xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateSidewaysView.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 /*
2  **********************************************************************
3  * Copyright (c) 2002-2004, International Business Machines
4  * Corporation and others.  All Rights Reserved.
5  **********************************************************************
6  * Author: Mark Davis
7  **********************************************************************
8  */
9 package org.unicode.cldr.tool;
10 
11 import com.google.common.collect.ImmutableMap;
12 import com.google.common.collect.ImmutableSet;
13 import com.ibm.icu.dev.tool.shared.UOption;
14 import com.ibm.icu.dev.util.UnicodeMap;
15 import com.ibm.icu.impl.Relation;
16 import com.ibm.icu.impl.Utility;
17 import com.ibm.icu.lang.UCharacter;
18 import com.ibm.icu.lang.UScript;
19 import com.ibm.icu.text.BreakIterator;
20 import com.ibm.icu.text.Collator;
21 import com.ibm.icu.text.Normalizer;
22 import com.ibm.icu.text.RuleBasedCollator;
23 import com.ibm.icu.text.RuleBasedNumberFormat;
24 import com.ibm.icu.text.Transliterator;
25 import com.ibm.icu.text.UTF16;
26 import com.ibm.icu.text.UnicodeSet;
27 import com.ibm.icu.text.UnicodeSetIterator;
28 import com.ibm.icu.util.ULocale;
29 import java.io.File;
30 import java.io.IOException;
31 import java.io.PrintWriter;
32 import java.util.Collection;
33 import java.util.Comparator;
34 import java.util.Date;
35 import java.util.EnumSet;
36 import java.util.HashMap;
37 import java.util.HashSet;
38 import java.util.Locale;
39 import java.util.Map;
40 import java.util.Map.Entry;
41 import java.util.Set;
42 import java.util.TreeMap;
43 import java.util.TreeSet;
44 import java.util.regex.Matcher;
45 import org.unicode.cldr.draft.FileUtilities;
46 import org.unicode.cldr.tool.ShowData.DataShower;
47 import org.unicode.cldr.util.CLDRFile;
48 import org.unicode.cldr.util.CLDRFile.Status;
49 import org.unicode.cldr.util.CLDRPaths;
50 import org.unicode.cldr.util.CldrUtility;
51 import org.unicode.cldr.util.DtdData;
52 import org.unicode.cldr.util.DtdData.Attribute;
53 import org.unicode.cldr.util.DtdData.AttributeStatus;
54 import org.unicode.cldr.util.Factory;
55 import org.unicode.cldr.util.FileCopier;
56 import org.unicode.cldr.util.LanguageTagParser;
57 import org.unicode.cldr.util.LanguageTagParser.Fields;
58 import org.unicode.cldr.util.LocaleIDParser;
59 import org.unicode.cldr.util.PathHeader;
60 import org.unicode.cldr.util.PathHeader.PageId;
61 import org.unicode.cldr.util.PatternCache;
62 import org.unicode.cldr.util.SimpleFactory;
63 import org.unicode.cldr.util.StringId;
64 import org.unicode.cldr.util.TransliteratorUtilities;
65 import org.unicode.cldr.util.XPathParts;
66 import org.xml.sax.SAXException;
67 
68 /**
69  * This is a simple class that walks through the CLDR hierarchy. It gathers together all the items
70  * from all the locales that share the same element chain, and thus presents a "sideways" view of
71  * the data, in files called by_type/X.html, where X is a type. X may be the concatenation of more
72  * than more than one element, where the file would otherwise be too large.
73  *
74  * @author medavis
75  */
76 /*
77  * Notes:
78  * http://xml.apache.org/xerces2-j/faq-grammars.html#faq-3
79  * http://developers.sun.com/dev/coolstuff/xml/readme.html
80  * http://lists.xml.org/archives/xml-dev/200007/msg00284.html
81  * http://java.sun.com/j2se/1.4.2/docs/api/org/xml/sax/DTDHandler.html
82  */
83 public class GenerateSidewaysView {
84     private static final boolean TOO_BIG_FOR_GITHUB = true;
85     private static final String DIR_NAME = "by_type";
86     // debug flags
87     static final boolean DEBUG = false;
88     static final boolean DEBUG2 = false;
89     static final boolean DEBUG_SHOW_ADD = false;
90     static final boolean DEBUG_ELEMENT = false;
91     static final boolean DEBUG_SHOW_BAT = false;
92 
93     static final boolean FIX_ZONE_ALIASES = true;
94 
95     private static final int HELP1 = 0,
96             HELP2 = 1,
97             SOURCEDIR = 2,
98             DESTDIR = 3,
99             MATCH = 4,
100             SKIP = 5,
101             TZADIR = 6,
102             NONVALIDATING = 7,
103             SHOW_DTD = 8,
104             TRANSLIT = 9,
105             PATH = 10;
106 
107     private static final UOption[] options = {
108         UOption.HELP_H(),
109         UOption.HELP_QUESTION_MARK(),
110         UOption.SOURCEDIR().setDefault(CLDRPaths.MAIN_DIRECTORY),
111         UOption.DESTDIR()
112                 .setDefault(
113                         CLDRPaths.CHART_DIRECTORY
114                                 + DIR_NAME
115                                 + "/"), // C:/cvsdata/unicode/cldr/diff/by_type/
116         UOption.create("match", 'm', UOption.REQUIRES_ARG).setDefault(".*"),
117         UOption.create("skip", 'z', UOption.REQUIRES_ARG).setDefault("zh_(C|S|HK|M).*"),
118         UOption.create("tzadir", 't', UOption.REQUIRES_ARG)
119                 .setDefault("C:\\ICU4J\\icu4j\\src\\com\\ibm\\icu\\dev\\tool\\cldr\\"),
120         UOption.create("nonvalidating", 'n', UOption.NO_ARG),
121         UOption.create("dtd", 'w', UOption.NO_ARG),
122         UOption.create("transliterate", 'y', UOption.NO_ARG),
123         UOption.create("path", 'p', UOption.REQUIRES_ARG),
124     };
125 
126     private static final Matcher altProposedMatcher = CLDRFile.ALT_PROPOSED_PATTERN.matcher("");
127     // private static final UnicodeSet ALL_CHARS = new UnicodeSet(0, 0x10FFFF);
128     protected static final UnicodeSet COMBINING = new UnicodeSet("[[:m:]]").freeze();
129 
getFirstScript(UnicodeSet exemplars)130     static int getFirstScript(UnicodeSet exemplars) {
131         for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next(); ) {
132             int script = UScript.getScript(it.codepoint);
133             if (script == UScript.COMMON || script == UScript.INHERITED) {
134                 continue;
135             }
136             return script;
137         }
138         return UScript.COMMON;
139     }
140 
141     static Comparator<Object> UCA;
142 
143     static {
144         RuleBasedCollator UCA2 = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT);
145         UCA2.setNumericCollation(true);
146         UCA2.setStrength(Collator.IDENTICAL);
147         UCA =
148                 new org.unicode.cldr.util.MultiComparator(
149                         UCA2, new UTF16.StringComparator(true, false, 0));
150     }
151 
152     private static Map<PathHeader, Map<String, Set<String>>> path_value_locales = new TreeMap<>();
153     private static long startTime = System.currentTimeMillis();
154 
155     static RuleBasedCollator standardCollation =
156             (RuleBasedCollator) Collator.getInstance(ULocale.ENGLISH);
157 
158     static {
159         standardCollation.setStrength(Collator.IDENTICAL);
160         standardCollation.setNumericCollation(true);
161     }
162 
163     private static CLDRFile english;
164     // private static DataShower dataShower = new DataShower();
165     private static Matcher pathMatcher;
166 
167     static final class OptionalPrinter {
168         PrintWriter printWriter;
169 
print(String s)170         public void print(String s) {
171             if (printWriter != null) {
172                 print(s);
173             }
174         }
175 
println()176         public void println() {
177             print("\n");
178         }
179 
close()180         public void close() {
181             if (printWriter != null) {
182                 close();
183             }
184         }
185     }
186 
main(String[] args)187     public static void main(String[] args) throws SAXException, IOException {
188         startTime = System.currentTimeMillis();
189         ToolUtilities.registerExtraTransliterators();
190         UOption.parseArgs(args, options);
191 
192         pathMatcher =
193                 options[PATH].value == null
194                         ? null
195                         : PatternCache.get(options[PATH].value).matcher("");
196 
197         File[] paths = {
198             new File(CLDRPaths.MAIN_DIRECTORY),
199             new File(CLDRPaths.ANNOTATIONS_DIRECTORY),
200             new File(CLDRPaths.SUBDIVISIONS_DIRECTORY)
201         };
202         Factory cldrFactory = SimpleFactory.make(paths, options[MATCH].value);
203 
204         // Factory cldrFactory = Factory.make(options[SOURCEDIR].value, options[MATCH].value);
205         english = cldrFactory.make("en", true);
206         pathHeaderFactory = PathHeader.getFactory(english);
207 
208         FileCopier.ensureDirectoryExists(options[DESTDIR].value);
209         FileCopier.copy(
210                 GenerateSidewaysView.class,
211                 "bytype-index.css",
212                 options[DESTDIR].value,
213                 "index.css");
214         FormattedFileWriter.copyIncludeHtmls(options[DESTDIR].value);
215 
216         // now get the info
217 
218         loadInformation(cldrFactory);
219         String oldMain = "";
220         PrintWriter out = null;
221 
222         System.out.println("Getting types " + path_value_locales.size());
223         // Set<String> types = new TreeSet<String>();
224         // for (PathHeader path : path_value_locales.keySet()) {
225         // String main = getFileName2(path);
226         // if (!main.equals(oldMain)) {
227         // oldMain = main;
228         // types.add(main);
229         // }
230         // }
231         String headerString = getHeader(path_value_locales.keySet());
232         FileCopier.copyAndReplace(
233                 GenerateSidewaysView.class,
234                 "bytype-index.html",
235                 options[DESTDIR].value,
236                 "index.html",
237                 ImmutableMap.of(
238                         "%header%",
239                         headerString,
240                         "%version%",
241                         ToolConstants.CHART_DISPLAY_VERSION,
242                         "%index%",
243                         "../index.html",
244                         "%index-title%",
245                         "Main Charts Index",
246                         "%date%",
247                         CldrUtility.isoFormatDateOnly(new Date())));
248         //        FileUtilities.copyFile(GenerateSidewaysView.class, "bytype-index.html",
249         // options[DESTDIR].value, "index.html",
250         //            new String[] { "%header%", headerString });
251 
252         System.out.println(
253                 "Printing files in " + new File(options[DESTDIR].value).getAbsolutePath());
254         // Transliterator toLatin = Transliterator.getInstance("any-latin");
255         toHTML = TransliteratorUtilities.toHTML;
256         // UnicodeSet BIDI_R = new UnicodeSet("[[:Bidi_Class=R:][:Bidi_Class=AL:]]");
257 
258         String oldHeader = "";
259         OptionalPrinter tsvFile = new OptionalPrinter();
260 
261         for (PathHeader path : path_value_locales.keySet()) {
262             String main = getFileName2(path, null);
263             if (!main.equals(oldMain)) {
264                 oldMain = main;
265                 out =
266                         start(
267                                 out,
268                                 main,
269                                 headerString,
270                                 path.getSection() + ":" + path.getPage(),
271                                 tsvFile);
272                 out.println("<table class='table'>");
273                 oldHeader = "";
274             }
275             String key = path.getCode();
276             String anchor = toHTML.transliterate(key);
277 
278             String originalPath = path.getOriginalPath(); // prettyPath.getOriginal(path);
279             String englishValue = english.getStringValue(originalPath);
280             if (englishValue != null) {
281                 englishValue = "English: ‹" + englishValue + "›";
282             } else {
283                 englishValue = "";
284             }
285 
286             String header = path.getHeader();
287             if (!header.equals(oldHeader) && !header.equals("null")) {
288                 out.println(
289                         "<tr><th colSpan='2' class='pathHeader'>"
290                                 + CldrUtility.getDoubleLinkedText(header)
291                                 + "</th></tr>");
292                 oldHeader = header;
293             }
294             String anchorId = Long.toHexString(StringId.getId(path.getOriginalPath()));
295             out.println(
296                     "<tr>"
297                             + "<th class='path'>"
298                             + CldrUtility.getDoubleLinkedText(anchorId, anchor)
299                             + "</th>"
300                             + "<th class='path'>"
301                             + toHTML.transliterate(englishValue)
302                             + "</th>"
303                             + "</tr>");
304             Map<String, Set<String>> value_locales = path_value_locales.get(path);
305             for (String value : value_locales.keySet()) {
306                 // String outValue = toHTML.transliterate(value);
307                 // String transValue = value;
308                 // try {
309                 // transValue = toLatin.transliterate(value);
310                 // } catch (RuntimeException e) {
311                 // }
312                 // if (!transValue.equals(value)) {
313                 // outValue = "<span title='" + toHTML.transliterate(transValue) + "'>" + outValue +
314                 // "</span>";
315                 // }
316                 String valueClass = " class='value'";
317                 if (DataShower.getBidiStyle(value).length() != 0) {
318                     valueClass = " class='rtl_value'";
319                 }
320                 out.println(
321                         "<tr><th"
322                                 + valueClass
323                                 + ">"
324                                 + DataShower.getPrettyValue(value)
325                                 + "</th><td class='td'>");
326                 tsvFile.print(
327                         path.getSection()
328                                 + "\t"
329                                 + path.getPage()
330                                 + "\t"
331                                 + path.getHeader()
332                                 + "\t"
333                                 + path.getCode()
334                                 + "\t"
335                                 + value
336                                 + "\t");
337 
338                 Set<String> locales = value_locales.get(value);
339                 boolean first = true;
340                 boolean containsRoot = locales.contains("root");
341                 for (String locale : locales) {
342                     if (first) first = false;
343                     else out.print(" ");
344                     if (locale.endsWith("*")) {
345                         locale = locale.substring(0, locale.length() - 1);
346                         out.print("<i>\u00B7" + locale + "\u00B7</i>");
347                         tsvFile.print("\u00B7" + locale + "\u00B7");
348                     } else if (!containsRoot) {
349                         out.print("\u00B7" + locale + "\u00B7");
350                         tsvFile.print("\u00B7" + locale + "\u00B7");
351                     } else if (locale.contains("_")) {
352                         // not same as root, but need to test for parent
353                         // if the parent is not in the same list, then we include anyway.
354                         // Cf http://unicode.org/cldr/trac/ticket/7228
355                         String parent = LocaleIDParser.getParent(locale);
356                         if (!locales.contains(parent)) {
357                             out.print("<b>\u00B7" + locale + "\u00B7</b>");
358                             tsvFile.print("\u00B7" + locale + "\u00B7");
359                         }
360                     }
361                 }
362                 if (containsRoot) {
363                     out.print("<b>\u00B7all\u00B7others\u00B7</b>");
364                     tsvFile.print("\u00B7all-others\u00B7");
365                 }
366                 out.println("</td></tr>");
367                 tsvFile.println();
368             }
369         }
370         for (String[] pair : EXEMPLARS) {
371             showExemplars(out, headerString, pair[0], pair[1], pair[2], tsvFile);
372         }
373         finish(out, tsvFile);
374         finishAll(out, tsvFile);
375         System.out.println(
376                 "Done in "
377                         + new RuleBasedNumberFormat(
378                                         new ULocale("en"), RuleBasedNumberFormat.DURATION)
379                                 .format((System.currentTimeMillis() - startTime) / 1000.0));
380     }
381 
382     static final String[][] EXEMPLARS = {
383         {"//ldml/characters/exemplarCharacters", "main", "Main Exemplars"},
384         {
385             "//ldml/characters/exemplarCharacters[@type=\"punctuation\"]",
386             "punctuation",
387             "Punctuation Exemplars"
388         },
389         {"//ldml/characters/exemplarCharacters[@type=\"index\"]", "index", "Index Exemplars"},
390         // TODO look at numbers, auxiliary
391     };
392 
showExemplars( PrintWriter out, String headerString, String pathName, String variant, String title, OptionalPrinter tsvFile)393     private static PrintWriter showExemplars(
394             PrintWriter out,
395             String headerString,
396             String pathName,
397             String variant,
398             String title,
399             OptionalPrinter tsvFile)
400             throws IOException {
401         PathHeader ph = fixPath(pathName, null);
402         String filename = getFileName2(ph, variant);
403         out = start(out, filename, headerString, title, tsvFile);
404         Map<String, Set<String>> value_locales = path_value_locales.get(ph);
405 
406         // TODO change logic so that aux characters characters work well.
407 
408         Map<String, UnicodeMap<Set<String>>> script_UnicodeMap = new TreeMap<>();
409         // UnicodeMap mapping = new UnicodeMap();
410         UnicodeSet stuffToSkip = new UnicodeSet("[:Han:]");
411 
412         // get the locale information
413         UnicodeSet totalExemplars = new UnicodeSet();
414         for (String value : value_locales.keySet()) {
415             // flatten out UnicodeSet
416             UnicodeSet exemplars = new UnicodeSet(value);
417             if (variant.equals("main")) {
418                 UnicodeSet extras = new UnicodeSet();
419                 for (String item : exemplars) {
420                     extras.addAll(Normalizer.normalize(item, Normalizer.NFD));
421                 }
422                 exemplars.addAll(extras);
423             }
424             totalExemplars.addAll(exemplars);
425             exemplars.removeAll(stuffToSkip);
426 
427             Set<String> locales = value_locales.get(value);
428             // String script = UScript.getName(getFirstScript(exemplars));
429             for (String locale : locales) {
430                 checkTr(script_UnicodeMap);
431                 String key =
432                         locale.endsWith("*") ? locale.substring(0, locale.length() - 1) : locale;
433                 String script = LOCALE_TO_SCRIPT.get(key);
434                 // try a few variants until we get the script
435                 if (script == null && key.contains("_")) {
436                     String simpleParent = LanguageTagParser.getSimpleParent(key);
437                     script = LOCALE_TO_SCRIPT.get(simpleParent);
438                     if (script == null && simpleParent.contains("_")) {
439                         simpleParent = LanguageTagParser.getSimpleParent(simpleParent);
440                         script = LOCALE_TO_SCRIPT.get(simpleParent);
441                     }
442                 }
443                 if (script == null) {
444                     script = UScript.getName(UScript.UNKNOWN);
445                 }
446                 Set<String> temp = new HashSet<>();
447                 temp.add(locale);
448                 checkTr(script_UnicodeMap);
449                 UnicodeMap<Set<String>> mapping = script_UnicodeMap.get(script);
450                 if (mapping == null) {
451                     script_UnicodeMap.put(script, mapping = new UnicodeMap<>());
452                 }
453                 checkTr(script_UnicodeMap);
454                 mapping.composeWith(exemplars, temp, setComposer);
455                 checkTr(script_UnicodeMap);
456             }
457         }
458         System.out.println("@@@TOTAL:\t" + variant + "\t" + totalExemplars.toPattern(false));
459         for (String script : script_UnicodeMap.keySet()) {
460             UnicodeMap<Set<String>> mapping = script_UnicodeMap.get(script);
461             writeCharToLocaleMapping(out, script, mapping);
462         }
463         return out;
464     }
465 
checkTr(Map<String, UnicodeMap<Set<String>>> script_UnicodeMap)466     private static void checkTr(Map<String, UnicodeMap<Set<String>>> script_UnicodeMap) {
467         UnicodeMap<Set<String>> unicodeMap = script_UnicodeMap.get("Cyrillic");
468         if (unicodeMap == null) {
469             return;
470         }
471         Set<String> foo = unicodeMap.get(0x21);
472         if (foo == null) {
473             return;
474         }
475         if (foo.contains("tr")) {
476             System.out.println("huh?");
477         }
478     }
479 
writeCharToLocaleMapping( PrintWriter out, String script, UnicodeMap<Set<String>> mapping)480     private static void writeCharToLocaleMapping(
481             PrintWriter out, String script, UnicodeMap<Set<String>> mapping) {
482         BreakIterator charBreaks =
483                 BreakIterator.getCharacterInstance(ULocale.ROOT); // TODO, make default language for
484         // script
485         System.out.println("@@Exemplars for\t" + script + "\t" + mapping.keySet());
486         if (script.equals("Hangul")) { //  || script.equals("Common")
487             return; // skip these
488         }
489         // find out all the locales and all the characters
490         Set<String> allLocales = new TreeSet<>(UCA);
491         Set<String> allChars = new TreeSet<>(UCA);
492         Set<String> allStrings = new TreeSet<>(UCA);
493         for (Set<String> locales : mapping.getAvailableValues()) {
494             allLocales.addAll(locales);
495             UnicodeSet unicodeSet = mapping.keySet(locales);
496             for (String item : unicodeSet) {
497                 charBreaks.setText(item);
498                 int endFirst = charBreaks.next();
499                 if (endFirst == item.length()) {
500                     allChars.add(item);
501                 } else {
502                     allStrings.add(item);
503                 }
504             }
505         }
506         // get the columns, and show them
507         out.println("<table class='table' style='width:1%'>");
508         out.println("<caption>" + script + "</caption>");
509         exemplarHeader(out, allChars);
510 
511         for (String locale : allLocales) {
512             String headerHeader =
513                     "<th class='head'>"
514                             + cleanLocale(locale, false)
515                             + "</th><td class='head nowrap left'>"
516                             + cleanLocale(locale, true)
517                             + "</td>";
518             out.println("<tr>");
519             out.println(headerHeader);
520 
521             for (String item : allChars) {
522                 // String exemplarsWithoutBrackets = displayExemplars(item);
523                 if (mapping.get(item).contains(locale)) {
524                     out.println("<td class='cell'" + ">" + displayCharacter(item) + "</td>");
525                 } else {
526                     out.println("<td class='empty'>\u00a0</td>");
527                 }
528             }
529             // now strings, if any
530             StringBuilder strings = new StringBuilder();
531             int lastLineStart = 0;
532             for (String item : allStrings) {
533                 // String exemplarsWithoutBrackets = displayExemplars(item);
534                 if (mapping.get(item).contains(locale)) {
535                     int str_len = strings.length();
536                     if (str_len != 0) {
537                         if (str_len - lastLineStart > 20) {
538                             strings.append(System.lineSeparator());
539                             lastLineStart = str_len;
540                         } else {
541                             strings.append(' ');
542                         }
543                     }
544                     strings.append(displayCharacter(item));
545                 }
546             }
547             if (strings.length() == 0) {
548                 out.println("<td class='empty'>\u00a0</td>");
549             } else {
550                 out.println(
551                         "<td class='cell nowrap'>"
552                                 + displayCharacter(strings.toString())
553                                         .replace(System.lineSeparator(), "<br>")
554                                 + "</td>");
555             }
556 
557             out.println(headerHeader);
558             out.println("</tr>");
559         }
560         exemplarHeader(out, allChars);
561         out.println("</table>");
562         out.flush();
563     }
564 
characterTitle(String item)565     private static String characterTitle(String item) {
566         return ("title='U+"
567                 + toHTML.transform(
568                         Utility.hex(item, 4, ", U+", true, new StringBuilder())
569                                 + " "
570                                 + UCharacter.getName(item, ", "))
571                 + "'");
572     }
573 
exemplarHeader(PrintWriter out, Set<String> allChars)574     private static void exemplarHeader(PrintWriter out, Set<String> allChars) {
575         out.println("<tr>");
576         out.println("<th class='head nowrap' colSpan='2'>Locale \\\u00a0Chars</th>");
577         for (String item : allChars) {
578             out.println(
579                     "<th class='head' "
580                             + characterTitle(item)
581                             + ">"
582                             + displayCharacter(item)
583                             + "</th>");
584         }
585         out.println("<th class='head'>Clusters</th>");
586         out.println("<th class='head nowrap' colSpan='2'>Locale \\\u00a0Chars</th>");
587         out.println("</tr>");
588     }
589 
590     static final UnicodeSet NONSPACING =
591             new UnicodeSet("[[:Mn:][:Me:][:default_ignorable_code_point:]]").freeze();
592 
displayCharacter(String item)593     public static String displayCharacter(String item) {
594         if (item.length() == 0) return "<i>none</i>";
595         int ch = item.codePointAt(0);
596         if (NONSPACING.contains(ch)) {
597             item = "\u00a0" + item + "\u00a0";
598         }
599         String result = toHTML.transform(item);
600         return result;
601     }
602 
603     static LanguageTagParser cleanLocaleParser = new LanguageTagParser();
604     static Set<Fields> allButScripts = EnumSet.allOf(Fields.class);
605 
606     static {
607         allButScripts.remove(Fields.SCRIPT);
608     }
609 
cleanLocale(String item, boolean name)610     private static String cleanLocale(String item, boolean name) {
611         if (item == null) {
612             return "<i>null</i>";
613         }
614         boolean draft = item.endsWith("*");
615         if (draft) {
616             item = item.substring(0, item.length() - 1);
617         }
618         cleanLocaleParser.set(item);
619         item = cleanLocaleParser.toString(allButScripts);
620         String core = item;
621         item = toHTML.transform(item);
622         if (name) {
623             item = english.getName(core);
624             item = item == null ? "<i>null</i>" : toHTML.transform(item);
625         }
626         if (draft) {
627             item = "<i>" + item + "</i>";
628         }
629         return item;
630     }
631 
632     // private static void showExemplarRow(PrintWriter out, Set<String> allLocales, UnicodeSet
633     // lastChars, Set locales) {
634     // String exemplarsWithoutBrackets = displayExemplars(lastChars);
635     // out.println("<tr><th class='head'>" + exemplarsWithoutBrackets + "</th>");
636     // for (String item : allLocales) {
637     // String cleanItem;
638     // if (locales.contains(item)) {
639     // cleanItem = "<th class='value'>" + cleanLocale(item, false) + "</th>";
640     // } else {
641     // cleanItem = "<td class='value'>\u00a0</td>";
642     // }
643     // out.println(cleanItem);
644     // }
645     // out.println("</tr>");
646     // }
647 
648     // private static final StringTransform MyTransform = new StringTransform() {
649     //
650     // public String transform(String source) {
651     // StringBuilder builder = new StringBuilder();
652     // int cp = 0;
653     // builder.append("<span title='");
654     // String prefix = "";
655     // for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
656     // cp = UTF16.charAt(source, i);
657     // if (i == 0) {
658     // if (COMBINING.contains(cp)) {
659     // prefix = "\u25CC";
660     // }
661     // } else {
662     // builder.append(" + ");
663     // }
664     // builder.append("U+").append(com.ibm.icu.impl.Utility.hex(cp,4)).append('
665     // ').append(UCharacter.getExtendedName(cp));
666     // }
667     // builder.append("'>").append(prefix).append(source).append("</span>");
668     // return builder.toString();
669     // }
670     //
671     // };
672 
673     // private static String displayExemplars(UnicodeSet lastChars) {
674     // String exemplarsWithoutBrackets = new PrettyPrinter()
675     // .setOrdering(UCA != null ? UCA : Collator.getInstance(ULocale.ROOT))
676     // .setSpaceComparator(UCA != null ? UCA : Collator.getInstance(ULocale.ROOT)
677     // .setStrength2(Collator.PRIMARY))
678     // .setCompressRanges(true)
679     // .setToQuote(ALL_CHARS)
680     // .setQuoter(MyTransform)
681     // .format(lastChars);
682     // exemplarsWithoutBrackets = exemplarsWithoutBrackets.substring(1,
683     // exemplarsWithoutBrackets.length() - 1);
684     // return exemplarsWithoutBrackets;
685     // }
686 
687     // private static boolean isNextCharacter(String last, String value) {
688     // if (UTF16.hasMoreCodePointsThan(last, 1)) return false;
689     // if (UTF16.hasMoreCodePointsThan(value, 1)) return false;
690     // int lastChar = UTF16.charAt(last,0);
691     // int valueChar = UTF16.charAt(value,0);
692     // return lastChar + 1 == valueChar;
693     // }
694 
695     static UnicodeMap.Composer<Set<String>> setComposer =
696             new UnicodeMap.Composer<>() {
697                 @Override
698                 public Set<String> compose(
699                         int codepoint, String string, Set<String> a, Set<String> b) {
700                     if (a == null) {
701                         return b;
702                     } else if (b == null) {
703                         return a;
704                     } else {
705                         TreeSet<String> result = new TreeSet<>(a);
706                         result.addAll(b);
707                         return result;
708                     }
709                 }
710             };
711 
712     static Map<String, String> LOCALE_TO_SCRIPT = new HashMap<>();
713 
loadInformation(Factory cldrFactory)714     private static void loadInformation(Factory cldrFactory) {
715         Set<String> alllocales = cldrFactory.getAvailable();
716         String[] postFix = new String[] {""};
717         // gather all information
718         // TODO tweek for value-laden attributes
719         for (String localeID : alllocales) {
720             System.out.println("Loading: " + localeID);
721             System.out.flush();
722 
723             CLDRFile cldrFile;
724             try {
725                 cldrFile = cldrFactory.make(localeID, localeID.equals("root"));
726             } catch (IllegalArgumentException e) {
727                 System.err.println("Couldn't open " + localeID);
728                 continue;
729             }
730             if (cldrFile.isNonInheriting()) continue;
731             for (String path : cldrFile) {
732                 if (pathMatcher != null && !pathMatcher.reset(path).matches()) {
733                     continue;
734                 }
735                 if (altProposedMatcher.reset(path).matches()) {
736                     continue;
737                 }
738                 if (path.indexOf("/alias") >= 0) continue;
739                 if (path.indexOf("/identity") >= 0) continue;
740                 if (path.indexOf("/references") >= 0) continue;
741                 PathHeader ph = fixPath(path, postFix);
742                 if (ph == null || ph.shouldHide()) {
743                     continue;
744                 }
745                 String fullPath = cldrFile.getFullXPath(path);
746                 String value = getValue(cldrFile, path, fullPath);
747                 if (value == null || CldrUtility.INHERITANCE_MARKER.equals(value)) {
748                     continue;
749                 }
750                 if (fullPath.indexOf("[@draft=\"unconfirmed\"]") >= 0
751                         || fullPath.indexOf("[@draft=\"provisional\"]") >= 0) {
752                     postFix[0] = "*";
753                 }
754                 if (path.equals("//ldml/characters/exemplarCharacters")) {
755                     UnicodeSet exemplars;
756                     try {
757                         exemplars = new UnicodeSet(value);
758                         String script = UScript.getName(getFirstScript(exemplars));
759                         LOCALE_TO_SCRIPT.put(localeID, script);
760                     } catch (Exception e) {
761 
762                     }
763                 }
764                 Map<String, Set<String>> value_locales = path_value_locales.get(ph);
765                 if (value_locales == null) {
766                     path_value_locales.put(ph, value_locales = new TreeMap<>(standardCollation));
767                 }
768                 Set<String> locales = value_locales.get(value);
769                 if (locales == null) {
770                     value_locales.put(value, locales = new TreeSet<>());
771                 }
772                 locales.add(localeID + postFix[0]);
773             }
774         }
775         Relation<String, String> sorted =
776                 Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
777         for (Entry<String, String> s : LOCALE_TO_SCRIPT.entrySet()) {
778             sorted.put(s.getValue(), s.getKey());
779         }
780         for (Entry<String, Set<String>> s : sorted.keyValuesSet()) {
781             System.out.println(s);
782         }
783     }
784 
785     static PathHeader.Factory pathHeaderFactory;
786 
787     /**
788      * @param path
789      * @param localePrefix
790      * @return
791      */
fixPath(String path, String[] localePrefix)792     private static PathHeader fixPath(String path, String[] localePrefix) {
793         if (localePrefix != null) {
794             localePrefix[0] = "";
795         }
796         return pathHeaderFactory.fromPath(path);
797     }
798 
799     /**
800      * @param parts
801      * @param skipAttributes
802      */
getValueAttributes(XPathParts parts)803     private static String getValueAttributes(XPathParts parts) {
804         String element = parts.getElement(-1);
805         Collection<String> attributes = parts.getAttributeKeys(-1);
806         DtdData dtdData = parts.getDtdData();
807         StringBuilder sb = new StringBuilder();
808         for (String attributeName : attributes) {
809             if (skipSet.contains(attributeName)) {
810                 continue;
811             }
812             Attribute attribute = dtdData.getAttribute(element, attributeName);
813             AttributeStatus status = attribute.getStatus();
814             switch (status) {
815                 case distinguished:
816                 case metadata: // skip
817                     break;
818                 case value: // keep
819                     sb.append(
820                             attributeName + "=" + parts.getAttributeValue(-1, attributeName) + " ");
821                     break;
822             }
823         }
824         return sb.toString();
825     }
826 
827     static final Set<String> skipSet = ImmutableSet.of("draft", "alt");
828 
829     static Status status = new Status();
830 
831     /** */
getValue(CLDRFile cldrFile, String path, String fullPath)832     private static String getValue(CLDRFile cldrFile, String path, String fullPath) {
833         String value = cldrFile.getStringValue(path);
834         if (value == null) {
835             System.out.println("Null value for " + path);
836             return value;
837         }
838         cldrFile.getSourceLocaleID(path, status);
839         if (!path.equals(status.pathWhereFound)) {
840             // value = "[" + prettyPath.getPrettyPath(status.pathWhereFound, false) + "]";
841             value = null;
842             return value;
843         }
844         if (value.length() == 0) {
845             XPathParts parts = XPathParts.getFrozenInstance(fullPath);
846             value = getValueAttributes(parts);
847         }
848         return value;
849     }
850 
getFileName2(PathHeader header, String suffix)851     private static String getFileName2(PathHeader header, String suffix) {
852         String result =
853                 (header.getSection() + "." + header.getPage())
854                         .replace(" ", "_")
855                         .replace("/", "_")
856                         .replace("(", "_")
857                         .replace(")", "_");
858         if (suffix != null) {
859             result += "." + suffix;
860         }
861         return result.toLowerCase(Locale.ENGLISH);
862     }
863 
864     static String[] headerAndFooter = new String[2];
865     private static Transliterator toHTML;
866 
867     /**
868      * @param tsvFile TODO
869      * @param path2
870      */
start( PrintWriter out, String main, String headerString, String title, OptionalPrinter tsvFile)871     private static PrintWriter start(
872             PrintWriter out,
873             String main,
874             String headerString,
875             String title,
876             OptionalPrinter tsvFile)
877             throws IOException {
878         finish(out, tsvFile);
879         out = writeHeader(main, title, tsvFile);
880         out.println(headerString);
881         return out;
882     }
883 
getHeader(Set<PathHeader> set)884     public static String getHeader(Set<PathHeader> set) {
885         StringBuffer out = new StringBuffer("<table class='simple'><tr>");
886         String lastMain = "";
887         String lastSub = "";
888         for (PathHeader pathHeader : set) {
889             String mainName = pathHeader.getSection();
890             String subName = TransliteratorUtilities.toHTML.transform(pathHeader.getPage());
891             if (!mainName.equals(lastMain)) {
892                 if (lastMain.length() != 0) {
893                     out.append("</tr>" + System.lineSeparator() + "<tr>");
894                 }
895                 out.append(
896                         "<th align='right' nowrap style='vertical-align: top'><b>"
897                                 + TransliteratorUtilities.toHTML.transform(mainName)
898                                 + ":&nbsp;</b></th><td>");
899                 lastMain = mainName;
900                 lastSub = subName;
901             } else if (!subName.equals(lastSub)) {
902                 out.append(" | ");
903                 lastSub = subName;
904             } else {
905                 continue; // identical, skip
906             }
907             out.append("<a href='" + getFileName2(pathHeader, null) + ".html'>" + subName + "</a>");
908             if (pathHeader.getPageId() == PageId.Alphabetic_Information) {
909                 for (String[] pair : EXEMPLARS) {
910                     out.append(
911                             " | <a href='"
912                                     + getFileName2(pathHeader, pair[1])
913                                     + ".html'>"
914                                     + pair[2]
915                                     + "</a>");
916                 }
917             }
918             continue;
919         }
920         return out.append("</td></tr>" + System.lineSeparator() + "</table>").toString();
921     }
922 
writeHeader(String main, String title, OptionalPrinter tsvFile)923     private static PrintWriter writeHeader(String main, String title, OptionalPrinter tsvFile)
924             throws IOException {
925         PrintWriter out;
926         out = FileUtilities.openUTF8Writer(options[DESTDIR].value, main + ".html");
927         if (!TOO_BIG_FOR_GITHUB && tsvFile.printWriter == null) {
928             tsvFile.printWriter =
929                     FileUtilities.openUTF8Writer(
930                             Chart.getTsvDir(options[DESTDIR].value, DIR_NAME), DIR_NAME + ".tsv");
931             tsvFile.print("# By-Type Data\n");
932             tsvFile.print("# Section\tPage\tHeader\tCode\tValue\tLocales\n");
933         }
934 
935         ShowData.getChartTemplate(
936                 "By-Type Chart: " + title,
937                 ToolConstants.CHART_DISPLAY_VERSION,
938                 "",
939                 headerAndFooter,
940                 null,
941                 false);
942         out.println(headerAndFooter[0]);
943         return out;
944     }
945 
946     /**
947      * @param tsvFile TODO
948      */
finish(PrintWriter out, OptionalPrinter tsvFile)949     private static void finish(PrintWriter out, OptionalPrinter tsvFile) {
950         if (out == null) return;
951         out.println("</table>");
952         out.println(headerAndFooter[1]);
953         out.close();
954     }
955 
finishAll(PrintWriter out, OptionalPrinter tsvFile)956     private static void finishAll(PrintWriter out, OptionalPrinter tsvFile) {
957         // TODO Auto-generated method stub
958         // tsvFile.println("# EOF");
959         tsvFile.close();
960     }
961 }
962