xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/util/ExtractCollationRules.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 /*
2  ******************************************************************************
3  * Copyright (C) 2004-2005, International Business Machines Corporation and        *
4  * others. All Rights Reserved.                                               *
5  ******************************************************************************
6  *
7  *
8  */
9 package org.unicode.cldr.util;
10 
11 import com.ibm.icu.text.Normalizer;
12 import com.ibm.icu.text.UTF16;
13 import com.ibm.icu.text.UnicodeSet;
14 import java.util.Iterator;
15 import java.util.Map;
16 import java.util.TreeMap;
17 
18 public class ExtractCollationRules {
19     Map<String, String> type_rules = new TreeMap<>();
20     StringBuffer rules = new StringBuffer();
21 
set(CLDRFile file)22     public ExtractCollationRules set(CLDRFile file) {
23         type_rules.clear();
24         String lastType = "";
25         rules.setLength(0);
26 
27         String context = null;
28 
29         for (Iterator it = file.iterator("//ldml/collations", file.getComparator());
30                 it.hasNext(); ) {
31 
32             // System.out.print(rules.substring(lastLen, rules.length()));
33             // lastLen = rules.length();
34 
35             String path = (String) it.next();
36             String value = file.getStringValue(path);
37             XPathParts parts = XPathParts.getFrozenInstance(path);
38             String type = parts.findAttributeValue("collation", "type");
39             if (!type.equals(lastType)) {
40                 lastType = type;
41                 type_rules.put(lastType, rules.toString());
42                 rules.setLength(0);
43             }
44             String mainType = parts.getElement(3);
45             // base?, settings?, suppress_contractions?, optimize?
46             // x: context?, ( p | pc | s | sc | t | tc | i | ic )*, extend?
47             if (mainType.equals("settings")) {
48                 writeSettings(parts.getAttributes(3), rules);
49                 continue;
50             } else if (mainType.equals("rules")) {
51                 String ruleType = parts.getElement(4);
52                 char c = ruleType.charAt(0);
53                 if (c == 'x') {
54                     ruleType = parts.getElement(5);
55                     c = ruleType.charAt(0);
56                 }
57                 boolean isMultiple = ruleType.length() > 1 && ruleType.charAt(1) == 'c';
58                 String lastContext = context;
59                 context = null;
60                 switch (c) {
61                     case 'r':
62                         appendOrdering("&", null, value, false, true);
63                         break;
64                     case 'p':
65                         appendOrdering("<", lastContext, value, isMultiple, true);
66                         break;
67                     case 's':
68                         appendOrdering("<<", lastContext, value, isMultiple, true);
69                         break;
70                     case 't':
71                         appendOrdering("<<<", lastContext, value, isMultiple, false);
72                         break;
73                     case 'i':
74                         appendOrdering("=", lastContext, value, isMultiple, false);
75                         break;
76                     case 'c':
77                         context = value;
78                         break;
79                     case 'e':
80                         appendOrdering("/", null, value, false, false);
81                         break;
82                     default:
83                         System.out.println("Couldn't handle: " + path + "\t" + value);
84                 }
85                 continue;
86             } else {
87 
88             }
89             System.out.println("Couldn't handle: " + path + "\t" + value);
90         }
91         type_rules.put(lastType, rules.toString());
92         return this;
93     }
94 
appendOrdering( String relation, String context, String valueAfter, boolean isMultiple, boolean lineBreakBefore)95     private void appendOrdering(
96             String relation,
97             String context,
98             String valueAfter,
99             boolean isMultiple,
100             boolean lineBreakBefore) {
101         if (isMultiple) {
102             int cp;
103             for (int i = 0; i < valueAfter.length(); i += UTF16.getCharCount(cp)) {
104                 cp = UTF16.charAt(valueAfter, i);
105                 if (lineBreakBefore) rules.append(CldrUtility.LINE_SEPARATOR);
106                 else rules.append(' ');
107                 rules.append(relation);
108                 if (context != null) rules.append(' ').append(quote(context));
109                 rules.append(' ').append(quote(UTF16.valueOf(cp)));
110             }
111         } else {
112             if (lineBreakBefore) rules.append(CldrUtility.LINE_SEPARATOR);
113             else rules.append(' ');
114             rules.append(relation);
115             if (context != null) rules.append(' ').append(quote(context));
116             rules.append(' ').append(quote(valueAfter));
117         }
118     }
119 
writeSettings(Map<String, String> attributes, StringBuffer results)120     private void writeSettings(Map<String, String> attributes, StringBuffer results) {
121         for (Iterator<String> it = attributes.keySet().iterator(); it.hasNext(); ) {
122             String attribute = it.next();
123             String value = attributes.get(attribute);
124             // TODO fix different cases
125             results.append("[" + attribute + " " + value + "]" + CldrUtility.LINE_SEPARATOR);
126             // if (attribute.equals("normalization")) {
127             //
128             // }
129         }
130     }
131 
iterator()132     public Iterator<String> iterator() {
133         return type_rules.keySet().iterator();
134     }
135 
getRules(Object key)136     public String getRules(Object key) {
137         return type_rules.get(key);
138     }
139 
140     static StringBuffer quoteOperandBuffer = new StringBuffer(); // faster
141 
142     static UnicodeSet needsQuoting = null;
143     static UnicodeSet needsUnicodeForm = null;
144 
quote(String s)145     static final String quote(String s) {
146         if (needsQuoting == null) {
147             /*
148              * c >= 'a' && c <= 'z'
149              * || c >= 'A' && c <= 'Z'
150              * || c >= '0' && c <= '9'
151              * || (c >= 0xA0 && !UCharacterProperty.isRuleWhiteSpace(c))
152              */
153             needsQuoting = new UnicodeSet("[[:whitespace:][:c:][:z:][:ascii:]-[a-zA-Z0-9]]"); //
154             // "[[:ascii:]-[a-zA-Z0-9]-[:c:]-[:z:]]"); // [:whitespace:][:c:][:z:]
155             // for (int i = 0; i <= 0x10FFFF; ++i) {
156             // if (UCharacterProperty.isRuleWhiteSpace(i)) needsQuoting.add(i);
157             // }
158             // needsQuoting.remove();
159             needsUnicodeForm = new UnicodeSet("[\\u000d\\u000a[:zl:][:zp:]]");
160         }
161         s = Normalizer.compose(s, false);
162         quoteOperandBuffer.setLength(0);
163         boolean noQuotes = true;
164         boolean inQuote = false;
165         int cp;
166         for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
167             cp = UTF16.charAt(s, i);
168             if (!needsQuoting.contains(cp)) {
169                 if (inQuote) {
170                     quoteOperandBuffer.append('\'');
171                     inQuote = false;
172                 }
173                 quoteOperandBuffer.append(UTF16.valueOf(cp));
174             } else {
175                 noQuotes = false;
176                 if (cp == '\'') {
177                     quoteOperandBuffer.append("''");
178                 } else {
179                     if (!inQuote) {
180                         quoteOperandBuffer.append('\'');
181                         inQuote = true;
182                     }
183                     if (!needsUnicodeForm.contains(cp))
184                         quoteOperandBuffer.append(UTF16.valueOf(cp)); // cp != 0x2028
185                     else if (cp > 0xFFFF) {
186                         quoteOperandBuffer.append("\\U").append(hex(cp, 8));
187                     } else if (cp <= 0x20 || cp > 0x7E) {
188                         quoteOperandBuffer.append("\\u").append(hex(cp, 4));
189                     } else {
190                         quoteOperandBuffer.append(UTF16.valueOf(cp));
191                     }
192                 }
193             }
194             /*
195              * switch (c) {
196              * case '<': case '>': case '#': case '=': case '&': case '/':
197              * quoteOperandBuffer.append('\'').append(c).append('\'');
198              * break;
199              * case '\'':
200              * quoteOperandBuffer.append("''");
201              * break;
202              * default:
203              * if (0 <= c && c < 0x20 || 0x7F <= c && c < 0xA0) {
204              * quoteOperandBuffer.append("\\u").append(Utility.hex(c));
205              * break;
206              * }
207              * quoteOperandBuffer.append(c);
208              * break;
209              * }
210              */
211         }
212         if (inQuote) {
213             quoteOperandBuffer.append('\'');
214         }
215         if (noQuotes) return s; // faster
216         return quoteOperandBuffer.toString();
217     }
218 
hex(long i, int places)219     public static String hex(long i, int places) {
220         if (i == Long.MIN_VALUE) return "-8000000000000000";
221         boolean negative = i < 0;
222         if (negative) {
223             i = -i;
224         }
225         String result = Long.toString(i, 16).toUpperCase();
226         if (result.length() < places) {
227             result = "0000000000000000".substring(result.length(), places) + result;
228         }
229         if (negative) {
230             return '-' + result;
231         }
232         return result;
233     }
234 }
235