1 /* 2 ****************************************************************************** 3 * Copyright (C) 2004-2005, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ****************************************************************************** 6 * 7 * 8 */ 9 package org.unicode.cldr.util; 10 11 import com.ibm.icu.text.Normalizer; 12 import com.ibm.icu.text.UTF16; 13 import com.ibm.icu.text.UnicodeSet; 14 import java.util.Iterator; 15 import java.util.Map; 16 import java.util.TreeMap; 17 18 public class ExtractCollationRules { 19 Map<String, String> type_rules = new TreeMap<>(); 20 StringBuffer rules = new StringBuffer(); 21 set(CLDRFile file)22 public ExtractCollationRules set(CLDRFile file) { 23 type_rules.clear(); 24 String lastType = ""; 25 rules.setLength(0); 26 27 String context = null; 28 29 for (Iterator it = file.iterator("//ldml/collations", file.getComparator()); 30 it.hasNext(); ) { 31 32 // System.out.print(rules.substring(lastLen, rules.length())); 33 // lastLen = rules.length(); 34 35 String path = (String) it.next(); 36 String value = file.getStringValue(path); 37 XPathParts parts = XPathParts.getFrozenInstance(path); 38 String type = parts.findAttributeValue("collation", "type"); 39 if (!type.equals(lastType)) { 40 lastType = type; 41 type_rules.put(lastType, rules.toString()); 42 rules.setLength(0); 43 } 44 String mainType = parts.getElement(3); 45 // base?, settings?, suppress_contractions?, optimize? 46 // x: context?, ( p | pc | s | sc | t | tc | i | ic )*, extend? 47 if (mainType.equals("settings")) { 48 writeSettings(parts.getAttributes(3), rules); 49 continue; 50 } else if (mainType.equals("rules")) { 51 String ruleType = parts.getElement(4); 52 char c = ruleType.charAt(0); 53 if (c == 'x') { 54 ruleType = parts.getElement(5); 55 c = ruleType.charAt(0); 56 } 57 boolean isMultiple = ruleType.length() > 1 && ruleType.charAt(1) == 'c'; 58 String lastContext = context; 59 context = null; 60 switch (c) { 61 case 'r': 62 appendOrdering("&", null, value, false, true); 63 break; 64 case 'p': 65 appendOrdering("<", lastContext, value, isMultiple, true); 66 break; 67 case 's': 68 appendOrdering("<<", lastContext, value, isMultiple, true); 69 break; 70 case 't': 71 appendOrdering("<<<", lastContext, value, isMultiple, false); 72 break; 73 case 'i': 74 appendOrdering("=", lastContext, value, isMultiple, false); 75 break; 76 case 'c': 77 context = value; 78 break; 79 case 'e': 80 appendOrdering("/", null, value, false, false); 81 break; 82 default: 83 System.out.println("Couldn't handle: " + path + "\t" + value); 84 } 85 continue; 86 } else { 87 88 } 89 System.out.println("Couldn't handle: " + path + "\t" + value); 90 } 91 type_rules.put(lastType, rules.toString()); 92 return this; 93 } 94 appendOrdering( String relation, String context, String valueAfter, boolean isMultiple, boolean lineBreakBefore)95 private void appendOrdering( 96 String relation, 97 String context, 98 String valueAfter, 99 boolean isMultiple, 100 boolean lineBreakBefore) { 101 if (isMultiple) { 102 int cp; 103 for (int i = 0; i < valueAfter.length(); i += UTF16.getCharCount(cp)) { 104 cp = UTF16.charAt(valueAfter, i); 105 if (lineBreakBefore) rules.append(CldrUtility.LINE_SEPARATOR); 106 else rules.append(' '); 107 rules.append(relation); 108 if (context != null) rules.append(' ').append(quote(context)); 109 rules.append(' ').append(quote(UTF16.valueOf(cp))); 110 } 111 } else { 112 if (lineBreakBefore) rules.append(CldrUtility.LINE_SEPARATOR); 113 else rules.append(' '); 114 rules.append(relation); 115 if (context != null) rules.append(' ').append(quote(context)); 116 rules.append(' ').append(quote(valueAfter)); 117 } 118 } 119 writeSettings(Map<String, String> attributes, StringBuffer results)120 private void writeSettings(Map<String, String> attributes, StringBuffer results) { 121 for (Iterator<String> it = attributes.keySet().iterator(); it.hasNext(); ) { 122 String attribute = it.next(); 123 String value = attributes.get(attribute); 124 // TODO fix different cases 125 results.append("[" + attribute + " " + value + "]" + CldrUtility.LINE_SEPARATOR); 126 // if (attribute.equals("normalization")) { 127 // 128 // } 129 } 130 } 131 iterator()132 public Iterator<String> iterator() { 133 return type_rules.keySet().iterator(); 134 } 135 getRules(Object key)136 public String getRules(Object key) { 137 return type_rules.get(key); 138 } 139 140 static StringBuffer quoteOperandBuffer = new StringBuffer(); // faster 141 142 static UnicodeSet needsQuoting = null; 143 static UnicodeSet needsUnicodeForm = null; 144 quote(String s)145 static final String quote(String s) { 146 if (needsQuoting == null) { 147 /* 148 * c >= 'a' && c <= 'z' 149 * || c >= 'A' && c <= 'Z' 150 * || c >= '0' && c <= '9' 151 * || (c >= 0xA0 && !UCharacterProperty.isRuleWhiteSpace(c)) 152 */ 153 needsQuoting = new UnicodeSet("[[:whitespace:][:c:][:z:][:ascii:]-[a-zA-Z0-9]]"); // 154 // "[[:ascii:]-[a-zA-Z0-9]-[:c:]-[:z:]]"); // [:whitespace:][:c:][:z:] 155 // for (int i = 0; i <= 0x10FFFF; ++i) { 156 // if (UCharacterProperty.isRuleWhiteSpace(i)) needsQuoting.add(i); 157 // } 158 // needsQuoting.remove(); 159 needsUnicodeForm = new UnicodeSet("[\\u000d\\u000a[:zl:][:zp:]]"); 160 } 161 s = Normalizer.compose(s, false); 162 quoteOperandBuffer.setLength(0); 163 boolean noQuotes = true; 164 boolean inQuote = false; 165 int cp; 166 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 167 cp = UTF16.charAt(s, i); 168 if (!needsQuoting.contains(cp)) { 169 if (inQuote) { 170 quoteOperandBuffer.append('\''); 171 inQuote = false; 172 } 173 quoteOperandBuffer.append(UTF16.valueOf(cp)); 174 } else { 175 noQuotes = false; 176 if (cp == '\'') { 177 quoteOperandBuffer.append("''"); 178 } else { 179 if (!inQuote) { 180 quoteOperandBuffer.append('\''); 181 inQuote = true; 182 } 183 if (!needsUnicodeForm.contains(cp)) 184 quoteOperandBuffer.append(UTF16.valueOf(cp)); // cp != 0x2028 185 else if (cp > 0xFFFF) { 186 quoteOperandBuffer.append("\\U").append(hex(cp, 8)); 187 } else if (cp <= 0x20 || cp > 0x7E) { 188 quoteOperandBuffer.append("\\u").append(hex(cp, 4)); 189 } else { 190 quoteOperandBuffer.append(UTF16.valueOf(cp)); 191 } 192 } 193 } 194 /* 195 * switch (c) { 196 * case '<': case '>': case '#': case '=': case '&': case '/': 197 * quoteOperandBuffer.append('\'').append(c).append('\''); 198 * break; 199 * case '\'': 200 * quoteOperandBuffer.append("''"); 201 * break; 202 * default: 203 * if (0 <= c && c < 0x20 || 0x7F <= c && c < 0xA0) { 204 * quoteOperandBuffer.append("\\u").append(Utility.hex(c)); 205 * break; 206 * } 207 * quoteOperandBuffer.append(c); 208 * break; 209 * } 210 */ 211 } 212 if (inQuote) { 213 quoteOperandBuffer.append('\''); 214 } 215 if (noQuotes) return s; // faster 216 return quoteOperandBuffer.toString(); 217 } 218 hex(long i, int places)219 public static String hex(long i, int places) { 220 if (i == Long.MIN_VALUE) return "-8000000000000000"; 221 boolean negative = i < 0; 222 if (negative) { 223 i = -i; 224 } 225 String result = Long.toString(i, 16).toUpperCase(); 226 if (result.length() < places) { 227 result = "0000000000000000".substring(result.length(), places) + result; 228 } 229 if (negative) { 230 return '-' + result; 231 } 232 return result; 233 } 234 } 235