1 package org.unicode.cldr.util; 2 3 import java.io.IOException; 4 import java.io.Reader; 5 6 /** 7 * Extremely simple class for parsing HTML. Extremely lenient. Call next() until DONE is returned. 8 * 9 * <p>Element content will be returned in the following sequence: 10 * 11 * <pre> 12 * ELEMENT_START 13 * ELEMENT strong 14 * ELEMENT_END 15 * ELEMENT_CONTENT Alphabetic code 16 * ELEMENT_START 17 * ELEMENT_POP 18 * ELEMENT strong 19 * ELEMENT_END 20 * </pre> 21 * 22 * while attributes will be returned as: 23 * 24 * <pre> 25 * ELEMENT_START 26 * ELEMENT div 27 * ATTRIBUTE id 28 * ATTRIBUTE_CONTENT mainContent 29 * ELEMENT_END 30 * </pre> 31 * 32 * @author markdavis 33 */ 34 public class SimpleHtmlParser { 35 public enum Type { 36 DONE, 37 /** No contents, set when we hit < */ 38 ELEMENT_START, 39 /** '<' contents/b */ 40 ELEMENT, 41 /** '<element/bcontents(=...) */ 42 ATTRIBUTE, 43 /** attribute=['"]contents['"] */ 44 ATTRIBUTE_CONTENT, 45 /** No contents, set when we hit '>' */ 46 ELEMENT_END, 47 /** No contents, set when we hit '/' after '<' */ 48 ELEMENT_POP, 49 /** '<!--' contents '-->' */ 50 QUOTE, 51 /** '<element>' contents '</element>' */ 52 ELEMENT_CONTENT 53 } 54 55 private enum State { 56 BASE, 57 IN_ELEMENT, 58 AFTER_ELEMENT, 59 IN_CONTENT, 60 IN_ATTRIBUTE, 61 IN_ATTRIBUTE_CONTENT, 62 IN_ATTRIBUTE_CONTENT1, 63 IN_ATTRIBUTE_CONTENT2, 64 ELEMENT_STOP, 65 IN_QUOTE 66 } 67 68 private Reader input; 69 70 private State state; 71 72 private Type bufferedReturn; 73 74 private int lineCount; 75 setReader(Reader input)76 public SimpleHtmlParser setReader(Reader input) { 77 this.input = input; 78 state = State.IN_CONTENT; 79 bufferedReturn = null; 80 lineCount = 0; 81 return this; 82 } 83 getLineCount()84 public int getLineCount() { 85 return lineCount; 86 } 87 next(StringBuilder result)88 public Type next(StringBuilder result) throws IOException { 89 result.setLength(0); 90 if (bufferedReturn != null) { 91 if (bufferedReturn == Type.DONE) { // once DONE, stay DONE 92 return Type.DONE; 93 } 94 Type temp = bufferedReturn; 95 bufferedReturn = null; 96 return temp; 97 } 98 while (true) { 99 char ch; 100 { 101 int chi = input.read(); 102 if (chi < 0) { 103 bufferedReturn = Type.DONE; 104 chi = 0; 105 } 106 ch = (char) chi; 107 if (ch == '\n') { 108 ++lineCount; 109 } 110 } 111 112 switch (state) { 113 case BASE: 114 if (ch == 0xFEFF) break; 115 // fall through! 116 117 case IN_CONTENT: 118 if (ch == '<') { 119 state = State.IN_ELEMENT; 120 bufferedReturn = Type.ELEMENT_START; 121 return Type.ELEMENT_CONTENT; 122 } 123 if (ch == 0) { 124 return Type.ELEMENT_CONTENT; 125 } 126 result.append(ch); 127 break; 128 129 case IN_ELEMENT: 130 if (ch <= ' ') { 131 if (equals(result, "!--")) { 132 state = State.IN_QUOTE; 133 result.setLength(0); 134 break; 135 } 136 state = State.AFTER_ELEMENT; 137 return Type.ELEMENT; 138 } 139 if (ch == '>') { 140 state = State.IN_CONTENT; 141 bufferedReturn = Type.ELEMENT_END; 142 return Type.ELEMENT; 143 } 144 if (ch == '/') { 145 return Type.ELEMENT_POP; 146 } 147 result.append(ch); 148 break; 149 150 case AFTER_ELEMENT: 151 if (ch <= ' ') break; 152 if (ch == '>') { 153 state = State.IN_CONTENT; 154 return Type.ELEMENT_END; 155 } 156 result.append(ch); 157 state = State.IN_ATTRIBUTE; 158 break; 159 160 case IN_ATTRIBUTE: 161 if (ch <= ' ') { 162 state = State.AFTER_ELEMENT; 163 return Type.ATTRIBUTE; 164 } 165 if (ch == '>') { 166 state = State.IN_CONTENT; 167 bufferedReturn = Type.ELEMENT_END; 168 return Type.ATTRIBUTE; 169 } 170 if (ch == '=') { 171 state = State.IN_ATTRIBUTE_CONTENT; 172 return Type.ATTRIBUTE; 173 } 174 result.append(ch); 175 break; 176 177 case IN_ATTRIBUTE_CONTENT: 178 if (ch <= ' ') { 179 break; 180 } 181 if (ch == '>') { 182 state = State.IN_CONTENT; 183 bufferedReturn = Type.ELEMENT_END; 184 return Type.ATTRIBUTE_CONTENT; 185 } 186 if (ch == '\'') { 187 state = State.IN_ATTRIBUTE_CONTENT1; 188 break; 189 } 190 if (ch == '"') { 191 state = State.IN_ATTRIBUTE_CONTENT2; 192 break; 193 } 194 result.append(ch); 195 break; 196 197 case IN_ATTRIBUTE_CONTENT1: 198 if (ch == 0 || ch == '\'') { 199 state = State.AFTER_ELEMENT; 200 return Type.ATTRIBUTE_CONTENT; 201 } 202 result.append(ch); 203 break; 204 205 case IN_ATTRIBUTE_CONTENT2: 206 if (ch == 0 || ch == '"') { 207 state = State.AFTER_ELEMENT; 208 return Type.ATTRIBUTE_CONTENT; 209 } 210 result.append(ch); 211 break; 212 213 case IN_QUOTE: 214 if (ch == 0) { 215 state = State.IN_CONTENT; 216 return Type.QUOTE; 217 } 218 if (ch == '>' && endsWith(result, "--")) { 219 result.setLength(result.length() - 2); 220 state = State.IN_CONTENT; 221 return Type.QUOTE; 222 } 223 result.append(ch); 224 break; 225 default: 226 } 227 } 228 } 229 endsWith(CharSequence a, CharSequence b)230 public static final boolean endsWith(CharSequence a, CharSequence b) { 231 int aStart = a.length() - b.length(); 232 if (aStart < 0) { 233 return false; 234 } 235 return regionEquals(a, aStart, b, 0, b.length()); 236 } 237 equals(CharSequence a, CharSequence b)238 public static final boolean equals(CharSequence a, CharSequence b) { 239 int len = a.length(); 240 if (len != b.length()) { 241 return false; 242 } 243 return regionEquals(a, 0, b, 0, len); 244 } 245 regionEquals(CharSequence a, int i, CharSequence b, int j, int len)246 public static boolean regionEquals(CharSequence a, int i, CharSequence b, int j, int len) { 247 for (; --len >= 0; ++i, ++j) { 248 if (a.charAt(i) != b.charAt(j)) { 249 return false; 250 } 251 } 252 return true; 253 } 254 writeResult(Type type, StringBuilder result, Appendable writer)255 public static void writeResult(Type type, StringBuilder result, Appendable writer) 256 throws IOException { 257 switch (type) { 258 case ELEMENT: 259 writer.append(result); 260 break; 261 case ELEMENT_START: 262 writer.append('<'); 263 break; 264 case ELEMENT_END: 265 writer.append('>'); 266 break; 267 case ATTRIBUTE: 268 writer.append(' ').append(result); 269 break; 270 case ATTRIBUTE_CONTENT: 271 writer.append("=\"").append(result).append('"'); 272 break; 273 case ELEMENT_CONTENT: 274 writer.append(result); 275 break; 276 case ELEMENT_POP: 277 writer.append('/'); 278 break; 279 case QUOTE: 280 writer.append(result); 281 break; 282 case DONE: 283 break; 284 default: 285 throw new IllegalArgumentException("Missing case: " + type); 286 } 287 } 288 } 289