xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/util/SimpleHtmlParser.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.util;
2 
3 import java.io.IOException;
4 import java.io.Reader;
5 
6 /**
7  * Extremely simple class for parsing HTML. Extremely lenient. Call next() until DONE is returned.
8  *
9  * <p>Element content will be returned in the following sequence:
10  *
11  * <pre>
12  *  ELEMENT_START
13  *  ELEMENT strong
14  *  ELEMENT_END
15  *  ELEMENT_CONTENT Alphabetic code
16  *  ELEMENT_START
17  *  ELEMENT_POP
18  *  ELEMENT strong
19  *  ELEMENT_END
20  * </pre>
21  *
22  * while attributes will be returned as:
23  *
24  * <pre>
25  *  ELEMENT_START
26  *  ELEMENT div
27  *  ATTRIBUTE id
28  *  ATTRIBUTE_CONTENT mainContent
29  *  ELEMENT_END
30  * </pre>
31  *
32  * @author markdavis
33  */
34 public class SimpleHtmlParser {
35     public enum Type {
36         DONE,
37         /** No contents, set when we hit < */
38         ELEMENT_START,
39         /** '&lt;' contents/b */
40         ELEMENT,
41         /** '&lt;element/bcontents(=...) */
42         ATTRIBUTE,
43         /** attribute=['"]contents['"] */
44         ATTRIBUTE_CONTENT,
45         /** No contents, set when we hit '&gt' */
46         ELEMENT_END,
47         /** No contents, set when we hit '/' after '&lt;' */
48         ELEMENT_POP,
49         /** '&lt;!--' contents '--&gt;' */
50         QUOTE,
51         /** '&lt;element&gt;' contents '&lt;/element&gt;' */
52         ELEMENT_CONTENT
53     }
54 
55     private enum State {
56         BASE,
57         IN_ELEMENT,
58         AFTER_ELEMENT,
59         IN_CONTENT,
60         IN_ATTRIBUTE,
61         IN_ATTRIBUTE_CONTENT,
62         IN_ATTRIBUTE_CONTENT1,
63         IN_ATTRIBUTE_CONTENT2,
64         ELEMENT_STOP,
65         IN_QUOTE
66     }
67 
68     private Reader input;
69 
70     private State state;
71 
72     private Type bufferedReturn;
73 
74     private int lineCount;
75 
setReader(Reader input)76     public SimpleHtmlParser setReader(Reader input) {
77         this.input = input;
78         state = State.IN_CONTENT;
79         bufferedReturn = null;
80         lineCount = 0;
81         return this;
82     }
83 
getLineCount()84     public int getLineCount() {
85         return lineCount;
86     }
87 
next(StringBuilder result)88     public Type next(StringBuilder result) throws IOException {
89         result.setLength(0);
90         if (bufferedReturn != null) {
91             if (bufferedReturn == Type.DONE) { // once DONE, stay DONE
92                 return Type.DONE;
93             }
94             Type temp = bufferedReturn;
95             bufferedReturn = null;
96             return temp;
97         }
98         while (true) {
99             char ch;
100             {
101                 int chi = input.read();
102                 if (chi < 0) {
103                     bufferedReturn = Type.DONE;
104                     chi = 0;
105                 }
106                 ch = (char) chi;
107                 if (ch == '\n') {
108                     ++lineCount;
109                 }
110             }
111 
112             switch (state) {
113                 case BASE:
114                     if (ch == 0xFEFF) break;
115                     // fall through!
116 
117                 case IN_CONTENT:
118                     if (ch == '<') {
119                         state = State.IN_ELEMENT;
120                         bufferedReturn = Type.ELEMENT_START;
121                         return Type.ELEMENT_CONTENT;
122                     }
123                     if (ch == 0) {
124                         return Type.ELEMENT_CONTENT;
125                     }
126                     result.append(ch);
127                     break;
128 
129                 case IN_ELEMENT:
130                     if (ch <= ' ') {
131                         if (equals(result, "!--")) {
132                             state = State.IN_QUOTE;
133                             result.setLength(0);
134                             break;
135                         }
136                         state = State.AFTER_ELEMENT;
137                         return Type.ELEMENT;
138                     }
139                     if (ch == '>') {
140                         state = State.IN_CONTENT;
141                         bufferedReturn = Type.ELEMENT_END;
142                         return Type.ELEMENT;
143                     }
144                     if (ch == '/') {
145                         return Type.ELEMENT_POP;
146                     }
147                     result.append(ch);
148                     break;
149 
150                 case AFTER_ELEMENT:
151                     if (ch <= ' ') break;
152                     if (ch == '>') {
153                         state = State.IN_CONTENT;
154                         return Type.ELEMENT_END;
155                     }
156                     result.append(ch);
157                     state = State.IN_ATTRIBUTE;
158                     break;
159 
160                 case IN_ATTRIBUTE:
161                     if (ch <= ' ') {
162                         state = State.AFTER_ELEMENT;
163                         return Type.ATTRIBUTE;
164                     }
165                     if (ch == '>') {
166                         state = State.IN_CONTENT;
167                         bufferedReturn = Type.ELEMENT_END;
168                         return Type.ATTRIBUTE;
169                     }
170                     if (ch == '=') {
171                         state = State.IN_ATTRIBUTE_CONTENT;
172                         return Type.ATTRIBUTE;
173                     }
174                     result.append(ch);
175                     break;
176 
177                 case IN_ATTRIBUTE_CONTENT:
178                     if (ch <= ' ') {
179                         break;
180                     }
181                     if (ch == '>') {
182                         state = State.IN_CONTENT;
183                         bufferedReturn = Type.ELEMENT_END;
184                         return Type.ATTRIBUTE_CONTENT;
185                     }
186                     if (ch == '\'') {
187                         state = State.IN_ATTRIBUTE_CONTENT1;
188                         break;
189                     }
190                     if (ch == '"') {
191                         state = State.IN_ATTRIBUTE_CONTENT2;
192                         break;
193                     }
194                     result.append(ch);
195                     break;
196 
197                 case IN_ATTRIBUTE_CONTENT1:
198                     if (ch == 0 || ch == '\'') {
199                         state = State.AFTER_ELEMENT;
200                         return Type.ATTRIBUTE_CONTENT;
201                     }
202                     result.append(ch);
203                     break;
204 
205                 case IN_ATTRIBUTE_CONTENT2:
206                     if (ch == 0 || ch == '"') {
207                         state = State.AFTER_ELEMENT;
208                         return Type.ATTRIBUTE_CONTENT;
209                     }
210                     result.append(ch);
211                     break;
212 
213                 case IN_QUOTE:
214                     if (ch == 0) {
215                         state = State.IN_CONTENT;
216                         return Type.QUOTE;
217                     }
218                     if (ch == '>' && endsWith(result, "--")) {
219                         result.setLength(result.length() - 2);
220                         state = State.IN_CONTENT;
221                         return Type.QUOTE;
222                     }
223                     result.append(ch);
224                     break;
225                 default:
226             }
227         }
228     }
229 
endsWith(CharSequence a, CharSequence b)230     public static final boolean endsWith(CharSequence a, CharSequence b) {
231         int aStart = a.length() - b.length();
232         if (aStart < 0) {
233             return false;
234         }
235         return regionEquals(a, aStart, b, 0, b.length());
236     }
237 
equals(CharSequence a, CharSequence b)238     public static final boolean equals(CharSequence a, CharSequence b) {
239         int len = a.length();
240         if (len != b.length()) {
241             return false;
242         }
243         return regionEquals(a, 0, b, 0, len);
244     }
245 
regionEquals(CharSequence a, int i, CharSequence b, int j, int len)246     public static boolean regionEquals(CharSequence a, int i, CharSequence b, int j, int len) {
247         for (; --len >= 0; ++i, ++j) {
248             if (a.charAt(i) != b.charAt(j)) {
249                 return false;
250             }
251         }
252         return true;
253     }
254 
writeResult(Type type, StringBuilder result, Appendable writer)255     public static void writeResult(Type type, StringBuilder result, Appendable writer)
256             throws IOException {
257         switch (type) {
258             case ELEMENT:
259                 writer.append(result);
260                 break;
261             case ELEMENT_START:
262                 writer.append('<');
263                 break;
264             case ELEMENT_END:
265                 writer.append('>');
266                 break;
267             case ATTRIBUTE:
268                 writer.append(' ').append(result);
269                 break;
270             case ATTRIBUTE_CONTENT:
271                 writer.append("=\"").append(result).append('"');
272                 break;
273             case ELEMENT_CONTENT:
274                 writer.append(result);
275                 break;
276             case ELEMENT_POP:
277                 writer.append('/');
278                 break;
279             case QUOTE:
280                 writer.append(result);
281                 break;
282             case DONE:
283                 break;
284             default:
285                 throw new IllegalArgumentException("Missing case: " + type);
286         }
287     }
288 }
289