1 /*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * [email protected]
7 */
8
9 #define IN_LIBXML
10 #include "libxml.h"
11 #ifdef LIBXML_HTML_ENABLED
12
13 #include <string.h>
14 #include <ctype.h>
15 #include <stdlib.h>
16
17 #include <libxml/HTMLparser.h>
18 #include <libxml/xmlmemory.h>
19 #include <libxml/tree.h>
20 #include <libxml/parser.h>
21 #include <libxml/parserInternals.h>
22 #include <libxml/xmlerror.h>
23 #include <libxml/HTMLtree.h>
24 #include <libxml/entities.h>
25 #include <libxml/encoding.h>
26 #include <libxml/xmlIO.h>
27 #include <libxml/uri.h>
28
29 #include "private/buf.h"
30 #include "private/enc.h"
31 #include "private/error.h"
32 #include "private/html.h"
33 #include "private/io.h"
34 #include "private/parser.h"
35 #include "private/tree.h"
36
37 #define HTML_MAX_NAMELEN 1000
38 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
39 #define HTML_PARSER_BUFFER_SIZE 100
40
41 static int htmlOmittedDefaultValue = 1;
42
43 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
44 xmlChar end, xmlChar end2, xmlChar end3);
45 static void htmlParseComment(htmlParserCtxtPtr ctxt);
46
47 /************************************************************************
48 * *
49 * Some factorized error routines *
50 * *
51 ************************************************************************/
52
53 /**
54 * htmlErrMemory:
55 * @ctxt: an HTML parser context
56 * @extra: extra information
57 *
58 * Handle a redefinition of attribute error
59 */
60 static void
htmlErrMemory(xmlParserCtxtPtr ctxt)61 htmlErrMemory(xmlParserCtxtPtr ctxt)
62 {
63 xmlCtxtErrMemory(ctxt);
64 }
65
66 /**
67 * htmlParseErr:
68 * @ctxt: an HTML parser context
69 * @error: the error number
70 * @msg: the error message
71 * @str1: string infor
72 * @str2: string infor
73 *
74 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
75 */
76 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErr(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,const xmlChar * str1,const xmlChar * str2)77 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
78 const char *msg, const xmlChar *str1, const xmlChar *str2)
79 {
80 xmlCtxtErr(ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR,
81 str1, str2, NULL, 0, msg, str1, str2);
82 }
83
84 /**
85 * htmlParseErrInt:
86 * @ctxt: an HTML parser context
87 * @error: the error number
88 * @msg: the error message
89 * @val: integer info
90 *
91 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
92 */
93 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErrInt(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,int val)94 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
95 const char *msg, int val)
96 {
97 xmlCtxtErr(ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR,
98 NULL, NULL, NULL, val, msg, val);
99 }
100
101 /************************************************************************
102 * *
103 * Parser stacks related functions and macros *
104 * *
105 ************************************************************************/
106
107 /**
108 * htmlnamePush:
109 * @ctxt: an HTML parser context
110 * @value: the element name
111 *
112 * Pushes a new element name on top of the name stack
113 *
114 * Returns -1 in case of error, the index in the stack otherwise
115 */
116 static int
htmlnamePush(htmlParserCtxtPtr ctxt,const xmlChar * value)117 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
118 {
119 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
120 ctxt->html = 3;
121 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
122 ctxt->html = 10;
123 if (ctxt->nameNr >= ctxt->nameMax) {
124 size_t newSize = ctxt->nameMax * 2;
125 const xmlChar **tmp;
126
127 tmp = xmlRealloc((xmlChar **) ctxt->nameTab,
128 newSize * sizeof(ctxt->nameTab[0]));
129 if (tmp == NULL) {
130 htmlErrMemory(ctxt);
131 return (-1);
132 }
133 ctxt->nameTab = tmp;
134 ctxt->nameMax = newSize;
135 }
136 ctxt->nameTab[ctxt->nameNr] = value;
137 ctxt->name = value;
138 return (ctxt->nameNr++);
139 }
140 /**
141 * htmlnamePop:
142 * @ctxt: an HTML parser context
143 *
144 * Pops the top element name from the name stack
145 *
146 * Returns the name just removed
147 */
148 static const xmlChar *
htmlnamePop(htmlParserCtxtPtr ctxt)149 htmlnamePop(htmlParserCtxtPtr ctxt)
150 {
151 const xmlChar *ret;
152
153 if (ctxt->nameNr <= 0)
154 return (NULL);
155 ctxt->nameNr--;
156 if (ctxt->nameNr < 0)
157 return (NULL);
158 if (ctxt->nameNr > 0)
159 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
160 else
161 ctxt->name = NULL;
162 ret = ctxt->nameTab[ctxt->nameNr];
163 ctxt->nameTab[ctxt->nameNr] = NULL;
164 return (ret);
165 }
166
167 /**
168 * htmlNodeInfoPush:
169 * @ctxt: an HTML parser context
170 * @value: the node info
171 *
172 * Pushes a new element name on top of the node info stack
173 *
174 * Returns 0 in case of error, the index in the stack otherwise
175 */
176 static int
htmlNodeInfoPush(htmlParserCtxtPtr ctxt,htmlParserNodeInfo * value)177 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
178 {
179 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
180 if (ctxt->nodeInfoMax == 0)
181 ctxt->nodeInfoMax = 5;
182 ctxt->nodeInfoMax *= 2;
183 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
184 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
185 ctxt->nodeInfoMax *
186 sizeof(ctxt->nodeInfoTab[0]));
187 if (ctxt->nodeInfoTab == NULL) {
188 htmlErrMemory(ctxt);
189 return (0);
190 }
191 }
192 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
193 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
194 return (ctxt->nodeInfoNr++);
195 }
196
197 /**
198 * htmlNodeInfoPop:
199 * @ctxt: an HTML parser context
200 *
201 * Pops the top element name from the node info stack
202 *
203 * Returns 0 in case of error, the pointer to NodeInfo otherwise
204 */
205 static htmlParserNodeInfo *
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)206 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
207 {
208 if (ctxt->nodeInfoNr <= 0)
209 return (NULL);
210 ctxt->nodeInfoNr--;
211 if (ctxt->nodeInfoNr < 0)
212 return (NULL);
213 if (ctxt->nodeInfoNr > 0)
214 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
215 else
216 ctxt->nodeInfo = NULL;
217 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
218 }
219
220 /*
221 * Macros for accessing the content. Those should be used only by the parser,
222 * and not exported.
223 *
224 * Dirty macros, i.e. one need to make assumption on the context to use them
225 *
226 * CUR_PTR return the current pointer to the xmlChar to be parsed.
227 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
228 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
229 * in UNICODE mode. This should be used internally by the parser
230 * only to compare to ASCII values otherwise it would break when
231 * running with UTF-8 encoding.
232 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
233 * to compare on ASCII based substring.
234 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
235 * it should be used only to compare on ASCII based substring.
236 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
237 * strings without newlines within the parser.
238 *
239 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
240 *
241 * NEXT Skip to the next character, this does the proper decoding
242 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
243 * NEXTL(l) Skip the current unicode character of l xmlChars long.
244 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
245 */
246
247 #define UPPER (toupper(*ctxt->input->cur))
248
249 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
250
251 #define NXT(val) ctxt->input->cur[(val)]
252
253 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
254
255 #define CUR_PTR ctxt->input->cur
256 #define BASE_PTR ctxt->input->base
257
258 #define SHRINK \
259 if ((!PARSER_PROGRESSIVE(ctxt)) && \
260 (ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
261 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
262 xmlParserShrink(ctxt);
263
264 #define GROW \
265 if ((!PARSER_PROGRESSIVE(ctxt)) && \
266 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
267 xmlParserGrow(ctxt);
268
269 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
270
271 /* Imported from XML */
272
273 #define CUR (*ctxt->input->cur)
274 #define NEXT xmlNextChar(ctxt)
275
276 #define RAW (*ctxt->input->cur)
277
278
279 #define NEXTL(l) do { \
280 if (*(ctxt->input->cur) == '\n') { \
281 ctxt->input->line++; ctxt->input->col = 1; \
282 } else ctxt->input->col++; \
283 ctxt->input->cur += l; \
284 } while (0)
285
286 /************
287 \
288 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
289 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
290 ************/
291
292 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
293
294 #define COPY_BUF(l,b,i,v) \
295 if (l == 1) b[i++] = v; \
296 else i += xmlCopyChar(l,&b[i],v)
297
298 /**
299 * htmlFindEncoding:
300 * @the HTML parser context
301 *
302 * Ty to find and encoding in the current data available in the input
303 * buffer this is needed to try to switch to the proper encoding when
304 * one face a character error.
305 * That's an heuristic, since it's operating outside of parsing it could
306 * try to use a meta which had been commented out, that's the reason it
307 * should only be used in case of error, not as a default.
308 *
309 * Returns an encoding string or NULL if not found, the string need to
310 * be freed
311 */
312 static xmlChar *
htmlFindEncoding(xmlParserCtxtPtr ctxt)313 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
314 const xmlChar *start, *cur, *end;
315 xmlChar *ret;
316
317 if ((ctxt == NULL) || (ctxt->input == NULL) ||
318 (ctxt->input->flags & XML_INPUT_HAS_ENCODING))
319 return(NULL);
320 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
321 return(NULL);
322
323 start = ctxt->input->cur;
324 end = ctxt->input->end;
325 /* we also expect the input buffer to be zero terminated */
326 if (*end != 0)
327 return(NULL);
328
329 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
330 if (cur == NULL)
331 return(NULL);
332 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
333 if (cur == NULL)
334 return(NULL);
335 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
336 if (cur == NULL)
337 return(NULL);
338 cur += 8;
339 start = cur;
340 while (((*cur >= 'A') && (*cur <= 'Z')) ||
341 ((*cur >= 'a') && (*cur <= 'z')) ||
342 ((*cur >= '0') && (*cur <= '9')) ||
343 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
344 cur++;
345 if (cur == start)
346 return(NULL);
347 ret = xmlStrndup(start, cur - start);
348 if (ret == NULL)
349 htmlErrMemory(ctxt);
350 return(ret);
351 }
352
353 /**
354 * htmlCurrentChar:
355 * @ctxt: the HTML parser context
356 * @len: pointer to the length of the char read
357 *
358 * The current char value, if using UTF-8 this may actually span multiple
359 * bytes in the input buffer. Implement the end of line normalization:
360 * 2.11 End-of-Line Handling
361 * If the encoding is unspecified, in the case we find an ISO-Latin-1
362 * char, then the encoding converter is plugged in automatically.
363 *
364 * Returns the current char value and its length
365 */
366
367 static int
htmlCurrentChar(xmlParserCtxtPtr ctxt,int * len)368 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
369 const unsigned char *cur;
370 unsigned char c;
371 unsigned int val;
372
373 if (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)
374 xmlParserGrow(ctxt);
375
376 if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
377 xmlChar * guess;
378
379 /*
380 * Assume it's a fixed length encoding (1) with
381 * a compatible encoding for the ASCII set, since
382 * HTML constructs only use < 128 chars
383 */
384 if (*ctxt->input->cur < 0x80) {
385 if (*ctxt->input->cur == 0) {
386 if (ctxt->input->cur < ctxt->input->end) {
387 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
388 "Char 0x%X out of allowed range\n", 0);
389 *len = 1;
390 return(' ');
391 } else {
392 *len = 0;
393 return(0);
394 }
395 }
396 *len = 1;
397 return(*ctxt->input->cur);
398 }
399
400 /*
401 * Humm this is bad, do an automatic flow conversion
402 */
403 guess = htmlFindEncoding(ctxt);
404 if (guess == NULL) {
405 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
406 } else {
407 xmlSwitchEncodingName(ctxt, (const char *) guess);
408 xmlFree(guess);
409 }
410 ctxt->input->flags |= XML_INPUT_HAS_ENCODING;
411 }
412
413 /*
414 * We are supposed to handle UTF8, check it's valid
415 * From rfc2044: encoding of the Unicode values on UTF-8:
416 *
417 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
418 * 0000 0000-0000 007F 0xxxxxxx
419 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
420 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
421 *
422 * Check for the 0x110000 limit too
423 */
424 cur = ctxt->input->cur;
425 c = *cur;
426 if (c & 0x80) {
427 size_t avail;
428
429 if ((c & 0x40) == 0)
430 goto encoding_error;
431
432 avail = ctxt->input->end - ctxt->input->cur;
433
434 if ((avail < 2) || ((cur[1] & 0xc0) != 0x80))
435 goto encoding_error;
436 if ((c & 0xe0) == 0xe0) {
437 if ((avail < 3) || ((cur[2] & 0xc0) != 0x80))
438 goto encoding_error;
439 if ((c & 0xf0) == 0xf0) {
440 if (((c & 0xf8) != 0xf0) ||
441 (avail < 4) || ((cur[3] & 0xc0) != 0x80))
442 goto encoding_error;
443 /* 4-byte code */
444 *len = 4;
445 val = (cur[0] & 0x7) << 18;
446 val |= (cur[1] & 0x3f) << 12;
447 val |= (cur[2] & 0x3f) << 6;
448 val |= cur[3] & 0x3f;
449 if (val < 0x10000)
450 goto encoding_error;
451 } else {
452 /* 3-byte code */
453 *len = 3;
454 val = (cur[0] & 0xf) << 12;
455 val |= (cur[1] & 0x3f) << 6;
456 val |= cur[2] & 0x3f;
457 if (val < 0x800)
458 goto encoding_error;
459 }
460 } else {
461 /* 2-byte code */
462 *len = 2;
463 val = (cur[0] & 0x1f) << 6;
464 val |= cur[1] & 0x3f;
465 if (val < 0x80)
466 goto encoding_error;
467 }
468 if (!IS_CHAR(val)) {
469 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
470 "Char 0x%X out of allowed range\n", val);
471 }
472 return(val);
473 } else {
474 if (*ctxt->input->cur == 0) {
475 if (ctxt->input->cur < ctxt->input->end) {
476 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
477 "Char 0x%X out of allowed range\n", 0);
478 *len = 1;
479 return(' ');
480 } else {
481 *len = 0;
482 return(0);
483 }
484 }
485 /* 1-byte code */
486 *len = 1;
487 return(*ctxt->input->cur);
488 }
489
490 encoding_error:
491 xmlCtxtErrIO(ctxt, XML_ERR_INVALID_ENCODING, NULL);
492
493 if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0)
494 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
495 *len = 1;
496 return(*ctxt->input->cur);
497 }
498
499 /**
500 * htmlSkipBlankChars:
501 * @ctxt: the HTML parser context
502 *
503 * skip all blanks character found at that point in the input streams.
504 *
505 * Returns the number of space chars skipped
506 */
507
508 static int
htmlSkipBlankChars(xmlParserCtxtPtr ctxt)509 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
510 int res = 0;
511
512 while (IS_BLANK_CH(*(ctxt->input->cur))) {
513 if (*(ctxt->input->cur) == '\n') {
514 ctxt->input->line++; ctxt->input->col = 1;
515 } else ctxt->input->col++;
516 ctxt->input->cur++;
517 if (*ctxt->input->cur == 0)
518 xmlParserGrow(ctxt);
519 if (res < INT_MAX)
520 res++;
521 }
522 return(res);
523 }
524
525
526
527 /************************************************************************
528 * *
529 * The list of HTML elements and their properties *
530 * *
531 ************************************************************************/
532
533 /*
534 * Start Tag: 1 means the start tag can be omitted
535 * End Tag: 1 means the end tag can be omitted
536 * 2 means it's forbidden (empty elements)
537 * 3 means the tag is stylistic and should be closed easily
538 * Depr: this element is deprecated
539 * DTD: 1 means that this element is valid only in the Loose DTD
540 * 2 means that this element is valid only in the Frameset DTD
541 *
542 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
543 , subElements , impliedsubelt , Attributes, userdata
544 */
545
546 /* Definitions and a couple of vars for HTML Elements */
547
548 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
549 #define NB_FONTSTYLE 8
550 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
551 #define NB_PHRASE 10
552 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
553 #define NB_SPECIAL 16
554 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
555 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
556 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
557 #define NB_BLOCK NB_HEADING + NB_LIST + 14
558 #define FORMCTRL "input", "select", "textarea", "label", "button"
559 #define NB_FORMCTRL 5
560 #define PCDATA
561 #define NB_PCDATA 0
562 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
563 #define NB_HEADING 6
564 #define LIST "ul", "ol", "dir", "menu"
565 #define NB_LIST 4
566 #define MODIFIER
567 #define NB_MODIFIER 0
568 #define FLOW BLOCK,INLINE
569 #define NB_FLOW NB_BLOCK + NB_INLINE
570 #define EMPTY NULL
571
572
573 static const char* const html_flow[] = { FLOW, NULL } ;
574 static const char* const html_inline[] = { INLINE, NULL } ;
575
576 /* placeholders: elts with content but no subelements */
577 static const char* const html_pcdata[] = { NULL } ;
578 #define html_cdata html_pcdata
579
580
581 /* ... and for HTML Attributes */
582
583 #define COREATTRS "id", "class", "style", "title"
584 #define NB_COREATTRS 4
585 #define I18N "lang", "dir"
586 #define NB_I18N 2
587 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
588 #define NB_EVENTS 9
589 #define ATTRS COREATTRS,I18N,EVENTS
590 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
591 #define CELLHALIGN "align", "char", "charoff"
592 #define NB_CELLHALIGN 3
593 #define CELLVALIGN "valign"
594 #define NB_CELLVALIGN 1
595
596 static const char* const html_attrs[] = { ATTRS, NULL } ;
597 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
598 static const char* const core_attrs[] = { COREATTRS, NULL } ;
599 static const char* const i18n_attrs[] = { I18N, NULL } ;
600
601
602 /* Other declarations that should go inline ... */
603 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
604 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
605 "tabindex", "onfocus", "onblur", NULL } ;
606 static const char* const target_attr[] = { "target", NULL } ;
607 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
608 static const char* const alt_attr[] = { "alt", NULL } ;
609 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
610 static const char* const href_attrs[] = { "href", NULL } ;
611 static const char* const clear_attrs[] = { "clear", NULL } ;
612 static const char* const inline_p[] = { INLINE, "p", NULL } ;
613
614 static const char* const flow_param[] = { FLOW, "param", NULL } ;
615 static const char* const applet_attrs[] = { COREATTRS , "codebase",
616 "archive", "alt", "name", "height", "width", "align",
617 "hspace", "vspace", NULL } ;
618 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
619 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
620 static const char* const basefont_attrs[] =
621 { "id", "size", "color", "face", NULL } ;
622 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
623 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
624 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
625 static const char* const body_depr[] = { "background", "bgcolor", "text",
626 "link", "vlink", "alink", NULL } ;
627 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
628 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
629
630
631 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
632 static const char* const col_elt[] = { "col", NULL } ;
633 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
634 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
635 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
636 static const char* const compact_attr[] = { "compact", NULL } ;
637 static const char* const label_attr[] = { "label", NULL } ;
638 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
639 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
640 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
641 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
642 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
643 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
644 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
645 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
646 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
647 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
648 static const char* const version_attr[] = { "version", NULL } ;
649 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
650 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
651 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
652 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
653 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
654 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
655 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
656 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
657 static const char* const align_attr[] = { "align", NULL } ;
658 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
659 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
660 static const char* const name_attr[] = { "name", NULL } ;
661 static const char* const action_attr[] = { "action", NULL } ;
662 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
663 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
664 static const char* const content_attr[] = { "content", NULL } ;
665 static const char* const type_attr[] = { "type", NULL } ;
666 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
667 static const char* const object_contents[] = { FLOW, "param", NULL } ;
668 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
669 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
670 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
671 static const char* const option_elt[] = { "option", NULL } ;
672 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
673 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
674 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
675 static const char* const width_attr[] = { "width", NULL } ;
676 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
677 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
678 static const char* const language_attr[] = { "language", NULL } ;
679 static const char* const select_content[] = { "optgroup", "option", NULL } ;
680 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
681 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
682 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
683 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
684 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
685 static const char* const tr_elt[] = { "tr", NULL } ;
686 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
687 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
688 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
689 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
690 static const char* const tr_contents[] = { "th", "td", NULL } ;
691 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
692 static const char* const li_elt[] = { "li", NULL } ;
693 static const char* const ul_depr[] = { "type", "compact", NULL} ;
694 static const char* const dir_attr[] = { "dir", NULL} ;
695
696 #define DECL (const char**)
697
698 static const htmlElemDesc
699 html40ElementTable[] = {
700 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
701 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
702 },
703 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
704 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
705 },
706 { "acronym", 0, 0, 0, 0, 0, 0, 1, "",
707 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
708 },
709 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
710 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
711 },
712 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
713 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
714 },
715 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
716 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
717 },
718 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
719 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
720 },
721 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
722 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
723 },
724 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
725 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
726 },
727 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
728 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
729 },
730 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
731 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
732 },
733 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
734 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
735 },
736 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
737 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
738 },
739 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
740 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
741 },
742 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
743 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
744 },
745 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
746 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
747 },
748 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
749 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
750 },
751 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
752 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
753 },
754 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
755 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
756 },
757 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
758 EMPTY , NULL , DECL col_attrs , NULL, NULL
759 },
760 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
761 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
762 },
763 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
764 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
765 },
766 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
767 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
768 },
769 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
770 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
771 },
772 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
773 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
774 },
775 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
776 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
777 },
778 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
779 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
780 },
781 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
782 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
783 },
784 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
785 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
786 },
787 { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
788 EMPTY, NULL, DECL embed_attrs, NULL, NULL
789 },
790 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
791 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
792 },
793 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
794 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
795 },
796 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
797 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
798 },
799 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
800 EMPTY, NULL, NULL, DECL frame_attrs, NULL
801 },
802 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
803 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
804 },
805 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
806 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
807 },
808 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
809 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
810 },
811 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
812 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
813 },
814 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
815 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
816 },
817 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
818 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
819 },
820 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
821 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
822 },
823 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
824 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
825 },
826 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
827 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
828 },
829 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
830 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
831 },
832 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
833 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
834 },
835 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
836 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
837 },
838 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
839 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
840 },
841 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
842 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
843 },
844 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
845 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
846 },
847 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
848 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
849 },
850 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
851 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
852 },
853 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
854 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
855 },
856 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
857 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
858 },
859 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
860 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
861 },
862 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
863 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
864 },
865 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
866 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
867 },
868 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
869 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
870 },
871 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
872 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
873 },
874 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
875 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
876 },
877 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
878 DECL html_flow, "div", DECL html_attrs, NULL, NULL
879 },
880 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
881 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
882 },
883 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
884 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
885 },
886 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
887 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
888 },
889 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
890 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
891 },
892 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
893 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
894 },
895 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
896 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
897 },
898 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
899 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
900 },
901 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
902 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
903 },
904 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
905 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
906 },
907 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
908 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
909 },
910 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
911 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
912 },
913 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
914 DECL select_content, NULL, DECL select_attrs, NULL, NULL
915 },
916 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
917 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
918 },
919 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
920 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
921 },
922 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
923 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
924 },
925 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
926 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
927 },
928 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
929 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
930 },
931 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
932 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
933 },
934 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
935 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
936 },
937 { "table", 0, 0, 0, 0, 0, 0, 0, "",
938 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
939 },
940 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
941 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
942 },
943 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
944 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
945 },
946 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
947 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
948 },
949 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
950 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
951 },
952 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
953 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
954 },
955 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
956 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
957 },
958 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
959 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
960 },
961 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
962 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
963 },
964 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
965 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
966 },
967 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
968 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
969 },
970 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
971 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
972 },
973 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
974 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
975 }
976 };
977
978 typedef struct {
979 const char *oldTag;
980 const char *newTag;
981 } htmlStartCloseEntry;
982
983 /*
984 * start tags that imply the end of current element
985 */
986 static const htmlStartCloseEntry htmlStartClose[] = {
987 { "a", "a" },
988 { "a", "fieldset" },
989 { "a", "table" },
990 { "a", "td" },
991 { "a", "th" },
992 { "address", "dd" },
993 { "address", "dl" },
994 { "address", "dt" },
995 { "address", "form" },
996 { "address", "li" },
997 { "address", "ul" },
998 { "b", "center" },
999 { "b", "p" },
1000 { "b", "td" },
1001 { "b", "th" },
1002 { "big", "p" },
1003 { "caption", "col" },
1004 { "caption", "colgroup" },
1005 { "caption", "tbody" },
1006 { "caption", "tfoot" },
1007 { "caption", "thead" },
1008 { "caption", "tr" },
1009 { "col", "col" },
1010 { "col", "colgroup" },
1011 { "col", "tbody" },
1012 { "col", "tfoot" },
1013 { "col", "thead" },
1014 { "col", "tr" },
1015 { "colgroup", "colgroup" },
1016 { "colgroup", "tbody" },
1017 { "colgroup", "tfoot" },
1018 { "colgroup", "thead" },
1019 { "colgroup", "tr" },
1020 { "dd", "dt" },
1021 { "dir", "dd" },
1022 { "dir", "dl" },
1023 { "dir", "dt" },
1024 { "dir", "form" },
1025 { "dir", "ul" },
1026 { "dl", "form" },
1027 { "dl", "li" },
1028 { "dt", "dd" },
1029 { "dt", "dl" },
1030 { "font", "center" },
1031 { "font", "td" },
1032 { "font", "th" },
1033 { "form", "form" },
1034 { "h1", "fieldset" },
1035 { "h1", "form" },
1036 { "h1", "li" },
1037 { "h1", "p" },
1038 { "h1", "table" },
1039 { "h2", "fieldset" },
1040 { "h2", "form" },
1041 { "h2", "li" },
1042 { "h2", "p" },
1043 { "h2", "table" },
1044 { "h3", "fieldset" },
1045 { "h3", "form" },
1046 { "h3", "li" },
1047 { "h3", "p" },
1048 { "h3", "table" },
1049 { "h4", "fieldset" },
1050 { "h4", "form" },
1051 { "h4", "li" },
1052 { "h4", "p" },
1053 { "h4", "table" },
1054 { "h5", "fieldset" },
1055 { "h5", "form" },
1056 { "h5", "li" },
1057 { "h5", "p" },
1058 { "h5", "table" },
1059 { "h6", "fieldset" },
1060 { "h6", "form" },
1061 { "h6", "li" },
1062 { "h6", "p" },
1063 { "h6", "table" },
1064 { "head", "a" },
1065 { "head", "abbr" },
1066 { "head", "acronym" },
1067 { "head", "address" },
1068 { "head", "b" },
1069 { "head", "bdo" },
1070 { "head", "big" },
1071 { "head", "blockquote" },
1072 { "head", "body" },
1073 { "head", "br" },
1074 { "head", "center" },
1075 { "head", "cite" },
1076 { "head", "code" },
1077 { "head", "dd" },
1078 { "head", "dfn" },
1079 { "head", "dir" },
1080 { "head", "div" },
1081 { "head", "dl" },
1082 { "head", "dt" },
1083 { "head", "em" },
1084 { "head", "fieldset" },
1085 { "head", "font" },
1086 { "head", "form" },
1087 { "head", "frameset" },
1088 { "head", "h1" },
1089 { "head", "h2" },
1090 { "head", "h3" },
1091 { "head", "h4" },
1092 { "head", "h5" },
1093 { "head", "h6" },
1094 { "head", "hr" },
1095 { "head", "i" },
1096 { "head", "iframe" },
1097 { "head", "img" },
1098 { "head", "kbd" },
1099 { "head", "li" },
1100 { "head", "listing" },
1101 { "head", "map" },
1102 { "head", "menu" },
1103 { "head", "ol" },
1104 { "head", "p" },
1105 { "head", "pre" },
1106 { "head", "q" },
1107 { "head", "s" },
1108 { "head", "samp" },
1109 { "head", "small" },
1110 { "head", "span" },
1111 { "head", "strike" },
1112 { "head", "strong" },
1113 { "head", "sub" },
1114 { "head", "sup" },
1115 { "head", "table" },
1116 { "head", "tt" },
1117 { "head", "u" },
1118 { "head", "ul" },
1119 { "head", "var" },
1120 { "head", "xmp" },
1121 { "hr", "form" },
1122 { "i", "center" },
1123 { "i", "p" },
1124 { "i", "td" },
1125 { "i", "th" },
1126 { "legend", "fieldset" },
1127 { "li", "li" },
1128 { "link", "body" },
1129 { "link", "frameset" },
1130 { "listing", "dd" },
1131 { "listing", "dl" },
1132 { "listing", "dt" },
1133 { "listing", "fieldset" },
1134 { "listing", "form" },
1135 { "listing", "li" },
1136 { "listing", "table" },
1137 { "listing", "ul" },
1138 { "menu", "dd" },
1139 { "menu", "dl" },
1140 { "menu", "dt" },
1141 { "menu", "form" },
1142 { "menu", "ul" },
1143 { "ol", "form" },
1144 { "option", "optgroup" },
1145 { "option", "option" },
1146 { "p", "address" },
1147 { "p", "blockquote" },
1148 { "p", "body" },
1149 { "p", "caption" },
1150 { "p", "center" },
1151 { "p", "col" },
1152 { "p", "colgroup" },
1153 { "p", "dd" },
1154 { "p", "dir" },
1155 { "p", "div" },
1156 { "p", "dl" },
1157 { "p", "dt" },
1158 { "p", "fieldset" },
1159 { "p", "form" },
1160 { "p", "frameset" },
1161 { "p", "h1" },
1162 { "p", "h2" },
1163 { "p", "h3" },
1164 { "p", "h4" },
1165 { "p", "h5" },
1166 { "p", "h6" },
1167 { "p", "head" },
1168 { "p", "hr" },
1169 { "p", "li" },
1170 { "p", "listing" },
1171 { "p", "menu" },
1172 { "p", "ol" },
1173 { "p", "p" },
1174 { "p", "pre" },
1175 { "p", "table" },
1176 { "p", "tbody" },
1177 { "p", "td" },
1178 { "p", "tfoot" },
1179 { "p", "th" },
1180 { "p", "title" },
1181 { "p", "tr" },
1182 { "p", "ul" },
1183 { "p", "xmp" },
1184 { "pre", "dd" },
1185 { "pre", "dl" },
1186 { "pre", "dt" },
1187 { "pre", "fieldset" },
1188 { "pre", "form" },
1189 { "pre", "li" },
1190 { "pre", "table" },
1191 { "pre", "ul" },
1192 { "s", "p" },
1193 { "script", "noscript" },
1194 { "small", "p" },
1195 { "span", "td" },
1196 { "span", "th" },
1197 { "strike", "p" },
1198 { "style", "body" },
1199 { "style", "frameset" },
1200 { "tbody", "tbody" },
1201 { "tbody", "tfoot" },
1202 { "td", "tbody" },
1203 { "td", "td" },
1204 { "td", "tfoot" },
1205 { "td", "th" },
1206 { "td", "tr" },
1207 { "tfoot", "tbody" },
1208 { "th", "tbody" },
1209 { "th", "td" },
1210 { "th", "tfoot" },
1211 { "th", "th" },
1212 { "th", "tr" },
1213 { "thead", "tbody" },
1214 { "thead", "tfoot" },
1215 { "title", "body" },
1216 { "title", "frameset" },
1217 { "tr", "tbody" },
1218 { "tr", "tfoot" },
1219 { "tr", "tr" },
1220 { "tt", "p" },
1221 { "u", "p" },
1222 { "u", "td" },
1223 { "u", "th" },
1224 { "ul", "address" },
1225 { "ul", "form" },
1226 { "ul", "menu" },
1227 { "ul", "pre" },
1228 { "xmp", "dd" },
1229 { "xmp", "dl" },
1230 { "xmp", "dt" },
1231 { "xmp", "fieldset" },
1232 { "xmp", "form" },
1233 { "xmp", "li" },
1234 { "xmp", "table" },
1235 { "xmp", "ul" }
1236 };
1237
1238 /*
1239 * The list of HTML elements which are supposed not to have
1240 * CDATA content and where a p element will be implied
1241 *
1242 * TODO: extend that list by reading the HTML SGML DTD on
1243 * implied paragraph
1244 */
1245 static const char *const htmlNoContentElements[] = {
1246 "html",
1247 "head",
1248 NULL
1249 };
1250
1251 /*
1252 * The list of HTML attributes which are of content %Script;
1253 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1254 * it assumes the name starts with 'on'
1255 */
1256 static const char *const htmlScriptAttributes[] = {
1257 "onclick",
1258 "ondblclick",
1259 "onmousedown",
1260 "onmouseup",
1261 "onmouseover",
1262 "onmousemove",
1263 "onmouseout",
1264 "onkeypress",
1265 "onkeydown",
1266 "onkeyup",
1267 "onload",
1268 "onunload",
1269 "onfocus",
1270 "onblur",
1271 "onsubmit",
1272 "onreset",
1273 "onchange",
1274 "onselect"
1275 };
1276
1277 /*
1278 * This table is used by the htmlparser to know what to do with
1279 * broken html pages. By assigning different priorities to different
1280 * elements the parser can decide how to handle extra endtags.
1281 * Endtags are only allowed to close elements with lower or equal
1282 * priority.
1283 */
1284
1285 typedef struct {
1286 const char *name;
1287 int priority;
1288 } elementPriority;
1289
1290 static const elementPriority htmlEndPriority[] = {
1291 {"div", 150},
1292 {"td", 160},
1293 {"th", 160},
1294 {"tr", 170},
1295 {"thead", 180},
1296 {"tbody", 180},
1297 {"tfoot", 180},
1298 {"table", 190},
1299 {"head", 200},
1300 {"body", 200},
1301 {"html", 220},
1302 {NULL, 100} /* Default priority */
1303 };
1304
1305 /************************************************************************
1306 * *
1307 * functions to handle HTML specific data *
1308 * *
1309 ************************************************************************/
1310
1311 /**
1312 * htmlInitAutoClose:
1313 *
1314 * DEPRECATED: This is a no-op.
1315 */
1316 void
htmlInitAutoClose(void)1317 htmlInitAutoClose(void) {
1318 }
1319
1320 static int
htmlCompareTags(const void * key,const void * member)1321 htmlCompareTags(const void *key, const void *member) {
1322 const xmlChar *tag = (const xmlChar *) key;
1323 const htmlElemDesc *desc = (const htmlElemDesc *) member;
1324
1325 return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1326 }
1327
1328 /**
1329 * htmlTagLookup:
1330 * @tag: The tag name in lowercase
1331 *
1332 * Lookup the HTML tag in the ElementTable
1333 *
1334 * Returns the related htmlElemDescPtr or NULL if not found.
1335 */
1336 const htmlElemDesc *
htmlTagLookup(const xmlChar * tag)1337 htmlTagLookup(const xmlChar *tag) {
1338 if (tag == NULL)
1339 return(NULL);
1340
1341 return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1342 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1343 sizeof(htmlElemDesc), htmlCompareTags));
1344 }
1345
1346 /**
1347 * htmlGetEndPriority:
1348 * @name: The name of the element to look up the priority for.
1349 *
1350 * Return value: The "endtag" priority.
1351 **/
1352 static int
htmlGetEndPriority(const xmlChar * name)1353 htmlGetEndPriority (const xmlChar *name) {
1354 int i = 0;
1355
1356 while ((htmlEndPriority[i].name != NULL) &&
1357 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1358 i++;
1359
1360 return(htmlEndPriority[i].priority);
1361 }
1362
1363
1364 static int
htmlCompareStartClose(const void * vkey,const void * member)1365 htmlCompareStartClose(const void *vkey, const void *member) {
1366 const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1367 const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1368 int ret;
1369
1370 ret = strcmp(key->oldTag, entry->oldTag);
1371 if (ret == 0)
1372 ret = strcmp(key->newTag, entry->newTag);
1373
1374 return(ret);
1375 }
1376
1377 /**
1378 * htmlCheckAutoClose:
1379 * @newtag: The new tag name
1380 * @oldtag: The old tag name
1381 *
1382 * Checks whether the new tag is one of the registered valid tags for
1383 * closing old.
1384 *
1385 * Returns 0 if no, 1 if yes.
1386 */
1387 static int
htmlCheckAutoClose(const xmlChar * newtag,const xmlChar * oldtag)1388 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1389 {
1390 htmlStartCloseEntry key;
1391 void *res;
1392
1393 key.oldTag = (const char *) oldtag;
1394 key.newTag = (const char *) newtag;
1395 res = bsearch(&key, htmlStartClose,
1396 sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1397 sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1398 return(res != NULL);
1399 }
1400
1401 /**
1402 * htmlAutoCloseOnClose:
1403 * @ctxt: an HTML parser context
1404 * @newtag: The new tag name
1405 * @force: force the tag closure
1406 *
1407 * The HTML DTD allows an ending tag to implicitly close other tags.
1408 */
1409 static void
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1410 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1411 {
1412 const htmlElemDesc *info;
1413 int i, priority;
1414
1415 priority = htmlGetEndPriority(newtag);
1416
1417 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1418
1419 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1420 break;
1421 /*
1422 * A misplaced endtag can only close elements with lower
1423 * or equal priority, so if we find an element with higher
1424 * priority before we find an element with
1425 * matching name, we just ignore this endtag
1426 */
1427 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1428 return;
1429 }
1430 if (i < 0)
1431 return;
1432
1433 while (!xmlStrEqual(newtag, ctxt->name)) {
1434 info = htmlTagLookup(ctxt->name);
1435 if ((info != NULL) && (info->endTag == 3)) {
1436 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1437 "Opening and ending tag mismatch: %s and %s\n",
1438 newtag, ctxt->name);
1439 }
1440 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1441 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1442 htmlnamePop(ctxt);
1443 }
1444 }
1445
1446 /**
1447 * htmlAutoCloseOnEnd:
1448 * @ctxt: an HTML parser context
1449 *
1450 * Close all remaining tags at the end of the stream
1451 */
1452 static void
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)1453 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1454 {
1455 int i;
1456
1457 if (ctxt->nameNr == 0)
1458 return;
1459 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1460 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1461 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1462 htmlnamePop(ctxt);
1463 }
1464 }
1465
1466 /**
1467 * htmlAutoClose:
1468 * @ctxt: an HTML parser context
1469 * @newtag: The new tag name or NULL
1470 *
1471 * The HTML DTD allows a tag to implicitly close other tags.
1472 * The list is kept in htmlStartClose array. This function is
1473 * called when a new tag has been detected and generates the
1474 * appropriates closes if possible/needed.
1475 * If newtag is NULL this mean we are at the end of the resource
1476 * and we should check
1477 */
1478 static void
htmlAutoClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1479 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1480 {
1481 if (newtag == NULL)
1482 return;
1483
1484 while ((ctxt->name != NULL) &&
1485 (htmlCheckAutoClose(newtag, ctxt->name))) {
1486 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1487 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1488 htmlnamePop(ctxt);
1489 }
1490 }
1491
1492 /**
1493 * htmlAutoCloseTag:
1494 * @doc: the HTML document
1495 * @name: The tag name
1496 * @elem: the HTML element
1497 *
1498 * The HTML DTD allows a tag to implicitly close other tags.
1499 * The list is kept in htmlStartClose array. This function checks
1500 * if the element or one of it's children would autoclose the
1501 * given tag.
1502 *
1503 * Returns 1 if autoclose, 0 otherwise
1504 */
1505 int
htmlAutoCloseTag(htmlDocPtr doc,const xmlChar * name,htmlNodePtr elem)1506 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1507 htmlNodePtr child;
1508
1509 if (elem == NULL) return(1);
1510 if (xmlStrEqual(name, elem->name)) return(0);
1511 if (htmlCheckAutoClose(elem->name, name)) return(1);
1512 child = elem->children;
1513 while (child != NULL) {
1514 if (htmlAutoCloseTag(doc, name, child)) return(1);
1515 child = child->next;
1516 }
1517 return(0);
1518 }
1519
1520 /**
1521 * htmlIsAutoClosed:
1522 * @doc: the HTML document
1523 * @elem: the HTML element
1524 *
1525 * The HTML DTD allows a tag to implicitly close other tags.
1526 * The list is kept in htmlStartClose array. This function checks
1527 * if a tag is autoclosed by one of it's child
1528 *
1529 * Returns 1 if autoclosed, 0 otherwise
1530 */
1531 int
htmlIsAutoClosed(htmlDocPtr doc,htmlNodePtr elem)1532 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1533 htmlNodePtr child;
1534
1535 if (elem == NULL) return(1);
1536 child = elem->children;
1537 while (child != NULL) {
1538 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1539 child = child->next;
1540 }
1541 return(0);
1542 }
1543
1544 /**
1545 * htmlCheckImplied:
1546 * @ctxt: an HTML parser context
1547 * @newtag: The new tag name
1548 *
1549 * The HTML DTD allows a tag to exists only implicitly
1550 * called when a new tag has been detected and generates the
1551 * appropriates implicit tags if missing
1552 */
1553 static void
htmlCheckImplied(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1554 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1555 int i;
1556
1557 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1558 return;
1559 if (!htmlOmittedDefaultValue)
1560 return;
1561 if (xmlStrEqual(newtag, BAD_CAST"html"))
1562 return;
1563 if (ctxt->nameNr <= 0) {
1564 htmlnamePush(ctxt, BAD_CAST"html");
1565 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1566 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1567 }
1568 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1569 return;
1570 if ((ctxt->nameNr <= 1) &&
1571 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1572 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1573 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1574 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1575 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1576 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1577 if (ctxt->html >= 3) {
1578 /* we already saw or generated an <head> before */
1579 return;
1580 }
1581 /*
1582 * dropped OBJECT ... i you put it first BODY will be
1583 * assumed !
1584 */
1585 htmlnamePush(ctxt, BAD_CAST"head");
1586 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1587 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1588 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1589 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1590 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1591 if (ctxt->html >= 10) {
1592 /* we already saw or generated a <body> before */
1593 return;
1594 }
1595 for (i = 0;i < ctxt->nameNr;i++) {
1596 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1597 return;
1598 }
1599 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1600 return;
1601 }
1602 }
1603
1604 htmlnamePush(ctxt, BAD_CAST"body");
1605 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1606 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1607 }
1608 }
1609
1610 /**
1611 * htmlCheckParagraph
1612 * @ctxt: an HTML parser context
1613 *
1614 * Check whether a p element need to be implied before inserting
1615 * characters in the current element.
1616 *
1617 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1618 * in case of error.
1619 */
1620
1621 static int
htmlCheckParagraph(htmlParserCtxtPtr ctxt)1622 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1623 const xmlChar *tag;
1624 int i;
1625
1626 if (ctxt == NULL)
1627 return(-1);
1628 tag = ctxt->name;
1629 if (tag == NULL) {
1630 htmlAutoClose(ctxt, BAD_CAST"p");
1631 htmlCheckImplied(ctxt, BAD_CAST"p");
1632 htmlnamePush(ctxt, BAD_CAST"p");
1633 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1634 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1635 return(1);
1636 }
1637 if (!htmlOmittedDefaultValue)
1638 return(0);
1639 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1640 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1641 htmlAutoClose(ctxt, BAD_CAST"p");
1642 htmlCheckImplied(ctxt, BAD_CAST"p");
1643 htmlnamePush(ctxt, BAD_CAST"p");
1644 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1645 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1646 return(1);
1647 }
1648 }
1649 return(0);
1650 }
1651
1652 /**
1653 * htmlIsScriptAttribute:
1654 * @name: an attribute name
1655 *
1656 * Check if an attribute is of content type Script
1657 *
1658 * Returns 1 is the attribute is a script 0 otherwise
1659 */
1660 int
htmlIsScriptAttribute(const xmlChar * name)1661 htmlIsScriptAttribute(const xmlChar *name) {
1662 unsigned int i;
1663
1664 if (name == NULL)
1665 return(0);
1666 /*
1667 * all script attributes start with 'on'
1668 */
1669 if ((name[0] != 'o') || (name[1] != 'n'))
1670 return(0);
1671 for (i = 0;
1672 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1673 i++) {
1674 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1675 return(1);
1676 }
1677 return(0);
1678 }
1679
1680 /************************************************************************
1681 * *
1682 * The list of HTML predefined entities *
1683 * *
1684 ************************************************************************/
1685
1686
1687 static const htmlEntityDesc html40EntitiesTable[] = {
1688 /*
1689 * the 4 absolute ones, plus apostrophe.
1690 */
1691 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1692 { 38, "amp", "ampersand, U+0026 ISOnum" },
1693 { 39, "apos", "single quote" },
1694 { 60, "lt", "less-than sign, U+003C ISOnum" },
1695 { 62, "gt", "greater-than sign, U+003E ISOnum" },
1696
1697 /*
1698 * A bunch still in the 128-255 range
1699 * Replacing them depend really on the charset used.
1700 */
1701 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1702 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1703 { 162, "cent", "cent sign, U+00A2 ISOnum" },
1704 { 163, "pound","pound sign, U+00A3 ISOnum" },
1705 { 164, "curren","currency sign, U+00A4 ISOnum" },
1706 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1707 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1708 { 167, "sect", "section sign, U+00A7 ISOnum" },
1709 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1710 { 169, "copy", "copyright sign, U+00A9 ISOnum" },
1711 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1712 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1713 { 172, "not", "not sign, U+00AC ISOnum" },
1714 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1715 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1716 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1717 { 176, "deg", "degree sign, U+00B0 ISOnum" },
1718 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1719 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1720 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1721 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1722 { 181, "micro","micro sign, U+00B5 ISOnum" },
1723 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1724 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1725 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1726 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1727 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1728 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1729 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1730 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1731 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1732 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1733 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1734 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1735 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1736 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1737 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1738 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1739 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1740 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1741 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1742 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1743 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1744 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1745 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1746 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1747 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1748 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1749 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1750 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1751 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1752 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1753 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1754 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1755 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1756 { 215, "times","multiplication sign, U+00D7 ISOnum" },
1757 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1758 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1759 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1760 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1761 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1762 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1763 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1764 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1765 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1766 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1767 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1768 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1769 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1770 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1771 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1772 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1773 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1774 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1775 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1776 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1777 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1778 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1779 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1780 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1781 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1782 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1783 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1784 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1785 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1786 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1787 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1788 { 247, "divide","division sign, U+00F7 ISOnum" },
1789 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1790 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1791 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1792 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1793 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1794 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1795 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1796 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1797
1798 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1799 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1800 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1801 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1802 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1803
1804 /*
1805 * Anything below should really be kept as entities references
1806 */
1807 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1808
1809 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1810 { 732, "tilde","small tilde, U+02DC ISOdia" },
1811
1812 { 913, "Alpha","greek capital letter alpha, U+0391" },
1813 { 914, "Beta", "greek capital letter beta, U+0392" },
1814 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1815 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1816 { 917, "Epsilon","greek capital letter epsilon, U+0395" },
1817 { 918, "Zeta", "greek capital letter zeta, U+0396" },
1818 { 919, "Eta", "greek capital letter eta, U+0397" },
1819 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1820 { 921, "Iota", "greek capital letter iota, U+0399" },
1821 { 922, "Kappa","greek capital letter kappa, U+039A" },
1822 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1823 { 924, "Mu", "greek capital letter mu, U+039C" },
1824 { 925, "Nu", "greek capital letter nu, U+039D" },
1825 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1826 { 927, "Omicron","greek capital letter omicron, U+039F" },
1827 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1828 { 929, "Rho", "greek capital letter rho, U+03A1" },
1829 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1830 { 932, "Tau", "greek capital letter tau, U+03A4" },
1831 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1832 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1833 { 935, "Chi", "greek capital letter chi, U+03A7" },
1834 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1835 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1836
1837 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1838 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1839 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1840 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1841 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1842 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1843 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1844 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1845 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1846 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1847 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1848 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1849 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1850 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1851 { 959, "omicron","greek small letter omicron, U+03BF NEW" },
1852 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1853 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1854 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1855 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1856 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1857 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1858 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1859 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1860 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1861 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1862 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1863 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1864 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1865
1866 { 8194, "ensp", "en space, U+2002 ISOpub" },
1867 { 8195, "emsp", "em space, U+2003 ISOpub" },
1868 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1869 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1870 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1871 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1872 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1873 { 8211, "ndash","en dash, U+2013 ISOpub" },
1874 { 8212, "mdash","em dash, U+2014 ISOpub" },
1875 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1876 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1877 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1878 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1879 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1880 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1881 { 8224, "dagger","dagger, U+2020 ISOpub" },
1882 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1883
1884 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1885 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1886
1887 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1888
1889 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1890 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1891
1892 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1893 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1894
1895 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1896 { 8260, "frasl","fraction slash, U+2044 NEW" },
1897
1898 { 8364, "euro", "euro sign, U+20AC NEW" },
1899
1900 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1901 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1902 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1903 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1904 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1905 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1906 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1907 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1908 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1909 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1910 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1911 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1912 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1913 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1914 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1915 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1916
1917 { 8704, "forall","for all, U+2200 ISOtech" },
1918 { 8706, "part", "partial differential, U+2202 ISOtech" },
1919 { 8707, "exist","there exists, U+2203 ISOtech" },
1920 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1921 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1922 { 8712, "isin", "element of, U+2208 ISOtech" },
1923 { 8713, "notin","not an element of, U+2209 ISOtech" },
1924 { 8715, "ni", "contains as member, U+220B ISOtech" },
1925 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1926 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1927 { 8722, "minus","minus sign, U+2212 ISOtech" },
1928 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1929 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
1930 { 8733, "prop", "proportional to, U+221D ISOtech" },
1931 { 8734, "infin","infinity, U+221E ISOtech" },
1932 { 8736, "ang", "angle, U+2220 ISOamso" },
1933 { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1934 { 8744, "or", "logical or = vee, U+2228 ISOtech" },
1935 { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1936 { 8746, "cup", "union = cup, U+222A ISOtech" },
1937 { 8747, "int", "integral, U+222B ISOtech" },
1938 { 8756, "there4","therefore, U+2234 ISOtech" },
1939 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1940 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1941 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1942 { 8800, "ne", "not equal to, U+2260 ISOtech" },
1943 { 8801, "equiv","identical to, U+2261 ISOtech" },
1944 { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1945 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1946 { 8834, "sub", "subset of, U+2282 ISOtech" },
1947 { 8835, "sup", "superset of, U+2283 ISOtech" },
1948 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1949 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1950 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1951 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1952 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1953 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1954 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1955 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1956 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1957 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1958 { 8971, "rfloor","right floor, U+230B ISOamsc" },
1959 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1960 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1961 { 9674, "loz", "lozenge, U+25CA ISOpub" },
1962
1963 { 9824, "spades","black spade suit, U+2660 ISOpub" },
1964 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1965 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1966 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
1967
1968 };
1969
1970 /************************************************************************
1971 * *
1972 * Commodity functions to handle entities *
1973 * *
1974 ************************************************************************/
1975
1976 /*
1977 * Macro used to grow the current buffer.
1978 */
1979 #define growBuffer(buffer) { \
1980 xmlChar *tmp; \
1981 buffer##_size *= 2; \
1982 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size); \
1983 if (tmp == NULL) { \
1984 htmlErrMemory(ctxt); \
1985 xmlFree(buffer); \
1986 return(NULL); \
1987 } \
1988 buffer = tmp; \
1989 }
1990
1991 /**
1992 * htmlEntityLookup:
1993 * @name: the entity name
1994 *
1995 * Lookup the given entity in EntitiesTable
1996 *
1997 * TODO: the linear scan is really ugly, an hash table is really needed.
1998 *
1999 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2000 */
2001 const htmlEntityDesc *
htmlEntityLookup(const xmlChar * name)2002 htmlEntityLookup(const xmlChar *name) {
2003 unsigned int i;
2004
2005 for (i = 0;i < (sizeof(html40EntitiesTable)/
2006 sizeof(html40EntitiesTable[0]));i++) {
2007 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2008 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2009 }
2010 }
2011 return(NULL);
2012 }
2013
2014 /**
2015 * htmlEntityValueLookup:
2016 * @value: the entity's unicode value
2017 *
2018 * Lookup the given entity in EntitiesTable
2019 *
2020 * TODO: the linear scan is really ugly, an hash table is really needed.
2021 *
2022 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2023 */
2024 const htmlEntityDesc *
htmlEntityValueLookup(unsigned int value)2025 htmlEntityValueLookup(unsigned int value) {
2026 unsigned int i;
2027
2028 for (i = 0;i < (sizeof(html40EntitiesTable)/
2029 sizeof(html40EntitiesTable[0]));i++) {
2030 if (html40EntitiesTable[i].value >= value) {
2031 if (html40EntitiesTable[i].value > value)
2032 break;
2033 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2034 }
2035 }
2036 return(NULL);
2037 }
2038
2039 /**
2040 * UTF8ToHtml:
2041 * @out: a pointer to an array of bytes to store the result
2042 * @outlen: the length of @out
2043 * @in: a pointer to an array of UTF-8 chars
2044 * @inlen: the length of @in
2045 *
2046 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2047 * plus HTML entities block of chars out.
2048 *
2049 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2050 * The value of @inlen after return is the number of octets consumed
2051 * as the return value is positive, else unpredictable.
2052 * The value of @outlen after return is the number of octets consumed.
2053 */
2054 int
UTF8ToHtml(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)2055 UTF8ToHtml(unsigned char* out, int *outlen,
2056 const unsigned char* in, int *inlen) {
2057 const unsigned char* processed = in;
2058 const unsigned char* outend;
2059 const unsigned char* outstart = out;
2060 const unsigned char* instart = in;
2061 const unsigned char* inend;
2062 unsigned int c, d;
2063 int trailing;
2064
2065 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2066 if (in == NULL) {
2067 /*
2068 * initialization nothing to do
2069 */
2070 *outlen = 0;
2071 *inlen = 0;
2072 return(0);
2073 }
2074 inend = in + (*inlen);
2075 outend = out + (*outlen);
2076 while (in < inend) {
2077 d = *in++;
2078 if (d < 0x80) { c= d; trailing= 0; }
2079 else if (d < 0xC0) {
2080 /* trailing byte in leading position */
2081 *outlen = out - outstart;
2082 *inlen = processed - instart;
2083 return(-2);
2084 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2085 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2086 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2087 else {
2088 /* no chance for this in Ascii */
2089 *outlen = out - outstart;
2090 *inlen = processed - instart;
2091 return(-2);
2092 }
2093
2094 if (inend - in < trailing) {
2095 break;
2096 }
2097
2098 for ( ; trailing; trailing--) {
2099 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2100 break;
2101 c <<= 6;
2102 c |= d & 0x3F;
2103 }
2104
2105 /* assertion: c is a single UTF-4 value */
2106 if (c < 0x80) {
2107 if (out + 1 >= outend)
2108 break;
2109 *out++ = c;
2110 } else {
2111 int len;
2112 const htmlEntityDesc * ent;
2113 const char *cp;
2114 char nbuf[16];
2115
2116 /*
2117 * Try to lookup a predefined HTML entity for it
2118 */
2119
2120 ent = htmlEntityValueLookup(c);
2121 if (ent == NULL) {
2122 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2123 cp = nbuf;
2124 }
2125 else
2126 cp = ent->name;
2127 len = strlen(cp);
2128 if (out + 2 + len >= outend)
2129 break;
2130 *out++ = '&';
2131 memcpy(out, cp, len);
2132 out += len;
2133 *out++ = ';';
2134 }
2135 processed = in;
2136 }
2137 *outlen = out - outstart;
2138 *inlen = processed - instart;
2139 return(0);
2140 }
2141
2142 /**
2143 * htmlEncodeEntities:
2144 * @out: a pointer to an array of bytes to store the result
2145 * @outlen: the length of @out
2146 * @in: a pointer to an array of UTF-8 chars
2147 * @inlen: the length of @in
2148 * @quoteChar: the quote character to escape (' or ") or zero.
2149 *
2150 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2151 * plus HTML entities block of chars out.
2152 *
2153 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2154 * The value of @inlen after return is the number of octets consumed
2155 * as the return value is positive, else unpredictable.
2156 * The value of @outlen after return is the number of octets consumed.
2157 */
2158 int
htmlEncodeEntities(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,int quoteChar)2159 htmlEncodeEntities(unsigned char* out, int *outlen,
2160 const unsigned char* in, int *inlen, int quoteChar) {
2161 const unsigned char* processed = in;
2162 const unsigned char* outend;
2163 const unsigned char* outstart = out;
2164 const unsigned char* instart = in;
2165 const unsigned char* inend;
2166 unsigned int c, d;
2167 int trailing;
2168
2169 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2170 return(-1);
2171 outend = out + (*outlen);
2172 inend = in + (*inlen);
2173 while (in < inend) {
2174 d = *in++;
2175 if (d < 0x80) { c= d; trailing= 0; }
2176 else if (d < 0xC0) {
2177 /* trailing byte in leading position */
2178 *outlen = out - outstart;
2179 *inlen = processed - instart;
2180 return(-2);
2181 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2182 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2183 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2184 else {
2185 /* no chance for this in Ascii */
2186 *outlen = out - outstart;
2187 *inlen = processed - instart;
2188 return(-2);
2189 }
2190
2191 if (inend - in < trailing)
2192 break;
2193
2194 while (trailing--) {
2195 if (((d= *in++) & 0xC0) != 0x80) {
2196 *outlen = out - outstart;
2197 *inlen = processed - instart;
2198 return(-2);
2199 }
2200 c <<= 6;
2201 c |= d & 0x3F;
2202 }
2203
2204 /* assertion: c is a single UTF-4 value */
2205 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2206 (c != '&') && (c != '<') && (c != '>')) {
2207 if (out >= outend)
2208 break;
2209 *out++ = c;
2210 } else {
2211 const htmlEntityDesc * ent;
2212 const char *cp;
2213 char nbuf[16];
2214 int len;
2215
2216 /*
2217 * Try to lookup a predefined HTML entity for it
2218 */
2219 ent = htmlEntityValueLookup(c);
2220 if (ent == NULL) {
2221 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2222 cp = nbuf;
2223 }
2224 else
2225 cp = ent->name;
2226 len = strlen(cp);
2227 if (outend - out < len + 2)
2228 break;
2229 *out++ = '&';
2230 memcpy(out, cp, len);
2231 out += len;
2232 *out++ = ';';
2233 }
2234 processed = in;
2235 }
2236 *outlen = out - outstart;
2237 *inlen = processed - instart;
2238 return(0);
2239 }
2240
2241 /************************************************************************
2242 * *
2243 * Commodity functions, cleanup needed ? *
2244 * *
2245 ************************************************************************/
2246 /*
2247 * all tags allowing pc data from the html 4.01 loose dtd
2248 * NOTE: it might be more appropriate to integrate this information
2249 * into the html40ElementTable array but I don't want to risk any
2250 * binary incompatibility
2251 */
2252 static const char *allowPCData[] = {
2253 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2254 "blockquote", "body", "button", "caption", "center", "cite", "code",
2255 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2256 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2257 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2258 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2259 };
2260
2261 /**
2262 * areBlanks:
2263 * @ctxt: an HTML parser context
2264 * @str: a xmlChar *
2265 * @len: the size of @str
2266 *
2267 * Is this a sequence of blank chars that one can ignore ?
2268 *
2269 * Returns 1 if ignorable 0 otherwise.
2270 */
2271
areBlanks(htmlParserCtxtPtr ctxt,const xmlChar * str,int len)2272 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2273 unsigned int i;
2274 int j;
2275 xmlNodePtr lastChild;
2276 xmlDtdPtr dtd;
2277
2278 for (j = 0;j < len;j++)
2279 if (!(IS_BLANK_CH(str[j]))) return(0);
2280
2281 if (CUR == 0) return(1);
2282 if (CUR != '<') return(0);
2283 if (ctxt->name == NULL)
2284 return(1);
2285 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2286 return(1);
2287 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2288 return(1);
2289
2290 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2291 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2292 dtd = xmlGetIntSubset(ctxt->myDoc);
2293 if (dtd != NULL && dtd->ExternalID != NULL) {
2294 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2295 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2296 return(1);
2297 }
2298 }
2299
2300 if (ctxt->node == NULL) return(0);
2301 lastChild = xmlGetLastChild(ctxt->node);
2302 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2303 lastChild = lastChild->prev;
2304 if (lastChild == NULL) {
2305 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2306 (ctxt->node->content != NULL)) return(0);
2307 /* keep ws in constructs like ...<b> </b>...
2308 for all tags "b" allowing PCDATA */
2309 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2310 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2311 return(0);
2312 }
2313 }
2314 } else if (xmlNodeIsText(lastChild)) {
2315 return(0);
2316 } else {
2317 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2318 for all tags "p" allowing PCDATA */
2319 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2320 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2321 return(0);
2322 }
2323 }
2324 }
2325 return(1);
2326 }
2327
2328 /**
2329 * htmlNewDocNoDtD:
2330 * @URI: URI for the dtd, or NULL
2331 * @ExternalID: the external ID of the DTD, or NULL
2332 *
2333 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2334 * are NULL
2335 *
2336 * Returns a new document, do not initialize the DTD if not provided
2337 */
2338 htmlDocPtr
htmlNewDocNoDtD(const xmlChar * URI,const xmlChar * ExternalID)2339 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2340 xmlDocPtr cur;
2341
2342 /*
2343 * Allocate a new document and fill the fields.
2344 */
2345 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2346 if (cur == NULL)
2347 return(NULL);
2348 memset(cur, 0, sizeof(xmlDoc));
2349
2350 cur->type = XML_HTML_DOCUMENT_NODE;
2351 cur->version = NULL;
2352 cur->intSubset = NULL;
2353 cur->doc = cur;
2354 cur->name = NULL;
2355 cur->children = NULL;
2356 cur->extSubset = NULL;
2357 cur->oldNs = NULL;
2358 cur->encoding = NULL;
2359 cur->standalone = 1;
2360 cur->compression = 0;
2361 cur->ids = NULL;
2362 cur->refs = NULL;
2363 cur->_private = NULL;
2364 cur->charset = XML_CHAR_ENCODING_UTF8;
2365 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2366 if ((ExternalID != NULL) ||
2367 (URI != NULL)) {
2368 xmlDtdPtr intSubset;
2369
2370 intSubset = xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2371 if (intSubset == NULL) {
2372 xmlFree(cur);
2373 return(NULL);
2374 }
2375 }
2376 if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2377 xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2378 return(cur);
2379 }
2380
2381 /**
2382 * htmlNewDoc:
2383 * @URI: URI for the dtd, or NULL
2384 * @ExternalID: the external ID of the DTD, or NULL
2385 *
2386 * Creates a new HTML document
2387 *
2388 * Returns a new document
2389 */
2390 htmlDocPtr
htmlNewDoc(const xmlChar * URI,const xmlChar * ExternalID)2391 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2392 if ((URI == NULL) && (ExternalID == NULL))
2393 return(htmlNewDocNoDtD(
2394 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2395 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2396
2397 return(htmlNewDocNoDtD(URI, ExternalID));
2398 }
2399
2400
2401 /************************************************************************
2402 * *
2403 * The parser itself *
2404 * Relates to http://www.w3.org/TR/html40 *
2405 * *
2406 ************************************************************************/
2407
2408 /************************************************************************
2409 * *
2410 * The parser itself *
2411 * *
2412 ************************************************************************/
2413
2414 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2415
2416 static void
htmlSkipBogusComment(htmlParserCtxtPtr ctxt)2417 htmlSkipBogusComment(htmlParserCtxtPtr ctxt) {
2418 int c;
2419
2420 htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT,
2421 "Incorrectly opened comment\n", NULL, NULL);
2422
2423 while (PARSER_STOPPED(ctxt) == 0) {
2424 c = CUR;
2425 if (c == 0)
2426 break;
2427 NEXT;
2428 if (c == '>')
2429 break;
2430 }
2431 }
2432
2433 /**
2434 * htmlParseHTMLName:
2435 * @ctxt: an HTML parser context
2436 *
2437 * parse an HTML tag or attribute name, note that we convert it to lowercase
2438 * since HTML names are not case-sensitive.
2439 *
2440 * Returns the Tag Name parsed or NULL
2441 */
2442
2443 static const xmlChar *
htmlParseHTMLName(htmlParserCtxtPtr ctxt)2444 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2445 const xmlChar *ret;
2446 int i = 0;
2447 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2448
2449 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2450 (CUR != ':') && (CUR != '.')) return(NULL);
2451
2452 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2453 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2454 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2455 (CUR == '.'))) {
2456 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2457 else loc[i] = CUR;
2458 i++;
2459
2460 NEXT;
2461 }
2462
2463 ret = xmlDictLookup(ctxt->dict, loc, i);
2464 if (ret == NULL)
2465 htmlErrMemory(ctxt);
2466
2467 return(ret);
2468 }
2469
2470
2471 /**
2472 * htmlParseHTMLName_nonInvasive:
2473 * @ctxt: an HTML parser context
2474 *
2475 * parse an HTML tag or attribute name, note that we convert it to lowercase
2476 * since HTML names are not case-sensitive, this doesn't consume the data
2477 * from the stream, it's a look-ahead
2478 *
2479 * Returns the Tag Name parsed or NULL
2480 */
2481
2482 static const xmlChar *
htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt)2483 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2484 int i = 0;
2485 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2486 const xmlChar *ret;
2487
2488 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2489 (NXT(1) != ':')) return(NULL);
2490
2491 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2492 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2493 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2494 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2495 else loc[i] = NXT(1+i);
2496 i++;
2497 }
2498
2499 ret = xmlDictLookup(ctxt->dict, loc, i);
2500 if (ret == NULL)
2501 htmlErrMemory(ctxt);
2502
2503 return(ret);
2504 }
2505
2506
2507 /**
2508 * htmlParseName:
2509 * @ctxt: an HTML parser context
2510 *
2511 * parse an HTML name, this routine is case sensitive.
2512 *
2513 * Returns the Name parsed or NULL
2514 */
2515
2516 static const xmlChar *
htmlParseName(htmlParserCtxtPtr ctxt)2517 htmlParseName(htmlParserCtxtPtr ctxt) {
2518 const xmlChar *in;
2519 const xmlChar *ret;
2520 int count = 0;
2521
2522 GROW;
2523
2524 /*
2525 * Accelerator for simple ASCII names
2526 */
2527 in = ctxt->input->cur;
2528 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2529 ((*in >= 0x41) && (*in <= 0x5A)) ||
2530 (*in == '_') || (*in == ':')) {
2531 in++;
2532 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2533 ((*in >= 0x41) && (*in <= 0x5A)) ||
2534 ((*in >= 0x30) && (*in <= 0x39)) ||
2535 (*in == '_') || (*in == '-') ||
2536 (*in == ':') || (*in == '.'))
2537 in++;
2538
2539 if (in == ctxt->input->end)
2540 return(NULL);
2541
2542 if ((*in > 0) && (*in < 0x80)) {
2543 count = in - ctxt->input->cur;
2544 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2545 if (ret == NULL)
2546 htmlErrMemory(ctxt);
2547 ctxt->input->cur = in;
2548 ctxt->input->col += count;
2549 return(ret);
2550 }
2551 }
2552 return(htmlParseNameComplex(ctxt));
2553 }
2554
2555 static const xmlChar *
htmlParseNameComplex(xmlParserCtxtPtr ctxt)2556 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2557 int len = 0, l;
2558 int c;
2559 int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2560 XML_MAX_TEXT_LENGTH :
2561 XML_MAX_NAME_LENGTH;
2562 const xmlChar *base = ctxt->input->base;
2563 const xmlChar *ret;
2564
2565 /*
2566 * Handler for more complex cases
2567 */
2568 c = CUR_CHAR(l);
2569 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2570 (!IS_LETTER(c) && (c != '_') &&
2571 (c != ':'))) {
2572 return(NULL);
2573 }
2574
2575 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2576 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2577 (c == '.') || (c == '-') ||
2578 (c == '_') || (c == ':') ||
2579 (IS_COMBINING(c)) ||
2580 (IS_EXTENDER(c)))) {
2581 len += l;
2582 if (len > maxLength) {
2583 htmlParseErr(ctxt, XML_ERR_NAME_TOO_LONG, "name too long", NULL, NULL);
2584 return(NULL);
2585 }
2586 NEXTL(l);
2587 c = CUR_CHAR(l);
2588 if (ctxt->input->base != base) {
2589 /*
2590 * We changed encoding from an unknown encoding
2591 * Input buffer changed location, so we better start again
2592 */
2593 return(htmlParseNameComplex(ctxt));
2594 }
2595 }
2596
2597 if (ctxt->input->cur - ctxt->input->base < len) {
2598 /* Sanity check */
2599 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2600 "unexpected change of input buffer", NULL, NULL);
2601 return (NULL);
2602 }
2603
2604 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len);
2605 if (ret == NULL)
2606 htmlErrMemory(ctxt);
2607
2608 return(ret);
2609 }
2610
2611
2612 /**
2613 * htmlParseHTMLAttribute:
2614 * @ctxt: an HTML parser context
2615 * @stop: a char stop value
2616 *
2617 * parse an HTML attribute value till the stop (quote), if
2618 * stop is 0 then it stops at the first space
2619 *
2620 * Returns the attribute parsed or NULL
2621 */
2622
2623 static xmlChar *
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt,const xmlChar stop)2624 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2625 xmlChar *buffer = NULL;
2626 int buffer_size = 0;
2627 int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2628 XML_MAX_HUGE_LENGTH :
2629 XML_MAX_TEXT_LENGTH;
2630 xmlChar *out = NULL;
2631 const xmlChar *name = NULL;
2632 const xmlChar *cur = NULL;
2633 const htmlEntityDesc * ent;
2634
2635 /*
2636 * allocate a translation buffer.
2637 */
2638 buffer_size = HTML_PARSER_BUFFER_SIZE;
2639 buffer = (xmlChar *) xmlMallocAtomic(buffer_size);
2640 if (buffer == NULL) {
2641 htmlErrMemory(ctxt);
2642 return(NULL);
2643 }
2644 out = buffer;
2645
2646 /*
2647 * Ok loop until we reach one of the ending chars
2648 */
2649 while ((PARSER_STOPPED(ctxt) == 0) &&
2650 (CUR != 0) && (CUR != stop)) {
2651 if ((stop == 0) && (CUR == '>')) break;
2652 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2653 if (CUR == '&') {
2654 if (NXT(1) == '#') {
2655 unsigned int c;
2656 int bits;
2657
2658 c = htmlParseCharRef(ctxt);
2659 if (c < 0x80)
2660 { *out++ = c; bits= -6; }
2661 else if (c < 0x800)
2662 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2663 else if (c < 0x10000)
2664 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2665 else
2666 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2667
2668 for ( ; bits >= 0; bits-= 6) {
2669 *out++ = ((c >> bits) & 0x3F) | 0x80;
2670 }
2671
2672 if (out - buffer > buffer_size - 100) {
2673 int indx = out - buffer;
2674
2675 growBuffer(buffer);
2676 out = &buffer[indx];
2677 }
2678 } else {
2679 ent = htmlParseEntityRef(ctxt, &name);
2680 if (name == NULL) {
2681 *out++ = '&';
2682 if (out - buffer > buffer_size - 100) {
2683 int indx = out - buffer;
2684
2685 growBuffer(buffer);
2686 out = &buffer[indx];
2687 }
2688 } else if (ent == NULL) {
2689 *out++ = '&';
2690 cur = name;
2691 while (*cur != 0) {
2692 if (out - buffer > buffer_size - 100) {
2693 int indx = out - buffer;
2694
2695 growBuffer(buffer);
2696 out = &buffer[indx];
2697 }
2698 *out++ = *cur++;
2699 }
2700 } else {
2701 unsigned int c;
2702 int bits;
2703
2704 if (out - buffer > buffer_size - 100) {
2705 int indx = out - buffer;
2706
2707 growBuffer(buffer);
2708 out = &buffer[indx];
2709 }
2710 c = ent->value;
2711 if (c < 0x80)
2712 { *out++ = c; bits= -6; }
2713 else if (c < 0x800)
2714 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2715 else if (c < 0x10000)
2716 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2717 else
2718 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2719
2720 for ( ; bits >= 0; bits-= 6) {
2721 *out++ = ((c >> bits) & 0x3F) | 0x80;
2722 }
2723 }
2724 }
2725 } else {
2726 unsigned int c;
2727 int bits, l;
2728
2729 if (out - buffer > buffer_size - 100) {
2730 int indx = out - buffer;
2731
2732 growBuffer(buffer);
2733 out = &buffer[indx];
2734 }
2735 c = CUR_CHAR(l);
2736 if (c < 0x80)
2737 { *out++ = c; bits= -6; }
2738 else if (c < 0x800)
2739 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2740 else if (c < 0x10000)
2741 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2742 else
2743 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2744
2745 for ( ; bits >= 0; bits-= 6) {
2746 *out++ = ((c >> bits) & 0x3F) | 0x80;
2747 }
2748 NEXTL(l);
2749 }
2750 if (out - buffer > maxLength) {
2751 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2752 "attribute value too long\n", NULL, NULL);
2753 xmlFree(buffer);
2754 return(NULL);
2755 }
2756 }
2757 *out = 0;
2758 return(buffer);
2759 }
2760
2761 /**
2762 * htmlParseEntityRef:
2763 * @ctxt: an HTML parser context
2764 * @str: location to store the entity name
2765 *
2766 * DEPRECATED: Internal function, don't use.
2767 *
2768 * parse an HTML ENTITY references
2769 *
2770 * [68] EntityRef ::= '&' Name ';'
2771 *
2772 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2773 * if non-NULL *str will have to be freed by the caller.
2774 */
2775 const htmlEntityDesc *
htmlParseEntityRef(htmlParserCtxtPtr ctxt,const xmlChar ** str)2776 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2777 const xmlChar *name;
2778 const htmlEntityDesc * ent = NULL;
2779
2780 if (str != NULL) *str = NULL;
2781 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2782
2783 if (CUR == '&') {
2784 NEXT;
2785 name = htmlParseName(ctxt);
2786 if (name == NULL) {
2787 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2788 "htmlParseEntityRef: no name\n", NULL, NULL);
2789 } else {
2790 GROW;
2791 if (CUR == ';') {
2792 if (str != NULL)
2793 *str = name;
2794
2795 /*
2796 * Lookup the entity in the table.
2797 */
2798 ent = htmlEntityLookup(name);
2799 if (ent != NULL) /* OK that's ugly !!! */
2800 NEXT;
2801 } else {
2802 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2803 "htmlParseEntityRef: expecting ';'\n",
2804 NULL, NULL);
2805 if (str != NULL)
2806 *str = name;
2807 }
2808 }
2809 }
2810 return(ent);
2811 }
2812
2813 /**
2814 * htmlParseAttValue:
2815 * @ctxt: an HTML parser context
2816 *
2817 * parse a value for an attribute
2818 * Note: the parser won't do substitution of entities here, this
2819 * will be handled later in xmlStringGetNodeList, unless it was
2820 * asked for ctxt->replaceEntities != 0
2821 *
2822 * Returns the AttValue parsed or NULL.
2823 */
2824
2825 static xmlChar *
htmlParseAttValue(htmlParserCtxtPtr ctxt)2826 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2827 xmlChar *ret = NULL;
2828
2829 if (CUR == '"') {
2830 NEXT;
2831 ret = htmlParseHTMLAttribute(ctxt, '"');
2832 if (CUR != '"') {
2833 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2834 "AttValue: \" expected\n", NULL, NULL);
2835 } else
2836 NEXT;
2837 } else if (CUR == '\'') {
2838 NEXT;
2839 ret = htmlParseHTMLAttribute(ctxt, '\'');
2840 if (CUR != '\'') {
2841 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2842 "AttValue: ' expected\n", NULL, NULL);
2843 } else
2844 NEXT;
2845 } else {
2846 /*
2847 * That's an HTMLism, the attribute value may not be quoted
2848 */
2849 ret = htmlParseHTMLAttribute(ctxt, 0);
2850 if (ret == NULL) {
2851 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2852 "AttValue: no value found\n", NULL, NULL);
2853 }
2854 }
2855 return(ret);
2856 }
2857
2858 /**
2859 * htmlParseSystemLiteral:
2860 * @ctxt: an HTML parser context
2861 *
2862 * parse an HTML Literal
2863 *
2864 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2865 *
2866 * Returns the SystemLiteral parsed or NULL
2867 */
2868
2869 static xmlChar *
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt)2870 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2871 size_t len = 0, startPosition = 0;
2872 int err = 0;
2873 int quote;
2874 xmlChar *ret = NULL;
2875
2876 if ((CUR != '"') && (CUR != '\'')) {
2877 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2878 "SystemLiteral \" or ' expected\n", NULL, NULL);
2879 return(NULL);
2880 }
2881 quote = CUR;
2882 NEXT;
2883
2884 if (CUR_PTR < BASE_PTR)
2885 return(ret);
2886 startPosition = CUR_PTR - BASE_PTR;
2887
2888 while ((PARSER_STOPPED(ctxt) == 0) &&
2889 (CUR != 0) && (CUR != quote)) {
2890 /* TODO: Handle UTF-8 */
2891 if (!IS_CHAR_CH(CUR)) {
2892 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2893 "Invalid char in SystemLiteral 0x%X\n", CUR);
2894 err = 1;
2895 }
2896 NEXT;
2897 len++;
2898 }
2899 if (CUR != quote) {
2900 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2901 "Unfinished SystemLiteral\n", NULL, NULL);
2902 } else {
2903 if (err == 0) {
2904 ret = xmlStrndup((BASE_PTR+startPosition), len);
2905 if (ret == NULL) {
2906 htmlErrMemory(ctxt);
2907 return(NULL);
2908 }
2909 }
2910 NEXT;
2911 }
2912
2913 return(ret);
2914 }
2915
2916 /**
2917 * htmlParsePubidLiteral:
2918 * @ctxt: an HTML parser context
2919 *
2920 * parse an HTML public literal
2921 *
2922 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2923 *
2924 * Returns the PubidLiteral parsed or NULL.
2925 */
2926
2927 static xmlChar *
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt)2928 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2929 size_t len = 0, startPosition = 0;
2930 int err = 0;
2931 int quote;
2932 xmlChar *ret = NULL;
2933
2934 if ((CUR != '"') && (CUR != '\'')) {
2935 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2936 "PubidLiteral \" or ' expected\n", NULL, NULL);
2937 return(NULL);
2938 }
2939 quote = CUR;
2940 NEXT;
2941
2942 /*
2943 * Name ::= (Letter | '_') (NameChar)*
2944 */
2945 if (CUR_PTR < BASE_PTR)
2946 return(ret);
2947 startPosition = CUR_PTR - BASE_PTR;
2948
2949 while ((PARSER_STOPPED(ctxt) == 0) &&
2950 (CUR != 0) && (CUR != quote)) {
2951 if (!IS_PUBIDCHAR_CH(CUR)) {
2952 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2953 "Invalid char in PubidLiteral 0x%X\n", CUR);
2954 err = 1;
2955 }
2956 len++;
2957 NEXT;
2958 }
2959
2960 if (CUR != quote) {
2961 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2962 "Unfinished PubidLiteral\n", NULL, NULL);
2963 } else {
2964 if (err == 0) {
2965 ret = xmlStrndup((BASE_PTR + startPosition), len);
2966 if (ret == NULL) {
2967 htmlErrMemory(ctxt);
2968 return(NULL);
2969 }
2970 }
2971 NEXT;
2972 }
2973
2974 return(ret);
2975 }
2976
2977 /**
2978 * htmlParseScript:
2979 * @ctxt: an HTML parser context
2980 *
2981 * parse the content of an HTML SCRIPT or STYLE element
2982 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2983 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2984 * http://www.w3.org/TR/html4/types.html#type-script
2985 * http://www.w3.org/TR/html4/types.html#h-6.15
2986 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2987 *
2988 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2989 * element and the value of intrinsic event attributes. User agents must
2990 * not evaluate script data as HTML markup but instead must pass it on as
2991 * data to a script engine.
2992 * NOTES:
2993 * - The content is passed like CDATA
2994 * - the attributes for style and scripting "onXXX" are also described
2995 * as CDATA but SGML allows entities references in attributes so their
2996 * processing is identical as other attributes
2997 */
2998 static void
htmlParseScript(htmlParserCtxtPtr ctxt)2999 htmlParseScript(htmlParserCtxtPtr ctxt) {
3000 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3001 int nbchar = 0;
3002 int cur,l;
3003
3004 cur = CUR_CHAR(l);
3005 while (cur != 0) {
3006 if ((cur == '<') && (NXT(1) == '/')) {
3007 /*
3008 * One should break here, the specification is clear:
3009 * Authors should therefore escape "</" within the content.
3010 * Escape mechanisms are specific to each scripting or
3011 * style sheet language.
3012 *
3013 * In recovery mode, only break if end tag match the
3014 * current tag, effectively ignoring all tags inside the
3015 * script/style block and treating the entire block as
3016 * CDATA.
3017 */
3018 if (ctxt->recovery) {
3019 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3020 xmlStrlen(ctxt->name)) == 0)
3021 {
3022 break; /* while */
3023 } else {
3024 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3025 "Element %s embeds close tag\n",
3026 ctxt->name, NULL);
3027 }
3028 } else {
3029 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3030 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3031 {
3032 break; /* while */
3033 }
3034 }
3035 }
3036 if (IS_CHAR(cur)) {
3037 COPY_BUF(l,buf,nbchar,cur);
3038 } else {
3039 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3040 "Invalid char in CDATA 0x%X\n", cur);
3041 }
3042 NEXTL(l);
3043 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3044 buf[nbchar] = 0;
3045 if (ctxt->sax->cdataBlock!= NULL) {
3046 /*
3047 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3048 */
3049 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3050 } else if (ctxt->sax->characters != NULL) {
3051 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3052 }
3053 nbchar = 0;
3054 SHRINK;
3055 }
3056 cur = CUR_CHAR(l);
3057 }
3058
3059 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3060 buf[nbchar] = 0;
3061 if (ctxt->sax->cdataBlock!= NULL) {
3062 /*
3063 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3064 */
3065 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3066 } else if (ctxt->sax->characters != NULL) {
3067 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3068 }
3069 }
3070 }
3071
3072
3073 /**
3074 * htmlParseCharDataInternal:
3075 * @ctxt: an HTML parser context
3076 * @readahead: optional read ahead character in ascii range
3077 *
3078 * parse a CharData section.
3079 * if we are within a CDATA section ']]>' marks an end of section.
3080 *
3081 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3082 */
3083
3084 static void
htmlParseCharDataInternal(htmlParserCtxtPtr ctxt,int readahead)3085 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3086 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3087 int nbchar = 0;
3088 int cur, l;
3089
3090 if (readahead)
3091 buf[nbchar++] = readahead;
3092
3093 cur = CUR_CHAR(l);
3094 while ((cur != '<') &&
3095 (cur != '&') &&
3096 (cur != 0) &&
3097 (!PARSER_STOPPED(ctxt))) {
3098 if (!(IS_CHAR(cur))) {
3099 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3100 "Invalid char in CDATA 0x%X\n", cur);
3101 } else {
3102 COPY_BUF(l,buf,nbchar,cur);
3103 }
3104 NEXTL(l);
3105 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3106 buf[nbchar] = 0;
3107
3108 /*
3109 * Ok the segment is to be consumed as chars.
3110 */
3111 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3112 if (areBlanks(ctxt, buf, nbchar)) {
3113 if (ctxt->keepBlanks) {
3114 if (ctxt->sax->characters != NULL)
3115 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3116 } else {
3117 if (ctxt->sax->ignorableWhitespace != NULL)
3118 ctxt->sax->ignorableWhitespace(ctxt->userData,
3119 buf, nbchar);
3120 }
3121 } else {
3122 htmlCheckParagraph(ctxt);
3123 if (ctxt->sax->characters != NULL)
3124 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3125 }
3126 }
3127 nbchar = 0;
3128 SHRINK;
3129 }
3130 cur = CUR_CHAR(l);
3131 }
3132 if (nbchar != 0) {
3133 buf[nbchar] = 0;
3134
3135 /*
3136 * Ok the segment is to be consumed as chars.
3137 */
3138 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3139 if (areBlanks(ctxt, buf, nbchar)) {
3140 if (ctxt->keepBlanks) {
3141 if (ctxt->sax->characters != NULL)
3142 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3143 } else {
3144 if (ctxt->sax->ignorableWhitespace != NULL)
3145 ctxt->sax->ignorableWhitespace(ctxt->userData,
3146 buf, nbchar);
3147 }
3148 } else {
3149 htmlCheckParagraph(ctxt);
3150 if (ctxt->sax->characters != NULL)
3151 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3152 }
3153 }
3154 }
3155 }
3156
3157 /**
3158 * htmlParseCharData:
3159 * @ctxt: an HTML parser context
3160 *
3161 * parse a CharData section.
3162 * if we are within a CDATA section ']]>' marks an end of section.
3163 *
3164 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3165 */
3166
3167 static void
htmlParseCharData(htmlParserCtxtPtr ctxt)3168 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3169 htmlParseCharDataInternal(ctxt, 0);
3170 }
3171
3172 /**
3173 * htmlParseExternalID:
3174 * @ctxt: an HTML parser context
3175 * @publicID: a xmlChar** receiving PubidLiteral
3176 *
3177 * Parse an External ID or a Public ID
3178 *
3179 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3180 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3181 *
3182 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3183 *
3184 * Returns the function returns SystemLiteral and in the second
3185 * case publicID receives PubidLiteral, is strict is off
3186 * it is possible to return NULL and have publicID set.
3187 */
3188
3189 static xmlChar *
htmlParseExternalID(htmlParserCtxtPtr ctxt,xmlChar ** publicID)3190 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3191 xmlChar *URI = NULL;
3192
3193 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3194 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3195 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3196 SKIP(6);
3197 if (!IS_BLANK_CH(CUR)) {
3198 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3199 "Space required after 'SYSTEM'\n", NULL, NULL);
3200 }
3201 SKIP_BLANKS;
3202 URI = htmlParseSystemLiteral(ctxt);
3203 if (URI == NULL) {
3204 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3205 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3206 }
3207 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3208 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3209 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3210 SKIP(6);
3211 if (!IS_BLANK_CH(CUR)) {
3212 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3213 "Space required after 'PUBLIC'\n", NULL, NULL);
3214 }
3215 SKIP_BLANKS;
3216 *publicID = htmlParsePubidLiteral(ctxt);
3217 if (*publicID == NULL) {
3218 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3219 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3220 NULL, NULL);
3221 }
3222 SKIP_BLANKS;
3223 if ((CUR == '"') || (CUR == '\'')) {
3224 URI = htmlParseSystemLiteral(ctxt);
3225 }
3226 }
3227 return(URI);
3228 }
3229
3230 /**
3231 * htmlParsePI:
3232 * @ctxt: an HTML parser context
3233 *
3234 * Parse an XML Processing Instruction. HTML5 doesn't allow processing
3235 * instructions, so this will be removed at some point.
3236 */
3237 static void
htmlParsePI(htmlParserCtxtPtr ctxt)3238 htmlParsePI(htmlParserCtxtPtr ctxt) {
3239 xmlChar *buf = NULL;
3240 int len = 0;
3241 int size = HTML_PARSER_BUFFER_SIZE;
3242 int cur, l;
3243 int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3244 XML_MAX_HUGE_LENGTH :
3245 XML_MAX_TEXT_LENGTH;
3246 const xmlChar *target;
3247 xmlParserInputState state;
3248
3249 if ((RAW == '<') && (NXT(1) == '?')) {
3250 state = ctxt->instate;
3251 ctxt->instate = XML_PARSER_PI;
3252 /*
3253 * this is a Processing Instruction.
3254 */
3255 SKIP(2);
3256
3257 /*
3258 * Parse the target name and check for special support like
3259 * namespace.
3260 */
3261 target = htmlParseName(ctxt);
3262 if (target != NULL) {
3263 if (RAW == '>') {
3264 SKIP(1);
3265
3266 /*
3267 * SAX: PI detected.
3268 */
3269 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3270 (ctxt->sax->processingInstruction != NULL))
3271 ctxt->sax->processingInstruction(ctxt->userData,
3272 target, NULL);
3273 goto done;
3274 }
3275 buf = (xmlChar *) xmlMallocAtomic(size);
3276 if (buf == NULL) {
3277 htmlErrMemory(ctxt);
3278 return;
3279 }
3280 cur = CUR;
3281 if (!IS_BLANK(cur)) {
3282 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3283 "ParsePI: PI %s space expected\n", target, NULL);
3284 }
3285 SKIP_BLANKS;
3286 cur = CUR_CHAR(l);
3287 while ((cur != 0) && (cur != '>')) {
3288 if (len + 5 >= size) {
3289 xmlChar *tmp;
3290
3291 size *= 2;
3292 tmp = (xmlChar *) xmlRealloc(buf, size);
3293 if (tmp == NULL) {
3294 htmlErrMemory(ctxt);
3295 xmlFree(buf);
3296 return;
3297 }
3298 buf = tmp;
3299 }
3300 if (IS_CHAR(cur)) {
3301 COPY_BUF(l,buf,len,cur);
3302 } else {
3303 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3304 "Invalid char in processing instruction "
3305 "0x%X\n", cur);
3306 }
3307 if (len > maxLength) {
3308 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3309 "PI %s too long", target, NULL);
3310 xmlFree(buf);
3311 goto done;
3312 }
3313 NEXTL(l);
3314 cur = CUR_CHAR(l);
3315 }
3316 buf[len] = 0;
3317 if (cur != '>') {
3318 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3319 "ParsePI: PI %s never end ...\n", target, NULL);
3320 } else {
3321 SKIP(1);
3322
3323 /*
3324 * SAX: PI detected.
3325 */
3326 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3327 (ctxt->sax->processingInstruction != NULL))
3328 ctxt->sax->processingInstruction(ctxt->userData,
3329 target, buf);
3330 }
3331 xmlFree(buf);
3332 } else {
3333 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3334 "PI is not started correctly", NULL, NULL);
3335 }
3336
3337 done:
3338 ctxt->instate = state;
3339 }
3340 }
3341
3342 /**
3343 * htmlParseComment:
3344 * @ctxt: an HTML parser context
3345 *
3346 * Parse an HTML comment
3347 */
3348 static void
htmlParseComment(htmlParserCtxtPtr ctxt)3349 htmlParseComment(htmlParserCtxtPtr ctxt) {
3350 xmlChar *buf = NULL;
3351 int len;
3352 int size = HTML_PARSER_BUFFER_SIZE;
3353 int q, ql;
3354 int r, rl;
3355 int cur, l;
3356 int next, nl;
3357 int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3358 XML_MAX_HUGE_LENGTH :
3359 XML_MAX_TEXT_LENGTH;
3360 xmlParserInputState state;
3361
3362 /*
3363 * Check that there is a comment right here.
3364 */
3365 if ((RAW != '<') || (NXT(1) != '!') ||
3366 (NXT(2) != '-') || (NXT(3) != '-')) return;
3367
3368 state = ctxt->instate;
3369 ctxt->instate = XML_PARSER_COMMENT;
3370 SKIP(4);
3371 buf = (xmlChar *) xmlMallocAtomic(size);
3372 if (buf == NULL) {
3373 htmlErrMemory(ctxt);
3374 return;
3375 }
3376 len = 0;
3377 buf[len] = 0;
3378 q = CUR_CHAR(ql);
3379 if (q == 0)
3380 goto unfinished;
3381 if (q == '>') {
3382 htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3383 cur = '>';
3384 goto finished;
3385 }
3386 NEXTL(ql);
3387 r = CUR_CHAR(rl);
3388 if (r == 0)
3389 goto unfinished;
3390 if (q == '-' && r == '>') {
3391 htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3392 cur = '>';
3393 goto finished;
3394 }
3395 NEXTL(rl);
3396 cur = CUR_CHAR(l);
3397 while ((cur != 0) &&
3398 ((cur != '>') ||
3399 (r != '-') || (q != '-'))) {
3400 NEXTL(l);
3401 next = CUR_CHAR(nl);
3402
3403 if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3404 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3405 "Comment incorrectly closed by '--!>'", NULL, NULL);
3406 cur = '>';
3407 break;
3408 }
3409
3410 if (len + 5 >= size) {
3411 xmlChar *tmp;
3412
3413 size *= 2;
3414 tmp = (xmlChar *) xmlRealloc(buf, size);
3415 if (tmp == NULL) {
3416 xmlFree(buf);
3417 htmlErrMemory(ctxt);
3418 return;
3419 }
3420 buf = tmp;
3421 }
3422 if (IS_CHAR(q)) {
3423 COPY_BUF(ql,buf,len,q);
3424 } else {
3425 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3426 "Invalid char in comment 0x%X\n", q);
3427 }
3428 if (len > maxLength) {
3429 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3430 "comment too long", NULL, NULL);
3431 xmlFree(buf);
3432 ctxt->instate = state;
3433 return;
3434 }
3435
3436 q = r;
3437 ql = rl;
3438 r = cur;
3439 rl = l;
3440 cur = next;
3441 l = nl;
3442 }
3443 finished:
3444 buf[len] = 0;
3445 if (cur == '>') {
3446 NEXT;
3447 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3448 (!ctxt->disableSAX))
3449 ctxt->sax->comment(ctxt->userData, buf);
3450 xmlFree(buf);
3451 ctxt->instate = state;
3452 return;
3453 }
3454
3455 unfinished:
3456 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3457 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3458 xmlFree(buf);
3459 }
3460
3461 /**
3462 * htmlParseCharRef:
3463 * @ctxt: an HTML parser context
3464 *
3465 * DEPRECATED: Internal function, don't use.
3466 *
3467 * parse Reference declarations
3468 *
3469 * [66] CharRef ::= '&#' [0-9]+ ';' |
3470 * '&#x' [0-9a-fA-F]+ ';'
3471 *
3472 * Returns the value parsed (as an int)
3473 */
3474 int
htmlParseCharRef(htmlParserCtxtPtr ctxt)3475 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3476 int val = 0;
3477
3478 if ((ctxt == NULL) || (ctxt->input == NULL))
3479 return(0);
3480 if ((CUR == '&') && (NXT(1) == '#') &&
3481 ((NXT(2) == 'x') || NXT(2) == 'X')) {
3482 SKIP(3);
3483 while (CUR != ';') {
3484 if ((CUR >= '0') && (CUR <= '9')) {
3485 if (val < 0x110000)
3486 val = val * 16 + (CUR - '0');
3487 } else if ((CUR >= 'a') && (CUR <= 'f')) {
3488 if (val < 0x110000)
3489 val = val * 16 + (CUR - 'a') + 10;
3490 } else if ((CUR >= 'A') && (CUR <= 'F')) {
3491 if (val < 0x110000)
3492 val = val * 16 + (CUR - 'A') + 10;
3493 } else {
3494 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3495 "htmlParseCharRef: missing semicolon\n",
3496 NULL, NULL);
3497 break;
3498 }
3499 NEXT;
3500 }
3501 if (CUR == ';')
3502 NEXT;
3503 } else if ((CUR == '&') && (NXT(1) == '#')) {
3504 SKIP(2);
3505 while (CUR != ';') {
3506 if ((CUR >= '0') && (CUR <= '9')) {
3507 if (val < 0x110000)
3508 val = val * 10 + (CUR - '0');
3509 } else {
3510 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3511 "htmlParseCharRef: missing semicolon\n",
3512 NULL, NULL);
3513 break;
3514 }
3515 NEXT;
3516 }
3517 if (CUR == ';')
3518 NEXT;
3519 } else {
3520 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3521 "htmlParseCharRef: invalid value\n", NULL, NULL);
3522 }
3523 /*
3524 * Check the value IS_CHAR ...
3525 */
3526 if (IS_CHAR(val)) {
3527 return(val);
3528 } else if (val >= 0x110000) {
3529 htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3530 "htmlParseCharRef: value too large\n", NULL, NULL);
3531 } else {
3532 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3533 "htmlParseCharRef: invalid xmlChar value %d\n",
3534 val);
3535 }
3536 return(0);
3537 }
3538
3539
3540 /**
3541 * htmlParseDocTypeDecl:
3542 * @ctxt: an HTML parser context
3543 *
3544 * parse a DOCTYPE declaration
3545 *
3546 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3547 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3548 */
3549
3550 static void
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt)3551 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3552 const xmlChar *name;
3553 xmlChar *ExternalID = NULL;
3554 xmlChar *URI = NULL;
3555
3556 /*
3557 * We know that '<!DOCTYPE' has been detected.
3558 */
3559 SKIP(9);
3560
3561 SKIP_BLANKS;
3562
3563 /*
3564 * Parse the DOCTYPE name.
3565 */
3566 name = htmlParseName(ctxt);
3567 if (name == NULL) {
3568 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3569 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3570 NULL, NULL);
3571 }
3572 /*
3573 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3574 */
3575
3576 SKIP_BLANKS;
3577
3578 /*
3579 * Check for SystemID and ExternalID
3580 */
3581 URI = htmlParseExternalID(ctxt, &ExternalID);
3582 SKIP_BLANKS;
3583
3584 /*
3585 * We should be at the end of the DOCTYPE declaration.
3586 */
3587 if (CUR != '>') {
3588 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3589 "DOCTYPE improperly terminated\n", NULL, NULL);
3590 /* Ignore bogus content */
3591 while ((CUR != 0) && (CUR != '>') &&
3592 (PARSER_STOPPED(ctxt) == 0))
3593 NEXT;
3594 }
3595 if (CUR == '>')
3596 NEXT;
3597
3598 /*
3599 * Create or update the document accordingly to the DOCTYPE
3600 */
3601 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3602 (!ctxt->disableSAX))
3603 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3604
3605 /*
3606 * Cleanup, since we don't use all those identifiers
3607 */
3608 if (URI != NULL) xmlFree(URI);
3609 if (ExternalID != NULL) xmlFree(ExternalID);
3610 }
3611
3612 /**
3613 * htmlParseAttribute:
3614 * @ctxt: an HTML parser context
3615 * @value: a xmlChar ** used to store the value of the attribute
3616 *
3617 * parse an attribute
3618 *
3619 * [41] Attribute ::= Name Eq AttValue
3620 *
3621 * [25] Eq ::= S? '=' S?
3622 *
3623 * With namespace:
3624 *
3625 * [NS 11] Attribute ::= QName Eq AttValue
3626 *
3627 * Also the case QName == xmlns:??? is handled independently as a namespace
3628 * definition.
3629 *
3630 * Returns the attribute name, and the value in *value.
3631 */
3632
3633 static const xmlChar *
htmlParseAttribute(htmlParserCtxtPtr ctxt,xmlChar ** value)3634 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3635 const xmlChar *name;
3636 xmlChar *val = NULL;
3637
3638 *value = NULL;
3639 name = htmlParseHTMLName(ctxt);
3640 if (name == NULL) {
3641 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3642 "error parsing attribute name\n", NULL, NULL);
3643 return(NULL);
3644 }
3645
3646 /*
3647 * read the value
3648 */
3649 SKIP_BLANKS;
3650 if (CUR == '=') {
3651 NEXT;
3652 SKIP_BLANKS;
3653 val = htmlParseAttValue(ctxt);
3654 }
3655
3656 *value = val;
3657 return(name);
3658 }
3659
3660 /**
3661 * htmlCheckEncoding:
3662 * @ctxt: an HTML parser context
3663 * @attvalue: the attribute value
3664 *
3665 * Checks an http-equiv attribute from a Meta tag to detect
3666 * the encoding
3667 * If a new encoding is detected the parser is switched to decode
3668 * it and pass UTF8
3669 */
3670 static void
htmlCheckEncoding(htmlParserCtxtPtr ctxt,const xmlChar * attvalue)3671 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3672 const xmlChar *encoding;
3673 xmlChar *copy;
3674
3675 if (!attvalue)
3676 return;
3677
3678 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3679 if (encoding != NULL) {
3680 encoding += 7;
3681 }
3682 /*
3683 * skip blank
3684 */
3685 if (encoding && IS_BLANK_CH(*encoding))
3686 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3687 if (encoding && *encoding == '=') {
3688 encoding ++;
3689 copy = xmlStrdup(encoding);
3690 if (copy == NULL)
3691 htmlErrMemory(ctxt);
3692 xmlSetDeclaredEncoding(ctxt, copy);
3693 }
3694 }
3695
3696 /**
3697 * htmlCheckMeta:
3698 * @ctxt: an HTML parser context
3699 * @atts: the attributes values
3700 *
3701 * Checks an attributes from a Meta tag
3702 */
3703 static void
htmlCheckMeta(htmlParserCtxtPtr ctxt,const xmlChar ** atts)3704 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3705 int i;
3706 const xmlChar *att, *value;
3707 int http = 0;
3708 const xmlChar *content = NULL;
3709
3710 if ((ctxt == NULL) || (atts == NULL))
3711 return;
3712
3713 i = 0;
3714 att = atts[i++];
3715 while (att != NULL) {
3716 value = atts[i++];
3717 if (value != NULL) {
3718 if ((!xmlStrcasecmp(att, BAD_CAST "http-equiv")) &&
3719 (!xmlStrcasecmp(value, BAD_CAST "Content-Type"))) {
3720 http = 1;
3721 } else if (!xmlStrcasecmp(att, BAD_CAST "charset")) {
3722 xmlChar *copy;
3723
3724 copy = xmlStrdup(value);
3725 if (copy == NULL)
3726 htmlErrMemory(ctxt);
3727 xmlSetDeclaredEncoding(ctxt, copy);
3728 } else if (!xmlStrcasecmp(att, BAD_CAST "content")) {
3729 content = value;
3730 }
3731 }
3732 att = atts[i++];
3733 }
3734 if ((http) && (content != NULL))
3735 htmlCheckEncoding(ctxt, content);
3736
3737 }
3738
3739 /**
3740 * htmlParseStartTag:
3741 * @ctxt: an HTML parser context
3742 *
3743 * parse a start of tag either for rule element or
3744 * EmptyElement. In both case we don't parse the tag closing chars.
3745 *
3746 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3747 *
3748 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3749 *
3750 * With namespace:
3751 *
3752 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3753 *
3754 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3755 *
3756 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3757 */
3758
3759 static int
htmlParseStartTag(htmlParserCtxtPtr ctxt)3760 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3761 const xmlChar *name;
3762 const xmlChar *attname;
3763 xmlChar *attvalue;
3764 const xmlChar **atts;
3765 int nbatts = 0;
3766 int maxatts;
3767 int meta = 0;
3768 int i;
3769 int discardtag = 0;
3770
3771 if ((ctxt == NULL) || (ctxt->input == NULL))
3772 return -1;
3773 if (CUR != '<') return -1;
3774 NEXT;
3775
3776 atts = ctxt->atts;
3777 maxatts = ctxt->maxatts;
3778
3779 GROW;
3780 name = htmlParseHTMLName(ctxt);
3781 if (name == NULL) {
3782 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3783 "htmlParseStartTag: invalid element name\n",
3784 NULL, NULL);
3785 /* Dump the bogus tag like browsers do */
3786 while ((CUR != 0) && (CUR != '>') &&
3787 (PARSER_STOPPED(ctxt) == 0))
3788 NEXT;
3789 return -1;
3790 }
3791 if (xmlStrEqual(name, BAD_CAST"meta"))
3792 meta = 1;
3793
3794 /*
3795 * Check for auto-closure of HTML elements.
3796 */
3797 htmlAutoClose(ctxt, name);
3798
3799 /*
3800 * Check for implied HTML elements.
3801 */
3802 htmlCheckImplied(ctxt, name);
3803
3804 /*
3805 * Avoid html at any level > 0, head at any level != 1
3806 * or any attempt to recurse body
3807 */
3808 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3809 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3810 "htmlParseStartTag: misplaced <html> tag\n",
3811 name, NULL);
3812 discardtag = 1;
3813 ctxt->depth++;
3814 }
3815 if ((ctxt->nameNr != 1) &&
3816 (xmlStrEqual(name, BAD_CAST"head"))) {
3817 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3818 "htmlParseStartTag: misplaced <head> tag\n",
3819 name, NULL);
3820 discardtag = 1;
3821 ctxt->depth++;
3822 }
3823 if (xmlStrEqual(name, BAD_CAST"body")) {
3824 int indx;
3825 for (indx = 0;indx < ctxt->nameNr;indx++) {
3826 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3827 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3828 "htmlParseStartTag: misplaced <body> tag\n",
3829 name, NULL);
3830 discardtag = 1;
3831 ctxt->depth++;
3832 }
3833 }
3834 }
3835
3836 /*
3837 * Now parse the attributes, it ends up with the ending
3838 *
3839 * (S Attribute)* S?
3840 */
3841 SKIP_BLANKS;
3842 while ((CUR != 0) &&
3843 (CUR != '>') &&
3844 ((CUR != '/') || (NXT(1) != '>')) &&
3845 (PARSER_STOPPED(ctxt) == 0)) {
3846 GROW;
3847 attname = htmlParseAttribute(ctxt, &attvalue);
3848 if (attname != NULL) {
3849
3850 /*
3851 * Well formedness requires at most one declaration of an attribute
3852 */
3853 for (i = 0; i < nbatts;i += 2) {
3854 if (xmlStrEqual(atts[i], attname)) {
3855 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3856 "Attribute %s redefined\n", attname, NULL);
3857 if (attvalue != NULL)
3858 xmlFree(attvalue);
3859 goto failed;
3860 }
3861 }
3862
3863 /*
3864 * Add the pair to atts
3865 */
3866 if (atts == NULL) {
3867 maxatts = 22; /* allow for 10 attrs by default */
3868 atts = (const xmlChar **)
3869 xmlMalloc(maxatts * sizeof(xmlChar *));
3870 if (atts == NULL) {
3871 htmlErrMemory(ctxt);
3872 if (attvalue != NULL)
3873 xmlFree(attvalue);
3874 goto failed;
3875 }
3876 ctxt->atts = atts;
3877 ctxt->maxatts = maxatts;
3878 } else if (nbatts + 4 > maxatts) {
3879 const xmlChar **n;
3880
3881 maxatts *= 2;
3882 n = (const xmlChar **) xmlRealloc((void *) atts,
3883 maxatts * sizeof(const xmlChar *));
3884 if (n == NULL) {
3885 htmlErrMemory(ctxt);
3886 if (attvalue != NULL)
3887 xmlFree(attvalue);
3888 goto failed;
3889 }
3890 atts = n;
3891 ctxt->atts = atts;
3892 ctxt->maxatts = maxatts;
3893 }
3894 atts[nbatts++] = attname;
3895 atts[nbatts++] = attvalue;
3896 atts[nbatts] = NULL;
3897 atts[nbatts + 1] = NULL;
3898 }
3899 else {
3900 if (attvalue != NULL)
3901 xmlFree(attvalue);
3902 /* Dump the bogus attribute string up to the next blank or
3903 * the end of the tag. */
3904 while ((CUR != 0) &&
3905 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3906 ((CUR != '/') || (NXT(1) != '>')) &&
3907 (PARSER_STOPPED(ctxt) == 0))
3908 NEXT;
3909 }
3910
3911 failed:
3912 SKIP_BLANKS;
3913 }
3914
3915 /*
3916 * Handle specific association to the META tag
3917 */
3918 if (meta && (nbatts != 0))
3919 htmlCheckMeta(ctxt, atts);
3920
3921 /*
3922 * SAX: Start of Element !
3923 */
3924 if (!discardtag) {
3925 htmlnamePush(ctxt, name);
3926 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3927 if (nbatts != 0)
3928 ctxt->sax->startElement(ctxt->userData, name, atts);
3929 else
3930 ctxt->sax->startElement(ctxt->userData, name, NULL);
3931 }
3932 }
3933
3934 if (atts != NULL) {
3935 for (i = 1;i < nbatts;i += 2) {
3936 if (atts[i] != NULL)
3937 xmlFree((xmlChar *) atts[i]);
3938 }
3939 }
3940
3941 return(discardtag);
3942 }
3943
3944 /**
3945 * htmlParseEndTag:
3946 * @ctxt: an HTML parser context
3947 *
3948 * parse an end of tag
3949 *
3950 * [42] ETag ::= '</' Name S? '>'
3951 *
3952 * With namespace
3953 *
3954 * [NS 9] ETag ::= '</' QName S? '>'
3955 *
3956 * Returns 1 if the current level should be closed.
3957 */
3958
3959 static int
htmlParseEndTag(htmlParserCtxtPtr ctxt)3960 htmlParseEndTag(htmlParserCtxtPtr ctxt)
3961 {
3962 const xmlChar *name;
3963 const xmlChar *oldname;
3964 int i, ret;
3965
3966 if ((CUR != '<') || (NXT(1) != '/')) {
3967 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3968 "htmlParseEndTag: '</' not found\n", NULL, NULL);
3969 return (0);
3970 }
3971 SKIP(2);
3972
3973 name = htmlParseHTMLName(ctxt);
3974 if (name == NULL)
3975 return (0);
3976 /*
3977 * We should definitely be at the ending "S? '>'" part
3978 */
3979 SKIP_BLANKS;
3980 if (CUR != '>') {
3981 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3982 "End tag : expected '>'\n", NULL, NULL);
3983 /* Skip to next '>' */
3984 while ((PARSER_STOPPED(ctxt) == 0) &&
3985 (CUR != 0) && (CUR != '>'))
3986 NEXT;
3987 }
3988 if (CUR == '>')
3989 NEXT;
3990
3991 /*
3992 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3993 * out now.
3994 */
3995 if ((ctxt->depth > 0) &&
3996 (xmlStrEqual(name, BAD_CAST "html") ||
3997 xmlStrEqual(name, BAD_CAST "body") ||
3998 xmlStrEqual(name, BAD_CAST "head"))) {
3999 ctxt->depth--;
4000 return (0);
4001 }
4002
4003 /*
4004 * If the name read is not one of the element in the parsing stack
4005 * then return, it's just an error.
4006 */
4007 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4008 if (xmlStrEqual(name, ctxt->nameTab[i]))
4009 break;
4010 }
4011 if (i < 0) {
4012 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4013 "Unexpected end tag : %s\n", name, NULL);
4014 return (0);
4015 }
4016
4017
4018 /*
4019 * Check for auto-closure of HTML elements.
4020 */
4021
4022 htmlAutoCloseOnClose(ctxt, name);
4023
4024 /*
4025 * Well formedness constraints, opening and closing must match.
4026 * With the exception that the autoclose may have popped stuff out
4027 * of the stack.
4028 */
4029 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4030 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4031 "Opening and ending tag mismatch: %s and %s\n",
4032 name, ctxt->name);
4033 }
4034
4035 /*
4036 * SAX: End of Tag
4037 */
4038 oldname = ctxt->name;
4039 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4040 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4041 ctxt->sax->endElement(ctxt->userData, name);
4042 htmlNodeInfoPop(ctxt);
4043 htmlnamePop(ctxt);
4044 ret = 1;
4045 } else {
4046 ret = 0;
4047 }
4048
4049 return (ret);
4050 }
4051
4052
4053 /**
4054 * htmlParseReference:
4055 * @ctxt: an HTML parser context
4056 *
4057 * parse and handle entity references in content,
4058 * this will end-up in a call to character() since this is either a
4059 * CharRef, or a predefined entity.
4060 */
4061 static void
htmlParseReference(htmlParserCtxtPtr ctxt)4062 htmlParseReference(htmlParserCtxtPtr ctxt) {
4063 const htmlEntityDesc * ent;
4064 xmlChar out[6];
4065 const xmlChar *name;
4066 if (CUR != '&') return;
4067
4068 if (NXT(1) == '#') {
4069 unsigned int c;
4070 int bits, i = 0;
4071
4072 c = htmlParseCharRef(ctxt);
4073 if (c == 0)
4074 return;
4075
4076 if (c < 0x80) { out[i++]= c; bits= -6; }
4077 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4078 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4079 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4080
4081 for ( ; bits >= 0; bits-= 6) {
4082 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4083 }
4084 out[i] = 0;
4085
4086 htmlCheckParagraph(ctxt);
4087 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4088 ctxt->sax->characters(ctxt->userData, out, i);
4089 } else {
4090 ent = htmlParseEntityRef(ctxt, &name);
4091 if (name == NULL) {
4092 htmlCheckParagraph(ctxt);
4093 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4094 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4095 return;
4096 }
4097 if ((ent == NULL) || !(ent->value > 0)) {
4098 htmlCheckParagraph(ctxt);
4099 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4100 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4101 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4102 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4103 }
4104 } else {
4105 unsigned int c;
4106 int bits, i = 0;
4107
4108 c = ent->value;
4109 if (c < 0x80)
4110 { out[i++]= c; bits= -6; }
4111 else if (c < 0x800)
4112 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4113 else if (c < 0x10000)
4114 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4115 else
4116 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4117
4118 for ( ; bits >= 0; bits-= 6) {
4119 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4120 }
4121 out[i] = 0;
4122
4123 htmlCheckParagraph(ctxt);
4124 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4125 ctxt->sax->characters(ctxt->userData, out, i);
4126 }
4127 }
4128 }
4129
4130 /**
4131 * htmlParseContent:
4132 * @ctxt: an HTML parser context
4133 *
4134 * Parse a content: comment, sub-element, reference or text.
4135 * Kept for compatibility with old code
4136 */
4137
4138 static void
htmlParseContent(htmlParserCtxtPtr ctxt)4139 htmlParseContent(htmlParserCtxtPtr ctxt) {
4140 xmlChar *currentNode;
4141 int depth;
4142 const xmlChar *name;
4143
4144 currentNode = xmlStrdup(ctxt->name);
4145 depth = ctxt->nameNr;
4146 while (!PARSER_STOPPED(ctxt)) {
4147 GROW;
4148
4149 /*
4150 * Our tag or one of it's parent or children is ending.
4151 */
4152 if ((CUR == '<') && (NXT(1) == '/')) {
4153 if (htmlParseEndTag(ctxt) &&
4154 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4155 if (currentNode != NULL)
4156 xmlFree(currentNode);
4157 return;
4158 }
4159 continue; /* while */
4160 }
4161
4162 else if ((CUR == '<') &&
4163 ((IS_ASCII_LETTER(NXT(1))) ||
4164 (NXT(1) == '_') || (NXT(1) == ':'))) {
4165 name = htmlParseHTMLName_nonInvasive(ctxt);
4166 if (name == NULL) {
4167 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4168 "htmlParseStartTag: invalid element name\n",
4169 NULL, NULL);
4170 /* Dump the bogus tag like browsers do */
4171 while ((CUR != 0) && (CUR != '>'))
4172 NEXT;
4173
4174 if (currentNode != NULL)
4175 xmlFree(currentNode);
4176 return;
4177 }
4178
4179 if (ctxt->name != NULL) {
4180 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4181 htmlAutoClose(ctxt, name);
4182 continue;
4183 }
4184 }
4185 }
4186
4187 /*
4188 * Has this node been popped out during parsing of
4189 * the next element
4190 */
4191 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4192 (!xmlStrEqual(currentNode, ctxt->name)))
4193 {
4194 if (currentNode != NULL) xmlFree(currentNode);
4195 return;
4196 }
4197
4198 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4199 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4200 /*
4201 * Handle SCRIPT/STYLE separately
4202 */
4203 htmlParseScript(ctxt);
4204 }
4205
4206 else if ((CUR == '<') && (NXT(1) == '!')) {
4207 /*
4208 * Sometimes DOCTYPE arrives in the middle of the document
4209 */
4210 if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4211 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4212 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4213 (UPP(8) == 'E')) {
4214 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4215 "Misplaced DOCTYPE declaration\n",
4216 BAD_CAST "DOCTYPE" , NULL);
4217 htmlParseDocTypeDecl(ctxt);
4218 }
4219 /*
4220 * First case : a comment
4221 */
4222 else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4223 htmlParseComment(ctxt);
4224 }
4225 else {
4226 htmlSkipBogusComment(ctxt);
4227 }
4228 }
4229
4230 /*
4231 * Second case : a Processing Instruction.
4232 */
4233 else if ((CUR == '<') && (NXT(1) == '?')) {
4234 htmlParsePI(ctxt);
4235 }
4236
4237 /*
4238 * Third case : a sub-element.
4239 */
4240 else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4241 htmlParseElement(ctxt);
4242 }
4243 else if (CUR == '<') {
4244 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4245 (ctxt->sax->characters != NULL))
4246 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4247 NEXT;
4248 }
4249
4250 /*
4251 * Fourth case : a reference. If if has not been resolved,
4252 * parsing returns it's Name, create the node
4253 */
4254 else if (CUR == '&') {
4255 htmlParseReference(ctxt);
4256 }
4257
4258 /*
4259 * Fifth case : end of the resource
4260 */
4261 else if (CUR == 0) {
4262 htmlAutoCloseOnEnd(ctxt);
4263 break;
4264 }
4265
4266 /*
4267 * Last case, text. Note that References are handled directly.
4268 */
4269 else {
4270 htmlParseCharData(ctxt);
4271 }
4272
4273 SHRINK;
4274 GROW;
4275 }
4276 if (currentNode != NULL) xmlFree(currentNode);
4277 }
4278
4279 /**
4280 * htmlParseElement:
4281 * @ctxt: an HTML parser context
4282 *
4283 * DEPRECATED: Internal function, don't use.
4284 *
4285 * parse an HTML element, this is highly recursive
4286 * this is kept for compatibility with previous code versions
4287 *
4288 * [39] element ::= EmptyElemTag | STag content ETag
4289 *
4290 * [41] Attribute ::= Name Eq AttValue
4291 */
4292
4293 void
htmlParseElement(htmlParserCtxtPtr ctxt)4294 htmlParseElement(htmlParserCtxtPtr ctxt) {
4295 const xmlChar *name;
4296 xmlChar *currentNode = NULL;
4297 const htmlElemDesc * info;
4298 htmlParserNodeInfo node_info;
4299 int failed;
4300 int depth;
4301 const xmlChar *oldptr;
4302
4303 if ((ctxt == NULL) || (ctxt->input == NULL))
4304 return;
4305
4306 /* Capture start position */
4307 if (ctxt->record_info) {
4308 node_info.begin_pos = ctxt->input->consumed +
4309 (CUR_PTR - ctxt->input->base);
4310 node_info.begin_line = ctxt->input->line;
4311 }
4312
4313 failed = htmlParseStartTag(ctxt);
4314 name = ctxt->name;
4315 if ((failed == -1) || (name == NULL)) {
4316 if (CUR == '>')
4317 NEXT;
4318 return;
4319 }
4320
4321 /*
4322 * Lookup the info for that element.
4323 */
4324 info = htmlTagLookup(name);
4325 if (info == NULL) {
4326 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4327 "Tag %s invalid\n", name, NULL);
4328 }
4329
4330 /*
4331 * Check for an Empty Element labeled the XML/SGML way
4332 */
4333 if ((CUR == '/') && (NXT(1) == '>')) {
4334 SKIP(2);
4335 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4336 ctxt->sax->endElement(ctxt->userData, name);
4337 htmlnamePop(ctxt);
4338 return;
4339 }
4340
4341 if (CUR == '>') {
4342 NEXT;
4343 } else {
4344 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4345 "Couldn't find end of Start Tag %s\n", name, NULL);
4346
4347 /*
4348 * end of parsing of this node.
4349 */
4350 if (xmlStrEqual(name, ctxt->name)) {
4351 nodePop(ctxt);
4352 htmlnamePop(ctxt);
4353 }
4354
4355 /*
4356 * Capture end position and add node
4357 */
4358 if (ctxt->record_info) {
4359 node_info.end_pos = ctxt->input->consumed +
4360 (CUR_PTR - ctxt->input->base);
4361 node_info.end_line = ctxt->input->line;
4362 node_info.node = ctxt->node;
4363 xmlParserAddNodeInfo(ctxt, &node_info);
4364 }
4365 return;
4366 }
4367
4368 /*
4369 * Check for an Empty Element from DTD definition
4370 */
4371 if ((info != NULL) && (info->empty)) {
4372 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4373 ctxt->sax->endElement(ctxt->userData, name);
4374 htmlnamePop(ctxt);
4375 return;
4376 }
4377
4378 /*
4379 * Parse the content of the element:
4380 */
4381 currentNode = xmlStrdup(ctxt->name);
4382 depth = ctxt->nameNr;
4383 while (CUR != 0) {
4384 oldptr = ctxt->input->cur;
4385 htmlParseContent(ctxt);
4386 if (oldptr==ctxt->input->cur) break;
4387 if (ctxt->nameNr < depth) break;
4388 }
4389
4390 /*
4391 * Capture end position and add node
4392 */
4393 if ( currentNode != NULL && ctxt->record_info ) {
4394 node_info.end_pos = ctxt->input->consumed +
4395 (CUR_PTR - ctxt->input->base);
4396 node_info.end_line = ctxt->input->line;
4397 node_info.node = ctxt->node;
4398 xmlParserAddNodeInfo(ctxt, &node_info);
4399 }
4400 if (CUR == 0) {
4401 htmlAutoCloseOnEnd(ctxt);
4402 }
4403
4404 if (currentNode != NULL)
4405 xmlFree(currentNode);
4406 }
4407
4408 static void
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt)4409 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4410 /*
4411 * Capture end position and add node
4412 */
4413 if ( ctxt->node != NULL && ctxt->record_info ) {
4414 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4415 (CUR_PTR - ctxt->input->base);
4416 ctxt->nodeInfo->end_line = ctxt->input->line;
4417 ctxt->nodeInfo->node = ctxt->node;
4418 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4419 htmlNodeInfoPop(ctxt);
4420 }
4421 if (CUR == 0) {
4422 htmlAutoCloseOnEnd(ctxt);
4423 }
4424 }
4425
4426 /**
4427 * htmlParseElementInternal:
4428 * @ctxt: an HTML parser context
4429 *
4430 * parse an HTML element, new version, non recursive
4431 *
4432 * [39] element ::= EmptyElemTag | STag content ETag
4433 *
4434 * [41] Attribute ::= Name Eq AttValue
4435 */
4436
4437 static void
htmlParseElementInternal(htmlParserCtxtPtr ctxt)4438 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4439 const xmlChar *name;
4440 const htmlElemDesc * info;
4441 htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4442 int failed;
4443
4444 if ((ctxt == NULL) || (ctxt->input == NULL))
4445 return;
4446
4447 /* Capture start position */
4448 if (ctxt->record_info) {
4449 node_info.begin_pos = ctxt->input->consumed +
4450 (CUR_PTR - ctxt->input->base);
4451 node_info.begin_line = ctxt->input->line;
4452 }
4453
4454 failed = htmlParseStartTag(ctxt);
4455 name = ctxt->name;
4456 if ((failed == -1) || (name == NULL)) {
4457 if (CUR == '>')
4458 NEXT;
4459 return;
4460 }
4461
4462 /*
4463 * Lookup the info for that element.
4464 */
4465 info = htmlTagLookup(name);
4466 if (info == NULL) {
4467 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4468 "Tag %s invalid\n", name, NULL);
4469 }
4470
4471 /*
4472 * Check for an Empty Element labeled the XML/SGML way
4473 */
4474 if ((CUR == '/') && (NXT(1) == '>')) {
4475 SKIP(2);
4476 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4477 ctxt->sax->endElement(ctxt->userData, name);
4478 htmlnamePop(ctxt);
4479 return;
4480 }
4481
4482 if (CUR == '>') {
4483 NEXT;
4484 } else {
4485 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4486 "Couldn't find end of Start Tag %s\n", name, NULL);
4487
4488 /*
4489 * end of parsing of this node.
4490 */
4491 if (xmlStrEqual(name, ctxt->name)) {
4492 nodePop(ctxt);
4493 htmlnamePop(ctxt);
4494 }
4495
4496 if (ctxt->record_info)
4497 htmlNodeInfoPush(ctxt, &node_info);
4498 htmlParserFinishElementParsing(ctxt);
4499 return;
4500 }
4501
4502 /*
4503 * Check for an Empty Element from DTD definition
4504 */
4505 if ((info != NULL) && (info->empty)) {
4506 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4507 ctxt->sax->endElement(ctxt->userData, name);
4508 htmlnamePop(ctxt);
4509 return;
4510 }
4511
4512 if (ctxt->record_info)
4513 htmlNodeInfoPush(ctxt, &node_info);
4514 }
4515
4516 /**
4517 * htmlParseContentInternal:
4518 * @ctxt: an HTML parser context
4519 *
4520 * Parse a content: comment, sub-element, reference or text.
4521 * New version for non recursive htmlParseElementInternal
4522 */
4523
4524 static void
htmlParseContentInternal(htmlParserCtxtPtr ctxt)4525 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4526 xmlChar *currentNode;
4527 int depth;
4528 const xmlChar *name;
4529
4530 depth = ctxt->nameNr;
4531 if (depth <= 0) {
4532 currentNode = NULL;
4533 } else {
4534 currentNode = xmlStrdup(ctxt->name);
4535 if (currentNode == NULL) {
4536 htmlErrMemory(ctxt);
4537 return;
4538 }
4539 }
4540 while (PARSER_STOPPED(ctxt) == 0) {
4541 GROW;
4542
4543 /*
4544 * Our tag or one of it's parent or children is ending.
4545 */
4546 if ((CUR == '<') && (NXT(1) == '/')) {
4547 if (htmlParseEndTag(ctxt) &&
4548 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4549 if (currentNode != NULL)
4550 xmlFree(currentNode);
4551
4552 depth = ctxt->nameNr;
4553 if (depth <= 0) {
4554 currentNode = NULL;
4555 } else {
4556 currentNode = xmlStrdup(ctxt->name);
4557 if (currentNode == NULL) {
4558 htmlErrMemory(ctxt);
4559 break;
4560 }
4561 }
4562 }
4563 continue; /* while */
4564 }
4565
4566 else if ((CUR == '<') &&
4567 ((IS_ASCII_LETTER(NXT(1))) ||
4568 (NXT(1) == '_') || (NXT(1) == ':'))) {
4569 name = htmlParseHTMLName_nonInvasive(ctxt);
4570 if (name == NULL) {
4571 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4572 "htmlParseStartTag: invalid element name\n",
4573 NULL, NULL);
4574 /* Dump the bogus tag like browsers do */
4575 while ((CUR == 0) && (CUR != '>'))
4576 NEXT;
4577
4578 htmlParserFinishElementParsing(ctxt);
4579 if (currentNode != NULL)
4580 xmlFree(currentNode);
4581
4582 if (ctxt->name == NULL) {
4583 currentNode = NULL;
4584 } else {
4585 currentNode = xmlStrdup(ctxt->name);
4586 if (currentNode == NULL) {
4587 htmlErrMemory(ctxt);
4588 break;
4589 }
4590 }
4591 depth = ctxt->nameNr;
4592 continue;
4593 }
4594
4595 if (ctxt->name != NULL) {
4596 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4597 htmlAutoClose(ctxt, name);
4598 continue;
4599 }
4600 }
4601 }
4602
4603 /*
4604 * Has this node been popped out during parsing of
4605 * the next element
4606 */
4607 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4608 (!xmlStrEqual(currentNode, ctxt->name)))
4609 {
4610 htmlParserFinishElementParsing(ctxt);
4611 if (currentNode != NULL) xmlFree(currentNode);
4612
4613 if (ctxt->name == NULL) {
4614 currentNode = NULL;
4615 } else {
4616 currentNode = xmlStrdup(ctxt->name);
4617 if (currentNode == NULL) {
4618 htmlErrMemory(ctxt);
4619 break;
4620 }
4621 }
4622 depth = ctxt->nameNr;
4623 continue;
4624 }
4625
4626 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4627 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4628 /*
4629 * Handle SCRIPT/STYLE separately
4630 */
4631 htmlParseScript(ctxt);
4632 }
4633
4634 else if ((CUR == '<') && (NXT(1) == '!')) {
4635 /*
4636 * Sometimes DOCTYPE arrives in the middle of the document
4637 */
4638 if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4639 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4640 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4641 (UPP(8) == 'E')) {
4642 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4643 "Misplaced DOCTYPE declaration\n",
4644 BAD_CAST "DOCTYPE" , NULL);
4645 htmlParseDocTypeDecl(ctxt);
4646 }
4647 /*
4648 * First case : a comment
4649 */
4650 else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4651 htmlParseComment(ctxt);
4652 }
4653 else {
4654 htmlSkipBogusComment(ctxt);
4655 }
4656 }
4657
4658 /*
4659 * Second case : a Processing Instruction.
4660 */
4661 else if ((CUR == '<') && (NXT(1) == '?')) {
4662 htmlParsePI(ctxt);
4663 }
4664
4665 /*
4666 * Third case : a sub-element.
4667 */
4668 else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4669 htmlParseElementInternal(ctxt);
4670 if (currentNode != NULL) xmlFree(currentNode);
4671
4672 if (ctxt->name == NULL) {
4673 currentNode = NULL;
4674 } else {
4675 currentNode = xmlStrdup(ctxt->name);
4676 if (currentNode == NULL) {
4677 htmlErrMemory(ctxt);
4678 break;
4679 }
4680 }
4681 depth = ctxt->nameNr;
4682 }
4683 else if (CUR == '<') {
4684 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4685 (ctxt->sax->characters != NULL))
4686 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4687 NEXT;
4688 }
4689
4690 /*
4691 * Fourth case : a reference. If if has not been resolved,
4692 * parsing returns it's Name, create the node
4693 */
4694 else if (CUR == '&') {
4695 htmlParseReference(ctxt);
4696 }
4697
4698 /*
4699 * Fifth case : end of the resource
4700 */
4701 else if (CUR == 0) {
4702 htmlAutoCloseOnEnd(ctxt);
4703 break;
4704 }
4705
4706 /*
4707 * Last case, text. Note that References are handled directly.
4708 */
4709 else {
4710 htmlParseCharData(ctxt);
4711 }
4712
4713 SHRINK;
4714 GROW;
4715 }
4716 if (currentNode != NULL) xmlFree(currentNode);
4717 }
4718
4719 /**
4720 * htmlParseContent:
4721 * @ctxt: an HTML parser context
4722 *
4723 * Parse a content: comment, sub-element, reference or text.
4724 * This is the entry point when called from parser.c
4725 */
4726
4727 void
__htmlParseContent(void * ctxt)4728 __htmlParseContent(void *ctxt) {
4729 if (ctxt != NULL)
4730 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4731 }
4732
4733 /**
4734 * htmlParseDocument:
4735 * @ctxt: an HTML parser context
4736 *
4737 * Parse an HTML document and invoke the SAX handlers. This is useful
4738 * if you're only interested in custom SAX callbacks. If you want a
4739 * document tree, use htmlCtxtParseDocument.
4740 *
4741 * Returns 0, -1 in case of error.
4742 */
4743
4744 int
htmlParseDocument(htmlParserCtxtPtr ctxt)4745 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4746 xmlDtdPtr dtd;
4747
4748 if ((ctxt == NULL) || (ctxt->input == NULL))
4749 return(-1);
4750
4751 /*
4752 * Document locator is unused. Only for backward compatibility.
4753 */
4754 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
4755 xmlSAXLocator copy = xmlDefaultSAXLocator;
4756 ctxt->sax->setDocumentLocator(ctxt->userData, ©);
4757 }
4758
4759 xmlDetectEncoding(ctxt);
4760
4761 /*
4762 * This is wrong but matches long-standing behavior. In most cases,
4763 * a document starting with an XML declaration will specify UTF-8.
4764 */
4765 if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
4766 (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0))
4767 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
4768
4769 /*
4770 * Wipe out everything which is before the first '<'
4771 */
4772 SKIP_BLANKS;
4773 if (CUR == 0) {
4774 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4775 "Document is empty\n", NULL, NULL);
4776 }
4777
4778 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4779 ctxt->sax->startDocument(ctxt->userData);
4780
4781 /*
4782 * Parse possible comments and PIs before any content
4783 */
4784 while (((CUR == '<') && (NXT(1) == '!') &&
4785 (NXT(2) == '-') && (NXT(3) == '-')) ||
4786 ((CUR == '<') && (NXT(1) == '?'))) {
4787 htmlParseComment(ctxt);
4788 htmlParsePI(ctxt);
4789 SKIP_BLANKS;
4790 }
4791
4792
4793 /*
4794 * Then possibly doc type declaration(s) and more Misc
4795 * (doctypedecl Misc*)?
4796 */
4797 if ((CUR == '<') && (NXT(1) == '!') &&
4798 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4799 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4800 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4801 (UPP(8) == 'E')) {
4802 htmlParseDocTypeDecl(ctxt);
4803 }
4804 SKIP_BLANKS;
4805
4806 /*
4807 * Parse possible comments and PIs before any content
4808 */
4809 while ((PARSER_STOPPED(ctxt) == 0) &&
4810 (((CUR == '<') && (NXT(1) == '!') &&
4811 (NXT(2) == '-') && (NXT(3) == '-')) ||
4812 ((CUR == '<') && (NXT(1) == '?')))) {
4813 htmlParseComment(ctxt);
4814 htmlParsePI(ctxt);
4815 SKIP_BLANKS;
4816 }
4817
4818 /*
4819 * Time to start parsing the tree itself
4820 */
4821 htmlParseContentInternal(ctxt);
4822
4823 /*
4824 * autoclose
4825 */
4826 if (CUR == 0)
4827 htmlAutoCloseOnEnd(ctxt);
4828
4829
4830 /*
4831 * SAX: end of the document processing.
4832 */
4833 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4834 ctxt->sax->endDocument(ctxt->userData);
4835
4836 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4837 dtd = xmlGetIntSubset(ctxt->myDoc);
4838 if (dtd == NULL) {
4839 ctxt->myDoc->intSubset =
4840 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4841 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4842 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4843 if (ctxt->myDoc->intSubset == NULL)
4844 htmlErrMemory(ctxt);
4845 }
4846 }
4847 if (! ctxt->wellFormed) return(-1);
4848 return(0);
4849 }
4850
4851
4852 /************************************************************************
4853 * *
4854 * Parser contexts handling *
4855 * *
4856 ************************************************************************/
4857
4858 /**
4859 * htmlInitParserCtxt:
4860 * @ctxt: an HTML parser context
4861 * @sax: SAX handler
4862 * @userData: user data
4863 *
4864 * Initialize a parser context
4865 *
4866 * Returns 0 in case of success and -1 in case of error
4867 */
4868
4869 static int
htmlInitParserCtxt(htmlParserCtxtPtr ctxt,const htmlSAXHandler * sax,void * userData)4870 htmlInitParserCtxt(htmlParserCtxtPtr ctxt, const htmlSAXHandler *sax,
4871 void *userData)
4872 {
4873 if (ctxt == NULL) return(-1);
4874 memset(ctxt, 0, sizeof(htmlParserCtxt));
4875
4876 ctxt->dict = xmlDictCreate();
4877 if (ctxt->dict == NULL)
4878 return(-1);
4879
4880 if (ctxt->sax == NULL)
4881 ctxt->sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4882 if (ctxt->sax == NULL)
4883 return(-1);
4884 if (sax == NULL) {
4885 memset(ctxt->sax, 0, sizeof(htmlSAXHandler));
4886 xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
4887 ctxt->userData = ctxt;
4888 } else {
4889 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4890 ctxt->userData = userData ? userData : ctxt;
4891 }
4892
4893 /* Allocate the Input stack */
4894 ctxt->inputTab = (htmlParserInputPtr *)
4895 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4896 if (ctxt->inputTab == NULL)
4897 return(-1);
4898 ctxt->inputNr = 0;
4899 ctxt->inputMax = 5;
4900 ctxt->input = NULL;
4901 ctxt->version = NULL;
4902 ctxt->encoding = NULL;
4903 ctxt->standalone = -1;
4904 ctxt->instate = XML_PARSER_START;
4905
4906 /* Allocate the Node stack */
4907 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4908 if (ctxt->nodeTab == NULL)
4909 return(-1);
4910 ctxt->nodeNr = 0;
4911 ctxt->nodeMax = 10;
4912 ctxt->node = NULL;
4913
4914 /* Allocate the Name stack */
4915 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4916 if (ctxt->nameTab == NULL)
4917 return(-1);
4918 ctxt->nameNr = 0;
4919 ctxt->nameMax = 10;
4920 ctxt->name = NULL;
4921
4922 ctxt->nodeInfoTab = NULL;
4923 ctxt->nodeInfoNr = 0;
4924 ctxt->nodeInfoMax = 0;
4925
4926 ctxt->myDoc = NULL;
4927 ctxt->wellFormed = 1;
4928 ctxt->replaceEntities = 0;
4929 ctxt->linenumbers = xmlLineNumbersDefaultValue;
4930 ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
4931 ctxt->html = 1;
4932 ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
4933 ctxt->vctxt.userData = ctxt;
4934 ctxt->vctxt.error = xmlParserValidityError;
4935 ctxt->vctxt.warning = xmlParserValidityWarning;
4936 ctxt->record_info = 0;
4937 ctxt->validate = 0;
4938 ctxt->checkIndex = 0;
4939 ctxt->catalogs = NULL;
4940 xmlInitNodeInfoSeq(&ctxt->node_seq);
4941 return(0);
4942 }
4943
4944 /**
4945 * htmlFreeParserCtxt:
4946 * @ctxt: an HTML parser context
4947 *
4948 * Free all the memory used by a parser context. However the parsed
4949 * document in ctxt->myDoc is not freed.
4950 */
4951
4952 void
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)4953 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4954 {
4955 xmlFreeParserCtxt(ctxt);
4956 }
4957
4958 /**
4959 * htmlNewParserCtxt:
4960 *
4961 * Allocate and initialize a new HTML parser context.
4962 *
4963 * This can be used to parse HTML documents into DOM trees with
4964 * functions like xmlCtxtReadFile or xmlCtxtReadMemory.
4965 *
4966 * See htmlCtxtUseOptions for parser options.
4967 *
4968 * See xmlCtxtSetErrorHandler for advanced error handling.
4969 *
4970 * See xmlNewInputURL, xmlNewInputMemory, xmlNewInputIO and similar
4971 * functions for advanced input control.
4972 *
4973 * See htmlNewSAXParserCtxt for custom SAX parsers.
4974 *
4975 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4976 */
4977
4978 htmlParserCtxtPtr
htmlNewParserCtxt(void)4979 htmlNewParserCtxt(void)
4980 {
4981 return(htmlNewSAXParserCtxt(NULL, NULL));
4982 }
4983
4984 /**
4985 * htmlNewSAXParserCtxt:
4986 * @sax: SAX handler
4987 * @userData: user data
4988 *
4989 * Allocate and initialize a new HTML SAX parser context. If userData
4990 * is NULL, the parser context will be passed as user data.
4991 *
4992 * Available since 2.11.0. If you want support older versions,
4993 * it's best to invoke htmlNewParserCtxt and set ctxt->sax with
4994 * struct assignment.
4995 *
4996 * Also see htmlNewParserCtxt.
4997 *
4998 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4999 */
5000
5001 htmlParserCtxtPtr
htmlNewSAXParserCtxt(const htmlSAXHandler * sax,void * userData)5002 htmlNewSAXParserCtxt(const htmlSAXHandler *sax, void *userData)
5003 {
5004 xmlParserCtxtPtr ctxt;
5005
5006 xmlInitParser();
5007
5008 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5009 if (ctxt == NULL)
5010 return(NULL);
5011 memset(ctxt, 0, sizeof(xmlParserCtxt));
5012 if (htmlInitParserCtxt(ctxt, sax, userData) < 0) {
5013 htmlFreeParserCtxt(ctxt);
5014 return(NULL);
5015 }
5016 return(ctxt);
5017 }
5018
5019 static htmlParserCtxtPtr
htmlCreateMemoryParserCtxtInternal(const char * url,const char * buffer,size_t size,const char * encoding)5020 htmlCreateMemoryParserCtxtInternal(const char *url,
5021 const char *buffer, size_t size,
5022 const char *encoding) {
5023 xmlParserCtxtPtr ctxt;
5024 xmlParserInputPtr input;
5025
5026 if (buffer == NULL)
5027 return(NULL);
5028
5029 ctxt = htmlNewParserCtxt();
5030 if (ctxt == NULL)
5031 return(NULL);
5032
5033 input = xmlNewInputMemory(ctxt, url, buffer, size, encoding, 0);
5034 if (input == NULL) {
5035 xmlFreeParserCtxt(ctxt);
5036 return(NULL);
5037 }
5038
5039 inputPush(ctxt, input);
5040
5041 return(ctxt);
5042 }
5043
5044 /**
5045 * htmlCreateMemoryParserCtxt:
5046 * @buffer: a pointer to a char array
5047 * @size: the size of the array
5048 *
5049 * DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadMemory.
5050 *
5051 * Create a parser context for an HTML in-memory document. The input
5052 * buffer must not contain any terminating null bytes.
5053 *
5054 * Returns the new parser context or NULL
5055 */
5056 htmlParserCtxtPtr
htmlCreateMemoryParserCtxt(const char * buffer,int size)5057 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5058 if (size <= 0)
5059 return(NULL);
5060
5061 return(htmlCreateMemoryParserCtxtInternal(NULL, buffer, size, NULL));
5062 }
5063
5064 /**
5065 * htmlCreateDocParserCtxt:
5066 * @str: a pointer to an array of xmlChar
5067 * @encoding: encoding (optional)
5068 *
5069 * Create a parser context for a null-terminated string.
5070 *
5071 * Returns the new parser context or NULL if a memory allocation failed.
5072 */
5073 static htmlParserCtxtPtr
htmlCreateDocParserCtxt(const xmlChar * str,const char * url,const char * encoding)5074 htmlCreateDocParserCtxt(const xmlChar *str, const char *url,
5075 const char *encoding) {
5076 xmlParserCtxtPtr ctxt;
5077 xmlParserInputPtr input;
5078
5079 if (str == NULL)
5080 return(NULL);
5081
5082 ctxt = htmlNewParserCtxt();
5083 if (ctxt == NULL)
5084 return(NULL);
5085
5086 input = xmlNewInputString(ctxt, url, (const char *) str, encoding, 0);
5087 if (input == NULL) {
5088 xmlFreeParserCtxt(ctxt);
5089 return(NULL);
5090 }
5091
5092 inputPush(ctxt, input);
5093
5094 return(ctxt);
5095 }
5096
5097 #ifdef LIBXML_PUSH_ENABLED
5098 /************************************************************************
5099 * *
5100 * Progressive parsing interfaces *
5101 * *
5102 ************************************************************************/
5103
5104 /**
5105 * htmlParseLookupSequence:
5106 * @ctxt: an HTML parser context
5107 * @first: the first char to lookup
5108 * @next: the next char to lookup or zero
5109 * @third: the next char to lookup or zero
5110 * @ignoreattrval: skip over attribute values
5111 *
5112 * Try to find if a sequence (first, next, third) or just (first next) or
5113 * (first) is available in the input stream.
5114 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5115 * to avoid rescanning sequences of bytes, it DOES change the state of the
5116 * parser, do not use liberally.
5117 * This is basically similar to xmlParseLookupSequence()
5118 *
5119 * Returns the index to the current parsing point if the full sequence
5120 * is available, -1 otherwise.
5121 */
5122 static int
htmlParseLookupSequence(htmlParserCtxtPtr ctxt,xmlChar first,xmlChar next,xmlChar third,int ignoreattrval)5123 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5124 xmlChar next, xmlChar third, int ignoreattrval)
5125 {
5126 size_t base, len;
5127 htmlParserInputPtr in;
5128 const xmlChar *buf;
5129 int quote;
5130
5131 in = ctxt->input;
5132 if (in == NULL)
5133 return (-1);
5134
5135 base = ctxt->checkIndex;
5136 quote = ctxt->endCheckState;
5137
5138 buf = in->cur;
5139 len = in->end - in->cur;
5140
5141 /* take into account the sequence length */
5142 if (third)
5143 len -= 2;
5144 else if (next)
5145 len--;
5146 for (; base < len; base++) {
5147 if (base >= INT_MAX / 2) {
5148 ctxt->checkIndex = 0;
5149 ctxt->endCheckState = 0;
5150 return (base - 2);
5151 }
5152 if (ignoreattrval) {
5153 if (quote) {
5154 if (buf[base] == quote)
5155 quote = 0;
5156 continue;
5157 }
5158 if (buf[base] == '"' || buf[base] == '\'') {
5159 quote = buf[base];
5160 continue;
5161 }
5162 }
5163 if (buf[base] == first) {
5164 if (third != 0) {
5165 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5166 continue;
5167 } else if (next != 0) {
5168 if (buf[base + 1] != next)
5169 continue;
5170 }
5171 ctxt->checkIndex = 0;
5172 ctxt->endCheckState = 0;
5173 return (base);
5174 }
5175 }
5176 ctxt->checkIndex = base;
5177 ctxt->endCheckState = quote;
5178 return (-1);
5179 }
5180
5181 /**
5182 * htmlParseLookupCommentEnd:
5183 * @ctxt: an HTML parser context
5184 *
5185 * Try to find a comment end tag in the input stream
5186 * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5187 * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5188 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5189 * to avoid rescanning sequences of bytes, it DOES change the state of the
5190 * parser, do not use liberally.
5191 * This wraps to htmlParseLookupSequence()
5192 *
5193 * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5194 */
5195 static int
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)5196 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5197 {
5198 int mark = 0;
5199 int offset;
5200
5201 while (1) {
5202 mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5203 if (mark < 0)
5204 break;
5205 if ((NXT(mark+2) == '>') ||
5206 ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5207 ctxt->checkIndex = 0;
5208 break;
5209 }
5210 offset = (NXT(mark+2) == '!') ? 3 : 2;
5211 if (mark + offset >= ctxt->input->end - ctxt->input->cur) {
5212 ctxt->checkIndex = mark;
5213 return(-1);
5214 }
5215 ctxt->checkIndex = mark + 1;
5216 }
5217 return mark;
5218 }
5219
5220
5221 /**
5222 * htmlParseTryOrFinish:
5223 * @ctxt: an HTML parser context
5224 * @terminate: last chunk indicator
5225 *
5226 * Try to progress on parsing
5227 *
5228 * Returns zero if no parsing was possible
5229 */
5230 static int
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt,int terminate)5231 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5232 int ret = 0;
5233 htmlParserInputPtr in;
5234 ptrdiff_t avail = 0;
5235 xmlChar cur, next;
5236
5237 htmlParserNodeInfo node_info;
5238
5239 while (PARSER_STOPPED(ctxt) == 0) {
5240
5241 in = ctxt->input;
5242 if (in == NULL) break;
5243 avail = in->end - in->cur;
5244 if ((avail == 0) && (terminate)) {
5245 htmlAutoCloseOnEnd(ctxt);
5246 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5247 /*
5248 * SAX: end of the document processing.
5249 */
5250 ctxt->instate = XML_PARSER_EOF;
5251 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5252 ctxt->sax->endDocument(ctxt->userData);
5253 }
5254 }
5255 if (avail < 1)
5256 goto done;
5257 /*
5258 * This is done to make progress and avoid an infinite loop
5259 * if a parsing attempt was aborted by hitting a NUL byte. After
5260 * changing htmlCurrentChar, this probably isn't necessary anymore.
5261 * We should consider removing this check.
5262 */
5263 cur = in->cur[0];
5264 if (cur == 0) {
5265 SKIP(1);
5266 continue;
5267 }
5268
5269 switch (ctxt->instate) {
5270 case XML_PARSER_EOF:
5271 /*
5272 * Document parsing is done !
5273 */
5274 goto done;
5275 case XML_PARSER_START:
5276 /*
5277 * This is wrong but matches long-standing behavior. In most
5278 * cases, a document starting with an XML declaration will
5279 * specify UTF-8.
5280 */
5281 if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
5282 (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0)) {
5283 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
5284 }
5285
5286 /*
5287 * Very first chars read from the document flow.
5288 */
5289 cur = in->cur[0];
5290 if (IS_BLANK_CH(cur)) {
5291 SKIP_BLANKS;
5292 avail = in->end - in->cur;
5293 }
5294 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
5295 xmlSAXLocator copy = xmlDefaultSAXLocator;
5296 ctxt->sax->setDocumentLocator(ctxt->userData, ©);
5297 }
5298 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5299 (!ctxt->disableSAX))
5300 ctxt->sax->startDocument(ctxt->userData);
5301
5302 cur = in->cur[0];
5303 next = in->cur[1];
5304 if ((cur == '<') && (next == '!') &&
5305 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5306 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5307 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5308 (UPP(8) == 'E')) {
5309 if ((!terminate) &&
5310 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5311 goto done;
5312 htmlParseDocTypeDecl(ctxt);
5313 ctxt->instate = XML_PARSER_PROLOG;
5314 } else {
5315 ctxt->instate = XML_PARSER_MISC;
5316 }
5317 break;
5318 case XML_PARSER_MISC:
5319 SKIP_BLANKS;
5320 avail = in->end - in->cur;
5321 /*
5322 * no chars in buffer
5323 */
5324 if (avail < 1)
5325 goto done;
5326 /*
5327 * not enough chars in buffer
5328 */
5329 if (avail < 2) {
5330 if (!terminate)
5331 goto done;
5332 else
5333 next = ' ';
5334 } else {
5335 next = in->cur[1];
5336 }
5337 cur = in->cur[0];
5338 if ((cur == '<') && (next == '!') &&
5339 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5340 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5341 goto done;
5342 htmlParseComment(ctxt);
5343 ctxt->instate = XML_PARSER_MISC;
5344 } else if ((cur == '<') && (next == '?')) {
5345 if ((!terminate) &&
5346 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5347 goto done;
5348 htmlParsePI(ctxt);
5349 ctxt->instate = XML_PARSER_MISC;
5350 } else if ((cur == '<') && (next == '!') &&
5351 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5352 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5353 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5354 (UPP(8) == 'E')) {
5355 if ((!terminate) &&
5356 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5357 goto done;
5358 htmlParseDocTypeDecl(ctxt);
5359 ctxt->instate = XML_PARSER_PROLOG;
5360 } else if ((cur == '<') && (next == '!') &&
5361 (avail < 9)) {
5362 goto done;
5363 } else {
5364 ctxt->instate = XML_PARSER_CONTENT;
5365 }
5366 break;
5367 case XML_PARSER_PROLOG:
5368 SKIP_BLANKS;
5369 avail = in->end - in->cur;
5370 if (avail < 2)
5371 goto done;
5372 cur = in->cur[0];
5373 next = in->cur[1];
5374 if ((cur == '<') && (next == '!') &&
5375 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5376 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5377 goto done;
5378 htmlParseComment(ctxt);
5379 ctxt->instate = XML_PARSER_PROLOG;
5380 } else if ((cur == '<') && (next == '?')) {
5381 if ((!terminate) &&
5382 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5383 goto done;
5384 htmlParsePI(ctxt);
5385 ctxt->instate = XML_PARSER_PROLOG;
5386 } else if ((cur == '<') && (next == '!') &&
5387 (avail < 4)) {
5388 goto done;
5389 } else {
5390 ctxt->instate = XML_PARSER_CONTENT;
5391 }
5392 break;
5393 case XML_PARSER_EPILOG:
5394 avail = in->end - in->cur;
5395 if (avail < 1)
5396 goto done;
5397 cur = in->cur[0];
5398 if (IS_BLANK_CH(cur)) {
5399 htmlParseCharData(ctxt);
5400 goto done;
5401 }
5402 if (avail < 2)
5403 goto done;
5404 next = in->cur[1];
5405 if ((cur == '<') && (next == '!') &&
5406 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5407 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5408 goto done;
5409 htmlParseComment(ctxt);
5410 ctxt->instate = XML_PARSER_EPILOG;
5411 } else if ((cur == '<') && (next == '?')) {
5412 if ((!terminate) &&
5413 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5414 goto done;
5415 htmlParsePI(ctxt);
5416 ctxt->instate = XML_PARSER_EPILOG;
5417 } else if ((cur == '<') && (next == '!') &&
5418 (avail < 4)) {
5419 goto done;
5420 } else {
5421 ctxt->errNo = XML_ERR_DOCUMENT_END;
5422 ctxt->wellFormed = 0;
5423 ctxt->instate = XML_PARSER_EOF;
5424 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5425 ctxt->sax->endDocument(ctxt->userData);
5426 goto done;
5427 }
5428 break;
5429 case XML_PARSER_START_TAG: {
5430 const xmlChar *name;
5431 int failed;
5432 const htmlElemDesc * info;
5433
5434 /*
5435 * no chars in buffer
5436 */
5437 if (avail < 1)
5438 goto done;
5439 /*
5440 * not enough chars in buffer
5441 */
5442 if (avail < 2) {
5443 if (!terminate)
5444 goto done;
5445 else
5446 next = ' ';
5447 } else {
5448 next = in->cur[1];
5449 }
5450 cur = in->cur[0];
5451 if (cur != '<') {
5452 ctxt->instate = XML_PARSER_CONTENT;
5453 break;
5454 }
5455 if (next == '/') {
5456 ctxt->instate = XML_PARSER_END_TAG;
5457 ctxt->checkIndex = 0;
5458 break;
5459 }
5460 if ((!terminate) &&
5461 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5462 goto done;
5463
5464 /* Capture start position */
5465 if (ctxt->record_info) {
5466 node_info.begin_pos = ctxt->input->consumed +
5467 (CUR_PTR - ctxt->input->base);
5468 node_info.begin_line = ctxt->input->line;
5469 }
5470
5471
5472 failed = htmlParseStartTag(ctxt);
5473 name = ctxt->name;
5474 if ((failed == -1) ||
5475 (name == NULL)) {
5476 if (CUR == '>')
5477 NEXT;
5478 break;
5479 }
5480
5481 /*
5482 * Lookup the info for that element.
5483 */
5484 info = htmlTagLookup(name);
5485 if (info == NULL) {
5486 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5487 "Tag %s invalid\n", name, NULL);
5488 }
5489
5490 /*
5491 * Check for an Empty Element labeled the XML/SGML way
5492 */
5493 if ((CUR == '/') && (NXT(1) == '>')) {
5494 SKIP(2);
5495 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5496 ctxt->sax->endElement(ctxt->userData, name);
5497 htmlnamePop(ctxt);
5498 ctxt->instate = XML_PARSER_CONTENT;
5499 break;
5500 }
5501
5502 if (CUR == '>') {
5503 NEXT;
5504 } else {
5505 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5506 "Couldn't find end of Start Tag %s\n",
5507 name, NULL);
5508
5509 /*
5510 * end of parsing of this node.
5511 */
5512 if (xmlStrEqual(name, ctxt->name)) {
5513 nodePop(ctxt);
5514 htmlnamePop(ctxt);
5515 }
5516
5517 if (ctxt->record_info)
5518 htmlNodeInfoPush(ctxt, &node_info);
5519
5520 ctxt->instate = XML_PARSER_CONTENT;
5521 break;
5522 }
5523
5524 /*
5525 * Check for an Empty Element from DTD definition
5526 */
5527 if ((info != NULL) && (info->empty)) {
5528 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5529 ctxt->sax->endElement(ctxt->userData, name);
5530 htmlnamePop(ctxt);
5531 }
5532
5533 if (ctxt->record_info)
5534 htmlNodeInfoPush(ctxt, &node_info);
5535
5536 ctxt->instate = XML_PARSER_CONTENT;
5537 break;
5538 }
5539 case XML_PARSER_CONTENT: {
5540 xmlChar chr[2] = { 0, 0 };
5541
5542 /*
5543 * Handle preparsed entities and charRef
5544 */
5545 if ((avail == 1) && (terminate)) {
5546 cur = in->cur[0];
5547 if ((cur != '<') && (cur != '&')) {
5548 if (ctxt->sax != NULL) {
5549 chr[0] = cur;
5550 if (IS_BLANK_CH(cur)) {
5551 if (ctxt->keepBlanks) {
5552 if (ctxt->sax->characters != NULL)
5553 ctxt->sax->characters(
5554 ctxt->userData, chr, 1);
5555 } else {
5556 if (ctxt->sax->ignorableWhitespace != NULL)
5557 ctxt->sax->ignorableWhitespace(
5558 ctxt->userData, chr, 1);
5559 }
5560 } else {
5561 htmlCheckParagraph(ctxt);
5562 if (ctxt->sax->characters != NULL)
5563 ctxt->sax->characters(
5564 ctxt->userData, chr, 1);
5565 }
5566 }
5567 ctxt->checkIndex = 0;
5568 in->cur++;
5569 break;
5570 }
5571 }
5572 if (avail < 2)
5573 goto done;
5574 cur = in->cur[0];
5575 next = in->cur[1];
5576 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5577 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5578 /*
5579 * Handle SCRIPT/STYLE separately
5580 */
5581 if (!terminate) {
5582 int idx;
5583 xmlChar val;
5584
5585 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5586 if (idx < 0)
5587 goto done;
5588 val = in->cur[idx + 2];
5589 if (val == 0) { /* bad cut of input */
5590 /*
5591 * FIXME: htmlParseScript checks for additional
5592 * characters after '</'.
5593 */
5594 ctxt->checkIndex = idx;
5595 goto done;
5596 }
5597 }
5598 htmlParseScript(ctxt);
5599 if ((cur == '<') && (next == '/')) {
5600 ctxt->instate = XML_PARSER_END_TAG;
5601 ctxt->checkIndex = 0;
5602 break;
5603 }
5604 } else if ((cur == '<') && (next == '!')) {
5605 if (avail < 4)
5606 goto done;
5607 /*
5608 * Sometimes DOCTYPE arrives in the middle of the document
5609 */
5610 if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
5611 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5612 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5613 (UPP(8) == 'E')) {
5614 if ((!terminate) &&
5615 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5616 goto done;
5617 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5618 "Misplaced DOCTYPE declaration\n",
5619 BAD_CAST "DOCTYPE" , NULL);
5620 htmlParseDocTypeDecl(ctxt);
5621 } else if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
5622 if ((!terminate) &&
5623 (htmlParseLookupCommentEnd(ctxt) < 0))
5624 goto done;
5625 htmlParseComment(ctxt);
5626 ctxt->instate = XML_PARSER_CONTENT;
5627 } else {
5628 if ((!terminate) &&
5629 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5630 goto done;
5631 htmlSkipBogusComment(ctxt);
5632 }
5633 } else if ((cur == '<') && (next == '?')) {
5634 if ((!terminate) &&
5635 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5636 goto done;
5637 htmlParsePI(ctxt);
5638 ctxt->instate = XML_PARSER_CONTENT;
5639 } else if ((cur == '<') && (next == '/')) {
5640 ctxt->instate = XML_PARSER_END_TAG;
5641 ctxt->checkIndex = 0;
5642 break;
5643 } else if ((cur == '<') && IS_ASCII_LETTER(next)) {
5644 if ((!terminate) && (next == 0))
5645 goto done;
5646 ctxt->instate = XML_PARSER_START_TAG;
5647 ctxt->checkIndex = 0;
5648 break;
5649 } else if (cur == '<') {
5650 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
5651 (ctxt->sax->characters != NULL))
5652 ctxt->sax->characters(ctxt->userData,
5653 BAD_CAST "<", 1);
5654 NEXT;
5655 } else {
5656 /*
5657 * check that the text sequence is complete
5658 * before handing out the data to the parser
5659 * to avoid problems with erroneous end of
5660 * data detection.
5661 */
5662 if ((!terminate) &&
5663 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5664 goto done;
5665 ctxt->checkIndex = 0;
5666 while ((PARSER_STOPPED(ctxt) == 0) &&
5667 (cur != '<') && (in->cur < in->end)) {
5668 if (cur == '&') {
5669 htmlParseReference(ctxt);
5670 } else {
5671 htmlParseCharData(ctxt);
5672 }
5673 cur = in->cur[0];
5674 }
5675 }
5676
5677 break;
5678 }
5679 case XML_PARSER_END_TAG:
5680 if (avail < 2)
5681 goto done;
5682 if ((!terminate) &&
5683 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5684 goto done;
5685 htmlParseEndTag(ctxt);
5686 if (ctxt->nameNr == 0) {
5687 ctxt->instate = XML_PARSER_EPILOG;
5688 } else {
5689 ctxt->instate = XML_PARSER_CONTENT;
5690 }
5691 ctxt->checkIndex = 0;
5692 break;
5693 default:
5694 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5695 "HPP: internal error\n", NULL, NULL);
5696 ctxt->instate = XML_PARSER_EOF;
5697 break;
5698 }
5699 }
5700 done:
5701 if ((avail == 0) && (terminate)) {
5702 htmlAutoCloseOnEnd(ctxt);
5703 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5704 /*
5705 * SAX: end of the document processing.
5706 */
5707 ctxt->instate = XML_PARSER_EOF;
5708 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5709 ctxt->sax->endDocument(ctxt->userData);
5710 }
5711 }
5712 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
5713 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5714 (ctxt->instate == XML_PARSER_EPILOG))) {
5715 xmlDtdPtr dtd;
5716 dtd = xmlGetIntSubset(ctxt->myDoc);
5717 if (dtd == NULL) {
5718 ctxt->myDoc->intSubset =
5719 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5720 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5721 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5722 if (ctxt->myDoc->intSubset == NULL)
5723 htmlErrMemory(ctxt);
5724 }
5725 }
5726 return(ret);
5727 }
5728
5729 /**
5730 * htmlParseChunk:
5731 * @ctxt: an HTML parser context
5732 * @chunk: chunk of memory
5733 * @size: size of chunk in bytes
5734 * @terminate: last chunk indicator
5735 *
5736 * Parse a chunk of memory in push parser mode.
5737 *
5738 * Assumes that the parser context was initialized with
5739 * htmlCreatePushParserCtxt.
5740 *
5741 * The last chunk, which will often be empty, must be marked with
5742 * the @terminate flag. With the default SAX callbacks, the resulting
5743 * document will be available in ctxt->myDoc. This pointer will not
5744 * be freed by the library.
5745 *
5746 * If the document isn't well-formed, ctxt->myDoc is set to NULL.
5747 *
5748 * Returns an xmlParserErrors code (0 on success).
5749 */
5750 int
htmlParseChunk(htmlParserCtxtPtr ctxt,const char * chunk,int size,int terminate)5751 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5752 int terminate) {
5753 if ((ctxt == NULL) || (ctxt->input == NULL))
5754 return(XML_ERR_ARGUMENT);
5755 if (PARSER_STOPPED(ctxt) != 0)
5756 return(ctxt->errNo);
5757 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5758 (ctxt->input->buf != NULL)) {
5759 size_t pos = ctxt->input->cur - ctxt->input->base;
5760 int res;
5761
5762 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5763 xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
5764 if (res < 0) {
5765 htmlParseErr(ctxt, ctxt->input->buf->error,
5766 "xmlParserInputBufferPush failed", NULL, NULL);
5767 xmlHaltParser(ctxt);
5768 return (ctxt->errNo);
5769 }
5770 }
5771 htmlParseTryOrFinish(ctxt, terminate);
5772 if (terminate) {
5773 if (ctxt->instate != XML_PARSER_EOF) {
5774 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5775 ctxt->sax->endDocument(ctxt->userData);
5776 }
5777 ctxt->instate = XML_PARSER_EOF;
5778 }
5779 return((xmlParserErrors) ctxt->errNo);
5780 }
5781
5782 /************************************************************************
5783 * *
5784 * User entry points *
5785 * *
5786 ************************************************************************/
5787
5788 /**
5789 * htmlCreatePushParserCtxt:
5790 * @sax: a SAX handler (optional)
5791 * @user_data: The user data returned on SAX callbacks (optional)
5792 * @chunk: a pointer to an array of chars (optional)
5793 * @size: number of chars in the array
5794 * @filename: only used for error reporting (optional)
5795 * @enc: encoding (deprecated, pass XML_CHAR_ENCODING_NONE)
5796 *
5797 * Create a parser context for using the HTML parser in push mode.
5798 *
5799 * Returns the new parser context or NULL if a memory allocation
5800 * failed.
5801 */
5802 htmlParserCtxtPtr
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,void * user_data,const char * chunk,int size,const char * filename,xmlCharEncoding enc)5803 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5804 const char *chunk, int size, const char *filename,
5805 xmlCharEncoding enc) {
5806 htmlParserCtxtPtr ctxt;
5807 htmlParserInputPtr input;
5808 const char *encoding;
5809
5810 ctxt = htmlNewSAXParserCtxt(sax, user_data);
5811 if (ctxt == NULL)
5812 return(NULL);
5813
5814 encoding = xmlGetCharEncodingName(enc);
5815 input = xmlNewInputPush(ctxt, filename, chunk, size, encoding);
5816 if (input == NULL) {
5817 htmlFreeParserCtxt(ctxt);
5818 return(NULL);
5819 }
5820 inputPush(ctxt, input);
5821
5822 return(ctxt);
5823 }
5824 #endif /* LIBXML_PUSH_ENABLED */
5825
5826 /**
5827 * htmlSAXParseDoc:
5828 * @cur: a pointer to an array of xmlChar
5829 * @encoding: a free form C string describing the HTML document encoding, or NULL
5830 * @sax: the SAX handler block
5831 * @userData: if using SAX, this pointer will be provided on callbacks.
5832 *
5833 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadDoc.
5834 *
5835 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5836 * to handle parse events. If sax is NULL, fallback to the default DOM
5837 * behavior and return a tree.
5838 *
5839 * Returns the resulting document tree unless SAX is NULL or the document is
5840 * not well formed.
5841 */
5842
5843 htmlDocPtr
htmlSAXParseDoc(const xmlChar * cur,const char * encoding,htmlSAXHandlerPtr sax,void * userData)5844 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
5845 htmlSAXHandlerPtr sax, void *userData) {
5846 htmlDocPtr ret;
5847 htmlParserCtxtPtr ctxt;
5848
5849 if (cur == NULL)
5850 return(NULL);
5851
5852 ctxt = htmlCreateDocParserCtxt(cur, NULL, encoding);
5853 if (ctxt == NULL)
5854 return(NULL);
5855
5856 if (sax != NULL) {
5857 *ctxt->sax = *sax;
5858 ctxt->userData = userData;
5859 }
5860
5861 htmlParseDocument(ctxt);
5862 ret = ctxt->myDoc;
5863 htmlFreeParserCtxt(ctxt);
5864
5865 return(ret);
5866 }
5867
5868 /**
5869 * htmlParseDoc:
5870 * @cur: a pointer to an array of xmlChar
5871 * @encoding: the encoding (optional)
5872 *
5873 * DEPRECATED: Use htmlReadDoc.
5874 *
5875 * Parse an HTML in-memory document and build a tree.
5876 *
5877 * This function uses deprecated global parser options.
5878 *
5879 * Returns the resulting document tree
5880 */
5881
5882 htmlDocPtr
htmlParseDoc(const xmlChar * cur,const char * encoding)5883 htmlParseDoc(const xmlChar *cur, const char *encoding) {
5884 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5885 }
5886
5887
5888 /**
5889 * htmlCreateFileParserCtxt:
5890 * @filename: the filename
5891 * @encoding: optional encoding
5892 *
5893 * DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadFile.
5894 *
5895 * Create a parser context to read from a file.
5896 *
5897 * A non-NULL encoding overrides encoding declarations in the document.
5898 *
5899 * Automatic support for ZLIB/Compress compressed document is provided
5900 * by default if found at compile-time.
5901 *
5902 * Returns the new parser context or NULL if a memory allocation failed.
5903 */
5904 htmlParserCtxtPtr
htmlCreateFileParserCtxt(const char * filename,const char * encoding)5905 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5906 {
5907 htmlParserCtxtPtr ctxt;
5908 htmlParserInputPtr input;
5909
5910 if (filename == NULL)
5911 return(NULL);
5912
5913 ctxt = htmlNewParserCtxt();
5914 if (ctxt == NULL) {
5915 return(NULL);
5916 }
5917
5918 input = xmlNewInputURL(ctxt, filename, NULL, encoding, 0);
5919 if (input == NULL) {
5920 xmlFreeParserCtxt(ctxt);
5921 return(NULL);
5922 }
5923 inputPush(ctxt, input);
5924
5925 return(ctxt);
5926 }
5927
5928 /**
5929 * htmlSAXParseFile:
5930 * @filename: the filename
5931 * @encoding: encoding (optional)
5932 * @sax: the SAX handler block
5933 * @userData: if using SAX, this pointer will be provided on callbacks.
5934 *
5935 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadFile.
5936 *
5937 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5938 * compressed document is provided by default if found at compile-time.
5939 * It use the given SAX function block to handle the parsing callback.
5940 * If sax is NULL, fallback to the default DOM tree building routines.
5941 *
5942 * Returns the resulting document tree unless SAX is NULL or the document is
5943 * not well formed.
5944 */
5945
5946 htmlDocPtr
htmlSAXParseFile(const char * filename,const char * encoding,htmlSAXHandlerPtr sax,void * userData)5947 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5948 void *userData) {
5949 htmlDocPtr ret;
5950 htmlParserCtxtPtr ctxt;
5951 htmlSAXHandlerPtr oldsax = NULL;
5952
5953 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5954 if (ctxt == NULL) return(NULL);
5955 if (sax != NULL) {
5956 oldsax = ctxt->sax;
5957 ctxt->sax = sax;
5958 ctxt->userData = userData;
5959 }
5960
5961 htmlParseDocument(ctxt);
5962
5963 ret = ctxt->myDoc;
5964 if (sax != NULL) {
5965 ctxt->sax = oldsax;
5966 ctxt->userData = NULL;
5967 }
5968 htmlFreeParserCtxt(ctxt);
5969
5970 return(ret);
5971 }
5972
5973 /**
5974 * htmlParseFile:
5975 * @filename: the filename
5976 * @encoding: encoding (optional)
5977 *
5978 * Parse an HTML file and build a tree.
5979 *
5980 * See xmlNewInputURL for details.
5981 *
5982 * Returns the resulting document tree
5983 */
5984
5985 htmlDocPtr
htmlParseFile(const char * filename,const char * encoding)5986 htmlParseFile(const char *filename, const char *encoding) {
5987 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5988 }
5989
5990 /**
5991 * htmlHandleOmittedElem:
5992 * @val: int 0 or 1
5993 *
5994 * Set and return the previous value for handling HTML omitted tags.
5995 *
5996 * Returns the last value for 0 for no handling, 1 for auto insertion.
5997 */
5998
5999 int
htmlHandleOmittedElem(int val)6000 htmlHandleOmittedElem(int val) {
6001 int old = htmlOmittedDefaultValue;
6002
6003 htmlOmittedDefaultValue = val;
6004 return(old);
6005 }
6006
6007 /**
6008 * htmlElementAllowedHere:
6009 * @parent: HTML parent element
6010 * @elt: HTML element
6011 *
6012 * Checks whether an HTML element may be a direct child of a parent element.
6013 * Note - doesn't check for deprecated elements
6014 *
6015 * Returns 1 if allowed; 0 otherwise.
6016 */
6017 int
htmlElementAllowedHere(const htmlElemDesc * parent,const xmlChar * elt)6018 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6019 const char** p ;
6020
6021 if ( ! elt || ! parent || ! parent->subelts )
6022 return 0 ;
6023
6024 for ( p = parent->subelts; *p; ++p )
6025 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6026 return 1 ;
6027
6028 return 0 ;
6029 }
6030 /**
6031 * htmlElementStatusHere:
6032 * @parent: HTML parent element
6033 * @elt: HTML element
6034 *
6035 * Checks whether an HTML element may be a direct child of a parent element.
6036 * and if so whether it is valid or deprecated.
6037 *
6038 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6039 */
6040 htmlStatus
htmlElementStatusHere(const htmlElemDesc * parent,const htmlElemDesc * elt)6041 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6042 if ( ! parent || ! elt )
6043 return HTML_INVALID ;
6044 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6045 return HTML_INVALID ;
6046
6047 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6048 }
6049 /**
6050 * htmlAttrAllowed:
6051 * @elt: HTML element
6052 * @attr: HTML attribute
6053 * @legacy: whether to allow deprecated attributes
6054 *
6055 * Checks whether an attribute is valid for an element
6056 * Has full knowledge of Required and Deprecated attributes
6057 *
6058 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6059 */
6060 htmlStatus
htmlAttrAllowed(const htmlElemDesc * elt,const xmlChar * attr,int legacy)6061 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6062 const char** p ;
6063
6064 if ( !elt || ! attr )
6065 return HTML_INVALID ;
6066
6067 if ( elt->attrs_req )
6068 for ( p = elt->attrs_req; *p; ++p)
6069 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6070 return HTML_REQUIRED ;
6071
6072 if ( elt->attrs_opt )
6073 for ( p = elt->attrs_opt; *p; ++p)
6074 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6075 return HTML_VALID ;
6076
6077 if ( legacy && elt->attrs_depr )
6078 for ( p = elt->attrs_depr; *p; ++p)
6079 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6080 return HTML_DEPRECATED ;
6081
6082 return HTML_INVALID ;
6083 }
6084 /**
6085 * htmlNodeStatus:
6086 * @node: an htmlNodePtr in a tree
6087 * @legacy: whether to allow deprecated elements (YES is faster here
6088 * for Element nodes)
6089 *
6090 * Checks whether the tree node is valid. Experimental (the author
6091 * only uses the HTML enhancements in a SAX parser)
6092 *
6093 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6094 * legacy allowed) or htmlElementStatusHere (otherwise).
6095 * for Attribute nodes, a return from htmlAttrAllowed
6096 * for other nodes, HTML_NA (no checks performed)
6097 */
6098 htmlStatus
htmlNodeStatus(htmlNodePtr node,int legacy)6099 htmlNodeStatus(htmlNodePtr node, int legacy) {
6100 if ( ! node )
6101 return HTML_INVALID ;
6102
6103 switch ( node->type ) {
6104 case XML_ELEMENT_NODE:
6105 return legacy
6106 ? ( htmlElementAllowedHere (
6107 htmlTagLookup(node->parent->name) , node->name
6108 ) ? HTML_VALID : HTML_INVALID )
6109 : htmlElementStatusHere(
6110 htmlTagLookup(node->parent->name) ,
6111 htmlTagLookup(node->name) )
6112 ;
6113 case XML_ATTRIBUTE_NODE:
6114 return htmlAttrAllowed(
6115 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6116 default: return HTML_NA ;
6117 }
6118 }
6119 /************************************************************************
6120 * *
6121 * New set (2.6.0) of simpler and more flexible APIs *
6122 * *
6123 ************************************************************************/
6124 /**
6125 * DICT_FREE:
6126 * @str: a string
6127 *
6128 * Free a string if it is not owned by the "dict" dictionary in the
6129 * current scope
6130 */
6131 #define DICT_FREE(str) \
6132 if ((str) && ((!dict) || \
6133 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6134 xmlFree((char *)(str));
6135
6136 /**
6137 * htmlCtxtReset:
6138 * @ctxt: an HTML parser context
6139 *
6140 * Reset a parser context
6141 */
6142 void
htmlCtxtReset(htmlParserCtxtPtr ctxt)6143 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6144 {
6145 xmlParserInputPtr input;
6146 xmlDictPtr dict;
6147
6148 if (ctxt == NULL)
6149 return;
6150
6151 dict = ctxt->dict;
6152
6153 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6154 xmlFreeInputStream(input);
6155 }
6156 ctxt->inputNr = 0;
6157 ctxt->input = NULL;
6158
6159 ctxt->spaceNr = 0;
6160 if (ctxt->spaceTab != NULL) {
6161 ctxt->spaceTab[0] = -1;
6162 ctxt->space = &ctxt->spaceTab[0];
6163 } else {
6164 ctxt->space = NULL;
6165 }
6166
6167
6168 ctxt->nodeNr = 0;
6169 ctxt->node = NULL;
6170
6171 ctxt->nameNr = 0;
6172 ctxt->name = NULL;
6173
6174 ctxt->nsNr = 0;
6175
6176 DICT_FREE(ctxt->version);
6177 ctxt->version = NULL;
6178 DICT_FREE(ctxt->encoding);
6179 ctxt->encoding = NULL;
6180 DICT_FREE(ctxt->extSubURI);
6181 ctxt->extSubURI = NULL;
6182 DICT_FREE(ctxt->extSubSystem);
6183 ctxt->extSubSystem = NULL;
6184 if (ctxt->myDoc != NULL)
6185 xmlFreeDoc(ctxt->myDoc);
6186 ctxt->myDoc = NULL;
6187
6188 ctxt->standalone = -1;
6189 ctxt->hasExternalSubset = 0;
6190 ctxt->hasPErefs = 0;
6191 ctxt->html = 1;
6192 ctxt->instate = XML_PARSER_START;
6193
6194 ctxt->wellFormed = 1;
6195 ctxt->nsWellFormed = 1;
6196 ctxt->disableSAX = 0;
6197 ctxt->valid = 1;
6198 ctxt->vctxt.userData = ctxt;
6199 ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
6200 ctxt->vctxt.error = xmlParserValidityError;
6201 ctxt->vctxt.warning = xmlParserValidityWarning;
6202 ctxt->record_info = 0;
6203 ctxt->checkIndex = 0;
6204 ctxt->endCheckState = 0;
6205 ctxt->inSubset = 0;
6206 ctxt->errNo = XML_ERR_OK;
6207 ctxt->depth = 0;
6208 ctxt->catalogs = NULL;
6209 xmlInitNodeInfoSeq(&ctxt->node_seq);
6210
6211 if (ctxt->attsDefault != NULL) {
6212 xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6213 ctxt->attsDefault = NULL;
6214 }
6215 if (ctxt->attsSpecial != NULL) {
6216 xmlHashFree(ctxt->attsSpecial, NULL);
6217 ctxt->attsSpecial = NULL;
6218 }
6219
6220 ctxt->nbErrors = 0;
6221 ctxt->nbWarnings = 0;
6222 if (ctxt->lastError.code != XML_ERR_OK)
6223 xmlResetError(&ctxt->lastError);
6224 }
6225
6226 /**
6227 * htmlCtxtUseOptions:
6228 * @ctxt: an HTML parser context
6229 * @options: a combination of htmlParserOption(s)
6230 *
6231 * Applies the options to the parser context
6232 *
6233 * Returns 0 in case of success, the set of unknown or unimplemented options
6234 * in case of error.
6235 */
6236 int
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt,int options)6237 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6238 {
6239 if (ctxt == NULL)
6240 return(-1);
6241
6242 if (options & HTML_PARSE_NOWARNING) {
6243 ctxt->sax->warning = NULL;
6244 ctxt->vctxt.warning = NULL;
6245 options -= XML_PARSE_NOWARNING;
6246 ctxt->options |= XML_PARSE_NOWARNING;
6247 }
6248 if (options & HTML_PARSE_NOERROR) {
6249 ctxt->sax->error = NULL;
6250 ctxt->vctxt.error = NULL;
6251 ctxt->sax->fatalError = NULL;
6252 options -= XML_PARSE_NOERROR;
6253 ctxt->options |= XML_PARSE_NOERROR;
6254 }
6255 if (options & HTML_PARSE_PEDANTIC) {
6256 ctxt->pedantic = 1;
6257 options -= XML_PARSE_PEDANTIC;
6258 ctxt->options |= XML_PARSE_PEDANTIC;
6259 } else
6260 ctxt->pedantic = 0;
6261 if (options & XML_PARSE_NOBLANKS) {
6262 ctxt->keepBlanks = 0;
6263 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6264 options -= XML_PARSE_NOBLANKS;
6265 ctxt->options |= XML_PARSE_NOBLANKS;
6266 } else
6267 ctxt->keepBlanks = 1;
6268 if (options & HTML_PARSE_RECOVER) {
6269 ctxt->recovery = 1;
6270 options -= HTML_PARSE_RECOVER;
6271 } else
6272 ctxt->recovery = 0;
6273 if (options & HTML_PARSE_COMPACT) {
6274 ctxt->options |= HTML_PARSE_COMPACT;
6275 options -= HTML_PARSE_COMPACT;
6276 }
6277 if (options & XML_PARSE_HUGE) {
6278 ctxt->options |= XML_PARSE_HUGE;
6279 options -= XML_PARSE_HUGE;
6280 }
6281 if (options & HTML_PARSE_NODEFDTD) {
6282 ctxt->options |= HTML_PARSE_NODEFDTD;
6283 options -= HTML_PARSE_NODEFDTD;
6284 }
6285 if (options & HTML_PARSE_IGNORE_ENC) {
6286 ctxt->options |= HTML_PARSE_IGNORE_ENC;
6287 options -= HTML_PARSE_IGNORE_ENC;
6288 }
6289 if (options & HTML_PARSE_NOIMPLIED) {
6290 ctxt->options |= HTML_PARSE_NOIMPLIED;
6291 options -= HTML_PARSE_NOIMPLIED;
6292 }
6293 ctxt->dictNames = 0;
6294 ctxt->linenumbers = 1;
6295 return (options);
6296 }
6297
6298 /**
6299 * htmlCtxtParseDocument:
6300 * @ctxt: an HTML parser context
6301 *
6302 * Parse an HTML document and return the resulting document tree.
6303 *
6304 * Returns the resulting document tree or NULL
6305 */
6306 htmlDocPtr
htmlCtxtParseDocument(htmlParserCtxtPtr ctxt,xmlParserInputPtr input)6307 htmlCtxtParseDocument(htmlParserCtxtPtr ctxt, xmlParserInputPtr input)
6308 {
6309 htmlDocPtr ret;
6310
6311 if ((ctxt == NULL) || (input == NULL))
6312 return(NULL);
6313
6314 /* assert(ctxt->inputNr == 0); */
6315 while (ctxt->inputNr > 0)
6316 xmlFreeInputStream(inputPop(ctxt));
6317
6318 if (inputPush(ctxt, input) < 0) {
6319 xmlFreeInputStream(input);
6320 return(NULL);
6321 }
6322
6323 ctxt->html = 1;
6324 htmlParseDocument(ctxt);
6325
6326 if (ctxt->errNo != XML_ERR_NO_MEMORY) {
6327 ret = ctxt->myDoc;
6328 } else {
6329 ret = NULL;
6330 xmlFreeDoc(ctxt->myDoc);
6331 }
6332 ctxt->myDoc = NULL;
6333
6334 /* assert(ctxt->inputNr == 1); */
6335 while (ctxt->inputNr > 0)
6336 xmlFreeInputStream(inputPop(ctxt));
6337
6338 return(ret);
6339 }
6340
6341 /**
6342 * htmlReadDoc:
6343 * @str: a pointer to a zero terminated string
6344 * @url: only used for error reporting (optoinal)
6345 * @encoding: the document encoding (optional)
6346 * @options: a combination of htmlParserOptions
6347 *
6348 * Convenience function to parse an HTML document from a zero-terminated
6349 * string.
6350 *
6351 * See htmlCtxtReadDoc for details.
6352 *
6353 * Returns the resulting document tree.
6354 */
6355 htmlDocPtr
htmlReadDoc(const xmlChar * str,const char * url,const char * encoding,int options)6356 htmlReadDoc(const xmlChar *str, const char *url, const char *encoding,
6357 int options)
6358 {
6359 htmlParserCtxtPtr ctxt;
6360 xmlParserInputPtr input;
6361 htmlDocPtr doc;
6362
6363 ctxt = htmlNewParserCtxt();
6364 if (ctxt == NULL)
6365 return(NULL);
6366
6367 htmlCtxtUseOptions(ctxt, options);
6368
6369 input = xmlNewInputString(ctxt, url, (const char *) str, encoding,
6370 XML_INPUT_BUF_STATIC);
6371
6372 doc = htmlCtxtParseDocument(ctxt, input);
6373
6374 htmlFreeParserCtxt(ctxt);
6375 return(doc);
6376 }
6377
6378 /**
6379 * htmlReadFile:
6380 * @filename: a file or URL
6381 * @encoding: the document encoding (optional)
6382 * @options: a combination of htmlParserOptions
6383 *
6384 * Convenience function to parse an HTML file from the filesystem,
6385 * the network or a global user-defined resource loader.
6386 *
6387 * See htmlCtxtReadFile for details.
6388 *
6389 * Returns the resulting document tree.
6390 */
6391 htmlDocPtr
htmlReadFile(const char * filename,const char * encoding,int options)6392 htmlReadFile(const char *filename, const char *encoding, int options)
6393 {
6394 htmlParserCtxtPtr ctxt;
6395 xmlParserInputPtr input;
6396 htmlDocPtr doc;
6397
6398 ctxt = htmlNewParserCtxt();
6399 if (ctxt == NULL)
6400 return(NULL);
6401
6402 htmlCtxtUseOptions(ctxt, options);
6403
6404 input = xmlNewInputURL(ctxt, filename, NULL, encoding, 0);
6405
6406 doc = htmlCtxtParseDocument(ctxt, input);
6407
6408 htmlFreeParserCtxt(ctxt);
6409 return(doc);
6410 }
6411
6412 /**
6413 * htmlReadMemory:
6414 * @buffer: a pointer to a char array
6415 * @size: the size of the array
6416 * @url: only used for error reporting (optional)
6417 * @encoding: the document encoding, or NULL
6418 * @options: a combination of htmlParserOption(s)
6419 *
6420 * Convenience function to parse an HTML document from memory.
6421 * The input buffer must not contain any terminating null bytes.
6422 *
6423 * See htmlCtxtReadMemory for details.
6424 *
6425 * Returns the resulting document tree
6426 */
6427 htmlDocPtr
htmlReadMemory(const char * buffer,int size,const char * url,const char * encoding,int options)6428 htmlReadMemory(const char *buffer, int size, const char *url,
6429 const char *encoding, int options)
6430 {
6431 htmlParserCtxtPtr ctxt;
6432 xmlParserInputPtr input;
6433 htmlDocPtr doc;
6434
6435 if (size < 0)
6436 return(NULL);
6437
6438 ctxt = htmlNewParserCtxt();
6439 if (ctxt == NULL)
6440 return(NULL);
6441
6442 htmlCtxtUseOptions(ctxt, options);
6443
6444 input = xmlNewInputMemory(ctxt, url, buffer, size, encoding,
6445 XML_INPUT_BUF_STATIC);
6446
6447 doc = htmlCtxtParseDocument(ctxt, input);
6448
6449 htmlFreeParserCtxt(ctxt);
6450 return(doc);
6451 }
6452
6453 /**
6454 * htmlReadFd:
6455 * @fd: an open file descriptor
6456 * @url: only used for error reporting (optional)
6457 * @encoding: the document encoding, or NULL
6458 * @options: a combination of htmlParserOptions
6459 *
6460 * Convenience function to parse an HTML document from a
6461 * file descriptor.
6462 *
6463 * NOTE that the file descriptor will not be closed when the
6464 * context is freed or reset.
6465 *
6466 * See htmlCtxtReadFd for details.
6467 *
6468 * Returns the resulting document tree
6469 */
6470 htmlDocPtr
htmlReadFd(int fd,const char * url,const char * encoding,int options)6471 htmlReadFd(int fd, const char *url, const char *encoding, int options)
6472 {
6473 htmlParserCtxtPtr ctxt;
6474 xmlParserInputPtr input;
6475 htmlDocPtr doc;
6476
6477 ctxt = htmlNewParserCtxt();
6478 if (ctxt == NULL)
6479 return(NULL);
6480
6481 htmlCtxtUseOptions(ctxt, options);
6482
6483 input = xmlNewInputFd(ctxt, url, fd, encoding, 0);
6484 input->buf->closecallback = NULL;
6485
6486 doc = htmlCtxtParseDocument(ctxt, input);
6487
6488 htmlFreeParserCtxt(ctxt);
6489 return(doc);
6490 }
6491
6492 /**
6493 * htmlReadIO:
6494 * @ioread: an I/O read function
6495 * @ioclose: an I/O close function (optional)
6496 * @ioctx: an I/O handler
6497 * @url: only used for error reporting (optional)
6498 * @encoding: the document encoding (optional)
6499 * @options: a combination of htmlParserOption(s)
6500 *
6501 * Convenience function to parse an HTML document from I/O functions
6502 * and context.
6503 *
6504 * See htmlCtxtReadIO for details.
6505 *
6506 * Returns the resulting document tree
6507 */
6508 htmlDocPtr
htmlReadIO(xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * url,const char * encoding,int options)6509 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6510 void *ioctx, const char *url, const char *encoding, int options)
6511 {
6512 htmlParserCtxtPtr ctxt;
6513 xmlParserInputPtr input;
6514 htmlDocPtr doc;
6515
6516 ctxt = htmlNewParserCtxt();
6517 if (ctxt == NULL)
6518 return (NULL);
6519
6520 htmlCtxtUseOptions(ctxt, options);
6521
6522 input = xmlNewInputIO(ctxt, url, ioread, ioclose, ioctx, encoding, 0);
6523
6524 doc = htmlCtxtParseDocument(ctxt, input);
6525
6526 htmlFreeParserCtxt(ctxt);
6527 return(doc);
6528 }
6529
6530 /**
6531 * htmlCtxtReadDoc:
6532 * @ctxt: an HTML parser context
6533 * @str: a pointer to a zero terminated string
6534 * @URL: only used for error reporting (optional)
6535 * @encoding: the document encoding (optional)
6536 * @options: a combination of htmlParserOptions
6537 *
6538 * Parse an HTML in-memory document and build a tree.
6539 *
6540 * See htmlCtxtUseOptions for details.
6541 *
6542 * Returns the resulting document tree
6543 */
6544 htmlDocPtr
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt,const xmlChar * str,const char * URL,const char * encoding,int options)6545 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar *str,
6546 const char *URL, const char *encoding, int options)
6547 {
6548 xmlParserInputPtr input;
6549
6550 if (ctxt == NULL)
6551 return (NULL);
6552
6553 htmlCtxtReset(ctxt);
6554 htmlCtxtUseOptions(ctxt, options);
6555
6556 input = xmlNewInputString(ctxt, URL, (const char *) str, encoding, 0);
6557
6558 return(htmlCtxtParseDocument(ctxt, input));
6559 }
6560
6561 /**
6562 * htmlCtxtReadFile:
6563 * @ctxt: an HTML parser context
6564 * @filename: a file or URL
6565 * @encoding: the document encoding (optional)
6566 * @options: a combination of htmlParserOptions
6567 *
6568 * Parse an HTML file from the filesystem, the network or a
6569 * user-defined resource loader.
6570 *
6571 * See xmlNewInputURL and htmlCtxtUseOptions for details.
6572 *
6573 * Returns the resulting document tree
6574 */
6575 htmlDocPtr
htmlCtxtReadFile(htmlParserCtxtPtr ctxt,const char * filename,const char * encoding,int options)6576 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6577 const char *encoding, int options)
6578 {
6579 xmlParserInputPtr input;
6580
6581 if (ctxt == NULL)
6582 return (NULL);
6583
6584 htmlCtxtReset(ctxt);
6585 htmlCtxtUseOptions(ctxt, options);
6586
6587 input = xmlNewInputURL(ctxt, filename, NULL, encoding, 0);
6588
6589 return(htmlCtxtParseDocument(ctxt, input));
6590 }
6591
6592 /**
6593 * htmlCtxtReadMemory:
6594 * @ctxt: an HTML parser context
6595 * @buffer: a pointer to a char array
6596 * @size: the size of the array
6597 * @URL: only used for error reporting (optional)
6598 * @encoding: the document encoding (optinal)
6599 * @options: a combination of htmlParserOptions
6600 *
6601 * Parse an HTML in-memory document and build a tree. The input buffer must
6602 * not contain any terminating null bytes.
6603 *
6604 * See htmlCtxtUseOptions for details.
6605 *
6606 * Returns the resulting document tree
6607 */
6608 htmlDocPtr
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt,const char * buffer,int size,const char * URL,const char * encoding,int options)6609 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6610 const char *URL, const char *encoding, int options)
6611 {
6612 xmlParserInputPtr input;
6613
6614 if ((ctxt == NULL) || (size < 0))
6615 return (NULL);
6616
6617 htmlCtxtReset(ctxt);
6618 htmlCtxtUseOptions(ctxt, options);
6619
6620 input = xmlNewInputMemory(ctxt, URL, buffer, size, encoding,
6621 XML_INPUT_BUF_STATIC);
6622
6623 return(htmlCtxtParseDocument(ctxt, input));
6624 }
6625
6626 /**
6627 * htmlCtxtReadFd:
6628 * @ctxt: an HTML parser context
6629 * @fd: an open file descriptor
6630 * @URL: only used for error reporting (optional)
6631 * @encoding: the document encoding (optinal)
6632 * @options: a combination of htmlParserOptions
6633 *
6634 * Parse an HTML from a file descriptor and build a tree.
6635 *
6636 * See htmlCtxtUseOptions for details.
6637 *
6638 * NOTE that the file descriptor will not be closed when the
6639 * context is freed or reset.
6640 *
6641 * Returns the resulting document tree
6642 */
6643 htmlDocPtr
htmlCtxtReadFd(htmlParserCtxtPtr ctxt,int fd,const char * URL,const char * encoding,int options)6644 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6645 const char *URL, const char *encoding, int options)
6646 {
6647 xmlParserInputPtr input;
6648
6649 if (ctxt == NULL)
6650 return(NULL);
6651
6652 htmlCtxtReset(ctxt);
6653 htmlCtxtUseOptions(ctxt, options);
6654
6655 input = xmlNewInputFd(ctxt, URL, fd, encoding, 0);
6656 input->buf->closecallback = NULL;
6657
6658 return(htmlCtxtParseDocument(ctxt, input));
6659 }
6660
6661 /**
6662 * htmlCtxtReadIO:
6663 * @ctxt: an HTML parser context
6664 * @ioread: an I/O read function
6665 * @ioclose: an I/O close function
6666 * @ioctx: an I/O handler
6667 * @URL: the base URL to use for the document
6668 * @encoding: the document encoding, or NULL
6669 * @options: a combination of htmlParserOption(s)
6670 *
6671 * Parse an HTML document from I/O functions and source and build a tree.
6672 *
6673 * See xmlNewInputIO and htmlCtxtUseOptions for details.
6674 *
6675 * Returns the resulting document tree
6676 */
6677 htmlDocPtr
htmlCtxtReadIO(htmlParserCtxtPtr ctxt,xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)6678 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6679 xmlInputCloseCallback ioclose, void *ioctx,
6680 const char *URL,
6681 const char *encoding, int options)
6682 {
6683 xmlParserInputPtr input;
6684
6685 if (ctxt == NULL)
6686 return (NULL);
6687
6688 htmlCtxtReset(ctxt);
6689 htmlCtxtUseOptions(ctxt, options);
6690
6691 input = xmlNewInputIO(ctxt, URL, ioread, ioclose, ioctx, encoding, 0);
6692
6693 return(htmlCtxtParseDocument(ctxt, input));
6694 }
6695
6696 #endif /* LIBXML_HTML_ENABLED */
6697