xref: /aosp_15_r20/external/libxml2/HTMLparser.c (revision 7c5688314b92172186c154356a6374bf7684c3ca)
1 /*
2  * HTMLparser.c : an HTML parser
3  *
4  * References:
5  *   HTML Living Standard
6  *     https://html.spec.whatwg.org/multipage/parsing.html
7  *
8  * Tokenization now conforms to HTML5. Tree construction still follows
9  * a custom, non-standard implementation. See:
10  *
11  *     https://gitlab.gnome.org/GNOME/libxml2/-/issues/211
12  *
13  * See Copyright for the status of this software.
14  *
15  * [email protected]
16  */
17 
18 #define IN_LIBXML
19 #include "libxml.h"
20 #ifdef LIBXML_HTML_ENABLED
21 
22 #include <string.h>
23 #include <ctype.h>
24 #include <stdlib.h>
25 
26 #include <libxml/HTMLparser.h>
27 #include <libxml/xmlmemory.h>
28 #include <libxml/tree.h>
29 #include <libxml/parser.h>
30 #include <libxml/parserInternals.h>
31 #include <libxml/xmlerror.h>
32 #include <libxml/HTMLtree.h>
33 #include <libxml/entities.h>
34 #include <libxml/encoding.h>
35 #include <libxml/xmlIO.h>
36 #include <libxml/uri.h>
37 
38 #include "private/buf.h"
39 #include "private/dict.h"
40 #include "private/enc.h"
41 #include "private/error.h"
42 #include "private/html.h"
43 #include "private/io.h"
44 #include "private/parser.h"
45 #include "private/tree.h"
46 
47 #define HTML_MAX_NAMELEN 1000
48 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
49 #define HTML_PARSER_BUFFER_SIZE 100
50 
51 #define IS_WS_HTML(c) \
52     (((c) == 0x20) || \
53      (((c) >= 0x09) && ((c) <= 0x0D) && ((c) != 0x0B)))
54 
55 #define IS_HEX_DIGIT(c) \
56     ((IS_ASCII_DIGIT(c)) || \
57      ((((c) | 0x20) >= 'a') && (((c) | 0x20) <= 'f')))
58 
59 #define IS_UPPER(c) \
60     (((c) >= 'A') && ((c) <= 'Z'))
61 
62 #define IS_ALNUM(c) \
63     (IS_ASCII_LETTER(c) || IS_ASCII_DIGIT(c))
64 
65 typedef const unsigned htmlAsciiMask[2];
66 
67 static htmlAsciiMask MASK_DQ = {
68     0,
69     1u << ('"' - 32),
70 };
71 static htmlAsciiMask MASK_SQ = {
72     0,
73     1u << ('\'' - 32),
74 };
75 static htmlAsciiMask MASK_GT = {
76     0,
77     1u << ('>' - 32),
78 };
79 static htmlAsciiMask MASK_DASH = {
80     0,
81     1u << ('-' - 32),
82 };
83 static htmlAsciiMask MASK_WS_GT = {
84     1u << 0x09 | 1u << 0x0A | 1u << 0x0C | 1u << 0x0D,
85     1u << (' ' - 32) | 1u << ('>' - 32),
86 };
87 static htmlAsciiMask MASK_DQ_GT = {
88     0,
89     1u << ('"' - 32) | 1u << ('>' - 32),
90 };
91 static htmlAsciiMask MASK_SQ_GT = {
92     0,
93     1u << ('\'' - 32) | 1u << ('>' - 32),
94 };
95 
96 static int htmlOmittedDefaultValue = 1;
97 
98 static int
99 htmlParseElementInternal(htmlParserCtxtPtr ctxt);
100 
101 /************************************************************************
102  *									*
103  *		Some factorized error routines				*
104  *									*
105  ************************************************************************/
106 
107 /**
108  * htmlErrMemory:
109  * @ctxt:  an HTML parser context
110  * @extra:  extra information
111  *
112  * Handle a redefinition of attribute error
113  */
114 static void
htmlErrMemory(xmlParserCtxtPtr ctxt)115 htmlErrMemory(xmlParserCtxtPtr ctxt)
116 {
117     xmlCtxtErrMemory(ctxt);
118 }
119 
120 /**
121  * htmlParseErr:
122  * @ctxt:  an HTML parser context
123  * @error:  the error number
124  * @msg:  the error message
125  * @str1:  string infor
126  * @str2:  string infor
127  *
128  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
129  */
130 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErr(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,const xmlChar * str1,const xmlChar * str2)131 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
132              const char *msg, const xmlChar *str1, const xmlChar *str2)
133 {
134     xmlCtxtErr(ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR,
135                str1, str2, NULL, 0, msg, str1, str2);
136 }
137 
138 /************************************************************************
139  *									*
140  *	Parser stacks related functions and macros		*
141  *									*
142  ************************************************************************/
143 
144 /**
145  * htmlnamePush:
146  * @ctxt:  an HTML parser context
147  * @value:  the element name
148  *
149  * Pushes a new element name on top of the name stack
150  *
151  * Returns -1 in case of error, the index in the stack otherwise
152  */
153 static int
htmlnamePush(htmlParserCtxtPtr ctxt,const xmlChar * value)154 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
155 {
156     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
157         ctxt->html = 3;
158     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
159         ctxt->html = 10;
160     if (ctxt->nameNr >= ctxt->nameMax) {
161         size_t newSize = ctxt->nameMax * 2;
162         const xmlChar **tmp;
163 
164         tmp = xmlRealloc((xmlChar **) ctxt->nameTab,
165                          newSize * sizeof(ctxt->nameTab[0]));
166         if (tmp == NULL) {
167             htmlErrMemory(ctxt);
168             return (-1);
169         }
170         ctxt->nameTab = tmp;
171         ctxt->nameMax = newSize;
172     }
173     ctxt->nameTab[ctxt->nameNr] = value;
174     ctxt->name = value;
175     return (ctxt->nameNr++);
176 }
177 /**
178  * htmlnamePop:
179  * @ctxt: an HTML parser context
180  *
181  * Pops the top element name from the name stack
182  *
183  * Returns the name just removed
184  */
185 static const xmlChar *
htmlnamePop(htmlParserCtxtPtr ctxt)186 htmlnamePop(htmlParserCtxtPtr ctxt)
187 {
188     const xmlChar *ret;
189 
190     if (ctxt->nameNr <= 0)
191         return (NULL);
192     ctxt->nameNr--;
193     if (ctxt->nameNr < 0)
194         return (NULL);
195     if (ctxt->nameNr > 0)
196         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
197     else
198         ctxt->name = NULL;
199     ret = ctxt->nameTab[ctxt->nameNr];
200     ctxt->nameTab[ctxt->nameNr] = NULL;
201     return (ret);
202 }
203 
204 /**
205  * htmlNodeInfoPush:
206  * @ctxt:  an HTML parser context
207  * @value:  the node info
208  *
209  * Pushes a new element name on top of the node info stack
210  *
211  * Returns 0 in case of error, the index in the stack otherwise
212  */
213 static int
htmlNodeInfoPush(htmlParserCtxtPtr ctxt,htmlParserNodeInfo * value)214 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
215 {
216     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
217         if (ctxt->nodeInfoMax == 0)
218                 ctxt->nodeInfoMax = 5;
219         ctxt->nodeInfoMax *= 2;
220         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
221                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
222                                     ctxt->nodeInfoMax *
223                                     sizeof(ctxt->nodeInfoTab[0]));
224         if (ctxt->nodeInfoTab == NULL) {
225             htmlErrMemory(ctxt);
226             return (0);
227         }
228     }
229     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
230     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
231     return (ctxt->nodeInfoNr++);
232 }
233 
234 /**
235  * htmlNodeInfoPop:
236  * @ctxt:  an HTML parser context
237  *
238  * Pops the top element name from the node info stack
239  *
240  * Returns 0 in case of error, the pointer to NodeInfo otherwise
241  */
242 static htmlParserNodeInfo *
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)243 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
244 {
245     if (ctxt->nodeInfoNr <= 0)
246         return (NULL);
247     ctxt->nodeInfoNr--;
248     if (ctxt->nodeInfoNr < 0)
249         return (NULL);
250     if (ctxt->nodeInfoNr > 0)
251         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
252     else
253         ctxt->nodeInfo = NULL;
254     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
255 }
256 
257 /*
258  * Macros for accessing the content. Those should be used only by the parser,
259  * and not exported.
260  *
261  * Dirty macros, i.e. one need to make assumption on the context to use them
262  *
263  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
264  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
265  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
266  *           in UNICODE mode. This should be used internally by the parser
267  *           only to compare to ASCII values otherwise it would break when
268  *           running with UTF-8 encoding.
269  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
270  *           to compare on ASCII based substring.
271  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
272  *           it should be used only to compare on ASCII based substring.
273  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
274  *           strings without newlines within the parser.
275  *
276  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
277  *
278  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
279  */
280 
281 #define UPPER (toupper(*ctxt->input->cur))
282 
283 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
284 
285 #define NXT(val) ctxt->input->cur[(val)]
286 
287 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
288 
289 #define CUR_PTR ctxt->input->cur
290 #define BASE_PTR ctxt->input->base
291 
292 #define SHRINK \
293     if ((!PARSER_PROGRESSIVE(ctxt)) && \
294         (ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
295 	(ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
296 	xmlParserShrink(ctxt);
297 
298 #define GROW \
299     if ((!PARSER_PROGRESSIVE(ctxt)) && \
300         (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
301 	xmlParserGrow(ctxt);
302 
303 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
304 
305 /* Imported from XML */
306 
307 #define CUR (*ctxt->input->cur)
308 
309 /**
310  * htmlFindEncoding:
311  * @the HTML parser context
312  *
313  * Ty to find and encoding in the current data available in the input
314  * buffer this is needed to try to switch to the proper encoding when
315  * one face a character error.
316  * That's an heuristic, since it's operating outside of parsing it could
317  * try to use a meta which had been commented out, that's the reason it
318  * should only be used in case of error, not as a default.
319  *
320  * Returns an encoding string or NULL if not found, the string need to
321  *   be freed
322  */
323 static xmlChar *
htmlFindEncoding(xmlParserCtxtPtr ctxt)324 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
325     const xmlChar *start, *cur, *end;
326     xmlChar *ret;
327 
328     if ((ctxt == NULL) || (ctxt->input == NULL) ||
329         (ctxt->input->flags & XML_INPUT_HAS_ENCODING))
330         return(NULL);
331     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
332         return(NULL);
333 
334     start = ctxt->input->cur;
335     end = ctxt->input->end;
336     /* we also expect the input buffer to be zero terminated */
337     if (*end != 0)
338         return(NULL);
339 
340     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
341     if (cur == NULL)
342         return(NULL);
343     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
344     if (cur == NULL)
345         return(NULL);
346     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
347     if (cur == NULL)
348         return(NULL);
349     cur += 8;
350     start = cur;
351     while ((IS_ALNUM(*cur)) ||
352            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
353            cur++;
354     if (cur == start)
355         return(NULL);
356     ret = xmlStrndup(start, cur - start);
357     if (ret == NULL)
358         htmlErrMemory(ctxt);
359     return(ret);
360 }
361 
362 static int
htmlMaskMatch(htmlAsciiMask mask,unsigned c)363 htmlMaskMatch(htmlAsciiMask mask, unsigned c) {
364     if (c >= 64)
365         return(0);
366     return((mask[c/32] >> (c & 31)) & 1);
367 }
368 
369 static int
htmlValidateUtf8(xmlParserCtxtPtr ctxt,const xmlChar * str,size_t len)370 htmlValidateUtf8(xmlParserCtxtPtr ctxt, const xmlChar *str, size_t len) {
371     unsigned c = str[0];
372     int size;
373 
374     if (c < 0xC2) {
375         goto invalid;
376     } else if (c < 0xE0) {
377         if (len < 2)
378             goto incomplete;
379         if ((str[1] & 0xC0) != 0x80)
380             goto invalid;
381         size = 2;
382     } else if (c < 0xF0) {
383         unsigned v;
384 
385         if (len < 3)
386             goto incomplete;
387 
388         v = str[1] << 8 | str[2]; /* hint to generate 16-bit load */
389         v |= c << 16;
390 
391         if (((v & 0x00C0C0) != 0x008080) ||
392             ((v & 0x0F2000) == 0x000000) ||
393             ((v & 0x0F2000) == 0x0D2000))
394             goto invalid;
395 
396         size = 3;
397     } else {
398         unsigned v;
399 
400         if (len < 4)
401             goto incomplete;
402 
403         v = c << 24 | str[1] << 16 | str[2] << 8 | str[3];
404 
405         if (((v & 0x00C0C0C0) != 0x00808080) ||
406             (v < 0xF0900000) || (v >= 0xF4900000))
407             goto invalid;
408 
409         size = 4;
410     }
411 
412     return(size);
413 
414 incomplete:
415     return(0);
416 
417 invalid:
418     /* Only report the first error */
419     if ((ctxt->input->flags & XML_INPUT_ENCODING_ERROR) == 0) {
420         htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
421                      "Invalid bytes in character encoding", NULL, NULL);
422         ctxt->input->flags |= XML_INPUT_ENCODING_ERROR;
423     }
424 
425     return(-1);
426 }
427 
428 /**
429  * htmlSkipBlankChars:
430  * @ctxt:  the HTML parser context
431  *
432  * skip all blanks character found at that point in the input streams.
433  *
434  * Returns the number of space chars skipped
435  */
436 
437 static int
htmlSkipBlankChars(xmlParserCtxtPtr ctxt)438 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
439     const xmlChar *cur = ctxt->input->cur;
440     size_t avail = ctxt->input->end - cur;
441     int res = 0;
442     int line = ctxt->input->line;
443     int col = ctxt->input->col;
444 
445     while (!PARSER_STOPPED(ctxt)) {
446         if (avail == 0) {
447             ctxt->input->cur = cur;
448             GROW;
449             cur = ctxt->input->cur;
450             avail = ctxt->input->end - cur;
451 
452             if (avail == 0)
453                 break;
454         }
455 
456         if (*cur == '\n') {
457             line++;
458             col = 1;
459         } else if (IS_WS_HTML(*cur)) {
460             col++;
461         } else {
462             break;
463         }
464 
465         cur += 1;
466         avail -= 1;
467 
468 	if (res < INT_MAX)
469 	    res++;
470     }
471 
472     ctxt->input->cur = cur;
473     ctxt->input->line = line;
474     ctxt->input->col = col;
475 
476     if (res > 8)
477         GROW;
478 
479     return(res);
480 }
481 
482 
483 
484 /************************************************************************
485  *									*
486  *	The list of HTML elements and their properties		*
487  *									*
488  ************************************************************************/
489 
490 /*
491  *  Start Tag: 1 means the start tag can be omitted
492  *  End Tag:   1 means the end tag can be omitted
493  *             2 means it's forbidden (empty elements)
494  *             3 means the tag is stylistic and should be closed easily
495  *  Depr:      this element is deprecated
496  *  DTD:       1 means that this element is valid only in the Loose DTD
497  *             2 means that this element is valid only in the Frameset DTD
498  *
499  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
500  */
501 
502 #define DATA_RCDATA         1
503 #define DATA_RAWTEXT        2
504 #define DATA_PLAINTEXT      3
505 #define DATA_SCRIPT         4
506 #define DATA_SCRIPT_ESC1    5
507 #define DATA_SCRIPT_ESC2    6
508 
509 static const htmlElemDesc
510 html40ElementTable[] = {
511 { "a",		0, 0, 0, 0, 0, 0, 1, "anchor ",
512 	NULL, NULL, NULL, NULL, NULL,
513 	0
514 },
515 { "abbr",	0, 0, 0, 0, 0, 0, 1, "abbreviated form",
516 	NULL, NULL, NULL, NULL, NULL,
517 	0
518 },
519 { "acronym",	0, 0, 0, 0, 0, 0, 1, "",
520 	NULL, NULL, NULL, NULL, NULL,
521 	0
522 },
523 { "address",	0, 0, 0, 0, 0, 0, 0, "information on author ",
524 	NULL, NULL, NULL, NULL, NULL,
525 	0
526 },
527 { "applet",	0, 0, 0, 0, 1, 1, 2, "java applet ",
528 	NULL, NULL, NULL, NULL, NULL,
529 	0
530 },
531 { "area",	0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
532 	NULL, NULL, NULL, NULL, NULL,
533 	0
534 },
535 { "b",		0, 3, 0, 0, 0, 0, 1, "bold text style",
536 	NULL, NULL, NULL, NULL, NULL,
537 	0
538 },
539 { "base",	0, 2, 2, 1, 0, 0, 0, "document base uri ",
540 	NULL, NULL, NULL, NULL, NULL,
541 	0
542 },
543 { "basefont",	0, 2, 2, 1, 1, 1, 1, "base font size " ,
544 	NULL, NULL, NULL, NULL, NULL,
545 	0
546 },
547 { "bdo",	0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
548 	NULL, NULL, NULL, NULL, NULL,
549 	0
550 },
551 { "big",	0, 3, 0, 0, 0, 0, 1, "large text style",
552 	NULL, NULL, NULL, NULL, NULL,
553 	0
554 },
555 { "blockquote",	0, 0, 0, 0, 0, 0, 0, "long quotation ",
556 	NULL, NULL, NULL, NULL, NULL,
557 	0
558 },
559 { "body",	1, 1, 0, 0, 0, 0, 0, "document body ",
560 	NULL, NULL, NULL, NULL, NULL,
561 	0
562 },
563 { "br",		0, 2, 2, 1, 0, 0, 1, "forced line break ",
564 	NULL, NULL, NULL, NULL, NULL,
565 	0
566 },
567 { "button",	0, 0, 0, 0, 0, 0, 2, "push button ",
568 	NULL, NULL, NULL, NULL, NULL,
569 	0
570 },
571 { "caption",	0, 0, 0, 0, 0, 0, 0, "table caption ",
572 	NULL, NULL, NULL, NULL, NULL,
573 	0
574 },
575 { "center",	0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
576 	NULL, NULL, NULL, NULL, NULL,
577 	0
578 },
579 { "cite",	0, 0, 0, 0, 0, 0, 1, "citation",
580 	NULL, NULL, NULL, NULL, NULL,
581 	0
582 },
583 { "code",	0, 0, 0, 0, 0, 0, 1, "computer code fragment",
584 	NULL, NULL, NULL, NULL, NULL,
585 	0
586 },
587 { "col",	0, 2, 2, 1, 0, 0, 0, "table column ",
588 	NULL, NULL, NULL, NULL, NULL,
589 	0
590 },
591 { "colgroup",	0, 1, 0, 0, 0, 0, 0, "table column group ",
592 	NULL, NULL, NULL, NULL, NULL,
593 	0
594 },
595 { "dd",		0, 1, 0, 0, 0, 0, 0, "definition description ",
596 	NULL, NULL, NULL, NULL, NULL,
597 	0
598 },
599 { "del",	0, 0, 0, 0, 0, 0, 2, "deleted text ",
600 	NULL, NULL, NULL, NULL, NULL,
601 	0
602 },
603 { "dfn",	0, 0, 0, 0, 0, 0, 1, "instance definition",
604 	NULL, NULL, NULL, NULL, NULL,
605 	0
606 },
607 { "dir",	0, 0, 0, 0, 1, 1, 0, "directory list",
608 	NULL, NULL, NULL, NULL, NULL,
609 	0
610 },
611 { "div",	0, 0, 0, 0, 0, 0, 0, "generic language/style container",
612 	NULL, NULL, NULL, NULL, NULL,
613 	0
614 },
615 { "dl",		0, 0, 0, 0, 0, 0, 0, "definition list ",
616 	NULL, NULL, NULL, NULL, NULL,
617 	0
618 },
619 { "dt",		0, 1, 0, 0, 0, 0, 0, "definition term ",
620 	NULL, NULL, NULL, NULL, NULL,
621 	0
622 },
623 { "em",		0, 3, 0, 0, 0, 0, 1, "emphasis",
624 	NULL, NULL, NULL, NULL, NULL,
625 	0
626 },
627 { "embed",	0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
628 	NULL, NULL, NULL, NULL, NULL,
629 	0
630 },
631 { "fieldset",	0, 0, 0, 0, 0, 0, 0, "form control group ",
632 	NULL, NULL, NULL, NULL, NULL,
633 	0
634 },
635 { "font",	0, 3, 0, 0, 1, 1, 1, "local change to font ",
636 	NULL, NULL, NULL, NULL, NULL,
637 	0
638 },
639 { "form",	0, 0, 0, 0, 0, 0, 0, "interactive form ",
640 	NULL, NULL, NULL, NULL, NULL,
641 	0
642 },
643 { "frame",	0, 2, 2, 1, 0, 2, 0, "subwindow " ,
644 	NULL, NULL, NULL, NULL, NULL,
645 	0
646 },
647 { "frameset",	0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
648 	NULL, NULL, NULL, NULL, NULL,
649 	0
650 },
651 { "h1",		0, 0, 0, 0, 0, 0, 0, "heading ",
652 	NULL, NULL, NULL, NULL, NULL,
653 	0
654 },
655 { "h2",		0, 0, 0, 0, 0, 0, 0, "heading ",
656 	NULL, NULL, NULL, NULL, NULL,
657 	0
658 },
659 { "h3",		0, 0, 0, 0, 0, 0, 0, "heading ",
660 	NULL, NULL, NULL, NULL, NULL,
661 	0
662 },
663 { "h4",		0, 0, 0, 0, 0, 0, 0, "heading ",
664 	NULL, NULL, NULL, NULL, NULL,
665 	0
666 },
667 { "h5",		0, 0, 0, 0, 0, 0, 0, "heading ",
668 	NULL, NULL, NULL, NULL, NULL,
669 	0
670 },
671 { "h6",		0, 0, 0, 0, 0, 0, 0, "heading ",
672 	NULL, NULL, NULL, NULL, NULL,
673 	0
674 },
675 { "head",	1, 1, 0, 0, 0, 0, 0, "document head ",
676 	NULL, NULL, NULL, NULL, NULL,
677 	0
678 },
679 { "hr",		0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
680 	NULL, NULL, NULL, NULL, NULL,
681 	0
682 },
683 { "html",	1, 1, 0, 0, 0, 0, 0, "document root element ",
684 	NULL, NULL, NULL, NULL, NULL,
685 	0
686 },
687 { "i",		0, 3, 0, 0, 0, 0, 1, "italic text style",
688 	NULL, NULL, NULL, NULL, NULL,
689 	0
690 },
691 { "iframe",	0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
692 	NULL, NULL, NULL, NULL, NULL,
693 	DATA_RAWTEXT
694 },
695 { "img",	0, 2, 2, 1, 0, 0, 1, "embedded image ",
696 	NULL, NULL, NULL, NULL, NULL,
697 	0
698 },
699 { "input",	0, 2, 2, 1, 0, 0, 1, "form control ",
700 	NULL, NULL, NULL, NULL, NULL,
701 	0
702 },
703 { "ins",	0, 0, 0, 0, 0, 0, 2, "inserted text",
704 	NULL, NULL, NULL, NULL, NULL,
705 	0
706 },
707 { "isindex",	0, 2, 2, 1, 1, 1, 0, "single line prompt ",
708 	NULL, NULL, NULL, NULL, NULL,
709 	0
710 },
711 { "kbd",	0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
712 	NULL, NULL, NULL, NULL, NULL,
713 	0
714 },
715 { "label",	0, 0, 0, 0, 0, 0, 1, "form field label text ",
716 	NULL, NULL, NULL, NULL, NULL,
717 	0
718 },
719 { "legend",	0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
720 	NULL, NULL, NULL, NULL, NULL,
721 	0
722 },
723 { "li",		0, 1, 1, 0, 0, 0, 0, "list item ",
724 	NULL, NULL, NULL, NULL, NULL,
725 	0
726 },
727 { "link",	0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
728 	NULL, NULL, NULL, NULL, NULL,
729 	0
730 },
731 { "map",	0, 0, 0, 0, 0, 0, 2, "client-side image map ",
732 	NULL, NULL, NULL, NULL, NULL,
733 	0
734 },
735 { "menu",	0, 0, 0, 0, 1, 1, 0, "menu list ",
736 	NULL, NULL, NULL, NULL, NULL,
737 	0
738 },
739 { "meta",	0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
740 	NULL, NULL, NULL, NULL, NULL,
741 	0
742 },
743 { "noembed",	0, 0, 0, 0, 0, 0, 0, "",
744 	NULL, NULL, NULL, NULL, NULL,
745 	DATA_RAWTEXT
746 },
747 { "noframes",	0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
748 	NULL, NULL, NULL, NULL, NULL,
749 	DATA_RAWTEXT
750 },
751 { "noscript",	0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
752 	NULL, NULL, NULL, NULL, NULL,
753 	0
754 },
755 { "object",	0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
756 	NULL, NULL, NULL, NULL, NULL,
757 	0
758 },
759 { "ol",		0, 0, 0, 0, 0, 0, 0, "ordered list ",
760 	NULL, NULL, NULL, NULL, NULL,
761 	0
762 },
763 { "optgroup",	0, 0, 0, 0, 0, 0, 0, "option group ",
764 	NULL, NULL, NULL, NULL, NULL,
765 	0
766 },
767 { "option",	0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
768 	NULL, NULL, NULL, NULL, NULL,
769 	0
770 },
771 { "p",		0, 1, 0, 0, 0, 0, 0, "paragraph ",
772 	NULL, NULL, NULL, NULL, NULL,
773 	0
774 },
775 { "param",	0, 2, 2, 1, 0, 0, 0, "named property value ",
776 	NULL, NULL, NULL, NULL, NULL,
777 	0
778 },
779 { "plaintext",	0, 0, 0, 0, 0, 0, 0, "",
780 	NULL, NULL, NULL, NULL, NULL,
781 	DATA_PLAINTEXT
782 },
783 { "pre",	0, 0, 0, 0, 0, 0, 0, "preformatted text ",
784 	NULL, NULL, NULL, NULL, NULL,
785 	0
786 },
787 { "q",		0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
788 	NULL, NULL, NULL, NULL, NULL,
789 	0
790 },
791 { "s",		0, 3, 0, 0, 1, 1, 1, "strike-through text style",
792 	NULL, NULL, NULL, NULL, NULL,
793 	0
794 },
795 { "samp",	0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
796 	NULL, NULL, NULL, NULL, NULL,
797 	0
798 },
799 { "script",	0, 0, 0, 0, 0, 0, 2, "script statements ",
800 	NULL, NULL, NULL, NULL, NULL,
801 	DATA_SCRIPT
802 },
803 { "select",	0, 0, 0, 0, 0, 0, 1, "option selector ",
804 	NULL, NULL, NULL, NULL, NULL,
805 	0
806 },
807 { "small",	0, 3, 0, 0, 0, 0, 1, "small text style",
808 	NULL, NULL, NULL, NULL, NULL,
809 	0
810 },
811 { "span",	0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
812 	NULL, NULL, NULL, NULL, NULL,
813 	0
814 },
815 { "strike",	0, 3, 0, 0, 1, 1, 1, "strike-through text",
816 	NULL, NULL, NULL, NULL, NULL,
817 	0
818 },
819 { "strong",	0, 3, 0, 0, 0, 0, 1, "strong emphasis",
820 	NULL, NULL, NULL, NULL, NULL,
821 	0
822 },
823 { "style",	0, 0, 0, 0, 0, 0, 0, "style info ",
824 	NULL, NULL, NULL, NULL, NULL,
825 	DATA_RAWTEXT
826 },
827 { "sub",	0, 3, 0, 0, 0, 0, 1, "subscript",
828 	NULL, NULL, NULL, NULL, NULL,
829 	0
830 },
831 { "sup",	0, 3, 0, 0, 0, 0, 1, "superscript ",
832 	NULL, NULL, NULL, NULL, NULL,
833 	0
834 },
835 { "table",	0, 0, 0, 0, 0, 0, 0, "",
836 	NULL, NULL, NULL, NULL, NULL,
837 	0
838 },
839 { "tbody",	1, 0, 0, 0, 0, 0, 0, "table body ",
840 	NULL, NULL, NULL, NULL, NULL,
841 	0
842 },
843 { "td",		0, 0, 0, 0, 0, 0, 0, "table data cell",
844 	NULL, NULL, NULL, NULL, NULL,
845 	0
846 },
847 { "textarea",	0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
848 	NULL, NULL, NULL, NULL, NULL,
849 	DATA_RCDATA
850 },
851 { "tfoot",	0, 1, 0, 0, 0, 0, 0, "table footer ",
852 	NULL, NULL, NULL, NULL, NULL,
853 	0
854 },
855 { "th",		0, 1, 0, 0, 0, 0, 0, "table header cell",
856 	NULL, NULL, NULL, NULL, NULL,
857 	0
858 },
859 { "thead",	0, 1, 0, 0, 0, 0, 0, "table header ",
860 	NULL, NULL, NULL, NULL, NULL,
861 	0
862 },
863 { "title",	0, 0, 0, 0, 0, 0, 0, "document title ",
864 	NULL, NULL, NULL, NULL, NULL,
865 	DATA_RCDATA
866 },
867 { "tr",		0, 0, 0, 0, 0, 0, 0, "table row ",
868 	NULL, NULL, NULL, NULL, NULL,
869 	0
870 },
871 { "tt",		0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
872 	NULL, NULL, NULL, NULL, NULL,
873 	0
874 },
875 { "u",		0, 3, 0, 0, 1, 1, 1, "underlined text style",
876 	NULL, NULL, NULL, NULL, NULL,
877 	0
878 },
879 { "ul",		0, 0, 0, 0, 0, 0, 0, "unordered list ",
880 	NULL, NULL, NULL, NULL, NULL,
881 	0
882 },
883 { "var",	0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
884 	NULL, NULL, NULL, NULL, NULL,
885 	0
886 },
887 { "xmp",	0, 0, 0, 0, 0, 0, 1, "",
888 	NULL, NULL, NULL, NULL, NULL,
889 	DATA_RAWTEXT
890 }
891 };
892 
893 typedef struct {
894     const char *oldTag;
895     const char *newTag;
896 } htmlStartCloseEntry;
897 
898 /*
899  * start tags that imply the end of current element
900  */
901 static const htmlStartCloseEntry htmlStartClose[] = {
902     { "a", "a" },
903     { "a", "fieldset" },
904     { "a", "table" },
905     { "a", "td" },
906     { "a", "th" },
907     { "address", "dd" },
908     { "address", "dl" },
909     { "address", "dt" },
910     { "address", "form" },
911     { "address", "li" },
912     { "address", "ul" },
913     { "b", "center" },
914     { "b", "p" },
915     { "b", "td" },
916     { "b", "th" },
917     { "big", "p" },
918     { "caption", "col" },
919     { "caption", "colgroup" },
920     { "caption", "tbody" },
921     { "caption", "tfoot" },
922     { "caption", "thead" },
923     { "caption", "tr" },
924     { "col", "col" },
925     { "col", "colgroup" },
926     { "col", "tbody" },
927     { "col", "tfoot" },
928     { "col", "thead" },
929     { "col", "tr" },
930     { "colgroup", "colgroup" },
931     { "colgroup", "tbody" },
932     { "colgroup", "tfoot" },
933     { "colgroup", "thead" },
934     { "colgroup", "tr" },
935     { "dd", "dt" },
936     { "dir", "dd" },
937     { "dir", "dl" },
938     { "dir", "dt" },
939     { "dir", "form" },
940     { "dir", "ul" },
941     { "dl", "form" },
942     { "dl", "li" },
943     { "dt", "dd" },
944     { "dt", "dl" },
945     { "font", "center" },
946     { "font", "td" },
947     { "font", "th" },
948     { "form", "form" },
949     { "h1", "fieldset" },
950     { "h1", "form" },
951     { "h1", "li" },
952     { "h1", "p" },
953     { "h1", "table" },
954     { "h2", "fieldset" },
955     { "h2", "form" },
956     { "h2", "li" },
957     { "h2", "p" },
958     { "h2", "table" },
959     { "h3", "fieldset" },
960     { "h3", "form" },
961     { "h3", "li" },
962     { "h3", "p" },
963     { "h3", "table" },
964     { "h4", "fieldset" },
965     { "h4", "form" },
966     { "h4", "li" },
967     { "h4", "p" },
968     { "h4", "table" },
969     { "h5", "fieldset" },
970     { "h5", "form" },
971     { "h5", "li" },
972     { "h5", "p" },
973     { "h5", "table" },
974     { "h6", "fieldset" },
975     { "h6", "form" },
976     { "h6", "li" },
977     { "h6", "p" },
978     { "h6", "table" },
979     { "head", "a" },
980     { "head", "abbr" },
981     { "head", "acronym" },
982     { "head", "address" },
983     { "head", "b" },
984     { "head", "bdo" },
985     { "head", "big" },
986     { "head", "blockquote" },
987     { "head", "body" },
988     { "head", "br" },
989     { "head", "center" },
990     { "head", "cite" },
991     { "head", "code" },
992     { "head", "dd" },
993     { "head", "dfn" },
994     { "head", "dir" },
995     { "head", "div" },
996     { "head", "dl" },
997     { "head", "dt" },
998     { "head", "em" },
999     { "head", "fieldset" },
1000     { "head", "font" },
1001     { "head", "form" },
1002     { "head", "frameset" },
1003     { "head", "h1" },
1004     { "head", "h2" },
1005     { "head", "h3" },
1006     { "head", "h4" },
1007     { "head", "h5" },
1008     { "head", "h6" },
1009     { "head", "hr" },
1010     { "head", "i" },
1011     { "head", "iframe" },
1012     { "head", "img" },
1013     { "head", "kbd" },
1014     { "head", "li" },
1015     { "head", "listing" },
1016     { "head", "map" },
1017     { "head", "menu" },
1018     { "head", "ol" },
1019     { "head", "p" },
1020     { "head", "pre" },
1021     { "head", "q" },
1022     { "head", "s" },
1023     { "head", "samp" },
1024     { "head", "small" },
1025     { "head", "span" },
1026     { "head", "strike" },
1027     { "head", "strong" },
1028     { "head", "sub" },
1029     { "head", "sup" },
1030     { "head", "table" },
1031     { "head", "tt" },
1032     { "head", "u" },
1033     { "head", "ul" },
1034     { "head", "var" },
1035     { "head", "xmp" },
1036     { "hr", "form" },
1037     { "i", "center" },
1038     { "i", "p" },
1039     { "i", "td" },
1040     { "i", "th" },
1041     { "legend", "fieldset" },
1042     { "li", "li" },
1043     { "link", "body" },
1044     { "link", "frameset" },
1045     { "listing", "dd" },
1046     { "listing", "dl" },
1047     { "listing", "dt" },
1048     { "listing", "fieldset" },
1049     { "listing", "form" },
1050     { "listing", "li" },
1051     { "listing", "table" },
1052     { "listing", "ul" },
1053     { "menu", "dd" },
1054     { "menu", "dl" },
1055     { "menu", "dt" },
1056     { "menu", "form" },
1057     { "menu", "ul" },
1058     { "ol", "form" },
1059     { "option", "optgroup" },
1060     { "option", "option" },
1061     { "p", "address" },
1062     { "p", "blockquote" },
1063     { "p", "body" },
1064     { "p", "caption" },
1065     { "p", "center" },
1066     { "p", "col" },
1067     { "p", "colgroup" },
1068     { "p", "dd" },
1069     { "p", "dir" },
1070     { "p", "div" },
1071     { "p", "dl" },
1072     { "p", "dt" },
1073     { "p", "fieldset" },
1074     { "p", "form" },
1075     { "p", "frameset" },
1076     { "p", "h1" },
1077     { "p", "h2" },
1078     { "p", "h3" },
1079     { "p", "h4" },
1080     { "p", "h5" },
1081     { "p", "h6" },
1082     { "p", "head" },
1083     { "p", "hr" },
1084     { "p", "li" },
1085     { "p", "listing" },
1086     { "p", "menu" },
1087     { "p", "ol" },
1088     { "p", "p" },
1089     { "p", "pre" },
1090     { "p", "table" },
1091     { "p", "tbody" },
1092     { "p", "td" },
1093     { "p", "tfoot" },
1094     { "p", "th" },
1095     { "p", "title" },
1096     { "p", "tr" },
1097     { "p", "ul" },
1098     { "p", "xmp" },
1099     { "pre", "dd" },
1100     { "pre", "dl" },
1101     { "pre", "dt" },
1102     { "pre", "fieldset" },
1103     { "pre", "form" },
1104     { "pre", "li" },
1105     { "pre", "table" },
1106     { "pre", "ul" },
1107     { "s", "p" },
1108     { "script", "noscript" },
1109     { "small", "p" },
1110     { "span", "td" },
1111     { "span", "th" },
1112     { "strike", "p" },
1113     { "style", "body" },
1114     { "style", "frameset" },
1115     { "tbody", "tbody" },
1116     { "tbody", "tfoot" },
1117     { "td", "tbody" },
1118     { "td", "td" },
1119     { "td", "tfoot" },
1120     { "td", "th" },
1121     { "td", "tr" },
1122     { "tfoot", "tbody" },
1123     { "th", "tbody" },
1124     { "th", "td" },
1125     { "th", "tfoot" },
1126     { "th", "th" },
1127     { "th", "tr" },
1128     { "thead", "tbody" },
1129     { "thead", "tfoot" },
1130     { "title", "body" },
1131     { "title", "frameset" },
1132     { "tr", "tbody" },
1133     { "tr", "tfoot" },
1134     { "tr", "tr" },
1135     { "tt", "p" },
1136     { "u", "p" },
1137     { "u", "td" },
1138     { "u", "th" },
1139     { "ul", "address" },
1140     { "ul", "form" },
1141     { "ul", "menu" },
1142     { "ul", "pre" },
1143     { "xmp", "dd" },
1144     { "xmp", "dl" },
1145     { "xmp", "dt" },
1146     { "xmp", "fieldset" },
1147     { "xmp", "form" },
1148     { "xmp", "li" },
1149     { "xmp", "table" },
1150     { "xmp", "ul" }
1151 };
1152 
1153 /*
1154  * The list of HTML elements which are supposed not to have
1155  * CDATA content and where a p element will be implied
1156  *
1157  * TODO: extend that list by reading the HTML SGML DTD on
1158  *       implied paragraph
1159  */
1160 static const char *const htmlNoContentElements[] = {
1161     "html",
1162     "head",
1163     NULL
1164 };
1165 
1166 /*
1167  * The list of HTML attributes which are of content %Script;
1168  * NOTE: when adding ones, check htmlIsScriptAttribute() since
1169  *       it assumes the name starts with 'on'
1170  */
1171 static const char *const htmlScriptAttributes[] = {
1172     "onclick",
1173     "ondblclick",
1174     "onmousedown",
1175     "onmouseup",
1176     "onmouseover",
1177     "onmousemove",
1178     "onmouseout",
1179     "onkeypress",
1180     "onkeydown",
1181     "onkeyup",
1182     "onload",
1183     "onunload",
1184     "onfocus",
1185     "onblur",
1186     "onsubmit",
1187     "onreset",
1188     "onchange",
1189     "onselect"
1190 };
1191 
1192 /*
1193  * This table is used by the htmlparser to know what to do with
1194  * broken html pages. By assigning different priorities to different
1195  * elements the parser can decide how to handle extra endtags.
1196  * Endtags are only allowed to close elements with lower or equal
1197  * priority.
1198  */
1199 
1200 typedef struct {
1201     const char *name;
1202     int priority;
1203 } elementPriority;
1204 
1205 static const elementPriority htmlEndPriority[] = {
1206     {"div",   150},
1207     {"td",    160},
1208     {"th",    160},
1209     {"tr",    170},
1210     {"thead", 180},
1211     {"tbody", 180},
1212     {"tfoot", 180},
1213     {"table", 190},
1214     {"head",  200},
1215     {"body",  200},
1216     {"html",  220},
1217     {NULL,    100} /* Default priority */
1218 };
1219 
1220 /************************************************************************
1221  *									*
1222  *	functions to handle HTML specific data			*
1223  *									*
1224  ************************************************************************/
1225 
1226 static void
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt)1227 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
1228     /*
1229      * Capture end position and add node
1230      */
1231     if ( ctxt->node != NULL && ctxt->record_info ) {
1232        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
1233                                 (CUR_PTR - ctxt->input->base);
1234        ctxt->nodeInfo->end_line = ctxt->input->line;
1235        ctxt->nodeInfo->node = ctxt->node;
1236        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
1237        htmlNodeInfoPop(ctxt);
1238     }
1239 }
1240 
1241 /**
1242  * htmlInitAutoClose:
1243  *
1244  * DEPRECATED: This is a no-op.
1245  */
1246 void
htmlInitAutoClose(void)1247 htmlInitAutoClose(void) {
1248 }
1249 
1250 static int
htmlCompareTags(const void * key,const void * member)1251 htmlCompareTags(const void *key, const void *member) {
1252     const xmlChar *tag = (const xmlChar *) key;
1253     const htmlElemDesc *desc = (const htmlElemDesc *) member;
1254 
1255     return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1256 }
1257 
1258 /**
1259  * htmlTagLookup:
1260  * @tag:  The tag name in lowercase
1261  *
1262  * Lookup the HTML tag in the ElementTable
1263  *
1264  * Returns the related htmlElemDescPtr or NULL if not found.
1265  */
1266 const htmlElemDesc *
htmlTagLookup(const xmlChar * tag)1267 htmlTagLookup(const xmlChar *tag) {
1268     if (tag == NULL)
1269         return(NULL);
1270 
1271     return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1272                 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1273                 sizeof(htmlElemDesc), htmlCompareTags));
1274 }
1275 
1276 /**
1277  * htmlGetEndPriority:
1278  * @name: The name of the element to look up the priority for.
1279  *
1280  * Return value: The "endtag" priority.
1281  **/
1282 static int
htmlGetEndPriority(const xmlChar * name)1283 htmlGetEndPriority (const xmlChar *name) {
1284     int i = 0;
1285 
1286     while ((htmlEndPriority[i].name != NULL) &&
1287 	   (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1288 	i++;
1289 
1290     return(htmlEndPriority[i].priority);
1291 }
1292 
1293 
1294 static int
htmlCompareStartClose(const void * vkey,const void * member)1295 htmlCompareStartClose(const void *vkey, const void *member) {
1296     const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1297     const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1298     int ret;
1299 
1300     ret = strcmp(key->oldTag, entry->oldTag);
1301     if (ret == 0)
1302         ret = strcmp(key->newTag, entry->newTag);
1303 
1304     return(ret);
1305 }
1306 
1307 /**
1308  * htmlCheckAutoClose:
1309  * @newtag:  The new tag name
1310  * @oldtag:  The old tag name
1311  *
1312  * Checks whether the new tag is one of the registered valid tags for
1313  * closing old.
1314  *
1315  * Returns 0 if no, 1 if yes.
1316  */
1317 static int
htmlCheckAutoClose(const xmlChar * newtag,const xmlChar * oldtag)1318 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1319 {
1320     htmlStartCloseEntry key;
1321     void *res;
1322 
1323     key.oldTag = (const char *) oldtag;
1324     key.newTag = (const char *) newtag;
1325     res = bsearch(&key, htmlStartClose,
1326             sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1327             sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1328     return(res != NULL);
1329 }
1330 
1331 /**
1332  * htmlAutoCloseOnClose:
1333  * @ctxt:  an HTML parser context
1334  * @newtag:  The new tag name
1335  * @force:  force the tag closure
1336  *
1337  * The HTML DTD allows an ending tag to implicitly close other tags.
1338  */
1339 static void
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1340 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1341 {
1342     const htmlElemDesc *info;
1343     int i, priority;
1344 
1345     if (ctxt->options & HTML_PARSE_HTML5)
1346         return;
1347 
1348     priority = htmlGetEndPriority(newtag);
1349 
1350     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1351 
1352         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1353             break;
1354         /*
1355          * A misplaced endtag can only close elements with lower
1356          * or equal priority, so if we find an element with higher
1357          * priority before we find an element with
1358          * matching name, we just ignore this endtag
1359          */
1360         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1361             return;
1362     }
1363     if (i < 0)
1364         return;
1365 
1366     while (!xmlStrEqual(newtag, ctxt->name)) {
1367         info = htmlTagLookup(ctxt->name);
1368         if ((info != NULL) && (info->endTag == 3)) {
1369             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1370 	                 "Opening and ending tag mismatch: %s and %s\n",
1371 			 newtag, ctxt->name);
1372         }
1373 	htmlParserFinishElementParsing(ctxt);
1374         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1375             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1376 	htmlnamePop(ctxt);
1377     }
1378 }
1379 
1380 /**
1381  * htmlAutoCloseOnEnd:
1382  * @ctxt:  an HTML parser context
1383  *
1384  * Close all remaining tags at the end of the stream
1385  */
1386 static void
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)1387 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1388 {
1389     int i;
1390 
1391     if (ctxt->options & HTML_PARSE_HTML5)
1392         return;
1393 
1394     if (ctxt->nameNr == 0)
1395         return;
1396     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1397 	htmlParserFinishElementParsing(ctxt);
1398         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1399             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1400 	htmlnamePop(ctxt);
1401     }
1402 }
1403 
1404 /**
1405  * htmlAutoClose:
1406  * @ctxt:  an HTML parser context
1407  * @newtag:  The new tag name or NULL
1408  *
1409  * The HTML DTD allows a tag to implicitly close other tags.
1410  * The list is kept in htmlStartClose array. This function is
1411  * called when a new tag has been detected and generates the
1412  * appropriates closes if possible/needed.
1413  * If newtag is NULL this mean we are at the end of the resource
1414  * and we should check
1415  */
1416 static void
htmlAutoClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1417 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1418 {
1419     if (ctxt->options & HTML_PARSE_HTML5)
1420         return;
1421 
1422     if (newtag == NULL)
1423         return;
1424 
1425     while ((ctxt->name != NULL) &&
1426            (htmlCheckAutoClose(newtag, ctxt->name))) {
1427 	htmlParserFinishElementParsing(ctxt);
1428         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1429             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1430 	htmlnamePop(ctxt);
1431     }
1432 }
1433 
1434 /**
1435  * htmlAutoCloseTag:
1436  * @doc:  the HTML document
1437  * @name:  The tag name
1438  * @elem:  the HTML element
1439  *
1440  * DEPRECATED: Internal function, don't use.
1441  *
1442  * The HTML DTD allows a tag to implicitly close other tags.
1443  * The list is kept in htmlStartClose array. This function checks
1444  * if the element or one of it's children would autoclose the
1445  * given tag.
1446  *
1447  * Returns 1 if autoclose, 0 otherwise
1448  */
1449 int
htmlAutoCloseTag(htmlDocPtr doc,const xmlChar * name,htmlNodePtr elem)1450 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1451     htmlNodePtr child;
1452 
1453     if (elem == NULL) return(1);
1454     if (xmlStrEqual(name, elem->name)) return(0);
1455     if (htmlCheckAutoClose(elem->name, name)) return(1);
1456     child = elem->children;
1457     while (child != NULL) {
1458         if (htmlAutoCloseTag(doc, name, child)) return(1);
1459 	child = child->next;
1460     }
1461     return(0);
1462 }
1463 
1464 /**
1465  * htmlIsAutoClosed:
1466  * @doc:  the HTML document
1467  * @elem:  the HTML element
1468  *
1469  * DEPRECATED: Internal function, don't use.
1470  *
1471  * The HTML DTD allows a tag to implicitly close other tags.
1472  * The list is kept in htmlStartClose array. This function checks
1473  * if a tag is autoclosed by one of it's child
1474  *
1475  * Returns 1 if autoclosed, 0 otherwise
1476  */
1477 int
htmlIsAutoClosed(htmlDocPtr doc,htmlNodePtr elem)1478 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1479     htmlNodePtr child;
1480 
1481     if (elem == NULL) return(1);
1482     child = elem->children;
1483     while (child != NULL) {
1484 	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1485 	child = child->next;
1486     }
1487     return(0);
1488 }
1489 
1490 /**
1491  * htmlCheckImplied:
1492  * @ctxt:  an HTML parser context
1493  * @newtag:  The new tag name
1494  *
1495  * The HTML DTD allows a tag to exists only implicitly
1496  * called when a new tag has been detected and generates the
1497  * appropriates implicit tags if missing
1498  */
1499 static void
htmlCheckImplied(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1500 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1501     int i;
1502 
1503     if (ctxt->options & (HTML_PARSE_NOIMPLIED | HTML_PARSE_HTML5))
1504         return;
1505     if (!htmlOmittedDefaultValue)
1506 	return;
1507     if (xmlStrEqual(newtag, BAD_CAST"html"))
1508 	return;
1509     if (ctxt->nameNr <= 0) {
1510 	htmlnamePush(ctxt, BAD_CAST"html");
1511 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1512 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1513     }
1514     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1515         return;
1516     if ((ctxt->nameNr <= 1) &&
1517         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1518 	 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1519 	 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1520 	 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1521 	 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1522 	 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1523         if (ctxt->html >= 3) {
1524             /* we already saw or generated an <head> before */
1525             return;
1526         }
1527         /*
1528          * dropped OBJECT ... i you put it first BODY will be
1529          * assumed !
1530          */
1531         htmlnamePush(ctxt, BAD_CAST"head");
1532         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1533             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1534     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1535 	       (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1536 	       (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1537         if (ctxt->html >= 10) {
1538             /* we already saw or generated a <body> before */
1539             return;
1540         }
1541 	for (i = 0;i < ctxt->nameNr;i++) {
1542 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1543 		return;
1544 	    }
1545 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1546 		return;
1547 	    }
1548 	}
1549 
1550 	htmlnamePush(ctxt, BAD_CAST"body");
1551 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1552 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1553     }
1554 }
1555 
1556 /**
1557  * htmlCheckParagraph
1558  * @ctxt:  an HTML parser context
1559  *
1560  * Check whether a p element need to be implied before inserting
1561  * characters in the current element.
1562  *
1563  * Returns 1 if a paragraph has been inserted, 0 if not and -1
1564  *         in case of error.
1565  */
1566 
1567 static int
htmlCheckParagraph(htmlParserCtxtPtr ctxt)1568 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1569     const xmlChar *tag;
1570     int i;
1571 
1572     if (ctxt == NULL)
1573 	return(-1);
1574     if (ctxt->options & HTML_PARSE_HTML5)
1575         return(0);
1576 
1577     tag = ctxt->name;
1578     if (tag == NULL) {
1579 	htmlAutoClose(ctxt, BAD_CAST"p");
1580 	htmlCheckImplied(ctxt, BAD_CAST"p");
1581 	htmlnamePush(ctxt, BAD_CAST"p");
1582 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1583 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1584 	return(1);
1585     }
1586     if (!htmlOmittedDefaultValue)
1587 	return(0);
1588     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1589 	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1590 	    htmlAutoClose(ctxt, BAD_CAST"p");
1591 	    htmlCheckImplied(ctxt, BAD_CAST"p");
1592 	    htmlnamePush(ctxt, BAD_CAST"p");
1593 	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1594 		ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1595 	    return(1);
1596 	}
1597     }
1598     return(0);
1599 }
1600 
1601 /**
1602  * htmlIsScriptAttribute:
1603  * @name:  an attribute name
1604  *
1605  * Check if an attribute is of content type Script
1606  *
1607  * Returns 1 is the attribute is a script 0 otherwise
1608  */
1609 int
htmlIsScriptAttribute(const xmlChar * name)1610 htmlIsScriptAttribute(const xmlChar *name) {
1611     unsigned int i;
1612 
1613     if (name == NULL)
1614       return(0);
1615     /*
1616      * all script attributes start with 'on'
1617      */
1618     if ((name[0] != 'o') || (name[1] != 'n'))
1619       return(0);
1620     for (i = 0;
1621 	 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1622 	 i++) {
1623 	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1624 	    return(1);
1625     }
1626     return(0);
1627 }
1628 
1629 /************************************************************************
1630  *									*
1631  *	The list of HTML predefined entities			*
1632  *									*
1633  ************************************************************************/
1634 
1635 
1636 static const htmlEntityDesc  html40EntitiesTable[] = {
1637 /*
1638  * the 4 absolute ones, plus apostrophe.
1639  */
1640 { 34,	"quot",	"quotation mark = APL quote, U+0022 ISOnum" },
1641 { 38,	"amp",	"ampersand, U+0026 ISOnum" },
1642 { 39,	"apos",	"single quote" },
1643 { 60,	"lt",	"less-than sign, U+003C ISOnum" },
1644 { 62,	"gt",	"greater-than sign, U+003E ISOnum" },
1645 
1646 /*
1647  * A bunch still in the 128-255 range
1648  * Replacing them depend really on the charset used.
1649  */
1650 { 160,	"nbsp",	"no-break space = non-breaking space, U+00A0 ISOnum" },
1651 { 161,	"iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1652 { 162,	"cent",	"cent sign, U+00A2 ISOnum" },
1653 { 163,	"pound","pound sign, U+00A3 ISOnum" },
1654 { 164,	"curren","currency sign, U+00A4 ISOnum" },
1655 { 165,	"yen",	"yen sign = yuan sign, U+00A5 ISOnum" },
1656 { 166,	"brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1657 { 167,	"sect",	"section sign, U+00A7 ISOnum" },
1658 { 168,	"uml",	"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1659 { 169,	"copy",	"copyright sign, U+00A9 ISOnum" },
1660 { 170,	"ordf",	"feminine ordinal indicator, U+00AA ISOnum" },
1661 { 171,	"laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1662 { 172,	"not",	"not sign, U+00AC ISOnum" },
1663 { 173,	"shy",	"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1664 { 174,	"reg",	"registered sign = registered trade mark sign, U+00AE ISOnum" },
1665 { 175,	"macr",	"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1666 { 176,	"deg",	"degree sign, U+00B0 ISOnum" },
1667 { 177,	"plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1668 { 178,	"sup2",	"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1669 { 179,	"sup3",	"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1670 { 180,	"acute","acute accent = spacing acute, U+00B4 ISOdia" },
1671 { 181,	"micro","micro sign, U+00B5 ISOnum" },
1672 { 182,	"para",	"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1673 { 183,	"middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1674 { 184,	"cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1675 { 185,	"sup1",	"superscript one = superscript digit one, U+00B9 ISOnum" },
1676 { 186,	"ordm",	"masculine ordinal indicator, U+00BA ISOnum" },
1677 { 187,	"raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1678 { 188,	"frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1679 { 189,	"frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1680 { 190,	"frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1681 { 191,	"iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1682 { 192,	"Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1683 { 193,	"Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1684 { 194,	"Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1685 { 195,	"Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1686 { 196,	"Auml",	"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1687 { 197,	"Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1688 { 198,	"AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1689 { 199,	"Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1690 { 200,	"Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1691 { 201,	"Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1692 { 202,	"Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1693 { 203,	"Euml",	"latin capital letter E with diaeresis, U+00CB ISOlat1" },
1694 { 204,	"Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1695 { 205,	"Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1696 { 206,	"Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1697 { 207,	"Iuml",	"latin capital letter I with diaeresis, U+00CF ISOlat1" },
1698 { 208,	"ETH",	"latin capital letter ETH, U+00D0 ISOlat1" },
1699 { 209,	"Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1700 { 210,	"Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1701 { 211,	"Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1702 { 212,	"Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1703 { 213,	"Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1704 { 214,	"Ouml",	"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1705 { 215,	"times","multiplication sign, U+00D7 ISOnum" },
1706 { 216,	"Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1707 { 217,	"Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1708 { 218,	"Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1709 { 219,	"Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1710 { 220,	"Uuml",	"latin capital letter U with diaeresis, U+00DC ISOlat1" },
1711 { 221,	"Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1712 { 222,	"THORN","latin capital letter THORN, U+00DE ISOlat1" },
1713 { 223,	"szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1714 { 224,	"agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1715 { 225,	"aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1716 { 226,	"acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1717 { 227,	"atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1718 { 228,	"auml",	"latin small letter a with diaeresis, U+00E4 ISOlat1" },
1719 { 229,	"aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1720 { 230,	"aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1721 { 231,	"ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1722 { 232,	"egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1723 { 233,	"eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1724 { 234,	"ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1725 { 235,	"euml",	"latin small letter e with diaeresis, U+00EB ISOlat1" },
1726 { 236,	"igrave","latin small letter i with grave, U+00EC ISOlat1" },
1727 { 237,	"iacute","latin small letter i with acute, U+00ED ISOlat1" },
1728 { 238,	"icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1729 { 239,	"iuml",	"latin small letter i with diaeresis, U+00EF ISOlat1" },
1730 { 240,	"eth",	"latin small letter eth, U+00F0 ISOlat1" },
1731 { 241,	"ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1732 { 242,	"ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1733 { 243,	"oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1734 { 244,	"ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1735 { 245,	"otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1736 { 246,	"ouml",	"latin small letter o with diaeresis, U+00F6 ISOlat1" },
1737 { 247,	"divide","division sign, U+00F7 ISOnum" },
1738 { 248,	"oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1739 { 249,	"ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1740 { 250,	"uacute","latin small letter u with acute, U+00FA ISOlat1" },
1741 { 251,	"ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1742 { 252,	"uuml",	"latin small letter u with diaeresis, U+00FC ISOlat1" },
1743 { 253,	"yacute","latin small letter y with acute, U+00FD ISOlat1" },
1744 { 254,	"thorn","latin small letter thorn with, U+00FE ISOlat1" },
1745 { 255,	"yuml",	"latin small letter y with diaeresis, U+00FF ISOlat1" },
1746 
1747 { 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },
1748 { 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },
1749 { 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1750 { 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },
1751 { 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1752 
1753 /*
1754  * Anything below should really be kept as entities references
1755  */
1756 { 402,	"fnof",	"latin small f with hook = function = florin, U+0192 ISOtech" },
1757 
1758 { 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },
1759 { 732,	"tilde","small tilde, U+02DC ISOdia" },
1760 
1761 { 913,	"Alpha","greek capital letter alpha, U+0391" },
1762 { 914,	"Beta",	"greek capital letter beta, U+0392" },
1763 { 915,	"Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1764 { 916,	"Delta","greek capital letter delta, U+0394 ISOgrk3" },
1765 { 917,	"Epsilon","greek capital letter epsilon, U+0395" },
1766 { 918,	"Zeta",	"greek capital letter zeta, U+0396" },
1767 { 919,	"Eta",	"greek capital letter eta, U+0397" },
1768 { 920,	"Theta","greek capital letter theta, U+0398 ISOgrk3" },
1769 { 921,	"Iota",	"greek capital letter iota, U+0399" },
1770 { 922,	"Kappa","greek capital letter kappa, U+039A" },
1771 { 923,	"Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1772 { 924,	"Mu",	"greek capital letter mu, U+039C" },
1773 { 925,	"Nu",	"greek capital letter nu, U+039D" },
1774 { 926,	"Xi",	"greek capital letter xi, U+039E ISOgrk3" },
1775 { 927,	"Omicron","greek capital letter omicron, U+039F" },
1776 { 928,	"Pi",	"greek capital letter pi, U+03A0 ISOgrk3" },
1777 { 929,	"Rho",	"greek capital letter rho, U+03A1" },
1778 { 931,	"Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1779 { 932,	"Tau",	"greek capital letter tau, U+03A4" },
1780 { 933,	"Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1781 { 934,	"Phi",	"greek capital letter phi, U+03A6 ISOgrk3" },
1782 { 935,	"Chi",	"greek capital letter chi, U+03A7" },
1783 { 936,	"Psi",	"greek capital letter psi, U+03A8 ISOgrk3" },
1784 { 937,	"Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1785 
1786 { 945,	"alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1787 { 946,	"beta",	"greek small letter beta, U+03B2 ISOgrk3" },
1788 { 947,	"gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1789 { 948,	"delta","greek small letter delta, U+03B4 ISOgrk3" },
1790 { 949,	"epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1791 { 950,	"zeta",	"greek small letter zeta, U+03B6 ISOgrk3" },
1792 { 951,	"eta",	"greek small letter eta, U+03B7 ISOgrk3" },
1793 { 952,	"theta","greek small letter theta, U+03B8 ISOgrk3" },
1794 { 953,	"iota",	"greek small letter iota, U+03B9 ISOgrk3" },
1795 { 954,	"kappa","greek small letter kappa, U+03BA ISOgrk3" },
1796 { 955,	"lambda","greek small letter lambda, U+03BB ISOgrk3" },
1797 { 956,	"mu",	"greek small letter mu, U+03BC ISOgrk3" },
1798 { 957,	"nu",	"greek small letter nu, U+03BD ISOgrk3" },
1799 { 958,	"xi",	"greek small letter xi, U+03BE ISOgrk3" },
1800 { 959,	"omicron","greek small letter omicron, U+03BF NEW" },
1801 { 960,	"pi",	"greek small letter pi, U+03C0 ISOgrk3" },
1802 { 961,	"rho",	"greek small letter rho, U+03C1 ISOgrk3" },
1803 { 962,	"sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1804 { 963,	"sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1805 { 964,	"tau",	"greek small letter tau, U+03C4 ISOgrk3" },
1806 { 965,	"upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1807 { 966,	"phi",	"greek small letter phi, U+03C6 ISOgrk3" },
1808 { 967,	"chi",	"greek small letter chi, U+03C7 ISOgrk3" },
1809 { 968,	"psi",	"greek small letter psi, U+03C8 ISOgrk3" },
1810 { 969,	"omega","greek small letter omega, U+03C9 ISOgrk3" },
1811 { 977,	"thetasym","greek small letter theta symbol, U+03D1 NEW" },
1812 { 978,	"upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1813 { 982,	"piv",	"greek pi symbol, U+03D6 ISOgrk3" },
1814 
1815 { 8194,	"ensp",	"en space, U+2002 ISOpub" },
1816 { 8195,	"emsp",	"em space, U+2003 ISOpub" },
1817 { 8201,	"thinsp","thin space, U+2009 ISOpub" },
1818 { 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },
1819 { 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },
1820 { 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },
1821 { 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },
1822 { 8211,	"ndash","en dash, U+2013 ISOpub" },
1823 { 8212,	"mdash","em dash, U+2014 ISOpub" },
1824 { 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },
1825 { 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },
1826 { 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },
1827 { 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },
1828 { 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },
1829 { 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },
1830 { 8224,	"dagger","dagger, U+2020 ISOpub" },
1831 { 8225,	"Dagger","double dagger, U+2021 ISOpub" },
1832 
1833 { 8226,	"bull",	"bullet = black small circle, U+2022 ISOpub" },
1834 { 8230,	"hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1835 
1836 { 8240,	"permil","per mille sign, U+2030 ISOtech" },
1837 
1838 { 8242,	"prime","prime = minutes = feet, U+2032 ISOtech" },
1839 { 8243,	"Prime","double prime = seconds = inches, U+2033 ISOtech" },
1840 
1841 { 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1842 { 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1843 
1844 { 8254,	"oline","overline = spacing overscore, U+203E NEW" },
1845 { 8260,	"frasl","fraction slash, U+2044 NEW" },
1846 
1847 { 8364,	"euro",	"euro sign, U+20AC NEW" },
1848 
1849 { 8465,	"image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1850 { 8472,	"weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1851 { 8476,	"real",	"blackletter capital R = real part symbol, U+211C ISOamso" },
1852 { 8482,	"trade","trade mark sign, U+2122 ISOnum" },
1853 { 8501,	"alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1854 { 8592,	"larr",	"leftwards arrow, U+2190 ISOnum" },
1855 { 8593,	"uarr",	"upwards arrow, U+2191 ISOnum" },
1856 { 8594,	"rarr",	"rightwards arrow, U+2192 ISOnum" },
1857 { 8595,	"darr",	"downwards arrow, U+2193 ISOnum" },
1858 { 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },
1859 { 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1860 { 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },
1861 { 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },
1862 { 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },
1863 { 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },
1864 { 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },
1865 
1866 { 8704,	"forall","for all, U+2200 ISOtech" },
1867 { 8706,	"part",	"partial differential, U+2202 ISOtech" },
1868 { 8707,	"exist","there exists, U+2203 ISOtech" },
1869 { 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },
1870 { 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },
1871 { 8712,	"isin",	"element of, U+2208 ISOtech" },
1872 { 8713,	"notin","not an element of, U+2209 ISOtech" },
1873 { 8715,	"ni",	"contains as member, U+220B ISOtech" },
1874 { 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },
1875 { 8721,	"sum",	"n-ary summation, U+2211 ISOamsb" },
1876 { 8722,	"minus","minus sign, U+2212 ISOtech" },
1877 { 8727,	"lowast","asterisk operator, U+2217 ISOtech" },
1878 { 8730,	"radic","square root = radical sign, U+221A ISOtech" },
1879 { 8733,	"prop",	"proportional to, U+221D ISOtech" },
1880 { 8734,	"infin","infinity, U+221E ISOtech" },
1881 { 8736,	"ang",	"angle, U+2220 ISOamso" },
1882 { 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },
1883 { 8744,	"or",	"logical or = vee, U+2228 ISOtech" },
1884 { 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },
1885 { 8746,	"cup",	"union = cup, U+222A ISOtech" },
1886 { 8747,	"int",	"integral, U+222B ISOtech" },
1887 { 8756,	"there4","therefore, U+2234 ISOtech" },
1888 { 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },
1889 { 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },
1890 { 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1891 { 8800,	"ne",	"not equal to, U+2260 ISOtech" },
1892 { 8801,	"equiv","identical to, U+2261 ISOtech" },
1893 { 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },
1894 { 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },
1895 { 8834,	"sub",	"subset of, U+2282 ISOtech" },
1896 { 8835,	"sup",	"superset of, U+2283 ISOtech" },
1897 { 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },
1898 { 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },
1899 { 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },
1900 { 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },
1901 { 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },
1902 { 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1903 { 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },
1904 { 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1905 { 8969,	"rceil","right ceiling, U+2309 ISOamsc" },
1906 { 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },
1907 { 8971,	"rfloor","right floor, U+230B ISOamsc" },
1908 { 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },
1909 { 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },
1910 { 9674,	"loz",	"lozenge, U+25CA ISOpub" },
1911 
1912 { 9824,	"spades","black spade suit, U+2660 ISOpub" },
1913 { 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },
1914 { 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },
1915 { 9830,	"diams","black diamond suit, U+2666 ISOpub" },
1916 
1917 };
1918 
1919 /************************************************************************
1920  *									*
1921  *		Commodity functions to handle entities			*
1922  *									*
1923  ************************************************************************/
1924 
1925 /**
1926  * htmlEntityLookup:
1927  * @name: the entity name
1928  *
1929  * Lookup the given entity in EntitiesTable
1930  *
1931  * TODO: the linear scan is really ugly, an hash table is really needed.
1932  *
1933  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1934  */
1935 const htmlEntityDesc *
htmlEntityLookup(const xmlChar * name)1936 htmlEntityLookup(const xmlChar *name) {
1937     unsigned int i;
1938 
1939     for (i = 0;i < (sizeof(html40EntitiesTable)/
1940                     sizeof(html40EntitiesTable[0]));i++) {
1941         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1942             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1943 	}
1944     }
1945     return(NULL);
1946 }
1947 
1948 static int
htmlCompareEntityDesc(const void * vkey,const void * vdesc)1949 htmlCompareEntityDesc(const void *vkey, const void *vdesc) {
1950     const unsigned *key = vkey;
1951     const htmlEntityDesc *desc = vdesc;
1952 
1953     return((int) *key - (int) desc->value);
1954 }
1955 
1956 /**
1957  * htmlEntityValueLookup:
1958  * @value: the entity's unicode value
1959  *
1960  * Lookup the given entity in EntitiesTable
1961  *
1962  * TODO: the linear scan is really ugly, an hash table is really needed.
1963  *
1964  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1965  */
1966 const htmlEntityDesc *
htmlEntityValueLookup(unsigned int value)1967 htmlEntityValueLookup(unsigned int value) {
1968     const htmlEntityDesc *desc;
1969     size_t nmemb;
1970 
1971     nmemb = sizeof(html40EntitiesTable) / sizeof(html40EntitiesTable[0]);
1972     desc = bsearch(&value, html40EntitiesTable, nmemb, sizeof(htmlEntityDesc),
1973                    htmlCompareEntityDesc);
1974 
1975     return(desc);
1976 }
1977 
1978 /**
1979  * UTF8ToHtml:
1980  * @out:  a pointer to an array of bytes to store the result
1981  * @outlen:  the length of @out
1982  * @in:  a pointer to an array of UTF-8 chars
1983  * @inlen:  the length of @in
1984  *
1985  * Take a block of UTF-8 chars in and try to convert it to an ASCII
1986  * plus HTML entities block of chars out.
1987  *
1988  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1989  * The value of @inlen after return is the number of octets consumed
1990  *     as the return value is positive, else unpredictable.
1991  * The value of @outlen after return is the number of octets consumed.
1992  */
1993 int
UTF8ToHtml(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)1994 UTF8ToHtml(unsigned char* out, int *outlen,
1995            const unsigned char* in, int *inlen) {
1996     const unsigned char* instart = in;
1997     const unsigned char* inend;
1998     unsigned char* outstart = out;
1999     unsigned char* outend;
2000     int ret = XML_ENC_ERR_SPACE;
2001 
2002     if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
2003         return(XML_ENC_ERR_INTERNAL);
2004 
2005     if (in == NULL) {
2006         /*
2007 	 * initialization nothing to do
2008 	 */
2009 	*outlen = 0;
2010 	*inlen = 0;
2011 	return(XML_ENC_ERR_SUCCESS);
2012     }
2013 
2014     inend = in + *inlen;
2015     outend = out + *outlen;
2016     while (in < inend) {
2017         const htmlEntityDesc *ent;
2018         const char *cp;
2019         char nbuf[16];
2020         unsigned c, d;
2021         int seqlen, len, i;
2022 
2023 	d = *in;
2024 
2025 	if (d < 0x80) {
2026             if (out >= outend)
2027                 goto done;
2028             *out++ = d;
2029             in += 1;
2030             continue;
2031         }
2032 
2033         if (d < 0xE0)      { c = d & 0x1F; seqlen = 2; }
2034         else if (d < 0xF0) { c = d & 0x0F; seqlen = 3; }
2035         else               { c = d & 0x07; seqlen = 4; }
2036 
2037 	if (inend - in < seqlen)
2038 	    break;
2039 
2040 	for (i = 1; i < seqlen; i++) {
2041 	    d = in[i];
2042 	    c <<= 6;
2043 	    c |= d & 0x3F;
2044 	}
2045 
2046         /*
2047          * Try to lookup a predefined HTML entity for it
2048          */
2049         ent = htmlEntityValueLookup(c);
2050 
2051         if (ent == NULL) {
2052           snprintf(nbuf, sizeof(nbuf), "#%u", c);
2053           cp = nbuf;
2054         } else {
2055           cp = ent->name;
2056         }
2057 
2058         len = strlen(cp);
2059         if (outend - out < len + 2)
2060             goto done;
2061 
2062         *out++ = '&';
2063         memcpy(out, cp, len);
2064         out += len;
2065         *out++ = ';';
2066 
2067         in += seqlen;
2068     }
2069 
2070     ret = out - outstart;
2071 
2072 done:
2073     *outlen = out - outstart;
2074     *inlen = in - instart;
2075     return(ret);
2076 }
2077 
2078 /**
2079  * htmlEncodeEntities:
2080  * @out:  a pointer to an array of bytes to store the result
2081  * @outlen:  the length of @out
2082  * @in:  a pointer to an array of UTF-8 chars
2083  * @inlen:  the length of @in
2084  * @quoteChar: the quote character to escape (' or ") or zero.
2085  *
2086  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2087  * plus HTML entities block of chars out.
2088  *
2089  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2090  * The value of @inlen after return is the number of octets consumed
2091  *     as the return value is positive, else unpredictable.
2092  * The value of @outlen after return is the number of octets consumed.
2093  */
2094 int
htmlEncodeEntities(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,int quoteChar)2095 htmlEncodeEntities(unsigned char* out, int *outlen,
2096 		   const unsigned char* in, int *inlen, int quoteChar) {
2097     const unsigned char* processed = in;
2098     const unsigned char* outend;
2099     const unsigned char* outstart = out;
2100     const unsigned char* instart = in;
2101     const unsigned char* inend;
2102     unsigned int c, d;
2103     int trailing;
2104 
2105     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2106         return(-1);
2107     outend = out + (*outlen);
2108     inend = in + (*inlen);
2109     while (in < inend) {
2110 	d = *in++;
2111 	if      (d < 0x80)  { c= d; trailing= 0; }
2112 	else if (d < 0xC0) {
2113 	    /* trailing byte in leading position */
2114 	    *outlen = out - outstart;
2115 	    *inlen = processed - instart;
2116 	    return(-2);
2117         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2118         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2119         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2120 	else {
2121 	    /* no chance for this in Ascii */
2122 	    *outlen = out - outstart;
2123 	    *inlen = processed - instart;
2124 	    return(-2);
2125 	}
2126 
2127 	if (inend - in < trailing)
2128 	    break;
2129 
2130 	while (trailing--) {
2131 	    if (((d= *in++) & 0xC0) != 0x80) {
2132 		*outlen = out - outstart;
2133 		*inlen = processed - instart;
2134 		return(-2);
2135 	    }
2136 	    c <<= 6;
2137 	    c |= d & 0x3F;
2138 	}
2139 
2140 	/* assertion: c is a single UTF-4 value */
2141 	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2142 	    (c != '&') && (c != '<') && (c != '>')) {
2143 	    if (out >= outend)
2144 		break;
2145 	    *out++ = c;
2146 	} else {
2147 	    const htmlEntityDesc * ent;
2148 	    const char *cp;
2149 	    char nbuf[16];
2150 	    int len;
2151 
2152 	    /*
2153 	     * Try to lookup a predefined HTML entity for it
2154 	     */
2155 	    ent = htmlEntityValueLookup(c);
2156 	    if (ent == NULL) {
2157 		snprintf(nbuf, sizeof(nbuf), "#%u", c);
2158 		cp = nbuf;
2159 	    }
2160 	    else
2161 		cp = ent->name;
2162 	    len = strlen(cp);
2163 	    if (outend - out < len + 2)
2164 		break;
2165 	    *out++ = '&';
2166 	    memcpy(out, cp, len);
2167 	    out += len;
2168 	    *out++ = ';';
2169 	}
2170 	processed = in;
2171     }
2172     *outlen = out - outstart;
2173     *inlen = processed - instart;
2174     return(0);
2175 }
2176 
2177 /************************************************************************
2178  *									*
2179  *		Commodity functions, cleanup needed ?			*
2180  *									*
2181  ************************************************************************/
2182 /*
2183  * all tags allowing pc data from the html 4.01 loose dtd
2184  * NOTE: it might be more appropriate to integrate this information
2185  * into the html40ElementTable array but I don't want to risk any
2186  * binary incompatibility
2187  */
2188 static const char *allowPCData[] = {
2189     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2190     "blockquote", "body", "button", "caption", "center", "cite", "code",
2191     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2192     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2193     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2194     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2195 };
2196 
2197 /**
2198  * areBlanks:
2199  * @ctxt:  an HTML parser context
2200  * @str:  a xmlChar *
2201  * @len:  the size of @str
2202  *
2203  * Is this a sequence of blank chars that one can ignore ?
2204  *
2205  * Returns 1 if ignorable 0 if whitespace, -1 otherwise.
2206  */
2207 
areBlanks(htmlParserCtxtPtr ctxt,const xmlChar * str,int len)2208 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2209     unsigned int i;
2210     int j;
2211     xmlNodePtr lastChild;
2212     xmlDtdPtr dtd;
2213 
2214     for (j = 0;j < len;j++)
2215         if (!(IS_WS_HTML(str[j]))) return(-1);
2216 
2217     if (CUR == 0) return(1);
2218     if (CUR != '<') return(0);
2219     if (ctxt->name == NULL)
2220 	return(1);
2221     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2222 	return(1);
2223     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2224 	return(1);
2225 
2226     /* Only strip CDATA children of the body tag for strict HTML DTDs */
2227     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2228         dtd = xmlGetIntSubset(ctxt->myDoc);
2229         if (dtd != NULL && dtd->ExternalID != NULL) {
2230             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2231                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2232                 return(1);
2233         }
2234     }
2235 
2236     if (ctxt->node == NULL) return(0);
2237     lastChild = xmlGetLastChild(ctxt->node);
2238     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2239 	lastChild = lastChild->prev;
2240     if (lastChild == NULL) {
2241         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2242             (ctxt->node->content != NULL)) return(0);
2243 	/* keep ws in constructs like ...<b> </b>...
2244 	   for all tags "b" allowing PCDATA */
2245 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2246 	    if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2247 		return(0);
2248 	    }
2249 	}
2250     } else if (xmlNodeIsText(lastChild)) {
2251         return(0);
2252     } else {
2253 	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2254 	   for all tags "p" allowing PCDATA */
2255 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2256 	    if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2257 		return(0);
2258 	    }
2259 	}
2260     }
2261     return(1);
2262 }
2263 
2264 /**
2265  * htmlNewDocNoDtD:
2266  * @URI:  URI for the dtd, or NULL
2267  * @ExternalID:  the external ID of the DTD, or NULL
2268  *
2269  * Creates a new HTML document without a DTD node if @URI and @ExternalID
2270  * are NULL
2271  *
2272  * Returns a new document, do not initialize the DTD if not provided
2273  */
2274 htmlDocPtr
htmlNewDocNoDtD(const xmlChar * URI,const xmlChar * ExternalID)2275 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2276     xmlDocPtr cur;
2277 
2278     /*
2279      * Allocate a new document and fill the fields.
2280      */
2281     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2282     if (cur == NULL)
2283 	return(NULL);
2284     memset(cur, 0, sizeof(xmlDoc));
2285 
2286     cur->type = XML_HTML_DOCUMENT_NODE;
2287     cur->version = NULL;
2288     cur->intSubset = NULL;
2289     cur->doc = cur;
2290     cur->name = NULL;
2291     cur->children = NULL;
2292     cur->extSubset = NULL;
2293     cur->oldNs = NULL;
2294     cur->encoding = NULL;
2295     cur->standalone = 1;
2296     cur->compression = 0;
2297     cur->ids = NULL;
2298     cur->refs = NULL;
2299     cur->_private = NULL;
2300     cur->charset = XML_CHAR_ENCODING_UTF8;
2301     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2302     if ((ExternalID != NULL) ||
2303 	(URI != NULL)) {
2304         xmlDtdPtr intSubset;
2305 
2306 	intSubset = xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2307         if (intSubset == NULL) {
2308             xmlFree(cur);
2309             return(NULL);
2310         }
2311     }
2312     if ((xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2313 	xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2314     return(cur);
2315 }
2316 
2317 /**
2318  * htmlNewDoc:
2319  * @URI:  URI for the dtd, or NULL
2320  * @ExternalID:  the external ID of the DTD, or NULL
2321  *
2322  * Creates a new HTML document
2323  *
2324  * Returns a new document
2325  */
2326 htmlDocPtr
htmlNewDoc(const xmlChar * URI,const xmlChar * ExternalID)2327 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2328     if ((URI == NULL) && (ExternalID == NULL))
2329 	return(htmlNewDocNoDtD(
2330 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2331 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2332 
2333     return(htmlNewDocNoDtD(URI, ExternalID));
2334 }
2335 
2336 
2337 /************************************************************************
2338  *									*
2339  *			The parser itself				*
2340  *	Relates to http://www.w3.org/TR/html40				*
2341  *									*
2342  ************************************************************************/
2343 
2344 /************************************************************************
2345  *									*
2346  *			The parser itself				*
2347  *									*
2348  ************************************************************************/
2349 
2350 /**
2351  * htmlParseHTMLName:
2352  * @ctxt:  an HTML parser context
2353  *
2354  * parse an HTML tag or attribute name, note that we convert it to lowercase
2355  * since HTML names are not case-sensitive.
2356  *
2357  * Returns the Tag Name parsed or NULL
2358  */
2359 
2360 static xmlHashedString
htmlParseHTMLName(htmlParserCtxtPtr ctxt,int attr)2361 htmlParseHTMLName(htmlParserCtxtPtr ctxt, int attr) {
2362     xmlHashedString ret;
2363     xmlChar buf[HTML_PARSER_BUFFER_SIZE];
2364     const xmlChar *in;
2365     size_t avail;
2366     int eof = PARSER_PROGRESSIVE(ctxt);
2367     int nbchar = 0;
2368     int stop = attr ? '=' : ' ';
2369 
2370     in = ctxt->input->cur;
2371     avail = ctxt->input->end - in;
2372 
2373     while (1) {
2374         int c, size;
2375 
2376         if ((!eof) && (avail < 32)) {
2377             size_t oldAvail = avail;
2378 
2379             ctxt->input->cur = in;
2380 
2381             SHRINK;
2382             xmlParserGrow(ctxt);
2383 
2384             in = ctxt->input->cur;
2385             avail = ctxt->input->end - in;
2386 
2387             if (oldAvail == avail)
2388                 eof = 1;
2389         }
2390 
2391         if (avail == 0)
2392             break;
2393 
2394         c = *in;
2395         size = 1;
2396 
2397         if ((nbchar != 0) &&
2398             ((c == '/') || (c == '>') || (c == stop) ||
2399              (IS_WS_HTML(c))))
2400             break;
2401 
2402         if (c == 0) {
2403             if (nbchar + 3 <= HTML_PARSER_BUFFER_SIZE) {
2404                 buf[nbchar++] = 0xEF;
2405                 buf[nbchar++] = 0xBF;
2406                 buf[nbchar++] = 0xBD;
2407             }
2408         } else if (c < 0x80) {
2409             if (nbchar < HTML_PARSER_BUFFER_SIZE) {
2410                 if (IS_UPPER(c))
2411                     c += 0x20;
2412                 buf[nbchar++] = c;
2413             }
2414         } else {
2415             size = htmlValidateUtf8(ctxt, in, avail);
2416 
2417             if (size > 0) {
2418                 if (nbchar + size <= HTML_PARSER_BUFFER_SIZE) {
2419                     memcpy(buf + nbchar, in, size);
2420                     nbchar += size;
2421                 }
2422             } else {
2423                 size = 1;
2424 
2425                 if (nbchar + 3 <= HTML_PARSER_BUFFER_SIZE) {
2426                     buf[nbchar++] = 0xEF;
2427                     buf[nbchar++] = 0xBF;
2428                     buf[nbchar++] = 0xBD;
2429                 }
2430             }
2431         }
2432 
2433         in += size;
2434         avail -= size;
2435     }
2436 
2437     ctxt->input->cur = in;
2438 
2439     SHRINK;
2440 
2441     ret = xmlDictLookupHashed(ctxt->dict, buf, nbchar);
2442     if (ret.name == NULL)
2443         htmlErrMemory(ctxt);
2444 
2445     return(ret);
2446 }
2447 
2448 static const short htmlC1Remap[32] = {
2449     0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
2450     0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
2451     0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
2452     0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178
2453 };
2454 
2455 static const xmlChar *
htmlCodePointToUtf8(int c,xmlChar * out,int * osize)2456 htmlCodePointToUtf8(int c, xmlChar *out, int *osize) {
2457     int i = 0;
2458     int bits, hi;
2459 
2460     if ((c >= 0x80) && (c < 0xA0)) {
2461         c = htmlC1Remap[c - 0x80];
2462     } else if ((c <= 0) ||
2463                ((c >= 0xD800) && (c < 0xE000)) ||
2464                (c > 0x10FFFF)) {
2465         c = 0xFFFD;
2466     }
2467 
2468     if      (c <    0x80) { bits =  0; hi = 0x00; }
2469     else if (c <   0x800) { bits =  6; hi = 0xC0; }
2470     else if (c < 0x10000) { bits = 12; hi = 0xE0; }
2471     else                  { bits = 18; hi = 0xF0; }
2472 
2473     out[i++] = (c >> bits) | hi;
2474 
2475     while (bits > 0) {
2476         bits -= 6;
2477         out[i++] = ((c >> bits) & 0x3F) | 0x80;
2478     }
2479 
2480     *osize = i;
2481     return(out);
2482 }
2483 
2484 #include "html5ent.inc"
2485 
2486 #define ENT_F_SEMICOLON 0x80u
2487 #define ENT_F_SUBTABLE  0x40u
2488 #define ENT_F_ALL       0xC0u
2489 
2490 static const xmlChar *
htmlFindEntityPrefix(const xmlChar * string,size_t slen,int isAttr,int * nlen,int * rlen)2491 htmlFindEntityPrefix(const xmlChar *string, size_t slen, int isAttr,
2492                      int *nlen, int *rlen) {
2493     const xmlChar *match = NULL;
2494     unsigned left, right;
2495     int first = string[0];
2496     size_t matchLen = 0;
2497     size_t soff = 1;
2498 
2499     if (slen < 2)
2500         return(NULL);
2501     if (!IS_ASCII_LETTER(first))
2502         return(NULL);
2503 
2504     /*
2505      * Look up range by first character
2506      */
2507     first &= 63;
2508     left = htmlEntAlpha[first*3] | htmlEntAlpha[first*3+1] << 8;
2509     right = left + htmlEntAlpha[first*3+2];
2510 
2511     /*
2512      * Binary search
2513      */
2514     while (left < right) {
2515         const xmlChar *bytes;
2516         unsigned mid;
2517         size_t len;
2518         int cmp;
2519 
2520         mid = left + (right - left) / 2;
2521         bytes = htmlEntStrings + htmlEntValues[mid];
2522         len = bytes[0] & ~ENT_F_ALL;
2523 
2524         cmp = string[soff] - bytes[1];
2525 
2526         if (cmp == 0) {
2527             if (slen < len) {
2528                 cmp = strncmp((const char *) string + soff + 1,
2529                               (const char *) bytes + 2,
2530                               slen - 1);
2531                 /* Prefix can never match */
2532                 if (cmp == 0)
2533                     break;
2534             } else {
2535                 cmp = strncmp((const char *) string + soff + 1,
2536                               (const char *) bytes + 2,
2537                               len - 1);
2538             }
2539         }
2540 
2541         if (cmp < 0) {
2542             right = mid;
2543         } else if (cmp > 0) {
2544             left = mid + 1;
2545         } else {
2546             int term = soff + len < slen ? string[soff + len] : 0;
2547             int isAlnum, isTerm;
2548 
2549             isAlnum = IS_ALNUM(term);
2550             isTerm = ((term == ';') ||
2551                       ((bytes[0] & ENT_F_SEMICOLON) &&
2552                        ((!isAttr) ||
2553                         ((!isAlnum) && (term != '=')))));
2554 
2555             if (isTerm) {
2556                 match = bytes + len + 1;
2557                 matchLen = soff + len;
2558                 if (term == ';')
2559                     matchLen += 1;
2560             }
2561 
2562             if (bytes[0] & ENT_F_SUBTABLE) {
2563                 if (isTerm)
2564                     match += 2;
2565 
2566                 if ((isAlnum) && (soff + len < slen)) {
2567                     left = mid + bytes[len + 1];
2568                     right = left + bytes[len + 2];
2569                     soff += len;
2570                     continue;
2571                 }
2572             }
2573 
2574             break;
2575         }
2576     }
2577 
2578     if (match == NULL)
2579         return(NULL);
2580 
2581     *nlen = matchLen;
2582     *rlen = match[0];
2583     return(match + 1);
2584 }
2585 
2586 /**
2587  * htmlParseData:
2588  * @ctxt:  an HTML parser context
2589  * @mask:  mask of terminating characters
2590  * @comment:  true if parsing a comment
2591  * @refs:  true if references are allowed
2592  * @maxLength:  maximum output length
2593  *
2594  * Parse data until terminator is reached.
2595  *
2596  * Returns the parsed string or NULL in case of errors.
2597  */
2598 
2599 static xmlChar *
htmlParseData(htmlParserCtxtPtr ctxt,htmlAsciiMask mask,int comment,int refs,int maxLength)2600 htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
2601               int comment, int refs, int maxLength) {
2602     xmlParserInputPtr input = ctxt->input;
2603     xmlChar *ret = NULL;
2604     xmlChar *buffer;
2605     xmlChar utf8Char[4];
2606     size_t buffer_size;
2607     size_t used;
2608     int eof = PARSER_PROGRESSIVE(ctxt);
2609     int line, col;
2610     int termSkip = -1;
2611 
2612     used = 0;
2613     buffer_size = ctxt->spaceMax;
2614     buffer = (xmlChar *) ctxt->spaceTab;
2615     if (buffer == NULL) {
2616         buffer_size = 500;
2617         buffer = xmlMalloc(buffer_size + 1);
2618         if (buffer == NULL) {
2619             htmlErrMemory(ctxt);
2620             return(NULL);
2621         }
2622     }
2623 
2624     line = input->line;
2625     col = input->col;
2626 
2627     while (!PARSER_STOPPED(ctxt)) {
2628         const xmlChar *chunk, *in, *repl;
2629         size_t avail, chunkSize, extraSize;
2630         int replSize;
2631         int skip = 0;
2632         int ncr = 0;
2633         int ncrSize = 0;
2634         int cp = 0;
2635 
2636         chunk = input->cur;
2637         avail = input->end - chunk;
2638         in = chunk;
2639 
2640         repl = BAD_CAST "";
2641         replSize = 0;
2642 
2643         while (!PARSER_STOPPED(ctxt)) {
2644             size_t j;
2645             int cur, size;
2646 
2647             if ((!eof) && (avail <= 64)) {
2648                 size_t oldAvail = avail;
2649                 size_t off = in - chunk;
2650 
2651                 input->cur = in;
2652 
2653                 xmlParserGrow(ctxt);
2654 
2655                 in = input->cur;
2656                 chunk = in - off;
2657                 input->cur = chunk;
2658                 avail = input->end - in;
2659 
2660                 if (oldAvail == avail)
2661                     eof = 1;
2662             }
2663 
2664             if (avail == 0) {
2665                 termSkip = 0;
2666                 break;
2667             }
2668 
2669             cur = *in;
2670             size = 1;
2671             col += 1;
2672 
2673             if (htmlMaskMatch(mask, cur)) {
2674                 if (comment) {
2675                     if (avail < 2) {
2676                         termSkip = 1;
2677                     } else if (in[1] == '-') {
2678                         if  (avail < 3) {
2679                             termSkip = 2;
2680                         } else if (in[2] == '>') {
2681                             termSkip = 3;
2682                         } else if (in[2] == '!') {
2683                             if (avail < 4)
2684                                 termSkip = 3;
2685                             else if (in[3] == '>')
2686                                 termSkip = 4;
2687                         }
2688                     }
2689 
2690                     if (termSkip >= 0)
2691                         break;
2692                 } else {
2693                     termSkip = 0;
2694                     break;
2695                 }
2696             }
2697 
2698             if (ncr) {
2699                 int lc = cur | 0x20;
2700                 int digit;
2701 
2702                 if ((cur >= '0') && (cur <= '9')) {
2703                     digit = cur - '0';
2704                 } else if ((ncr == 16) && (lc >= 'a') && (lc <= 'f')) {
2705                     digit = (lc - 'a') + 10;
2706                 } else {
2707                     if (cur == ';') {
2708                         in += 1;
2709                         size += 1;
2710                         ncrSize += 1;
2711                     }
2712                     goto next_chunk;
2713                 }
2714 
2715                 cp = cp * ncr + digit;
2716                 if (cp >= 0x110000)
2717                     cp = 0x110000;
2718 
2719                 ncrSize += 1;
2720 
2721                 goto next_char;
2722             }
2723 
2724             switch (cur) {
2725             case '&':
2726                 if (!refs)
2727                     break;
2728 
2729                 j = 1;
2730 
2731                 if ((j < avail) && (in[j] == '#')) {
2732                     j += 1;
2733                     if (j < avail) {
2734                         if ((in[j] | 0x20) == 'x') {
2735                             j += 1;
2736                             if ((j < avail) && (IS_HEX_DIGIT(in[j]))) {
2737                                 ncr = 16;
2738                                 size = 3;
2739                                 ncrSize = 3;
2740                                 cp = 0;
2741                             }
2742                         } else if (IS_ASCII_DIGIT(in[j])) {
2743                             ncr = 10;
2744                             size = 2;
2745                             ncrSize = 2;
2746                             cp = 0;
2747                         }
2748                     }
2749                 } else {
2750                     repl = htmlFindEntityPrefix(in + j,
2751                                                 avail - j,
2752                                                 /* isAttr */ 1,
2753                                                 &skip, &replSize);
2754                     if (repl != NULL) {
2755                         skip += 1;
2756                         goto next_chunk;
2757                     }
2758 
2759                     skip = 0;
2760                 }
2761 
2762                 break;
2763 
2764             case '\0':
2765                 skip = 1;
2766                 repl = BAD_CAST "\xEF\xBF\xBD";
2767                 replSize = 3;
2768                 goto next_chunk;
2769 
2770             case '\n':
2771                 line += 1;
2772                 col = 1;
2773                 break;
2774 
2775             case '\r':
2776                 skip = 1;
2777                 if (in[1] != 0x0A) {
2778                     repl = BAD_CAST "\x0A";
2779                     replSize = 1;
2780                 }
2781                 goto next_chunk;
2782 
2783             default:
2784                 if (cur < 0x80)
2785                     break;
2786 
2787                 if ((input->flags & XML_INPUT_HAS_ENCODING) == 0) {
2788                     xmlChar * guess;
2789 
2790                     guess = htmlFindEncoding(ctxt);
2791                     if (guess == NULL) {
2792                         xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
2793                     } else {
2794                         xmlSwitchEncodingName(ctxt, (const char *) guess);
2795                         xmlFree(guess);
2796                     }
2797                     input->flags |= XML_INPUT_HAS_ENCODING;
2798 
2799                     goto restart;
2800                 }
2801 
2802                 size = htmlValidateUtf8(ctxt, in, avail);
2803 
2804                 if (size <= 0) {
2805                     skip = 1;
2806                     repl = BAD_CAST "\xEF\xBF\xBD";
2807                     replSize = 3;
2808                     goto next_chunk;
2809                 }
2810 
2811                 break;
2812             }
2813 
2814 next_char:
2815             in += size;
2816             avail -= size;
2817         }
2818 
2819 next_chunk:
2820         if (ncrSize > 0) {
2821             skip = ncrSize;
2822             in -= ncrSize;
2823 
2824             repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
2825         }
2826 
2827         chunkSize = in - chunk;
2828         extraSize = chunkSize + replSize;
2829 
2830         if (extraSize > maxLength - used) {
2831             htmlParseErr(ctxt, XML_ERR_RESOURCE_LIMIT,
2832                          "value too long\n", NULL, NULL);
2833             goto error;
2834         }
2835 
2836         if (extraSize > buffer_size - used) {
2837             size_t newSize = (used + extraSize) * 2;
2838             xmlChar *tmp = (xmlChar *) xmlRealloc(buffer, newSize + 1);
2839 
2840             if (tmp == NULL) {
2841                 htmlErrMemory(ctxt);
2842                 goto error;
2843             }
2844             buffer = tmp;
2845             buffer_size = newSize;
2846         }
2847 
2848         if (chunkSize > 0) {
2849             input->cur += chunkSize;
2850             memcpy(buffer + used, chunk, chunkSize);
2851             used += chunkSize;
2852         }
2853 
2854         input->cur += skip;
2855         if (replSize > 0) {
2856             memcpy(buffer + used, repl, replSize);
2857             used += replSize;
2858         }
2859 
2860         SHRINK;
2861 
2862         if (termSkip >= 0)
2863             break;
2864 
2865 restart:
2866         ;
2867     }
2868 
2869     if (termSkip > 0) {
2870         input->cur += termSkip;
2871         col += termSkip;
2872     }
2873 
2874     input->line = line;
2875     input->col = col;
2876 
2877     ret = xmlMalloc(used + 1);
2878     if (ret == NULL) {
2879         htmlErrMemory(ctxt);
2880     } else {
2881         memcpy(ret, buffer, used);
2882         ret[used] = 0;
2883     }
2884 
2885 error:
2886     ctxt->spaceTab = (void *) buffer;
2887     ctxt->spaceMax = buffer_size;
2888 
2889     return(ret);
2890 }
2891 
2892 /**
2893  * htmlParseEntityRef:
2894  * @ctxt:  an HTML parser context
2895  * @str:  location to store the entity name
2896  *
2897  * DEPRECATED: Internal function, don't use.
2898  *
2899  * Returns NULL.
2900  */
2901 const htmlEntityDesc *
htmlParseEntityRef(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED,const xmlChar ** str ATTRIBUTE_UNUSED)2902 htmlParseEntityRef(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED,
2903                    const xmlChar **str ATTRIBUTE_UNUSED) {
2904     return(NULL);
2905 }
2906 
2907 /**
2908  * htmlParseAttValue:
2909  * @ctxt:  an HTML parser context
2910  *
2911  * parse a value for an attribute
2912  * Note: the parser won't do substitution of entities here, this
2913  * will be handled later in xmlStringGetNodeList, unless it was
2914  * asked for ctxt->replaceEntities != 0
2915  *
2916  * Returns the AttValue parsed or NULL.
2917  */
2918 
2919 static xmlChar *
htmlParseAttValue(htmlParserCtxtPtr ctxt)2920 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2921     xmlChar *ret = NULL;
2922     int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
2923                     XML_MAX_HUGE_LENGTH :
2924                     XML_MAX_TEXT_LENGTH;
2925 
2926     if (CUR == '"') {
2927         SKIP(1);
2928 	ret = htmlParseData(ctxt, MASK_DQ, 0, 1, maxLength);
2929         if (CUR == '"')
2930             SKIP(1);
2931     } else if (CUR == '\'') {
2932         SKIP(1);
2933 	ret = htmlParseData(ctxt, MASK_SQ, 0, 1, maxLength);
2934         if (CUR == '\'')
2935             SKIP(1);
2936     } else {
2937 	ret = htmlParseData(ctxt, MASK_WS_GT, 0, 1, maxLength);
2938     }
2939     return(ret);
2940 }
2941 
2942 static void
htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt,const xmlChar * buf,int size,int mode)2943 htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt, const xmlChar *buf,
2944                         int size, int mode) {
2945     if ((ctxt->sax == NULL) || (ctxt->disableSAX))
2946         return;
2947 
2948     if ((mode == 0) || (mode == DATA_RCDATA) ||
2949         (ctxt->sax->cdataBlock == NULL)) {
2950         int blank = areBlanks(ctxt, buf, size);
2951 
2952         if ((mode == 0) && (blank > 0) && (!ctxt->keepBlanks)) {
2953             if (ctxt->sax->ignorableWhitespace != NULL)
2954                 ctxt->sax->ignorableWhitespace(ctxt->userData,
2955                                                buf, size);
2956         } else {
2957             if ((mode == 0) && (blank < 0))
2958                 htmlCheckParagraph(ctxt);
2959 
2960             if (ctxt->sax->characters != NULL)
2961                 ctxt->sax->characters(ctxt->userData, buf, size);
2962         }
2963     } else {
2964         /*
2965          * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2966          */
2967         ctxt->sax->cdataBlock(ctxt->userData, buf, size);
2968     }
2969 }
2970 
2971 /**
2972  * htmlParseCharData:
2973  * @ctxt:  an HTML parser context
2974  * @terminate: true if the input buffer is complete
2975  *
2976  * Parse character data and references.
2977  */
2978 
2979 static int
htmlParseCharData(htmlParserCtxtPtr ctxt)2980 htmlParseCharData(htmlParserCtxtPtr ctxt) {
2981     xmlParserInputPtr input = ctxt->input;
2982     xmlChar utf8Char[4];
2983     int complete = 0;
2984     int done = 0;
2985     int mode;
2986     int eof = PARSER_PROGRESSIVE(ctxt);
2987     int line, col;
2988 
2989     mode = ctxt->endCheckState;
2990 
2991     line = input->line;
2992     col = input->col;
2993 
2994     while (!PARSER_STOPPED(ctxt)) {
2995         const xmlChar *chunk, *in, *repl;
2996         size_t avail;
2997         int replSize;
2998         int skip = 0;
2999         int ncr = 0;
3000         int ncrSize = 0;
3001         int cp = 0;
3002 
3003         chunk = input->cur;
3004         avail = input->end - chunk;
3005         in = chunk;
3006 
3007         repl = BAD_CAST "";
3008         replSize = 0;
3009 
3010         while (!PARSER_STOPPED(ctxt)) {
3011             size_t j;
3012             int cur, size;
3013 
3014             if (avail <= 64) {
3015                 if (!eof) {
3016                     size_t oldAvail = avail;
3017                     size_t off = in - chunk;
3018 
3019                     input->cur = in;
3020 
3021                     xmlParserGrow(ctxt);
3022 
3023                     in = input->cur;
3024                     chunk = in - off;
3025                     input->cur = chunk;
3026                     avail = input->end - in;
3027 
3028                     if (oldAvail == avail)
3029                         eof = 1;
3030                 }
3031 
3032                 if (avail == 0) {
3033                     done = 1;
3034                     break;
3035                 }
3036             }
3037 
3038             /* Accelerator */
3039             if (!ncr) {
3040                 while (avail > 0) {
3041                     static const unsigned mask[8] = {
3042                         0x00002401, 0x10002040,
3043                         0x00000000, 0x00000000,
3044                         0xFFFFFFFF, 0xFFFFFFFF,
3045                         0xFFFFFFFF, 0xFFFFFFFF
3046                     };
3047                     cur = *in;
3048                     if ((1u << (cur & 0x1F)) & mask[cur >> 5])
3049                         break;
3050                     col += 1;
3051                     in += 1;
3052                     avail -= 1;
3053                 }
3054 
3055                 if ((!eof) && (avail <= 64))
3056                     continue;
3057                 if (avail == 0)
3058                     continue;
3059             }
3060 
3061             cur = *in;
3062             size = 1;
3063             col += 1;
3064 
3065             if (ncr) {
3066                 int lc = cur | 0x20;
3067                 int digit;
3068 
3069                 if ((cur >= '0') && (cur <= '9')) {
3070                     digit = cur - '0';
3071                 } else if ((ncr == 16) && (lc >= 'a') && (lc <= 'f')) {
3072                     digit = (lc - 'a') + 10;
3073                 } else {
3074                     if (cur == ';') {
3075                         in += 1;
3076                         size += 1;
3077                         ncrSize += 1;
3078                     }
3079                     goto next_chunk;
3080                 }
3081 
3082                 cp = cp * ncr + digit;
3083                 if (cp >= 0x110000)
3084                     cp = 0x110000;
3085 
3086                 ncrSize += 1;
3087 
3088                 goto next_char;
3089             }
3090 
3091             switch (cur) {
3092             case '<':
3093                 if (mode == 0) {
3094                     done = 1;
3095                     goto next_chunk;
3096                 }
3097                 if (mode == DATA_PLAINTEXT)
3098                     break;
3099 
3100                 j = 1;
3101                 if (j < avail) {
3102                     if ((mode == DATA_SCRIPT) && (in[j] == '!')) {
3103                         /* Check for comment start */
3104 
3105                         j += 1;
3106                         if ((j < avail) && (in[j] == '-')) {
3107                             j += 1;
3108                             if ((j < avail) && (in[j] == '-'))
3109                                 mode = DATA_SCRIPT_ESC1;
3110                         }
3111                     } else {
3112                         int i = 0;
3113                         int solidus = 0;
3114 
3115                         /* Check for tag */
3116 
3117                         if (in[j] == '/') {
3118                             j += 1;
3119                             solidus = 1;
3120                         }
3121 
3122                         if ((solidus) || (mode == DATA_SCRIPT_ESC1)) {
3123                             while ((j < avail) &&
3124                                    (ctxt->name[i] != 0) &&
3125                                    (ctxt->name[i] == (in[j] | 0x20))) {
3126                                 i += 1;
3127                                 j += 1;
3128                             }
3129 
3130                             if ((ctxt->name[i] == 0) && (j < avail)) {
3131                                 int c = in[j];
3132 
3133                                 if ((c == '>') || (c == '/') ||
3134                                     (IS_WS_HTML(c))) {
3135                                     if ((mode == DATA_SCRIPT_ESC1) &&
3136                                         (!solidus)) {
3137                                         mode = DATA_SCRIPT_ESC2;
3138                                     } else if (mode == DATA_SCRIPT_ESC2) {
3139                                         mode = DATA_SCRIPT_ESC1;
3140                                     } else {
3141                                         complete = 1;
3142                                         done = 1;
3143                                         goto next_chunk;
3144                                     }
3145                                 }
3146                             }
3147                         }
3148                     }
3149                 }
3150 
3151                 if ((mode != 0) && (PARSER_PROGRESSIVE(ctxt))) {
3152                     in += 1;
3153                     done = 1;
3154                     goto next_chunk;
3155                 }
3156 
3157                 break;
3158 
3159             case '-':
3160                 if ((mode != DATA_SCRIPT_ESC1) && (mode != DATA_SCRIPT_ESC2))
3161                     break;
3162 
3163                 /* Check for comment end */
3164 
3165                 j = 1;
3166                 if ((j < avail) && (in[j] == '-')) {
3167                     j += 1;
3168                     if ((j < avail) && (in[j] == '>'))
3169                         mode = DATA_SCRIPT;
3170                 }
3171 
3172                 break;
3173 
3174             case '&':
3175                 if ((mode != 0) && (mode != DATA_RCDATA))
3176                     break;
3177 
3178                 j = 1;
3179 
3180                 if ((j < avail) && (in[j] == '#')) {
3181                     j += 1;
3182                     if (j < avail) {
3183                         if ((in[j] | 0x20) == 'x') {
3184                             j += 1;
3185                             if ((j < avail) && (IS_HEX_DIGIT(in[j]))) {
3186                                 ncr = 16;
3187                                 size = 3;
3188                                 ncrSize = 3;
3189                                 cp = 0;
3190                             }
3191                         } else if (IS_ASCII_DIGIT(in[j])) {
3192                             ncr = 10;
3193                             size = 2;
3194                             ncrSize = 2;
3195                             cp = 0;
3196                         }
3197                     }
3198                 } else {
3199                     repl = htmlFindEntityPrefix(in + j,
3200                                                 avail - j,
3201                                                 /* isAttr */ 0,
3202                                                 &skip, &replSize);
3203                     if (repl != NULL) {
3204                         skip += 1;
3205                         goto next_chunk;
3206                     }
3207 
3208                     skip = 0;
3209                 }
3210 
3211                 break;
3212 
3213             case '\0':
3214                 skip = 1;
3215                 repl = BAD_CAST "\xEF\xBF\xBD";
3216                 replSize = 3;
3217                 goto next_chunk;
3218 
3219             case '\n':
3220                 line += 1;
3221                 col = 1;
3222                 break;
3223 
3224             case '\r':
3225                 skip = 1;
3226                 if (in[1] != 0x0A) {
3227                     repl = BAD_CAST "\x0A";
3228                     replSize = 1;
3229                 }
3230                 goto next_chunk;
3231 
3232             default:
3233                 if (cur < 0x80)
3234                     break;
3235 
3236                 if ((input->flags & XML_INPUT_HAS_ENCODING) == 0) {
3237                     xmlChar * guess;
3238 
3239                     guess = htmlFindEncoding(ctxt);
3240                     if (guess == NULL) {
3241                         xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
3242                     } else {
3243                         xmlSwitchEncodingName(ctxt, (const char *) guess);
3244                         xmlFree(guess);
3245                     }
3246                     input->flags |= XML_INPUT_HAS_ENCODING;
3247 
3248                     goto restart;
3249                 }
3250 
3251                 size = htmlValidateUtf8(ctxt, in, avail);
3252 
3253                 if (size <= 0) {
3254                     skip = 1;
3255                     repl = BAD_CAST "\xEF\xBF\xBD";
3256                     replSize = 3;
3257                     goto next_chunk;
3258                 }
3259 
3260                 break;
3261             }
3262 
3263 next_char:
3264             in += size;
3265             avail -= size;
3266         }
3267 
3268 next_chunk:
3269         if (ncrSize > 0) {
3270             skip = ncrSize;
3271             in -= ncrSize;
3272 
3273             repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
3274         }
3275 
3276         if (in > chunk) {
3277             input->cur += in - chunk;
3278             htmlCharDataSAXCallback(ctxt, chunk, in - chunk, mode);
3279         }
3280 
3281         input->cur += skip;
3282         if (replSize > 0)
3283             htmlCharDataSAXCallback(ctxt, repl, replSize, mode);
3284 
3285         SHRINK;
3286 
3287         if (done)
3288             break;
3289 
3290 restart:
3291         ;
3292     }
3293 
3294     input->line = line;
3295     input->col = col;
3296 
3297     if (complete)
3298         ctxt->endCheckState = 0;
3299     else
3300         ctxt->endCheckState = mode;
3301 
3302     return(complete);
3303 }
3304 
3305 /**
3306  * htmlParseComment:
3307  * @ctxt:  an HTML parser context
3308  * @bogus:  true if this is a bogus comment
3309  *
3310  * Parse an HTML comment
3311  */
3312 static void
htmlParseComment(htmlParserCtxtPtr ctxt,int bogus)3313 htmlParseComment(htmlParserCtxtPtr ctxt, int bogus) {
3314     const xmlChar *comment = BAD_CAST "";
3315     xmlChar *buf = NULL;
3316     int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
3317                     XML_MAX_HUGE_LENGTH :
3318                     XML_MAX_TEXT_LENGTH;
3319 
3320     if (bogus) {
3321         buf = htmlParseData(ctxt, MASK_GT, 0, 0, maxLength);
3322         if (CUR == '>')
3323             SKIP(1);
3324         comment = buf;
3325     } else {
3326         if (CUR == '>') {
3327             SKIP(1);
3328         } else if ((CUR == '-') && (NXT(1) == '>')) {
3329             SKIP(2);
3330         } else {
3331             buf = htmlParseData(ctxt, MASK_DASH, 1, 0, maxLength);
3332             comment = buf;
3333         }
3334     }
3335 
3336     if (comment == NULL)
3337         return;
3338 
3339     if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3340         (!ctxt->disableSAX))
3341         ctxt->sax->comment(ctxt->userData, comment);
3342 
3343     xmlFree(buf);
3344 }
3345 
3346 /**
3347  * htmlParseCharRef:
3348  * @ctxt:  an HTML parser context
3349  *
3350  * DEPRECATED: Internal function, don't use.
3351  *
3352  * Returns 0
3353  */
3354 int
htmlParseCharRef(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED)3355 htmlParseCharRef(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED) {
3356     return(0);
3357 }
3358 
3359 
3360 /**
3361  * htmlParseDoctypeLiteral:
3362  * @ctxt:  an HTML parser context
3363  *
3364  * Parse a DOCTYPE SYTSTEM or PUBLIC literal.
3365  *
3366  * Returns the literal or NULL in case of error.
3367  */
3368 
3369 static xmlChar *
htmlParseDoctypeLiteral(htmlParserCtxtPtr ctxt)3370 htmlParseDoctypeLiteral(htmlParserCtxtPtr ctxt) {
3371     xmlChar *ret;
3372     int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
3373                     XML_MAX_TEXT_LENGTH :
3374                     XML_MAX_NAME_LENGTH;
3375 
3376     if (CUR == '"') {
3377         SKIP(1);
3378         ret = htmlParseData(ctxt, MASK_DQ_GT, 0, 0, maxLength);
3379         if (CUR == '"')
3380             SKIP(1);
3381     } else if (CUR == '\'') {
3382         SKIP(1);
3383         ret = htmlParseData(ctxt, MASK_SQ_GT, 0, 0, maxLength);
3384         if (CUR == '\'')
3385             SKIP(1);
3386     } else {
3387         return(NULL);
3388     }
3389 
3390     return(ret);
3391 }
3392 
3393 static void
htmlSkipBogusDoctype(htmlParserCtxtPtr ctxt)3394 htmlSkipBogusDoctype(htmlParserCtxtPtr ctxt) {
3395     const xmlChar *in;
3396     size_t avail;
3397     int eof = PARSER_PROGRESSIVE(ctxt);
3398     int line, col;
3399 
3400     line = ctxt->input->line;
3401     col = ctxt->input->col;
3402 
3403     in = ctxt->input->cur;
3404     avail = ctxt->input->end - in;
3405 
3406     while (!PARSER_STOPPED(ctxt)) {
3407         int cur;
3408 
3409         if ((!eof) && (avail <= 64)) {
3410             size_t oldAvail = avail;
3411 
3412             ctxt->input->cur = in;
3413 
3414             xmlParserGrow(ctxt);
3415 
3416             in = ctxt->input->cur;
3417             avail = ctxt->input->end - in;
3418 
3419             if (oldAvail == avail)
3420                 eof = 1;
3421         }
3422 
3423         if (avail == 0)
3424             break;
3425 
3426         col += 1;
3427 
3428         cur = *in;
3429         if (cur == '>') {
3430             in += 1;
3431             break;
3432         } else if (cur == 0x0A) {
3433             line += 1;
3434             col = 1;
3435         }
3436 
3437         in += 1;
3438         avail -= 1;
3439 
3440         SHRINK;
3441     }
3442 
3443     ctxt->input->cur = in;
3444     ctxt->input->line = line;
3445     ctxt->input->col = col;
3446 }
3447 
3448 /**
3449  * htmlParseDocTypeDecl:
3450  * @ctxt:  an HTML parser context
3451  *
3452  * Parse a DOCTYPE declaration.
3453  */
3454 
3455 static void
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt)3456 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3457     xmlChar *name = NULL;
3458     xmlChar *publicId = NULL;
3459     xmlChar *URI = NULL;
3460     int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
3461                     XML_MAX_TEXT_LENGTH :
3462                     XML_MAX_NAME_LENGTH;
3463 
3464     /*
3465      * We know that '<!DOCTYPE' has been detected.
3466      */
3467     SKIP(9);
3468 
3469     SKIP_BLANKS;
3470 
3471     if ((ctxt->input->cur < ctxt->input->end) && (CUR != '>')) {
3472         name = htmlParseData(ctxt, MASK_WS_GT, 0, 0, maxLength);
3473 
3474         if ((ctxt->options & HTML_PARSE_HTML5) && (name != NULL)) {
3475             xmlChar *cur;
3476 
3477             for (cur = name; *cur; cur++) {
3478                 if (IS_UPPER(*cur))
3479                     *cur += 0x20;
3480             }
3481         }
3482 
3483         SKIP_BLANKS;
3484     }
3485 
3486     /*
3487      * Check for SystemID and publicId
3488      */
3489     if ((UPPER == 'P') && (UPP(1) == 'U') &&
3490 	(UPP(2) == 'B') && (UPP(3) == 'L') &&
3491 	(UPP(4) == 'I') && (UPP(5) == 'C')) {
3492         SKIP(6);
3493         SKIP_BLANKS;
3494 	publicId = htmlParseDoctypeLiteral(ctxt);
3495 	if (publicId == NULL)
3496             goto bogus;
3497         SKIP_BLANKS;
3498 	URI = htmlParseDoctypeLiteral(ctxt);
3499     } else if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3500                (UPP(2) == 'S') && (UPP(3) == 'T') &&
3501 	       (UPP(4) == 'E') && (UPP(5) == 'M')) {
3502         SKIP(6);
3503         SKIP_BLANKS;
3504 	URI = htmlParseDoctypeLiteral(ctxt);
3505     }
3506 
3507 bogus:
3508     htmlSkipBogusDoctype(ctxt);
3509 
3510     /*
3511      * Create or update the document accordingly to the DOCTYPE
3512      */
3513     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3514 	(!ctxt->disableSAX))
3515 	ctxt->sax->internalSubset(ctxt->userData, name, publicId, URI);
3516 
3517     xmlFree(name);
3518     xmlFree(URI);
3519     xmlFree(publicId);
3520 }
3521 
3522 /**
3523  * htmlParseAttribute:
3524  * @ctxt:  an HTML parser context
3525  * @value:  a xmlChar ** used to store the value of the attribute
3526  *
3527  * parse an attribute
3528  *
3529  * [41] Attribute ::= Name Eq AttValue
3530  *
3531  * [25] Eq ::= S? '=' S?
3532  *
3533  * With namespace:
3534  *
3535  * [NS 11] Attribute ::= QName Eq AttValue
3536  *
3537  * Also the case QName == xmlns:??? is handled independently as a namespace
3538  * definition.
3539  *
3540  * Returns the attribute name, and the value in *value.
3541  */
3542 
3543 static xmlHashedString
htmlParseAttribute(htmlParserCtxtPtr ctxt,xmlChar ** value)3544 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3545     xmlHashedString hname;
3546     xmlChar *val = NULL;
3547 
3548     *value = NULL;
3549     hname = htmlParseHTMLName(ctxt, 1);
3550     if (hname.name == NULL)
3551         return(hname);
3552 
3553     /*
3554      * read the value
3555      */
3556     SKIP_BLANKS;
3557     if (CUR == '=') {
3558         SKIP(1);
3559 	SKIP_BLANKS;
3560 	val = htmlParseAttValue(ctxt);
3561     }
3562 
3563     *value = val;
3564     return(hname);
3565 }
3566 
3567 /**
3568  * htmlCheckEncoding:
3569  * @ctxt:  an HTML parser context
3570  * @attvalue: the attribute value
3571  *
3572  * Checks an http-equiv attribute from a Meta tag to detect
3573  * the encoding
3574  * If a new encoding is detected the parser is switched to decode
3575  * it and pass UTF8
3576  */
3577 static void
htmlCheckEncoding(htmlParserCtxtPtr ctxt,const xmlChar * attvalue)3578 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3579     const xmlChar *encoding;
3580     xmlChar *copy;
3581 
3582     if (!attvalue)
3583 	return;
3584 
3585     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3586     if (encoding != NULL) {
3587 	encoding += 7;
3588     }
3589     /*
3590      * skip blank
3591      */
3592     if (encoding && IS_WS_HTML(*encoding))
3593 	encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3594     if (encoding && *encoding == '=') {
3595 	encoding ++;
3596         copy = xmlStrdup(encoding);
3597         if (copy == NULL)
3598             htmlErrMemory(ctxt);
3599 	xmlSetDeclaredEncoding(ctxt, copy);
3600     }
3601 }
3602 
3603 /**
3604  * htmlCheckMeta:
3605  * @ctxt:  an HTML parser context
3606  * @atts:  the attributes values
3607  *
3608  * Checks an attributes from a Meta tag
3609  */
3610 static void
htmlCheckMeta(htmlParserCtxtPtr ctxt,const xmlChar ** atts)3611 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3612     int i;
3613     const xmlChar *att, *value;
3614     int http = 0;
3615     const xmlChar *content = NULL;
3616 
3617     if ((ctxt == NULL) || (atts == NULL))
3618 	return;
3619 
3620     i = 0;
3621     att = atts[i++];
3622     while (att != NULL) {
3623 	value = atts[i++];
3624         if (value != NULL) {
3625             if ((!xmlStrcasecmp(att, BAD_CAST "http-equiv")) &&
3626                 (!xmlStrcasecmp(value, BAD_CAST "Content-Type"))) {
3627                 http = 1;
3628             } else if (!xmlStrcasecmp(att, BAD_CAST "charset")) {
3629                 xmlChar *copy;
3630 
3631                 copy = xmlStrdup(value);
3632                 if (copy == NULL)
3633                     htmlErrMemory(ctxt);
3634                 xmlSetDeclaredEncoding(ctxt, copy);
3635             } else if (!xmlStrcasecmp(att, BAD_CAST "content")) {
3636                 content = value;
3637             }
3638         }
3639 	att = atts[i++];
3640     }
3641     if ((http) && (content != NULL))
3642 	htmlCheckEncoding(ctxt, content);
3643 
3644 }
3645 
3646 /**
3647  * htmlAttrHashInsert:
3648  * @ctxt: parser context
3649  * @size: size of the hash table
3650  * @name: attribute name
3651  * @hashValue: hash value of name
3652  * @aindex: attribute index (this is a multiple of 5)
3653  *
3654  * Inserts a new attribute into the hash table.
3655  *
3656  * Returns INT_MAX if no existing attribute was found, the attribute
3657  * index if an attribute was found, -1 if a memory allocation failed.
3658  */
3659 static int
htmlAttrHashInsert(xmlParserCtxtPtr ctxt,unsigned size,const xmlChar * name,unsigned hashValue,int aindex)3660 htmlAttrHashInsert(xmlParserCtxtPtr ctxt, unsigned size, const xmlChar *name,
3661                    unsigned hashValue, int aindex) {
3662     xmlAttrHashBucket *table = ctxt->attrHash;
3663     xmlAttrHashBucket *bucket;
3664     unsigned hindex;
3665 
3666     hindex = hashValue & (size - 1);
3667     bucket = &table[hindex];
3668 
3669     while (bucket->index >= 0) {
3670         const xmlChar **atts = &ctxt->atts[bucket->index];
3671 
3672         if (name == atts[0])
3673             return(bucket->index);
3674 
3675         hindex++;
3676         bucket++;
3677         if (hindex >= size) {
3678             hindex = 0;
3679             bucket = table;
3680         }
3681     }
3682 
3683     bucket->index = aindex;
3684 
3685     return(INT_MAX);
3686 }
3687 
3688 /**
3689  * htmlParseStartTag:
3690  * @ctxt:  an HTML parser context
3691  *
3692  * parse a start of tag either for rule element or
3693  * EmptyElement. In both case we don't parse the tag closing chars.
3694  *
3695  * [40] STag ::= '<' Name (S Attribute)* S? '>'
3696  *
3697  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3698  *
3699  * With namespace:
3700  *
3701  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3702  *
3703  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3704  *
3705  * Returns 0 in case of success, -1 in case of error and 1 if discarded
3706  */
3707 
3708 static void
htmlParseStartTag(htmlParserCtxtPtr ctxt)3709 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3710     const xmlChar *name;
3711     const xmlChar *attname;
3712     xmlChar *attvalue;
3713     const xmlChar **atts;
3714     int nbatts = 0;
3715     int maxatts;
3716     int meta = 0;
3717     int i;
3718     int discardtag = 0;
3719 
3720     ctxt->endCheckState = 0;
3721 
3722     SKIP(1);
3723 
3724     atts = ctxt->atts;
3725     maxatts = ctxt->maxatts;
3726 
3727     GROW;
3728     name = htmlParseHTMLName(ctxt, 0).name;
3729     if (name == NULL)
3730         return;
3731     if (xmlStrEqual(name, BAD_CAST"meta"))
3732 	meta = 1;
3733 
3734     if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
3735         /*
3736          * Check for auto-closure of HTML elements.
3737          */
3738         htmlAutoClose(ctxt, name);
3739 
3740         /*
3741          * Check for implied HTML elements.
3742          */
3743         htmlCheckImplied(ctxt, name);
3744 
3745         /*
3746          * Avoid html at any level > 0, head at any level != 1
3747          * or any attempt to recurse body
3748          */
3749         if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3750             htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3751                          "htmlParseStartTag: misplaced <html> tag\n",
3752                          name, NULL);
3753             discardtag = 1;
3754             ctxt->depth++;
3755         }
3756         if ((ctxt->nameNr != 1) &&
3757             (xmlStrEqual(name, BAD_CAST"head"))) {
3758             htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3759                          "htmlParseStartTag: misplaced <head> tag\n",
3760                          name, NULL);
3761             discardtag = 1;
3762             ctxt->depth++;
3763         }
3764         if (xmlStrEqual(name, BAD_CAST"body")) {
3765             int indx;
3766             for (indx = 0;indx < ctxt->nameNr;indx++) {
3767                 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3768                     htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3769                                  "htmlParseStartTag: misplaced <body> tag\n",
3770                                  name, NULL);
3771                     discardtag = 1;
3772                     ctxt->depth++;
3773                 }
3774             }
3775         }
3776     }
3777 
3778     /*
3779      * Now parse the attributes, it ends up with the ending
3780      *
3781      * (S Attribute)* S?
3782      */
3783     SKIP_BLANKS;
3784     while ((ctxt->input->cur < ctxt->input->end) &&
3785            (CUR != '>') &&
3786 	   ((CUR != '/') || (NXT(1) != '>')) &&
3787            (PARSER_STOPPED(ctxt) == 0)) {
3788         xmlHashedString hattname;
3789 
3790         /*  unexpected-solidus-in-tag */
3791         if (CUR == '/') {
3792             SKIP(1);
3793             SKIP_BLANKS;
3794             continue;
3795         }
3796 	GROW;
3797 	hattname = htmlParseAttribute(ctxt, &attvalue);
3798         attname = hattname.name;
3799 
3800         if (attname != NULL) {
3801 	    /*
3802 	     * Add the pair to atts
3803 	     */
3804 	    if (nbatts + 4 > maxatts) {
3805 	        const xmlChar **tmp;
3806                 unsigned *utmp;
3807                 size_t newSize = maxatts ? maxatts * 2 : 22;
3808 
3809 	        tmp = xmlMalloc(newSize * sizeof(tmp[0]));
3810 		if (tmp == NULL) {
3811 		    htmlErrMemory(ctxt);
3812 		    if (attvalue != NULL)
3813 			xmlFree(attvalue);
3814 		    goto failed;
3815 		}
3816 
3817 	        utmp = xmlRealloc(ctxt->attallocs,
3818                                   newSize / 2 * sizeof(utmp[0]));
3819 		if (utmp == NULL) {
3820 		    htmlErrMemory(ctxt);
3821 		    if (attvalue != NULL)
3822 			xmlFree(attvalue);
3823                     xmlFree(tmp);
3824 		    goto failed;
3825 		}
3826 
3827                 if (maxatts > 0)
3828                     memcpy(tmp, atts, maxatts * sizeof(tmp[0]));
3829                 xmlFree(atts);
3830 
3831                 atts = tmp;
3832                 maxatts = newSize;
3833 		ctxt->atts = atts;
3834                 ctxt->attallocs = utmp;
3835 		ctxt->maxatts = maxatts;
3836 	    }
3837 
3838             ctxt->attallocs[nbatts/2] = hattname.hashValue;
3839 	    atts[nbatts++] = attname;
3840 	    atts[nbatts++] = attvalue;
3841 	}
3842 	else {
3843 	    if (attvalue != NULL)
3844 	        xmlFree(attvalue);
3845 	}
3846 
3847 failed:
3848 	SKIP_BLANKS;
3849     }
3850 
3851     if (ctxt->input->cur >= ctxt->input->end) {
3852         discardtag = 1;
3853         goto done;
3854     }
3855 
3856     /*
3857      * Verify that attribute names are unique.
3858      */
3859     if (nbatts > 2) {
3860         unsigned attrHashSize;
3861         int j, k;
3862 
3863         attrHashSize = 4;
3864         while (attrHashSize / 2 < (unsigned) nbatts / 2)
3865             attrHashSize *= 2;
3866 
3867         if (attrHashSize > ctxt->attrHashMax) {
3868             xmlAttrHashBucket *tmp;
3869 
3870             tmp = xmlRealloc(ctxt->attrHash, attrHashSize * sizeof(tmp[0]));
3871             if (tmp == NULL) {
3872                 htmlErrMemory(ctxt);
3873                 goto done;
3874             }
3875 
3876             ctxt->attrHash = tmp;
3877             ctxt->attrHashMax = attrHashSize;
3878         }
3879 
3880         memset(ctxt->attrHash, -1, attrHashSize * sizeof(ctxt->attrHash[0]));
3881 
3882         for (i = 0, j = 0, k = 0; i < nbatts; i += 2, k++) {
3883             unsigned hashValue;
3884             int res;
3885 
3886             attname = atts[i];
3887             hashValue = ctxt->attallocs[k] | 0x80000000;
3888 
3889             res = htmlAttrHashInsert(ctxt, attrHashSize, attname,
3890                                     hashValue, j);
3891             if (res < 0)
3892                 continue;
3893 
3894             if (res == INT_MAX) {
3895                 atts[j] = atts[i];
3896                 atts[j+1] = atts[i+1];
3897                 j += 2;
3898             } else {
3899                 xmlFree((xmlChar *) atts[i+1]);
3900             }
3901         }
3902 
3903         nbatts = j;
3904     }
3905 
3906     if (nbatts > 0) {
3907         atts[nbatts] = NULL;
3908         atts[nbatts + 1] = NULL;
3909 
3910         /*
3911          * Handle specific association to the META tag
3912          */
3913         if (meta)
3914             htmlCheckMeta(ctxt, atts);
3915     }
3916 
3917     /*
3918      * SAX: Start of Element !
3919      */
3920     if (!discardtag) {
3921         if (ctxt->options & HTML_PARSE_HTML5) {
3922             if (ctxt->nameNr > 0)
3923                 htmlnamePop(ctxt);
3924         }
3925 
3926 	htmlnamePush(ctxt, name);
3927 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3928 	    if (nbatts != 0)
3929 		ctxt->sax->startElement(ctxt->userData, name, atts);
3930 	    else
3931 		ctxt->sax->startElement(ctxt->userData, name, NULL);
3932 	}
3933     }
3934 
3935 done:
3936     if (atts != NULL) {
3937         for (i = 1;i < nbatts;i += 2) {
3938 	    if (atts[i] != NULL)
3939 		xmlFree((xmlChar *) atts[i]);
3940 	}
3941     }
3942 }
3943 
3944 /**
3945  * htmlParseEndTag:
3946  * @ctxt:  an HTML parser context
3947  *
3948  * parse an end of tag
3949  *
3950  * [42] ETag ::= '</' Name S? '>'
3951  *
3952  * With namespace
3953  *
3954  * [NS 9] ETag ::= '</' QName S? '>'
3955  *
3956  * Returns 1 if the current level should be closed.
3957  */
3958 
3959 static void
htmlParseEndTag(htmlParserCtxtPtr ctxt)3960 htmlParseEndTag(htmlParserCtxtPtr ctxt)
3961 {
3962     const xmlChar *name;
3963     const xmlChar *oldname;
3964     int i;
3965 
3966     ctxt->endCheckState = 0;
3967 
3968     SKIP(2);
3969 
3970     if (CUR == '>') {
3971         SKIP(1);
3972         return;
3973     }
3974 
3975     if (!IS_ASCII_LETTER(CUR)) {
3976         htmlParseComment(ctxt, /* bogus */ 1);
3977         return;
3978     }
3979 
3980     name = htmlParseHTMLName(ctxt, 0).name;
3981     if (name == NULL)
3982         return;
3983 
3984     /*
3985      * Parse and ignore attributes.
3986      */
3987     SKIP_BLANKS;
3988     while ((ctxt->input->cur < ctxt->input->end) &&
3989            (CUR != '>') &&
3990 	   ((CUR != '/') || (NXT(1) != '>')) &&
3991            (ctxt->instate != XML_PARSER_EOF)) {
3992         xmlChar *attvalue = NULL;
3993 
3994         /*  unexpected-solidus-in-tag */
3995         if (CUR == '/') {
3996             SKIP(1);
3997             SKIP_BLANKS;
3998             continue;
3999         }
4000 	GROW;
4001 	htmlParseAttribute(ctxt, &attvalue);
4002         if (attvalue != NULL)
4003             xmlFree(attvalue);
4004 
4005 	SKIP_BLANKS;
4006     }
4007 
4008     if (CUR == '>') {
4009         SKIP(1);
4010     } else if ((CUR == '/') && (NXT(1) == '>')) {
4011         SKIP(2);
4012     } else {
4013         return;
4014     }
4015 
4016     if (ctxt->options & HTML_PARSE_HTML5) {
4017         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4018             ctxt->sax->endElement(ctxt->userData, name);
4019         return;
4020     }
4021 
4022     /*
4023      * if we ignored misplaced tags in htmlParseStartTag don't pop them
4024      * out now.
4025      */
4026     if ((ctxt->depth > 0) &&
4027         (xmlStrEqual(name, BAD_CAST "html") ||
4028          xmlStrEqual(name, BAD_CAST "body") ||
4029 	 xmlStrEqual(name, BAD_CAST "head"))) {
4030 	ctxt->depth--;
4031 	return;
4032     }
4033 
4034     /*
4035      * If the name read is not one of the element in the parsing stack
4036      * then return, it's just an error.
4037      */
4038     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4039         if (xmlStrEqual(name, ctxt->nameTab[i]))
4040             break;
4041     }
4042     if (i < 0) {
4043         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4044 	             "Unexpected end tag : %s\n", name, NULL);
4045         return;
4046     }
4047 
4048 
4049     /*
4050      * Check for auto-closure of HTML elements.
4051      */
4052 
4053     htmlAutoCloseOnClose(ctxt, name);
4054 
4055     /*
4056      * Well formedness constraints, opening and closing must match.
4057      * With the exception that the autoclose may have popped stuff out
4058      * of the stack.
4059      */
4060     if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4061         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4062                      "Opening and ending tag mismatch: %s and %s\n",
4063                      name, ctxt->name);
4064     }
4065 
4066     /*
4067      * SAX: End of Tag
4068      */
4069     oldname = ctxt->name;
4070     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4071 	htmlParserFinishElementParsing(ctxt);
4072         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4073             ctxt->sax->endElement(ctxt->userData, name);
4074         htmlnamePop(ctxt);
4075     }
4076 }
4077 
4078 /**
4079  * htmlParseContent:
4080  * @ctxt:  an HTML parser context
4081  *
4082  * Parse a content: comment, sub-element, reference or text.
4083  * New version for non recursive htmlParseElementInternal
4084  */
4085 
4086 static void
htmlParseContent(htmlParserCtxtPtr ctxt)4087 htmlParseContent(htmlParserCtxtPtr ctxt) {
4088     while ((PARSER_STOPPED(ctxt) == 0) &&
4089            (ctxt->input->cur < ctxt->input->end)) {
4090         int mode;
4091 
4092         GROW;
4093         mode = ctxt->endCheckState;
4094 
4095         if ((mode == 0) && (CUR == '<')) {
4096             if (NXT(1) == '/') {
4097 	        htmlParseEndTag(ctxt);
4098             } else if (NXT(1) == '!') {
4099                 /*
4100                  * Sometimes DOCTYPE arrives in the middle of the document
4101                  */
4102                 if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4103                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
4104                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4105                     (UPP(8) == 'E')) {
4106                     htmlParseDocTypeDecl(ctxt);
4107                 } else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4108                     SKIP(4);
4109                     htmlParseComment(ctxt, /* bogus */ 0);
4110                 } else {
4111                     SKIP(2);
4112                     htmlParseComment(ctxt, /* bogus */ 1);
4113                 }
4114             } else if (NXT(1) == '?') {
4115                 SKIP(1);
4116                 htmlParseComment(ctxt, /* bogus */ 1);
4117             } else if (IS_ASCII_LETTER(NXT(1))) {
4118                 htmlParseElementInternal(ctxt);
4119             } else {
4120                 htmlCheckParagraph(ctxt);
4121                 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4122                     (ctxt->sax->characters != NULL))
4123                     ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4124                 SKIP(1);
4125             }
4126         } else {
4127             htmlParseCharData(ctxt);
4128         }
4129 
4130         SHRINK;
4131         GROW;
4132     }
4133 
4134     if (ctxt->input->cur >= ctxt->input->end)
4135         htmlAutoCloseOnEnd(ctxt);
4136 }
4137 
4138 /**
4139  * htmlParseElementInternal:
4140  * @ctxt:  an HTML parser context
4141  *
4142  * parse an HTML element, new version, non recursive
4143  *
4144  * [39] element ::= EmptyElemTag | STag content ETag
4145  *
4146  * [41] Attribute ::= Name Eq AttValue
4147  */
4148 
4149 static int
htmlParseElementInternal(htmlParserCtxtPtr ctxt)4150 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4151     const xmlChar *name;
4152     const htmlElemDesc * info;
4153     htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4154 
4155     if ((ctxt == NULL) || (ctxt->input == NULL))
4156 	return(0);
4157 
4158     /* Capture start position */
4159     if (ctxt->record_info) {
4160         node_info.begin_pos = ctxt->input->consumed +
4161                           (CUR_PTR - ctxt->input->base);
4162 	node_info.begin_line = ctxt->input->line;
4163     }
4164 
4165     htmlParseStartTag(ctxt);
4166     name = ctxt->name;
4167     if (name == NULL)
4168         return(0);
4169 
4170     if (ctxt->record_info)
4171         htmlNodeInfoPush(ctxt, &node_info);
4172 
4173     /*
4174      * Check for an Empty Element labeled the XML/SGML way
4175      */
4176     if ((CUR == '/') && (NXT(1) == '>')) {
4177         SKIP(2);
4178         htmlParserFinishElementParsing(ctxt);
4179         if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
4180             if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4181                 ctxt->sax->endElement(ctxt->userData, name);
4182         }
4183 	htmlnamePop(ctxt);
4184 	return(0);
4185     }
4186 
4187     if (CUR != '>')
4188         return(0);
4189     SKIP(1);
4190 
4191     /*
4192      * Lookup the info for that element.
4193      */
4194     info = htmlTagLookup(name);
4195 
4196     /*
4197      * Check for an Empty Element from DTD definition
4198      */
4199     if ((info != NULL) && (info->empty)) {
4200         htmlParserFinishElementParsing(ctxt);
4201         if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
4202             if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4203                 ctxt->sax->endElement(ctxt->userData, name);
4204         }
4205 	htmlnamePop(ctxt);
4206 	return(0);
4207     }
4208 
4209     if (info != NULL)
4210         ctxt->endCheckState = info->dataMode;
4211 
4212     return(1);
4213 }
4214 
4215 /**
4216  * htmlParseElement:
4217  * @ctxt:  an HTML parser context
4218  *
4219  * DEPRECATED: Internal function, don't use.
4220  *
4221  * parse an HTML element, this is highly recursive
4222  * this is kept for compatibility with previous code versions
4223  *
4224  * [39] element ::= EmptyElemTag | STag content ETag
4225  *
4226  * [41] Attribute ::= Name Eq AttValue
4227  */
4228 
4229 void
htmlParseElement(htmlParserCtxtPtr ctxt)4230 htmlParseElement(htmlParserCtxtPtr ctxt) {
4231     const xmlChar *oldptr;
4232     int depth;
4233 
4234     if ((ctxt == NULL) || (ctxt->input == NULL))
4235 	return;
4236 
4237     if (htmlParseElementInternal(ctxt) == 0)
4238         return;
4239 
4240     /*
4241      * Parse the content of the element:
4242      */
4243     depth = ctxt->nameNr;
4244     while (CUR != 0) {
4245 	oldptr = ctxt->input->cur;
4246 	htmlParseContent(ctxt);
4247 	if (oldptr==ctxt->input->cur) break;
4248 	if (ctxt->nameNr < depth) break;
4249     }
4250 
4251     if (CUR == 0) {
4252 	htmlAutoCloseOnEnd(ctxt);
4253     }
4254 }
4255 
4256 xmlNodePtr
htmlCtxtParseContentInternal(htmlParserCtxtPtr ctxt,xmlParserInputPtr input)4257 htmlCtxtParseContentInternal(htmlParserCtxtPtr ctxt, xmlParserInputPtr input) {
4258     xmlNodePtr root;
4259     xmlNodePtr list = NULL;
4260     xmlChar *rootName = BAD_CAST "#root";
4261 
4262     root = xmlNewDocNode(ctxt->myDoc, NULL, rootName, NULL);
4263     if (root == NULL) {
4264         htmlErrMemory(ctxt);
4265         return(NULL);
4266     }
4267 
4268     if (xmlPushInput(ctxt, input) < 0) {
4269         xmlFreeNode(root);
4270         return(NULL);
4271     }
4272 
4273     htmlnamePush(ctxt, rootName);
4274     nodePush(ctxt, root);
4275 
4276     htmlParseContent(ctxt);
4277 
4278     /* TODO: Use xmlCtxtIsCatastrophicError */
4279     if (ctxt->errNo != XML_ERR_NO_MEMORY) {
4280         xmlNodePtr cur;
4281 
4282         /*
4283          * Unlink newly created node list.
4284          */
4285         list = root->children;
4286         root->children = NULL;
4287         root->last = NULL;
4288         for (cur = list; cur != NULL; cur = cur->next)
4289             cur->parent = NULL;
4290     }
4291 
4292     nodePop(ctxt);
4293     htmlnamePop(ctxt);
4294 
4295     /* xmlPopInput would free the stream */
4296     inputPop(ctxt);
4297 
4298     xmlFreeNode(root);
4299     return(list);
4300 }
4301 
4302 /**
4303  * htmlParseDocument:
4304  * @ctxt:  an HTML parser context
4305  *
4306  * Parse an HTML document and invoke the SAX handlers. This is useful
4307  * if you're only interested in custom SAX callbacks. If you want a
4308  * document tree, use htmlCtxtParseDocument.
4309  *
4310  * Returns 0, -1 in case of error.
4311  */
4312 
4313 int
htmlParseDocument(htmlParserCtxtPtr ctxt)4314 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4315     xmlDtdPtr dtd;
4316 
4317     if ((ctxt == NULL) || (ctxt->input == NULL))
4318 	return(-1);
4319 
4320     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
4321         ctxt->sax->setDocumentLocator(ctxt->userData,
4322                 (xmlSAXLocator *) &xmlDefaultSAXLocator);
4323     }
4324 
4325     xmlDetectEncoding(ctxt);
4326 
4327     /*
4328      * This is wrong but matches long-standing behavior. In most cases,
4329      * a document starting with an XML declaration will specify UTF-8.
4330      */
4331     if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
4332         (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0))
4333         xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
4334 
4335     /*
4336      * Wipe out everything which is before the first '<'
4337      */
4338     SKIP_BLANKS;
4339 
4340     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4341 	ctxt->sax->startDocument(ctxt->userData);
4342 
4343     /*
4344      * Parse possible comments and PIs before any content
4345      */
4346     while (CUR == '<') {
4347         if ((NXT(1) == '!') && (NXT(2) == '-') && (NXT(3) == '-')) {
4348             SKIP(4);
4349             htmlParseComment(ctxt, /* bogus */ 0);
4350         } else if (NXT(1) == '?') {
4351             SKIP(1);
4352             htmlParseComment(ctxt, /* bogus */ 1);
4353         } else {
4354             break;
4355         }
4356 	SKIP_BLANKS;
4357     }
4358 
4359     /*
4360      * Then possibly doc type declaration(s) and more Misc
4361      * (doctypedecl Misc*)?
4362      */
4363     if ((CUR == '<') && (NXT(1) == '!') &&
4364 	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4365 	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4366 	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4367 	(UPP(8) == 'E')) {
4368         ctxt->instate = XML_PARSER_MISC;
4369 	htmlParseDocTypeDecl(ctxt);
4370     }
4371     SKIP_BLANKS;
4372 
4373     /*
4374      * Parse possible comments and PIs before any content
4375      */
4376     ctxt->instate = XML_PARSER_PROLOG;
4377     while (CUR == '<') {
4378         if ((NXT(1) == '!') && (NXT(2) == '-') && (NXT(3) == '-')) {
4379             SKIP(4);
4380             htmlParseComment(ctxt, /* bogus */ 0);
4381         } else if (NXT(1) == '?') {
4382             SKIP(1);
4383             htmlParseComment(ctxt, /* bogus */ 1);
4384         } else {
4385             break;
4386         }
4387 	SKIP_BLANKS;
4388     }
4389 
4390     /*
4391      * Time to start parsing the tree itself
4392      */
4393     ctxt->instate = XML_PARSER_CONTENT;
4394     htmlParseContent(ctxt);
4395 
4396     /*
4397      * autoclose
4398      */
4399     if (CUR == 0)
4400 	htmlAutoCloseOnEnd(ctxt);
4401 
4402 
4403     /*
4404      * SAX: end of the document processing.
4405      */
4406     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4407         ctxt->sax->endDocument(ctxt->userData);
4408 
4409     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4410 	dtd = xmlGetIntSubset(ctxt->myDoc);
4411 	if (dtd == NULL) {
4412 	    ctxt->myDoc->intSubset =
4413 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4414 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4415 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4416             if (ctxt->myDoc->intSubset == NULL)
4417                 htmlErrMemory(ctxt);
4418         }
4419     }
4420     if (! ctxt->wellFormed) return(-1);
4421     return(0);
4422 }
4423 
4424 
4425 /************************************************************************
4426  *									*
4427  *			Parser contexts handling			*
4428  *									*
4429  ************************************************************************/
4430 
4431 /**
4432  * htmlInitParserCtxt:
4433  * @ctxt:  an HTML parser context
4434  * @sax:  SAX handler
4435  * @userData:  user data
4436  *
4437  * Initialize a parser context
4438  *
4439  * Returns 0 in case of success and -1 in case of error
4440  */
4441 
4442 static int
htmlInitParserCtxt(htmlParserCtxtPtr ctxt,const htmlSAXHandler * sax,void * userData)4443 htmlInitParserCtxt(htmlParserCtxtPtr ctxt, const htmlSAXHandler *sax,
4444                    void *userData)
4445 {
4446     if (ctxt == NULL) return(-1);
4447     memset(ctxt, 0, sizeof(htmlParserCtxt));
4448 
4449     ctxt->dict = xmlDictCreate();
4450     if (ctxt->dict == NULL)
4451 	return(-1);
4452 
4453     if (ctxt->sax == NULL)
4454         ctxt->sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4455     if (ctxt->sax == NULL)
4456 	return(-1);
4457     if (sax == NULL) {
4458         memset(ctxt->sax, 0, sizeof(htmlSAXHandler));
4459         xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
4460         ctxt->userData = ctxt;
4461     } else {
4462         memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4463         ctxt->userData = userData ? userData : ctxt;
4464     }
4465 
4466     /* Allocate the Input stack */
4467     ctxt->inputTab = (htmlParserInputPtr *)
4468                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
4469     if (ctxt->inputTab == NULL)
4470 	return(-1);
4471     ctxt->inputNr = 0;
4472     ctxt->inputMax = 5;
4473     ctxt->input = NULL;
4474     ctxt->version = NULL;
4475     ctxt->encoding = NULL;
4476     ctxt->standalone = -1;
4477     ctxt->instate = XML_PARSER_START;
4478 
4479     /* Allocate the Node stack */
4480     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4481     if (ctxt->nodeTab == NULL)
4482 	return(-1);
4483     ctxt->nodeNr = 0;
4484     ctxt->nodeMax = 10;
4485     ctxt->node = NULL;
4486 
4487     /* Allocate the Name stack */
4488     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4489     if (ctxt->nameTab == NULL)
4490 	return(-1);
4491     ctxt->nameNr = 0;
4492     ctxt->nameMax = 10;
4493     ctxt->name = NULL;
4494 
4495     ctxt->nodeInfoTab = NULL;
4496     ctxt->nodeInfoNr  = 0;
4497     ctxt->nodeInfoMax = 0;
4498 
4499     ctxt->myDoc = NULL;
4500     ctxt->wellFormed = 1;
4501     ctxt->replaceEntities = 0;
4502     ctxt->linenumbers = xmlLineNumbersDefaultValue;
4503     ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
4504     ctxt->html = 1;
4505     ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
4506     ctxt->vctxt.userData = ctxt;
4507     ctxt->vctxt.error = xmlParserValidityError;
4508     ctxt->vctxt.warning = xmlParserValidityWarning;
4509     ctxt->record_info = 0;
4510     ctxt->validate = 0;
4511     ctxt->checkIndex = 0;
4512     ctxt->catalogs = NULL;
4513     xmlInitNodeInfoSeq(&ctxt->node_seq);
4514     return(0);
4515 }
4516 
4517 /**
4518  * htmlFreeParserCtxt:
4519  * @ctxt:  an HTML parser context
4520  *
4521  * Free all the memory used by a parser context. However the parsed
4522  * document in ctxt->myDoc is not freed.
4523  */
4524 
4525 void
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)4526 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4527 {
4528     xmlFreeParserCtxt(ctxt);
4529 }
4530 
4531 /**
4532  * htmlNewParserCtxt:
4533  *
4534  * Allocate and initialize a new HTML parser context.
4535  *
4536  * This can be used to parse HTML documents into DOM trees with
4537  * functions like xmlCtxtReadFile or xmlCtxtReadMemory.
4538  *
4539  * See htmlCtxtUseOptions for parser options.
4540  *
4541  * See xmlCtxtSetErrorHandler for advanced error handling.
4542  *
4543  * See htmlNewSAXParserCtxt for custom SAX parsers.
4544  *
4545  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4546  */
4547 
4548 htmlParserCtxtPtr
htmlNewParserCtxt(void)4549 htmlNewParserCtxt(void)
4550 {
4551     return(htmlNewSAXParserCtxt(NULL, NULL));
4552 }
4553 
4554 /**
4555  * htmlNewSAXParserCtxt:
4556  * @sax:  SAX handler
4557  * @userData:  user data
4558  *
4559  * Allocate and initialize a new HTML SAX parser context. If userData
4560  * is NULL, the parser context will be passed as user data.
4561  *
4562  * Available since 2.11.0. If you want support older versions,
4563  * it's best to invoke htmlNewParserCtxt and set ctxt->sax with
4564  * struct assignment.
4565  *
4566  * Also see htmlNewParserCtxt.
4567  *
4568  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4569  */
4570 
4571 htmlParserCtxtPtr
htmlNewSAXParserCtxt(const htmlSAXHandler * sax,void * userData)4572 htmlNewSAXParserCtxt(const htmlSAXHandler *sax, void *userData)
4573 {
4574     xmlParserCtxtPtr ctxt;
4575 
4576     xmlInitParser();
4577 
4578     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4579     if (ctxt == NULL)
4580 	return(NULL);
4581     memset(ctxt, 0, sizeof(xmlParserCtxt));
4582     if (htmlInitParserCtxt(ctxt, sax, userData) < 0) {
4583         htmlFreeParserCtxt(ctxt);
4584 	return(NULL);
4585     }
4586     return(ctxt);
4587 }
4588 
4589 static htmlParserCtxtPtr
htmlCreateMemoryParserCtxtInternal(const char * url,const char * buffer,size_t size,const char * encoding)4590 htmlCreateMemoryParserCtxtInternal(const char *url,
4591                                    const char *buffer, size_t size,
4592                                    const char *encoding) {
4593     xmlParserCtxtPtr ctxt;
4594     xmlParserInputPtr input;
4595 
4596     if (buffer == NULL)
4597 	return(NULL);
4598 
4599     ctxt = htmlNewParserCtxt();
4600     if (ctxt == NULL)
4601 	return(NULL);
4602 
4603     input = xmlCtxtNewInputFromMemory(ctxt, url, buffer, size, encoding, 0);
4604     if (input == NULL) {
4605 	xmlFreeParserCtxt(ctxt);
4606         return(NULL);
4607     }
4608 
4609     if (inputPush(ctxt, input) < 0) {
4610         xmlFreeInputStream(input);
4611         xmlFreeParserCtxt(ctxt);
4612         return(NULL);
4613     }
4614 
4615     return(ctxt);
4616 }
4617 
4618 /**
4619  * htmlCreateMemoryParserCtxt:
4620  * @buffer:  a pointer to a char array
4621  * @size:  the size of the array
4622  *
4623  * DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadMemory.
4624  *
4625  * Create a parser context for an HTML in-memory document. The input
4626  * buffer must not contain any terminating null bytes.
4627  *
4628  * Returns the new parser context or NULL
4629  */
4630 htmlParserCtxtPtr
htmlCreateMemoryParserCtxt(const char * buffer,int size)4631 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4632     if (size <= 0)
4633 	return(NULL);
4634 
4635     return(htmlCreateMemoryParserCtxtInternal(NULL, buffer, size, NULL));
4636 }
4637 
4638 /**
4639  * htmlCreateDocParserCtxt:
4640  * @str:  a pointer to an array of xmlChar
4641  * @encoding:  encoding (optional)
4642  *
4643  * Create a parser context for a null-terminated string.
4644  *
4645  * Returns the new parser context or NULL if a memory allocation failed.
4646  */
4647 static htmlParserCtxtPtr
htmlCreateDocParserCtxt(const xmlChar * str,const char * url,const char * encoding)4648 htmlCreateDocParserCtxt(const xmlChar *str, const char *url,
4649                         const char *encoding) {
4650     xmlParserCtxtPtr ctxt;
4651     xmlParserInputPtr input;
4652 
4653     if (str == NULL)
4654 	return(NULL);
4655 
4656     ctxt = htmlNewParserCtxt();
4657     if (ctxt == NULL)
4658 	return(NULL);
4659 
4660     input = xmlCtxtNewInputFromString(ctxt, url, (const char *) str,
4661                                       encoding, 0);
4662     if (input == NULL) {
4663 	xmlFreeParserCtxt(ctxt);
4664 	return(NULL);
4665     }
4666 
4667     if (inputPush(ctxt, input) < 0) {
4668         xmlFreeInputStream(input);
4669         xmlFreeParserCtxt(ctxt);
4670         return(NULL);
4671     }
4672 
4673     return(ctxt);
4674 }
4675 
4676 #ifdef LIBXML_PUSH_ENABLED
4677 /************************************************************************
4678  *									*
4679  *	Progressive parsing interfaces				*
4680  *									*
4681  ************************************************************************/
4682 
4683 enum xmlLookupStates {
4684     LSTATE_TAG_NAME = 0,
4685     LSTATE_BEFORE_ATTR_NAME,
4686     LSTATE_ATTR_NAME,
4687     LSTATE_AFTER_ATTR_NAME,
4688     LSTATE_BEFORE_ATTR_VALUE,
4689     LSTATE_ATTR_VALUE_DQUOTED,
4690     LSTATE_ATTR_VALUE_SQUOTED,
4691     LSTATE_ATTR_VALUE_UNQUOTED
4692 };
4693 
4694 /**
4695  * htmlParseLookupGt:
4696  * @ctxt:  an HTML parser context
4697  *
4698  * Check whether there's enough data in the input buffer to finish parsing
4699  * a tag. This has to take quotes into account.
4700  */
4701 static int
htmlParseLookupGt(xmlParserCtxtPtr ctxt)4702 htmlParseLookupGt(xmlParserCtxtPtr ctxt) {
4703     const xmlChar *cur;
4704     const xmlChar *end = ctxt->input->end;
4705     int state = ctxt->endCheckState;
4706     size_t index;
4707 
4708     if (ctxt->checkIndex == 0)
4709         cur = ctxt->input->cur + 2; /* Skip '<a' or '</' */
4710     else
4711         cur = ctxt->input->cur + ctxt->checkIndex;
4712 
4713     while (cur < end) {
4714         int c = *cur++;
4715 
4716         if (state != LSTATE_ATTR_VALUE_SQUOTED &&
4717             state != LSTATE_ATTR_VALUE_DQUOTED) {
4718             if (c == '/' &&
4719                 state != LSTATE_BEFORE_ATTR_VALUE &&
4720                 state != LSTATE_ATTR_VALUE_UNQUOTED) {
4721                 state = LSTATE_BEFORE_ATTR_NAME;
4722                 continue;
4723             } else if (c == '>') {
4724                 ctxt->checkIndex = 0;
4725                 ctxt->endCheckState = 0;
4726                 return(0);
4727             }
4728         }
4729 
4730         switch (state) {
4731             case LSTATE_TAG_NAME:
4732                 if (IS_WS_HTML(c))
4733                     state = LSTATE_BEFORE_ATTR_NAME;
4734                 break;
4735 
4736             case LSTATE_BEFORE_ATTR_NAME:
4737                 if (!IS_WS_HTML(c))
4738                     state = LSTATE_ATTR_NAME;
4739                 break;
4740 
4741             case LSTATE_ATTR_NAME:
4742                 if (c == '=')
4743                     state = LSTATE_BEFORE_ATTR_VALUE;
4744                 else if (IS_WS_HTML(c))
4745                     state = LSTATE_AFTER_ATTR_NAME;
4746                 break;
4747 
4748             case LSTATE_AFTER_ATTR_NAME:
4749                 if (c == '=')
4750                     state = LSTATE_BEFORE_ATTR_VALUE;
4751                 else if (!IS_WS_HTML(c))
4752                     state = LSTATE_ATTR_NAME;
4753                 break;
4754 
4755             case LSTATE_BEFORE_ATTR_VALUE:
4756                 if (c == '"')
4757                     state = LSTATE_ATTR_VALUE_DQUOTED;
4758                 else if (c == '\'')
4759                     state = LSTATE_ATTR_VALUE_SQUOTED;
4760                 else if (!IS_WS_HTML(c))
4761                     state = LSTATE_ATTR_VALUE_UNQUOTED;
4762                 break;
4763 
4764             case LSTATE_ATTR_VALUE_DQUOTED:
4765                 if (c == '"')
4766                     state = LSTATE_BEFORE_ATTR_NAME;
4767                 break;
4768 
4769             case LSTATE_ATTR_VALUE_SQUOTED:
4770                 if (c == '\'')
4771                     state = LSTATE_BEFORE_ATTR_NAME;
4772                 break;
4773 
4774             case LSTATE_ATTR_VALUE_UNQUOTED:
4775                 if (IS_WS_HTML(c))
4776                     state = LSTATE_BEFORE_ATTR_NAME;
4777                 break;
4778         }
4779     }
4780 
4781     index = cur - ctxt->input->cur;
4782     if (index > LONG_MAX) {
4783         ctxt->checkIndex = 0;
4784         ctxt->endCheckState = 0;
4785         return(0);
4786     }
4787     ctxt->checkIndex = index;
4788     ctxt->endCheckState = state;
4789     return(-1);
4790 }
4791 
4792 /**
4793  * htmlParseLookupString:
4794  * @ctxt:  an XML parser context
4795  * @startDelta: delta to apply at the start
4796  * @str:  string
4797  * @strLen:  length of string
4798  *
4799  * Check whether the input buffer contains a string.
4800  */
4801 static int
htmlParseLookupString(xmlParserCtxtPtr ctxt,size_t startDelta,const char * str,size_t strLen,size_t extraLen)4802 htmlParseLookupString(xmlParserCtxtPtr ctxt, size_t startDelta,
4803                       const char *str, size_t strLen, size_t extraLen) {
4804     const xmlChar *end = ctxt->input->end;
4805     const xmlChar *cur, *term;
4806     size_t index, rescan;
4807     int ret;
4808 
4809     if (ctxt->checkIndex == 0) {
4810         cur = ctxt->input->cur + startDelta;
4811     } else {
4812         cur = ctxt->input->cur + ctxt->checkIndex;
4813     }
4814 
4815     term = BAD_CAST strstr((const char *) cur, str);
4816     if ((term != NULL) &&
4817         ((size_t) (ctxt->input->end - term) >= extraLen + 1)) {
4818         ctxt->checkIndex = 0;
4819 
4820         if (term - ctxt->input->cur > INT_MAX / 2)
4821             ret = INT_MAX / 2;
4822         else
4823             ret = term - ctxt->input->cur;
4824 
4825         return(ret);
4826     }
4827 
4828     /* Rescan (strLen + extraLen - 1) characters. */
4829     rescan = strLen + extraLen - 1;
4830     if ((size_t) (end - cur) <= rescan)
4831         end = cur;
4832     else
4833         end -= rescan;
4834     index = end - ctxt->input->cur;
4835     if (index > INT_MAX / 2) {
4836         ctxt->checkIndex = 0;
4837         ret = INT_MAX / 2;
4838     } else {
4839         ctxt->checkIndex = index;
4840         ret = -1;
4841     }
4842 
4843     return(ret);
4844 }
4845 
4846 /**
4847  * htmlParseLookupCommentEnd:
4848  * @ctxt: an HTML parser context
4849  *
4850  * Try to find a comment end tag in the input stream
4851  * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
4852  * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
4853  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4854  * to avoid rescanning sequences of bytes, it DOES change the state of the
4855  * parser, do not use liberally.
4856  *
4857  * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
4858  */
4859 static int
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)4860 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
4861 {
4862     int mark = 0;
4863     int offset;
4864 
4865     while (1) {
4866 	mark = htmlParseLookupString(ctxt, 2, "--", 2, 0);
4867 	if (mark < 0)
4868             break;
4869         if ((NXT(mark+2) == '>') ||
4870 	    ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
4871             ctxt->checkIndex = 0;
4872 	    break;
4873 	}
4874         offset = (NXT(mark+2) == '!') ? 3 : 2;
4875         if (mark + offset >= ctxt->input->end - ctxt->input->cur) {
4876 	    ctxt->checkIndex = mark;
4877             return(-1);
4878         }
4879 	ctxt->checkIndex = mark + 1;
4880     }
4881     return mark;
4882 }
4883 
4884 
4885 /**
4886  * htmlParseTryOrFinish:
4887  * @ctxt:  an HTML parser context
4888  * @terminate:  last chunk indicator
4889  *
4890  * Try to progress on parsing
4891  *
4892  * Returns zero if no parsing was possible
4893  */
4894 static int
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt,int terminate)4895 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4896     int ret = 0;
4897     htmlParserInputPtr in;
4898     ptrdiff_t avail = 0;
4899     int cur;
4900 
4901     htmlParserNodeInfo node_info;
4902 
4903     while (PARSER_STOPPED(ctxt) == 0) {
4904 
4905 	in = ctxt->input;
4906 	if (in == NULL) break;
4907 	avail = in->end - in->cur;
4908 	if ((avail == 0) && (terminate)) {
4909 	    htmlAutoCloseOnEnd(ctxt);
4910 	    if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4911 		/*
4912 		 * SAX: end of the document processing.
4913 		 */
4914 		ctxt->instate = XML_PARSER_EOF;
4915 		if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4916 		    ctxt->sax->endDocument(ctxt->userData);
4917 	    }
4918 	}
4919         if (avail < 1)
4920 	    goto done;
4921 	cur = in->cur[0];
4922 
4923         switch (ctxt->instate) {
4924             case XML_PARSER_EOF:
4925 	        /*
4926 		 * Document parsing is done !
4927 		 */
4928 	        goto done;
4929             case XML_PARSER_START:
4930                 /*
4931                  * This is wrong but matches long-standing behavior. In most
4932                  * cases, a document starting with an XML declaration will
4933                  * specify UTF-8.
4934                  */
4935                 if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
4936                     (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0)) {
4937                     xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
4938                 }
4939 
4940                 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
4941                     ctxt->sax->setDocumentLocator(ctxt->userData,
4942                             (xmlSAXLocator *) &xmlDefaultSAXLocator);
4943                 }
4944 		if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4945 	            (!ctxt->disableSAX))
4946 		    ctxt->sax->startDocument(ctxt->userData);
4947 
4948                 /* Allow callback to modify state */
4949                 if (ctxt->instate == XML_PARSER_START)
4950                     ctxt->instate = XML_PARSER_MISC;
4951 		break;
4952             case XML_PARSER_START_TAG: {
4953 	        const xmlChar *name;
4954 		int next;
4955 		const htmlElemDesc * info;
4956 
4957 		/*
4958 		 * not enough chars in buffer
4959 		 */
4960 		if (avail < 2)
4961 		    goto done;
4962 		cur = in->cur[0];
4963 		next = in->cur[1];
4964 	        if (cur != '<') {
4965 		    ctxt->instate = XML_PARSER_CONTENT;
4966 		    break;
4967 		}
4968 		if (next == '/') {
4969 		    ctxt->instate = XML_PARSER_END_TAG;
4970 		    ctxt->checkIndex = 0;
4971 		    break;
4972 		}
4973 		if ((!terminate) &&
4974 		    (htmlParseLookupGt(ctxt) < 0))
4975 		    goto done;
4976 
4977                 /* Capture start position */
4978 	        if (ctxt->record_info) {
4979 	             node_info.begin_pos = ctxt->input->consumed +
4980 	                                (CUR_PTR - ctxt->input->base);
4981 	             node_info.begin_line = ctxt->input->line;
4982 	        }
4983 
4984 
4985 		htmlParseStartTag(ctxt);
4986 		name = ctxt->name;
4987 		if (name == NULL)
4988 		    break;
4989 
4990 		/*
4991 		 * Check for an Empty Element labeled the XML/SGML way
4992 		 */
4993 		if ((CUR == '/') && (NXT(1) == '>')) {
4994 		    SKIP(2);
4995                     htmlParserFinishElementParsing(ctxt);
4996                     if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
4997                         if ((ctxt->sax != NULL) &&
4998                             (ctxt->sax->endElement != NULL))
4999                             ctxt->sax->endElement(ctxt->userData, name);
5000                     }
5001 		    htmlnamePop(ctxt);
5002 		    ctxt->instate = XML_PARSER_CONTENT;
5003 		    break;
5004 		}
5005 
5006 		if (CUR != '>')
5007                     break;
5008 		SKIP(1);
5009 
5010 		/*
5011 		 * Lookup the info for that element.
5012 		 */
5013 		info = htmlTagLookup(name);
5014 
5015 		/*
5016 		 * Check for an Empty Element from DTD definition
5017 		 */
5018 		if ((info != NULL) && (info->empty)) {
5019                     htmlParserFinishElementParsing(ctxt);
5020                     if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
5021                         if ((ctxt->sax != NULL) &&
5022                             (ctxt->sax->endElement != NULL))
5023                             ctxt->sax->endElement(ctxt->userData, name);
5024                     }
5025 		    htmlnamePop(ctxt);
5026 		}
5027 
5028 		if (info != NULL)
5029                     ctxt->endCheckState = info->dataMode;
5030 
5031                 if (ctxt->record_info)
5032 	            htmlNodeInfoPush(ctxt, &node_info);
5033 
5034 		ctxt->instate = XML_PARSER_CONTENT;
5035                 break;
5036 	    }
5037             case XML_PARSER_MISC:
5038             case XML_PARSER_PROLOG:
5039             case XML_PARSER_CONTENT:
5040             case XML_PARSER_EPILOG: {
5041                 int mode;
5042 
5043                 if ((ctxt->instate == XML_PARSER_MISC) ||
5044                     (ctxt->instate == XML_PARSER_PROLOG)) {
5045                     SKIP_BLANKS;
5046                     avail = in->end - in->cur;
5047                 }
5048 
5049 		if (avail < 1)
5050 		    goto done;
5051 		cur = in->cur[0];
5052                 mode = ctxt->endCheckState;
5053 
5054                 if (mode != 0) {
5055                     while ((PARSER_STOPPED(ctxt) == 0) &&
5056                            (in->cur < in->end)) {
5057                         size_t extra;
5058 
5059                         extra = strlen((const char *) ctxt->name) + 2;
5060 
5061                         if ((!terminate) &&
5062                             (htmlParseLookupString(ctxt, 0, "<", 1,
5063                                                    extra) < 0))
5064                             goto done;
5065                         ctxt->checkIndex = 0;
5066 
5067                         if (htmlParseCharData(ctxt))
5068                             break;
5069                     }
5070 
5071                     break;
5072 		} else if (cur == '<') {
5073                     int next;
5074 
5075                     if (avail < 2) {
5076                         if (!terminate)
5077                             goto done;
5078                         next = ' ';
5079                     } else {
5080                         next = in->cur[1];
5081                     }
5082 
5083                     if (next == '!') {
5084                         if ((!terminate) && (avail < 4))
5085                             goto done;
5086                         if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
5087                             if ((!terminate) &&
5088                                 (htmlParseLookupCommentEnd(ctxt) < 0))
5089                                 goto done;
5090                             SKIP(4);
5091                             htmlParseComment(ctxt, /* bogus */ 0);
5092                             break;
5093                         }
5094 
5095                         if ((!terminate) && (avail < 9))
5096                             goto done;
5097                         if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
5098                             (UPP(4) == 'C') && (UPP(5) == 'T') &&
5099                             (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5100                             (UPP(8) == 'E')) {
5101                             if ((!terminate) &&
5102                                 (htmlParseLookupString(ctxt, 9, ">", 1,
5103                                                        0) < 0))
5104                                 goto done;
5105                             htmlParseDocTypeDecl(ctxt);
5106                             if (ctxt->instate == XML_PARSER_MISC)
5107                                 ctxt->instate = XML_PARSER_PROLOG;
5108                         } else {
5109                             if ((!terminate) &&
5110                                 (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
5111                                 goto done;
5112                             SKIP(2);
5113                             htmlParseComment(ctxt, /* bogus */ 1);
5114                         }
5115                     } else if (next == '?') {
5116                         if ((!terminate) &&
5117                             (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
5118                             goto done;
5119                         SKIP(1);
5120                         htmlParseComment(ctxt, /* bogus */ 1);
5121                     } else if (next == '/') {
5122                         ctxt->instate = XML_PARSER_END_TAG;
5123                         ctxt->checkIndex = 0;
5124                         break;
5125                     } else if (IS_ASCII_LETTER(next)) {
5126                         if ((!terminate) && (next == 0))
5127                             goto done;
5128                         ctxt->instate = XML_PARSER_START_TAG;
5129                         ctxt->checkIndex = 0;
5130                         break;
5131                     } else {
5132                         ctxt->instate = XML_PARSER_CONTENT;
5133                         htmlCheckParagraph(ctxt);
5134                         if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
5135                             (ctxt->sax->characters != NULL))
5136                             ctxt->sax->characters(ctxt->userData,
5137                                                   BAD_CAST "<", 1);
5138                         SKIP(1);
5139                     }
5140                 } else {
5141                     /*
5142                      * check that the text sequence is complete
5143                      * before handing out the data to the parser
5144                      * to avoid problems with erroneous end of
5145                      * data detection.
5146                      */
5147                     if ((!terminate) &&
5148                         (htmlParseLookupString(ctxt, 0, "<", 1, 0) < 0))
5149                         goto done;
5150                     ctxt->checkIndex = 0;
5151                     htmlParseCharData(ctxt);
5152 		}
5153 
5154 		break;
5155 	    }
5156             case XML_PARSER_END_TAG:
5157 		if ((terminate) && (avail == 2)) {
5158                     htmlCheckParagraph(ctxt);
5159                     if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
5160                         (ctxt->sax->characters != NULL))
5161                         ctxt->sax->characters(ctxt->userData,
5162                                               BAD_CAST "</", 2);
5163 		    goto done;
5164                 }
5165 		if ((!terminate) &&
5166 		    (htmlParseLookupGt(ctxt) < 0))
5167 		    goto done;
5168 		htmlParseEndTag(ctxt);
5169 		if (ctxt->nameNr == 0) {
5170 		    ctxt->instate = XML_PARSER_EPILOG;
5171 		} else {
5172 		    ctxt->instate = XML_PARSER_CONTENT;
5173 		}
5174 		ctxt->checkIndex = 0;
5175 	        break;
5176 	    default:
5177 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5178 			     "HPP: internal error\n", NULL, NULL);
5179 		ctxt->instate = XML_PARSER_EOF;
5180 		break;
5181 	}
5182     }
5183 done:
5184     if ((avail == 0) && (terminate)) {
5185 	htmlAutoCloseOnEnd(ctxt);
5186 	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5187 	    /*
5188 	     * SAX: end of the document processing.
5189 	     */
5190 	    ctxt->instate = XML_PARSER_EOF;
5191 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5192 		ctxt->sax->endDocument(ctxt->userData);
5193 	}
5194     }
5195     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
5196 	((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5197 	 (ctxt->instate == XML_PARSER_EPILOG))) {
5198 	xmlDtdPtr dtd;
5199 	dtd = xmlGetIntSubset(ctxt->myDoc);
5200 	if (dtd == NULL) {
5201 	    ctxt->myDoc->intSubset =
5202 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5203 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5204 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5205             if (ctxt->myDoc->intSubset == NULL)
5206                 htmlErrMemory(ctxt);
5207         }
5208     }
5209     return(ret);
5210 }
5211 
5212 /**
5213  * htmlParseChunk:
5214  * @ctxt:  an HTML parser context
5215  * @chunk:  chunk of memory
5216  * @size:  size of chunk in bytes
5217  * @terminate:  last chunk indicator
5218  *
5219  * Parse a chunk of memory in push parser mode.
5220  *
5221  * Assumes that the parser context was initialized with
5222  * htmlCreatePushParserCtxt.
5223  *
5224  * The last chunk, which will often be empty, must be marked with
5225  * the @terminate flag. With the default SAX callbacks, the resulting
5226  * document will be available in ctxt->myDoc. This pointer will not
5227  * be freed by the library.
5228  *
5229  * If the document isn't well-formed, ctxt->myDoc is set to NULL.
5230  *
5231  * Returns an xmlParserErrors code (0 on success).
5232  */
5233 int
htmlParseChunk(htmlParserCtxtPtr ctxt,const char * chunk,int size,int terminate)5234 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5235               int terminate) {
5236     if ((ctxt == NULL) || (ctxt->input == NULL))
5237 	return(XML_ERR_ARGUMENT);
5238     if (PARSER_STOPPED(ctxt) != 0)
5239         return(ctxt->errNo);
5240     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5241         (ctxt->input->buf != NULL))  {
5242 	size_t pos = ctxt->input->cur - ctxt->input->base;
5243 	int res;
5244 
5245 	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5246         xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
5247 	if (res < 0) {
5248             htmlParseErr(ctxt, ctxt->input->buf->error,
5249                          "xmlParserInputBufferPush failed", NULL, NULL);
5250             xmlHaltParser(ctxt);
5251 	    return (ctxt->errNo);
5252 	}
5253     }
5254     htmlParseTryOrFinish(ctxt, terminate);
5255     if (terminate) {
5256 	if (ctxt->instate != XML_PARSER_EOF) {
5257 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5258 		ctxt->sax->endDocument(ctxt->userData);
5259 	}
5260 	ctxt->instate = XML_PARSER_EOF;
5261     }
5262     return((xmlParserErrors) ctxt->errNo);
5263 }
5264 
5265 /************************************************************************
5266  *									*
5267  *			User entry points				*
5268  *									*
5269  ************************************************************************/
5270 
5271 /**
5272  * htmlCreatePushParserCtxt:
5273  * @sax:  a SAX handler (optional)
5274  * @user_data:  The user data returned on SAX callbacks (optional)
5275  * @chunk:  a pointer to an array of chars (optional)
5276  * @size:  number of chars in the array
5277  * @filename:  only used for error reporting (optional)
5278  * @enc:  encoding (deprecated, pass XML_CHAR_ENCODING_NONE)
5279  *
5280  * Create a parser context for using the HTML parser in push mode.
5281  *
5282  * Returns the new parser context or NULL if a memory allocation
5283  * failed.
5284  */
5285 htmlParserCtxtPtr
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,void * user_data,const char * chunk,int size,const char * filename,xmlCharEncoding enc)5286 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5287                          const char *chunk, int size, const char *filename,
5288 			 xmlCharEncoding enc) {
5289     htmlParserCtxtPtr ctxt;
5290     htmlParserInputPtr input;
5291     const char *encoding;
5292 
5293     ctxt = htmlNewSAXParserCtxt(sax, user_data);
5294     if (ctxt == NULL)
5295 	return(NULL);
5296 
5297     encoding = xmlGetCharEncodingName(enc);
5298     input = xmlNewPushInput(filename, chunk, size);
5299     if (input == NULL) {
5300 	htmlFreeParserCtxt(ctxt);
5301 	return(NULL);
5302     }
5303 
5304     if (inputPush(ctxt, input) < 0) {
5305         xmlFreeInputStream(input);
5306         xmlFreeParserCtxt(ctxt);
5307         return(NULL);
5308     }
5309 
5310     if (encoding != NULL)
5311         xmlSwitchEncodingName(ctxt, encoding);
5312 
5313     return(ctxt);
5314 }
5315 #endif /* LIBXML_PUSH_ENABLED */
5316 
5317 /**
5318  * htmlSAXParseDoc:
5319  * @cur:  a pointer to an array of xmlChar
5320  * @encoding:  a free form C string describing the HTML document encoding, or NULL
5321  * @sax:  the SAX handler block
5322  * @userData: if using SAX, this pointer will be provided on callbacks.
5323  *
5324  * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadDoc.
5325  *
5326  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5327  * to handle parse events. If sax is NULL, fallback to the default DOM
5328  * behavior and return a tree.
5329  *
5330  * Returns the resulting document tree unless SAX is NULL or the document is
5331  *     not well formed.
5332  */
5333 
5334 htmlDocPtr
htmlSAXParseDoc(const xmlChar * cur,const char * encoding,htmlSAXHandlerPtr sax,void * userData)5335 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
5336                 htmlSAXHandlerPtr sax, void *userData) {
5337     htmlDocPtr ret;
5338     htmlParserCtxtPtr ctxt;
5339 
5340     if (cur == NULL)
5341         return(NULL);
5342 
5343     ctxt = htmlCreateDocParserCtxt(cur, NULL, encoding);
5344     if (ctxt == NULL)
5345         return(NULL);
5346 
5347     if (sax != NULL) {
5348         *ctxt->sax = *sax;
5349         ctxt->userData = userData;
5350     }
5351 
5352     htmlParseDocument(ctxt);
5353     ret = ctxt->myDoc;
5354     htmlFreeParserCtxt(ctxt);
5355 
5356     return(ret);
5357 }
5358 
5359 /**
5360  * htmlParseDoc:
5361  * @cur:  a pointer to an array of xmlChar
5362  * @encoding:  the encoding (optional)
5363  *
5364  * DEPRECATED: Use htmlReadDoc.
5365  *
5366  * Parse an HTML in-memory document and build a tree.
5367  *
5368  * This function uses deprecated global parser options.
5369  *
5370  * Returns the resulting document tree
5371  */
5372 
5373 htmlDocPtr
htmlParseDoc(const xmlChar * cur,const char * encoding)5374 htmlParseDoc(const xmlChar *cur, const char *encoding) {
5375     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5376 }
5377 
5378 
5379 /**
5380  * htmlCreateFileParserCtxt:
5381  * @filename:  the filename
5382  * @encoding:  optional encoding
5383  *
5384  * DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadFile.
5385  *
5386  * Create a parser context to read from a file.
5387  *
5388  * A non-NULL encoding overrides encoding declarations in the document.
5389  *
5390  * Automatic support for ZLIB/Compress compressed document is provided
5391  * by default if found at compile-time.
5392  *
5393  * Returns the new parser context or NULL if a memory allocation failed.
5394  */
5395 htmlParserCtxtPtr
htmlCreateFileParserCtxt(const char * filename,const char * encoding)5396 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5397 {
5398     htmlParserCtxtPtr ctxt;
5399     htmlParserInputPtr input;
5400 
5401     if (filename == NULL)
5402         return(NULL);
5403 
5404     ctxt = htmlNewParserCtxt();
5405     if (ctxt == NULL) {
5406 	return(NULL);
5407     }
5408 
5409     input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0);
5410     if (input == NULL) {
5411 	xmlFreeParserCtxt(ctxt);
5412 	return(NULL);
5413     }
5414     if (inputPush(ctxt, input) < 0) {
5415         xmlFreeInputStream(input);
5416         xmlFreeParserCtxt(ctxt);
5417         return(NULL);
5418     }
5419 
5420     return(ctxt);
5421 }
5422 
5423 /**
5424  * htmlSAXParseFile:
5425  * @filename:  the filename
5426  * @encoding:  encoding (optional)
5427  * @sax:  the SAX handler block
5428  * @userData: if using SAX, this pointer will be provided on callbacks.
5429  *
5430  * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadFile.
5431  *
5432  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5433  * compressed document is provided by default if found at compile-time.
5434  * It use the given SAX function block to handle the parsing callback.
5435  * If sax is NULL, fallback to the default DOM tree building routines.
5436  *
5437  * Returns the resulting document tree unless SAX is NULL or the document is
5438  *     not well formed.
5439  */
5440 
5441 htmlDocPtr
htmlSAXParseFile(const char * filename,const char * encoding,htmlSAXHandlerPtr sax,void * userData)5442 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5443                  void *userData) {
5444     htmlDocPtr ret;
5445     htmlParserCtxtPtr ctxt;
5446     htmlSAXHandlerPtr oldsax = NULL;
5447 
5448     ctxt = htmlCreateFileParserCtxt(filename, encoding);
5449     if (ctxt == NULL) return(NULL);
5450     if (sax != NULL) {
5451 	oldsax = ctxt->sax;
5452         ctxt->sax = sax;
5453         ctxt->userData = userData;
5454     }
5455 
5456     htmlParseDocument(ctxt);
5457 
5458     ret = ctxt->myDoc;
5459     if (sax != NULL) {
5460         ctxt->sax = oldsax;
5461         ctxt->userData = NULL;
5462     }
5463     htmlFreeParserCtxt(ctxt);
5464 
5465     return(ret);
5466 }
5467 
5468 /**
5469  * htmlParseFile:
5470  * @filename:  the filename
5471  * @encoding:  encoding (optional)
5472  *
5473  * Parse an HTML file and build a tree.
5474  *
5475  * Returns the resulting document tree
5476  */
5477 
5478 htmlDocPtr
htmlParseFile(const char * filename,const char * encoding)5479 htmlParseFile(const char *filename, const char *encoding) {
5480     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5481 }
5482 
5483 /**
5484  * htmlHandleOmittedElem:
5485  * @val:  int 0 or 1
5486  *
5487  * DEPRECATED: Use HTML_PARSE_NOIMPLIED
5488  *
5489  * Set and return the previous value for handling HTML omitted tags.
5490  *
5491  * Returns the last value for 0 for no handling, 1 for auto insertion.
5492  */
5493 
5494 int
htmlHandleOmittedElem(int val)5495 htmlHandleOmittedElem(int val) {
5496     int old = htmlOmittedDefaultValue;
5497 
5498     htmlOmittedDefaultValue = val;
5499     return(old);
5500 }
5501 
5502 /**
5503  * htmlElementAllowedHere:
5504  * @parent: HTML parent element
5505  * @elt: HTML element
5506  *
5507  * DEPRECATED: Don't use.
5508  *
5509  * Returns 1
5510  */
5511 int
htmlElementAllowedHere(const htmlElemDesc * parent ATTRIBUTE_UNUSED,const xmlChar * elt ATTRIBUTE_UNUSED)5512 htmlElementAllowedHere(const htmlElemDesc* parent ATTRIBUTE_UNUSED,
5513                        const xmlChar* elt ATTRIBUTE_UNUSED) {
5514     return(1);
5515 }
5516 
5517 /**
5518  * htmlElementStatusHere:
5519  * @parent: HTML parent element
5520  * @elt: HTML element
5521  *
5522  * DEPRECATED: Don't use.
5523  *
5524  * Returns HTML_VALID
5525  */
5526 htmlStatus
htmlElementStatusHere(const htmlElemDesc * parent ATTRIBUTE_UNUSED,const htmlElemDesc * elt ATTRIBUTE_UNUSED)5527 htmlElementStatusHere(const htmlElemDesc* parent ATTRIBUTE_UNUSED,
5528                       const htmlElemDesc* elt ATTRIBUTE_UNUSED) {
5529     return(HTML_VALID);
5530 }
5531 
5532 /**
5533  * htmlAttrAllowed:
5534  * @elt: HTML element
5535  * @attr: HTML attribute
5536  * @legacy: whether to allow deprecated attributes
5537  *
5538  * DEPRECATED: Don't use.
5539  *
5540  * Returns HTML_VALID
5541  */
5542 htmlStatus
htmlAttrAllowed(const htmlElemDesc * elt ATTRIBUTE_UNUSED,const xmlChar * attr ATTRIBUTE_UNUSED,int legacy ATTRIBUTE_UNUSED)5543 htmlAttrAllowed(const htmlElemDesc* elt ATTRIBUTE_UNUSED,
5544                 const xmlChar* attr ATTRIBUTE_UNUSED,
5545                 int legacy ATTRIBUTE_UNUSED) {
5546     return(HTML_VALID);
5547 }
5548 
5549 /**
5550  * htmlNodeStatus:
5551  * @node: an htmlNodePtr in a tree
5552  * @legacy: whether to allow deprecated elements (YES is faster here
5553  *	for Element nodes)
5554  *
5555  * DEPRECATED: Don't use.
5556  *
5557  * Returns HTML_VALID
5558  */
5559 htmlStatus
htmlNodeStatus(htmlNodePtr node ATTRIBUTE_UNUSED,int legacy ATTRIBUTE_UNUSED)5560 htmlNodeStatus(htmlNodePtr node ATTRIBUTE_UNUSED,
5561                int legacy ATTRIBUTE_UNUSED) {
5562     return(HTML_VALID);
5563 }
5564 
5565 /************************************************************************
5566  *									*
5567  *	New set (2.6.0) of simpler and more flexible APIs		*
5568  *									*
5569  ************************************************************************/
5570 /**
5571  * DICT_FREE:
5572  * @str:  a string
5573  *
5574  * Free a string if it is not owned by the "dict" dictionary in the
5575  * current scope
5576  */
5577 #define DICT_FREE(str)						\
5578 	if ((str) && ((!dict) ||				\
5579 	    (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))	\
5580 	    xmlFree((char *)(str));
5581 
5582 /**
5583  * htmlCtxtReset:
5584  * @ctxt: an HTML parser context
5585  *
5586  * Reset a parser context
5587  */
5588 void
htmlCtxtReset(htmlParserCtxtPtr ctxt)5589 htmlCtxtReset(htmlParserCtxtPtr ctxt)
5590 {
5591     xmlParserInputPtr input;
5592     xmlDictPtr dict;
5593 
5594     if (ctxt == NULL)
5595         return;
5596 
5597     dict = ctxt->dict;
5598 
5599     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5600         xmlFreeInputStream(input);
5601     }
5602     ctxt->inputNr = 0;
5603     ctxt->input = NULL;
5604 
5605     ctxt->spaceNr = 0;
5606     if (ctxt->spaceTab != NULL) {
5607 	ctxt->spaceTab[0] = -1;
5608 	ctxt->space = &ctxt->spaceTab[0];
5609     } else {
5610 	ctxt->space = NULL;
5611     }
5612 
5613 
5614     ctxt->nodeNr = 0;
5615     ctxt->node = NULL;
5616 
5617     ctxt->nameNr = 0;
5618     ctxt->name = NULL;
5619 
5620     ctxt->nsNr = 0;
5621 
5622     DICT_FREE(ctxt->version);
5623     ctxt->version = NULL;
5624     DICT_FREE(ctxt->encoding);
5625     ctxt->encoding = NULL;
5626     DICT_FREE(ctxt->extSubURI);
5627     ctxt->extSubURI = NULL;
5628     DICT_FREE(ctxt->extSubSystem);
5629     ctxt->extSubSystem = NULL;
5630 
5631     if (ctxt->directory != NULL) {
5632         xmlFree(ctxt->directory);
5633         ctxt->directory = NULL;
5634     }
5635 
5636     if (ctxt->myDoc != NULL)
5637         xmlFreeDoc(ctxt->myDoc);
5638     ctxt->myDoc = NULL;
5639 
5640     ctxt->standalone = -1;
5641     ctxt->hasExternalSubset = 0;
5642     ctxt->hasPErefs = 0;
5643     ctxt->html = 1;
5644     ctxt->instate = XML_PARSER_START;
5645 
5646     ctxt->wellFormed = 1;
5647     ctxt->nsWellFormed = 1;
5648     ctxt->disableSAX = 0;
5649     ctxt->valid = 1;
5650     ctxt->vctxt.userData = ctxt;
5651     ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
5652     ctxt->vctxt.error = xmlParserValidityError;
5653     ctxt->vctxt.warning = xmlParserValidityWarning;
5654     ctxt->record_info = 0;
5655     ctxt->checkIndex = 0;
5656     ctxt->endCheckState = 0;
5657     ctxt->inSubset = 0;
5658     ctxt->errNo = XML_ERR_OK;
5659     ctxt->depth = 0;
5660     ctxt->catalogs = NULL;
5661     xmlInitNodeInfoSeq(&ctxt->node_seq);
5662 
5663     if (ctxt->attsDefault != NULL) {
5664         xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
5665         ctxt->attsDefault = NULL;
5666     }
5667     if (ctxt->attsSpecial != NULL) {
5668         xmlHashFree(ctxt->attsSpecial, NULL);
5669         ctxt->attsSpecial = NULL;
5670     }
5671 
5672     ctxt->nbErrors = 0;
5673     ctxt->nbWarnings = 0;
5674     if (ctxt->lastError.code != XML_ERR_OK)
5675         xmlResetError(&ctxt->lastError);
5676 }
5677 
5678 static int
htmlCtxtSetOptionsInternal(xmlParserCtxtPtr ctxt,int options,int keepMask)5679 htmlCtxtSetOptionsInternal(xmlParserCtxtPtr ctxt, int options, int keepMask)
5680 {
5681     int allMask;
5682 
5683     if (ctxt == NULL)
5684         return(-1);
5685 
5686     allMask = HTML_PARSE_RECOVER |
5687               HTML_PARSE_HTML5 |
5688               HTML_PARSE_NODEFDTD |
5689               HTML_PARSE_NOERROR |
5690               HTML_PARSE_NOWARNING |
5691               HTML_PARSE_PEDANTIC |
5692               HTML_PARSE_NOBLANKS |
5693               HTML_PARSE_NONET |
5694               HTML_PARSE_NOIMPLIED |
5695               HTML_PARSE_COMPACT |
5696               HTML_PARSE_HUGE |
5697               HTML_PARSE_IGNORE_ENC |
5698               HTML_PARSE_BIG_LINES;
5699 
5700     ctxt->options = (ctxt->options & keepMask) | (options & allMask);
5701 
5702     /*
5703      * For some options, struct members are historically the source
5704      * of truth. See xmlCtxtSetOptionsInternal.
5705      */
5706     ctxt->keepBlanks = (options & HTML_PARSE_NOBLANKS) ? 0 : 1;
5707 
5708     /*
5709      * Changing SAX callbacks is a bad idea. This should be fixed.
5710      */
5711     if (options & HTML_PARSE_NOBLANKS) {
5712         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5713     }
5714     if (options & HTML_PARSE_HUGE) {
5715         if (ctxt->dict != NULL)
5716             xmlDictSetLimit(ctxt->dict, 0);
5717     }
5718 
5719     /*
5720      * It would be useful to allow this feature.
5721      */
5722     ctxt->dictNames = 0;
5723 
5724     ctxt->linenumbers = 1;
5725 
5726     return(options & ~allMask);
5727 }
5728 
5729 /**
5730  * htmlCtxtSetOptions:
5731  * @ctxt: an HTML parser context
5732  * @options:  a bitmask of xmlParserOption values
5733  *
5734  * Applies the options to the parser context. Unset options are
5735  * cleared.
5736  *
5737  * Available since 2.14.0. With older versions, you can use
5738  * htmlCtxtUseOptions.
5739  *
5740  * HTML_PARSE_RECOVER
5741  *
5742  * No effect as of 2.14.0.
5743  *
5744  * HTML_PARSE_HTML5
5745  *
5746  * Make the tokenizer emit a SAX callback for each token. This results
5747  * in unbalanced invocations of startElement and endElement.
5748  *
5749  * For now, this is only usable with custom SAX callbacks.
5750  *
5751  * HTML_PARSE_NODEFDTD
5752  *
5753  * Do not default to a doctype if none was found.
5754  *
5755  * HTML_PARSE_NOERROR
5756  *
5757  * Disable error and warning reports to the error handlers.
5758  * Errors are still accessible with xmlCtxtGetLastError.
5759  *
5760  * HTML_PARSE_NOWARNING
5761  *
5762  * Disable warning reports.
5763  *
5764  * HTML_PARSE_PEDANTIC
5765  *
5766  * No effect.
5767  *
5768  * HTML_PARSE_NOBLANKS
5769  *
5770  * Remove some text nodes containing only whitespace from the
5771  * result document. Which nodes are removed depends on a conservative
5772  * heuristic. The reindenting feature of the serialization code relies
5773  * on this option to be set when parsing. Use of this option is
5774  * DISCOURAGED.
5775  *
5776  * HTML_PARSE_NONET
5777  *
5778  * No effect.
5779  *
5780  * HTML_PARSE_NOIMPLIED
5781  *
5782  * Do not add implied html, head or body elements.
5783  *
5784  * HTML_PARSE_COMPACT
5785  *
5786  * Store small strings directly in the node struct to save
5787  * memory.
5788  *
5789  * HTML_PARSE_HUGE
5790  *
5791  * Relax some internal limits.
5792  *
5793  * Available since 2.14.0. Use XML_PARSE_HUGE works with older
5794  * versions.
5795  *
5796  * Maximum size of text nodes, tags, comments, CDATA sections
5797  *
5798  * normal: 10M
5799  * huge:    1B
5800  *
5801  * Maximum size of names, system literals, pubid literals
5802  *
5803  * normal: 50K
5804  * huge:   10M
5805  *
5806  * Maximum nesting depth of elements
5807  *
5808  * normal:  256
5809  * huge:   2048
5810  *
5811  * HTML_PARSE_IGNORE_ENC
5812  *
5813  * Ignore the encoding in the HTML declaration. This option is
5814  * mostly unneeded these days. The only effect is to enforce
5815  * UTF-8 decoding of ASCII-like data.
5816  *
5817  * HTML_PARSE_BIG_LINES
5818  *
5819  * Enable reporting of line numbers larger than 65535.
5820  *
5821  * Available since 2.14.0.
5822  *
5823  * Returns 0 in case of success, the set of unknown or unimplemented options
5824  *         in case of error.
5825  */
5826 int
htmlCtxtSetOptions(xmlParserCtxtPtr ctxt,int options)5827 htmlCtxtSetOptions(xmlParserCtxtPtr ctxt, int options)
5828 {
5829     return(htmlCtxtSetOptionsInternal(ctxt, options, 0));
5830 }
5831 
5832 /**
5833  * htmlCtxtUseOptions:
5834  * @ctxt: an HTML parser context
5835  * @options:  a combination of htmlParserOption(s)
5836  *
5837  * DEPRECATED: Use htmlCtxtSetOptions.
5838  *
5839  * Applies the options to the parser context. The following options
5840  * are never cleared and can only be enabled:
5841  *
5842  * HTML_PARSE_NODEFDTD
5843  * HTML_PARSE_NOERROR
5844  * HTML_PARSE_NOWARNING
5845  * HTML_PARSE_NOIMPLIED
5846  * HTML_PARSE_COMPACT
5847  * HTML_PARSE_HUGE
5848  * HTML_PARSE_IGNORE_ENC
5849  * HTML_PARSE_BIG_LINES
5850  *
5851  * Returns 0 in case of success, the set of unknown or unimplemented options
5852  *         in case of error.
5853  */
5854 int
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt,int options)5855 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5856 {
5857     int keepMask;
5858 
5859     /*
5860      * For historic reasons, some options can only be enabled.
5861      */
5862     keepMask = HTML_PARSE_NODEFDTD |
5863                HTML_PARSE_NOERROR |
5864                HTML_PARSE_NOWARNING |
5865                HTML_PARSE_NOIMPLIED |
5866                HTML_PARSE_COMPACT |
5867                HTML_PARSE_HUGE |
5868                HTML_PARSE_IGNORE_ENC |
5869                HTML_PARSE_BIG_LINES;
5870 
5871     return(htmlCtxtSetOptionsInternal(ctxt, options, keepMask));
5872 }
5873 
5874 /**
5875  * htmlCtxtParseDocument:
5876  * @ctxt:  an HTML parser context
5877  * @input:  parser input
5878  *
5879  * Parse an HTML document and return the resulting document tree.
5880  *
5881  * Available since 2.13.0.
5882  *
5883  * Returns the resulting document tree or NULL
5884  */
5885 htmlDocPtr
htmlCtxtParseDocument(htmlParserCtxtPtr ctxt,xmlParserInputPtr input)5886 htmlCtxtParseDocument(htmlParserCtxtPtr ctxt, xmlParserInputPtr input)
5887 {
5888     htmlDocPtr ret;
5889 
5890     if ((ctxt == NULL) || (input == NULL))
5891         return(NULL);
5892 
5893     /* assert(ctxt->inputNr == 0); */
5894     while (ctxt->inputNr > 0)
5895         xmlFreeInputStream(inputPop(ctxt));
5896 
5897     if (inputPush(ctxt, input) < 0) {
5898         xmlFreeInputStream(input);
5899         return(NULL);
5900     }
5901 
5902     ctxt->html = 1;
5903     htmlParseDocument(ctxt);
5904 
5905     if (ctxt->errNo != XML_ERR_NO_MEMORY) {
5906         ret = ctxt->myDoc;
5907     } else {
5908         ret = NULL;
5909         xmlFreeDoc(ctxt->myDoc);
5910     }
5911     ctxt->myDoc = NULL;
5912 
5913     /* assert(ctxt->inputNr == 1); */
5914     while (ctxt->inputNr > 0)
5915         xmlFreeInputStream(inputPop(ctxt));
5916 
5917     return(ret);
5918 }
5919 
5920 /**
5921  * htmlReadDoc:
5922  * @str:  a pointer to a zero terminated string
5923  * @url:  only used for error reporting (optoinal)
5924  * @encoding:  the document encoding (optional)
5925  * @options:  a combination of htmlParserOptions
5926  *
5927  * Convenience function to parse an HTML document from a zero-terminated
5928  * string.
5929  *
5930  * See htmlCtxtReadDoc for details.
5931  *
5932  * Returns the resulting document tree.
5933  */
5934 htmlDocPtr
htmlReadDoc(const xmlChar * str,const char * url,const char * encoding,int options)5935 htmlReadDoc(const xmlChar *str, const char *url, const char *encoding,
5936             int options)
5937 {
5938     htmlParserCtxtPtr ctxt;
5939     xmlParserInputPtr input;
5940     htmlDocPtr doc;
5941 
5942     ctxt = htmlNewParserCtxt();
5943     if (ctxt == NULL)
5944         return(NULL);
5945 
5946     htmlCtxtUseOptions(ctxt, options);
5947 
5948     input = xmlCtxtNewInputFromString(ctxt, url, (const char *) str, encoding,
5949                                       XML_INPUT_BUF_STATIC);
5950 
5951     doc = htmlCtxtParseDocument(ctxt, input);
5952 
5953     htmlFreeParserCtxt(ctxt);
5954     return(doc);
5955 }
5956 
5957 /**
5958  * htmlReadFile:
5959  * @filename:  a file or URL
5960  * @encoding:  the document encoding (optional)
5961  * @options:  a combination of htmlParserOptions
5962  *
5963  * Convenience function to parse an HTML file from the filesystem,
5964  * the network or a global user-defined resource loader.
5965  *
5966  * See htmlCtxtReadFile for details.
5967  *
5968  * Returns the resulting document tree.
5969  */
5970 htmlDocPtr
htmlReadFile(const char * filename,const char * encoding,int options)5971 htmlReadFile(const char *filename, const char *encoding, int options)
5972 {
5973     htmlParserCtxtPtr ctxt;
5974     xmlParserInputPtr input;
5975     htmlDocPtr doc;
5976 
5977     ctxt = htmlNewParserCtxt();
5978     if (ctxt == NULL)
5979         return(NULL);
5980 
5981     htmlCtxtUseOptions(ctxt, options);
5982 
5983     input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0);
5984 
5985     doc = htmlCtxtParseDocument(ctxt, input);
5986 
5987     htmlFreeParserCtxt(ctxt);
5988     return(doc);
5989 }
5990 
5991 /**
5992  * htmlReadMemory:
5993  * @buffer:  a pointer to a char array
5994  * @size:  the size of the array
5995  * @url:  only used for error reporting (optional)
5996  * @encoding:  the document encoding, or NULL
5997  * @options:  a combination of htmlParserOption(s)
5998  *
5999  * Convenience function to parse an HTML document from memory.
6000  * The input buffer must not contain any terminating null bytes.
6001  *
6002  * See htmlCtxtReadMemory for details.
6003  *
6004  * Returns the resulting document tree
6005  */
6006 htmlDocPtr
htmlReadMemory(const char * buffer,int size,const char * url,const char * encoding,int options)6007 htmlReadMemory(const char *buffer, int size, const char *url,
6008                const char *encoding, int options)
6009 {
6010     htmlParserCtxtPtr ctxt;
6011     xmlParserInputPtr input;
6012     htmlDocPtr doc;
6013 
6014     if (size < 0)
6015 	return(NULL);
6016 
6017     ctxt = htmlNewParserCtxt();
6018     if (ctxt == NULL)
6019         return(NULL);
6020 
6021     htmlCtxtUseOptions(ctxt, options);
6022 
6023     input = xmlCtxtNewInputFromMemory(ctxt, url, buffer, size, encoding,
6024                                       XML_INPUT_BUF_STATIC);
6025 
6026     doc = htmlCtxtParseDocument(ctxt, input);
6027 
6028     htmlFreeParserCtxt(ctxt);
6029     return(doc);
6030 }
6031 
6032 /**
6033  * htmlReadFd:
6034  * @fd:  an open file descriptor
6035  * @url:  only used for error reporting (optional)
6036  * @encoding:  the document encoding, or NULL
6037  * @options:  a combination of htmlParserOptions
6038  *
6039  * Convenience function to parse an HTML document from a
6040  * file descriptor.
6041  *
6042  * NOTE that the file descriptor will not be closed when the
6043  * context is freed or reset.
6044  *
6045  * See htmlCtxtReadFd for details.
6046  *
6047  * Returns the resulting document tree
6048  */
6049 htmlDocPtr
htmlReadFd(int fd,const char * url,const char * encoding,int options)6050 htmlReadFd(int fd, const char *url, const char *encoding, int options)
6051 {
6052     htmlParserCtxtPtr ctxt;
6053     xmlParserInputPtr input;
6054     htmlDocPtr doc;
6055 
6056     ctxt = htmlNewParserCtxt();
6057     if (ctxt == NULL)
6058         return(NULL);
6059 
6060     htmlCtxtUseOptions(ctxt, options);
6061 
6062     input = xmlCtxtNewInputFromFd(ctxt, url, fd, encoding, 0);
6063 
6064     doc = htmlCtxtParseDocument(ctxt, input);
6065 
6066     htmlFreeParserCtxt(ctxt);
6067     return(doc);
6068 }
6069 
6070 /**
6071  * htmlReadIO:
6072  * @ioread:  an I/O read function
6073  * @ioclose:  an I/O close function (optional)
6074  * @ioctx:  an I/O handler
6075  * @url:  only used for error reporting (optional)
6076  * @encoding:  the document encoding (optional)
6077  * @options:  a combination of htmlParserOption(s)
6078  *
6079  * Convenience function to parse an HTML document from I/O functions
6080  * and context.
6081  *
6082  * See htmlCtxtReadIO for details.
6083  *
6084  * Returns the resulting document tree
6085  */
6086 htmlDocPtr
htmlReadIO(xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * url,const char * encoding,int options)6087 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6088           void *ioctx, const char *url, const char *encoding, int options)
6089 {
6090     htmlParserCtxtPtr ctxt;
6091     xmlParserInputPtr input;
6092     htmlDocPtr doc;
6093 
6094     ctxt = htmlNewParserCtxt();
6095     if (ctxt == NULL)
6096         return (NULL);
6097 
6098     htmlCtxtUseOptions(ctxt, options);
6099 
6100     input = xmlCtxtNewInputFromIO(ctxt, url, ioread, ioclose, ioctx,
6101                                   encoding, 0);
6102 
6103     doc = htmlCtxtParseDocument(ctxt, input);
6104 
6105     htmlFreeParserCtxt(ctxt);
6106     return(doc);
6107 }
6108 
6109 /**
6110  * htmlCtxtReadDoc:
6111  * @ctxt:  an HTML parser context
6112  * @str:  a pointer to a zero terminated string
6113  * @URL:  only used for error reporting (optional)
6114  * @encoding:  the document encoding (optional)
6115  * @options:  a combination of htmlParserOptions
6116  *
6117  * Parse an HTML in-memory document and build a tree.
6118  *
6119  * See htmlCtxtUseOptions for details.
6120  *
6121  * Returns the resulting document tree
6122  */
6123 htmlDocPtr
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt,const xmlChar * str,const char * URL,const char * encoding,int options)6124 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar *str,
6125                 const char *URL, const char *encoding, int options)
6126 {
6127     xmlParserInputPtr input;
6128 
6129     if (ctxt == NULL)
6130         return (NULL);
6131 
6132     htmlCtxtReset(ctxt);
6133     htmlCtxtUseOptions(ctxt, options);
6134 
6135     input = xmlCtxtNewInputFromString(ctxt, URL, (const char *) str,
6136                                       encoding, 0);
6137 
6138     return(htmlCtxtParseDocument(ctxt, input));
6139 }
6140 
6141 /**
6142  * htmlCtxtReadFile:
6143  * @ctxt:  an HTML parser context
6144  * @filename:  a file or URL
6145  * @encoding:  the document encoding (optional)
6146  * @options:  a combination of htmlParserOptions
6147  *
6148  * Parse an HTML file from the filesystem, the network or a
6149  * user-defined resource loader.
6150  *
6151  * See htmlCtxtUseOptions for details.
6152  *
6153  * Returns the resulting document tree
6154  */
6155 htmlDocPtr
htmlCtxtReadFile(htmlParserCtxtPtr ctxt,const char * filename,const char * encoding,int options)6156 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6157                 const char *encoding, int options)
6158 {
6159     xmlParserInputPtr input;
6160 
6161     if (ctxt == NULL)
6162         return (NULL);
6163 
6164     htmlCtxtReset(ctxt);
6165     htmlCtxtUseOptions(ctxt, options);
6166 
6167     input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0);
6168 
6169     return(htmlCtxtParseDocument(ctxt, input));
6170 }
6171 
6172 /**
6173  * htmlCtxtReadMemory:
6174  * @ctxt:  an HTML parser context
6175  * @buffer:  a pointer to a char array
6176  * @size:  the size of the array
6177  * @URL:  only used for error reporting (optional)
6178  * @encoding:  the document encoding (optinal)
6179  * @options:  a combination of htmlParserOptions
6180  *
6181  * Parse an HTML in-memory document and build a tree. The input buffer must
6182  * not contain any terminating null bytes.
6183  *
6184  * See htmlCtxtUseOptions for details.
6185  *
6186  * Returns the resulting document tree
6187  */
6188 htmlDocPtr
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt,const char * buffer,int size,const char * URL,const char * encoding,int options)6189 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6190                   const char *URL, const char *encoding, int options)
6191 {
6192     xmlParserInputPtr input;
6193 
6194     if ((ctxt == NULL) || (size < 0))
6195         return (NULL);
6196 
6197     htmlCtxtReset(ctxt);
6198     htmlCtxtUseOptions(ctxt, options);
6199 
6200     input = xmlCtxtNewInputFromMemory(ctxt, URL, buffer, size, encoding,
6201                                       XML_INPUT_BUF_STATIC);
6202 
6203     return(htmlCtxtParseDocument(ctxt, input));
6204 }
6205 
6206 /**
6207  * htmlCtxtReadFd:
6208  * @ctxt:  an HTML parser context
6209  * @fd:  an open file descriptor
6210  * @URL:  only used for error reporting (optional)
6211  * @encoding:  the document encoding (optinal)
6212  * @options:  a combination of htmlParserOptions
6213  *
6214  * Parse an HTML from a file descriptor and build a tree.
6215  *
6216  * See htmlCtxtUseOptions for details.
6217  *
6218  * NOTE that the file descriptor will not be closed when the
6219  * context is freed or reset.
6220  *
6221  * Returns the resulting document tree
6222  */
6223 htmlDocPtr
htmlCtxtReadFd(htmlParserCtxtPtr ctxt,int fd,const char * URL,const char * encoding,int options)6224 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6225               const char *URL, const char *encoding, int options)
6226 {
6227     xmlParserInputPtr input;
6228 
6229     if (ctxt == NULL)
6230         return(NULL);
6231 
6232     htmlCtxtReset(ctxt);
6233     htmlCtxtUseOptions(ctxt, options);
6234 
6235     input = xmlCtxtNewInputFromFd(ctxt, URL, fd, encoding, 0);
6236 
6237     return(htmlCtxtParseDocument(ctxt, input));
6238 }
6239 
6240 /**
6241  * htmlCtxtReadIO:
6242  * @ctxt:  an HTML parser context
6243  * @ioread:  an I/O read function
6244  * @ioclose:  an I/O close function
6245  * @ioctx:  an I/O handler
6246  * @URL:  the base URL to use for the document
6247  * @encoding:  the document encoding, or NULL
6248  * @options:  a combination of htmlParserOption(s)
6249  *
6250  * Parse an HTML document from I/O functions and source and build a tree.
6251  *
6252  * See htmlCtxtUseOptions for details.
6253  *
6254  * Returns the resulting document tree
6255  */
6256 htmlDocPtr
htmlCtxtReadIO(htmlParserCtxtPtr ctxt,xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)6257 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6258               xmlInputCloseCallback ioclose, void *ioctx,
6259 	      const char *URL,
6260               const char *encoding, int options)
6261 {
6262     xmlParserInputPtr input;
6263 
6264     if (ctxt == NULL)
6265         return (NULL);
6266 
6267     htmlCtxtReset(ctxt);
6268     htmlCtxtUseOptions(ctxt, options);
6269 
6270     input = xmlCtxtNewInputFromIO(ctxt, URL, ioread, ioclose, ioctx,
6271                                   encoding, 0);
6272 
6273     return(htmlCtxtParseDocument(ctxt, input));
6274 }
6275 
6276 #endif /* LIBXML_HTML_ENABLED */
6277