xref: /aosp_15_r20/external/cronet/third_party/libxml/src/HTMLparser.c (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 /*
2  * HTMLparser.c : an HTML 4.0 non-verifying parser
3  *
4  * See Copyright for the status of this software.
5  *
6  * [email protected]
7  */
8 
9 #define IN_LIBXML
10 #include "libxml.h"
11 #ifdef LIBXML_HTML_ENABLED
12 
13 #include <string.h>
14 #include <ctype.h>
15 #include <stdlib.h>
16 
17 #include <libxml/HTMLparser.h>
18 #include <libxml/xmlmemory.h>
19 #include <libxml/tree.h>
20 #include <libxml/parser.h>
21 #include <libxml/parserInternals.h>
22 #include <libxml/xmlerror.h>
23 #include <libxml/HTMLtree.h>
24 #include <libxml/entities.h>
25 #include <libxml/encoding.h>
26 #include <libxml/xmlIO.h>
27 #include <libxml/uri.h>
28 
29 #include "private/buf.h"
30 #include "private/enc.h"
31 #include "private/error.h"
32 #include "private/html.h"
33 #include "private/io.h"
34 #include "private/parser.h"
35 #include "private/tree.h"
36 
37 #define HTML_MAX_NAMELEN 1000
38 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
39 #define HTML_PARSER_BUFFER_SIZE 100
40 
41 static int htmlOmittedDefaultValue = 1;
42 
43 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
44 			     xmlChar end, xmlChar  end2, xmlChar end3);
45 static void htmlParseComment(htmlParserCtxtPtr ctxt);
46 
47 /************************************************************************
48  *									*
49  *		Some factorized error routines				*
50  *									*
51  ************************************************************************/
52 
53 /**
54  * htmlErrMemory:
55  * @ctxt:  an HTML parser context
56  * @extra:  extra information
57  *
58  * Handle a redefinition of attribute error
59  */
60 static void
htmlErrMemory(xmlParserCtxtPtr ctxt)61 htmlErrMemory(xmlParserCtxtPtr ctxt)
62 {
63     xmlCtxtErrMemory(ctxt);
64 }
65 
66 /**
67  * htmlParseErr:
68  * @ctxt:  an HTML parser context
69  * @error:  the error number
70  * @msg:  the error message
71  * @str1:  string infor
72  * @str2:  string infor
73  *
74  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
75  */
76 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErr(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,const xmlChar * str1,const xmlChar * str2)77 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
78              const char *msg, const xmlChar *str1, const xmlChar *str2)
79 {
80     xmlCtxtErr(ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR,
81                str1, str2, NULL, 0, msg, str1, str2);
82 }
83 
84 /**
85  * htmlParseErrInt:
86  * @ctxt:  an HTML parser context
87  * @error:  the error number
88  * @msg:  the error message
89  * @val:  integer info
90  *
91  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
92  */
93 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErrInt(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,int val)94 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
95              const char *msg, int val)
96 {
97     xmlCtxtErr(ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR,
98                NULL, NULL, NULL, val, msg, val);
99 }
100 
101 /************************************************************************
102  *									*
103  *	Parser stacks related functions and macros		*
104  *									*
105  ************************************************************************/
106 
107 /**
108  * htmlnamePush:
109  * @ctxt:  an HTML parser context
110  * @value:  the element name
111  *
112  * Pushes a new element name on top of the name stack
113  *
114  * Returns -1 in case of error, the index in the stack otherwise
115  */
116 static int
htmlnamePush(htmlParserCtxtPtr ctxt,const xmlChar * value)117 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
118 {
119     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
120         ctxt->html = 3;
121     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
122         ctxt->html = 10;
123     if (ctxt->nameNr >= ctxt->nameMax) {
124         size_t newSize = ctxt->nameMax * 2;
125         const xmlChar **tmp;
126 
127         tmp = xmlRealloc((xmlChar **) ctxt->nameTab,
128                          newSize * sizeof(ctxt->nameTab[0]));
129         if (tmp == NULL) {
130             htmlErrMemory(ctxt);
131             return (-1);
132         }
133         ctxt->nameTab = tmp;
134         ctxt->nameMax = newSize;
135     }
136     ctxt->nameTab[ctxt->nameNr] = value;
137     ctxt->name = value;
138     return (ctxt->nameNr++);
139 }
140 /**
141  * htmlnamePop:
142  * @ctxt: an HTML parser context
143  *
144  * Pops the top element name from the name stack
145  *
146  * Returns the name just removed
147  */
148 static const xmlChar *
htmlnamePop(htmlParserCtxtPtr ctxt)149 htmlnamePop(htmlParserCtxtPtr ctxt)
150 {
151     const xmlChar *ret;
152 
153     if (ctxt->nameNr <= 0)
154         return (NULL);
155     ctxt->nameNr--;
156     if (ctxt->nameNr < 0)
157         return (NULL);
158     if (ctxt->nameNr > 0)
159         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
160     else
161         ctxt->name = NULL;
162     ret = ctxt->nameTab[ctxt->nameNr];
163     ctxt->nameTab[ctxt->nameNr] = NULL;
164     return (ret);
165 }
166 
167 /**
168  * htmlNodeInfoPush:
169  * @ctxt:  an HTML parser context
170  * @value:  the node info
171  *
172  * Pushes a new element name on top of the node info stack
173  *
174  * Returns 0 in case of error, the index in the stack otherwise
175  */
176 static int
htmlNodeInfoPush(htmlParserCtxtPtr ctxt,htmlParserNodeInfo * value)177 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
178 {
179     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
180         if (ctxt->nodeInfoMax == 0)
181                 ctxt->nodeInfoMax = 5;
182         ctxt->nodeInfoMax *= 2;
183         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
184                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
185                                     ctxt->nodeInfoMax *
186                                     sizeof(ctxt->nodeInfoTab[0]));
187         if (ctxt->nodeInfoTab == NULL) {
188             htmlErrMemory(ctxt);
189             return (0);
190         }
191     }
192     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
193     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
194     return (ctxt->nodeInfoNr++);
195 }
196 
197 /**
198  * htmlNodeInfoPop:
199  * @ctxt:  an HTML parser context
200  *
201  * Pops the top element name from the node info stack
202  *
203  * Returns 0 in case of error, the pointer to NodeInfo otherwise
204  */
205 static htmlParserNodeInfo *
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)206 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
207 {
208     if (ctxt->nodeInfoNr <= 0)
209         return (NULL);
210     ctxt->nodeInfoNr--;
211     if (ctxt->nodeInfoNr < 0)
212         return (NULL);
213     if (ctxt->nodeInfoNr > 0)
214         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
215     else
216         ctxt->nodeInfo = NULL;
217     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
218 }
219 
220 /*
221  * Macros for accessing the content. Those should be used only by the parser,
222  * and not exported.
223  *
224  * Dirty macros, i.e. one need to make assumption on the context to use them
225  *
226  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
227  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
228  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
229  *           in UNICODE mode. This should be used internally by the parser
230  *           only to compare to ASCII values otherwise it would break when
231  *           running with UTF-8 encoding.
232  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
233  *           to compare on ASCII based substring.
234  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
235  *           it should be used only to compare on ASCII based substring.
236  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
237  *           strings without newlines within the parser.
238  *
239  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
240  *
241  *   NEXT    Skip to the next character, this does the proper decoding
242  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
243  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
244  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
245  */
246 
247 #define UPPER (toupper(*ctxt->input->cur))
248 
249 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
250 
251 #define NXT(val) ctxt->input->cur[(val)]
252 
253 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
254 
255 #define CUR_PTR ctxt->input->cur
256 #define BASE_PTR ctxt->input->base
257 
258 #define SHRINK \
259     if ((!PARSER_PROGRESSIVE(ctxt)) && \
260         (ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
261 	(ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
262 	xmlParserShrink(ctxt);
263 
264 #define GROW \
265     if ((!PARSER_PROGRESSIVE(ctxt)) && \
266         (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
267 	xmlParserGrow(ctxt);
268 
269 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
270 
271 /* Imported from XML */
272 
273 #define CUR (*ctxt->input->cur)
274 #define NEXT xmlNextChar(ctxt)
275 
276 #define RAW (*ctxt->input->cur)
277 
278 
279 #define NEXTL(l) do {							\
280     if (*(ctxt->input->cur) == '\n') {					\
281 	ctxt->input->line++; ctxt->input->col = 1;			\
282     } else ctxt->input->col++;						\
283     ctxt->input->cur += l;						\
284   } while (0)
285 
286 /************
287     \
288     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);	\
289     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
290  ************/
291 
292 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
293 
294 #define COPY_BUF(l,b,i,v)						\
295     if (l == 1) b[i++] = v;						\
296     else i += xmlCopyChar(l,&b[i],v)
297 
298 /**
299  * htmlFindEncoding:
300  * @the HTML parser context
301  *
302  * Ty to find and encoding in the current data available in the input
303  * buffer this is needed to try to switch to the proper encoding when
304  * one face a character error.
305  * That's an heuristic, since it's operating outside of parsing it could
306  * try to use a meta which had been commented out, that's the reason it
307  * should only be used in case of error, not as a default.
308  *
309  * Returns an encoding string or NULL if not found, the string need to
310  *   be freed
311  */
312 static xmlChar *
htmlFindEncoding(xmlParserCtxtPtr ctxt)313 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
314     const xmlChar *start, *cur, *end;
315     xmlChar *ret;
316 
317     if ((ctxt == NULL) || (ctxt->input == NULL) ||
318         (ctxt->input->flags & XML_INPUT_HAS_ENCODING))
319         return(NULL);
320     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
321         return(NULL);
322 
323     start = ctxt->input->cur;
324     end = ctxt->input->end;
325     /* we also expect the input buffer to be zero terminated */
326     if (*end != 0)
327         return(NULL);
328 
329     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
330     if (cur == NULL)
331         return(NULL);
332     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
333     if (cur == NULL)
334         return(NULL);
335     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
336     if (cur == NULL)
337         return(NULL);
338     cur += 8;
339     start = cur;
340     while (((*cur >= 'A') && (*cur <= 'Z')) ||
341            ((*cur >= 'a') && (*cur <= 'z')) ||
342            ((*cur >= '0') && (*cur <= '9')) ||
343            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
344            cur++;
345     if (cur == start)
346         return(NULL);
347     ret = xmlStrndup(start, cur - start);
348     if (ret == NULL)
349         htmlErrMemory(ctxt);
350     return(ret);
351 }
352 
353 /**
354  * htmlCurrentChar:
355  * @ctxt:  the HTML parser context
356  * @len:  pointer to the length of the char read
357  *
358  * The current char value, if using UTF-8 this may actually span multiple
359  * bytes in the input buffer. Implement the end of line normalization:
360  * 2.11 End-of-Line Handling
361  * If the encoding is unspecified, in the case we find an ISO-Latin-1
362  * char, then the encoding converter is plugged in automatically.
363  *
364  * Returns the current char value and its length
365  */
366 
367 static int
htmlCurrentChar(xmlParserCtxtPtr ctxt,int * len)368 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
369     const unsigned char *cur;
370     unsigned char c;
371     unsigned int val;
372 
373     if (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)
374         xmlParserGrow(ctxt);
375 
376     if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
377         xmlChar * guess;
378 
379         /*
380          * Assume it's a fixed length encoding (1) with
381          * a compatible encoding for the ASCII set, since
382          * HTML constructs only use < 128 chars
383          */
384         if (*ctxt->input->cur < 0x80) {
385             if (*ctxt->input->cur == 0) {
386                 if (ctxt->input->cur < ctxt->input->end) {
387                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
388                                     "Char 0x%X out of allowed range\n", 0);
389                     *len = 1;
390                     return(' ');
391                 } else {
392                     *len = 0;
393                     return(0);
394                 }
395             }
396             *len = 1;
397             return(*ctxt->input->cur);
398         }
399 
400         /*
401          * Humm this is bad, do an automatic flow conversion
402          */
403         guess = htmlFindEncoding(ctxt);
404         if (guess == NULL) {
405             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
406         } else {
407             xmlSwitchEncodingName(ctxt, (const char *) guess);
408             xmlFree(guess);
409         }
410         ctxt->input->flags |= XML_INPUT_HAS_ENCODING;
411     }
412 
413     /*
414      * We are supposed to handle UTF8, check it's valid
415      * From rfc2044: encoding of the Unicode values on UTF-8:
416      *
417      * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
418      * 0000 0000-0000 007F   0xxxxxxx
419      * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
420      * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
421      *
422      * Check for the 0x110000 limit too
423      */
424     cur = ctxt->input->cur;
425     c = *cur;
426     if (c & 0x80) {
427         size_t avail;
428 
429         if ((c & 0x40) == 0)
430             goto encoding_error;
431 
432         avail = ctxt->input->end - ctxt->input->cur;
433 
434         if ((avail < 2) || ((cur[1] & 0xc0) != 0x80))
435             goto encoding_error;
436         if ((c & 0xe0) == 0xe0) {
437             if ((avail < 3) || ((cur[2] & 0xc0) != 0x80))
438                 goto encoding_error;
439             if ((c & 0xf0) == 0xf0) {
440                 if (((c & 0xf8) != 0xf0) ||
441                     (avail < 4) || ((cur[3] & 0xc0) != 0x80))
442                     goto encoding_error;
443                 /* 4-byte code */
444                 *len = 4;
445                 val = (cur[0] & 0x7) << 18;
446                 val |= (cur[1] & 0x3f) << 12;
447                 val |= (cur[2] & 0x3f) << 6;
448                 val |= cur[3] & 0x3f;
449                 if (val < 0x10000)
450                     goto encoding_error;
451             } else {
452               /* 3-byte code */
453                 *len = 3;
454                 val = (cur[0] & 0xf) << 12;
455                 val |= (cur[1] & 0x3f) << 6;
456                 val |= cur[2] & 0x3f;
457                 if (val < 0x800)
458                     goto encoding_error;
459             }
460         } else {
461           /* 2-byte code */
462             *len = 2;
463             val = (cur[0] & 0x1f) << 6;
464             val |= cur[1] & 0x3f;
465             if (val < 0x80)
466                 goto encoding_error;
467         }
468         if (!IS_CHAR(val)) {
469             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
470                             "Char 0x%X out of allowed range\n", val);
471         }
472         return(val);
473     } else {
474         if (*ctxt->input->cur == 0) {
475             if (ctxt->input->cur < ctxt->input->end) {
476                 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
477                                 "Char 0x%X out of allowed range\n", 0);
478                 *len = 1;
479                 return(' ');
480             } else {
481                 *len = 0;
482                 return(0);
483             }
484         }
485         /* 1-byte code */
486         *len = 1;
487         return(*ctxt->input->cur);
488     }
489 
490 encoding_error:
491     xmlCtxtErrIO(ctxt, XML_ERR_INVALID_ENCODING, NULL);
492 
493     if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0)
494         xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
495     *len = 1;
496     return(*ctxt->input->cur);
497 }
498 
499 /**
500  * htmlSkipBlankChars:
501  * @ctxt:  the HTML parser context
502  *
503  * skip all blanks character found at that point in the input streams.
504  *
505  * Returns the number of space chars skipped
506  */
507 
508 static int
htmlSkipBlankChars(xmlParserCtxtPtr ctxt)509 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
510     int res = 0;
511 
512     while (IS_BLANK_CH(*(ctxt->input->cur))) {
513         if (*(ctxt->input->cur) == '\n') {
514             ctxt->input->line++; ctxt->input->col = 1;
515         } else ctxt->input->col++;
516         ctxt->input->cur++;
517         if (*ctxt->input->cur == 0)
518             xmlParserGrow(ctxt);
519 	if (res < INT_MAX)
520 	    res++;
521     }
522     return(res);
523 }
524 
525 
526 
527 /************************************************************************
528  *									*
529  *	The list of HTML elements and their properties		*
530  *									*
531  ************************************************************************/
532 
533 /*
534  *  Start Tag: 1 means the start tag can be omitted
535  *  End Tag:   1 means the end tag can be omitted
536  *             2 means it's forbidden (empty elements)
537  *             3 means the tag is stylistic and should be closed easily
538  *  Depr:      this element is deprecated
539  *  DTD:       1 means that this element is valid only in the Loose DTD
540  *             2 means that this element is valid only in the Frameset DTD
541  *
542  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
543 	, subElements , impliedsubelt , Attributes, userdata
544  */
545 
546 /* Definitions and a couple of vars for HTML Elements */
547 
548 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
549 #define NB_FONTSTYLE 8
550 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
551 #define NB_PHRASE 10
552 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
553 #define NB_SPECIAL 16
554 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
555 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
556 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
557 #define NB_BLOCK NB_HEADING + NB_LIST + 14
558 #define FORMCTRL "input", "select", "textarea", "label", "button"
559 #define NB_FORMCTRL 5
560 #define PCDATA
561 #define NB_PCDATA 0
562 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
563 #define NB_HEADING 6
564 #define LIST "ul", "ol", "dir", "menu"
565 #define NB_LIST 4
566 #define MODIFIER
567 #define NB_MODIFIER 0
568 #define FLOW BLOCK,INLINE
569 #define NB_FLOW NB_BLOCK + NB_INLINE
570 #define EMPTY NULL
571 
572 
573 static const char* const html_flow[] = { FLOW, NULL } ;
574 static const char* const html_inline[] = { INLINE, NULL } ;
575 
576 /* placeholders: elts with content but no subelements */
577 static const char* const html_pcdata[] = { NULL } ;
578 #define html_cdata html_pcdata
579 
580 
581 /* ... and for HTML Attributes */
582 
583 #define COREATTRS "id", "class", "style", "title"
584 #define NB_COREATTRS 4
585 #define I18N "lang", "dir"
586 #define NB_I18N 2
587 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
588 #define NB_EVENTS 9
589 #define ATTRS COREATTRS,I18N,EVENTS
590 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
591 #define CELLHALIGN "align", "char", "charoff"
592 #define NB_CELLHALIGN 3
593 #define CELLVALIGN "valign"
594 #define NB_CELLVALIGN 1
595 
596 static const char* const html_attrs[] = { ATTRS, NULL } ;
597 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
598 static const char* const core_attrs[] = { COREATTRS, NULL } ;
599 static const char* const i18n_attrs[] = { I18N, NULL } ;
600 
601 
602 /* Other declarations that should go inline ... */
603 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
604 	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
605 	"tabindex", "onfocus", "onblur", NULL } ;
606 static const char* const target_attr[] = { "target", NULL } ;
607 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
608 static const char* const alt_attr[] = { "alt", NULL } ;
609 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
610 static const char* const href_attrs[] = { "href", NULL } ;
611 static const char* const clear_attrs[] = { "clear", NULL } ;
612 static const char* const inline_p[] = { INLINE, "p", NULL } ;
613 
614 static const char* const flow_param[] = { FLOW, "param", NULL } ;
615 static const char* const applet_attrs[] = { COREATTRS , "codebase",
616 		"archive", "alt", "name", "height", "width", "align",
617 		"hspace", "vspace", NULL } ;
618 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
619 	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
620 static const char* const basefont_attrs[] =
621 	{ "id", "size", "color", "face", NULL } ;
622 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
623 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
624 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
625 static const char* const body_depr[] = { "background", "bgcolor", "text",
626 	"link", "vlink", "alink", NULL } ;
627 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
628 	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
629 
630 
631 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
632 static const char* const col_elt[] = { "col", NULL } ;
633 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
634 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
635 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
636 static const char* const compact_attr[] = { "compact", NULL } ;
637 static const char* const label_attr[] = { "label", NULL } ;
638 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
639 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
640 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
641 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
642 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
643 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
644 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
645 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
646 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
647 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
648 static const char* const version_attr[] = { "version", NULL } ;
649 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
650 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
651 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
652 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
653 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
654 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
655 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
656 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
657 static const char* const align_attr[] = { "align", NULL } ;
658 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
659 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
660 static const char* const name_attr[] = { "name", NULL } ;
661 static const char* const action_attr[] = { "action", NULL } ;
662 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
663 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
664 static const char* const content_attr[] = { "content", NULL } ;
665 static const char* const type_attr[] = { "type", NULL } ;
666 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
667 static const char* const object_contents[] = { FLOW, "param", NULL } ;
668 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
669 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
670 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
671 static const char* const option_elt[] = { "option", NULL } ;
672 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
673 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
674 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
675 static const char* const width_attr[] = { "width", NULL } ;
676 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
677 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
678 static const char* const language_attr[] = { "language", NULL } ;
679 static const char* const select_content[] = { "optgroup", "option", NULL } ;
680 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
681 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
682 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
683 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
684 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
685 static const char* const tr_elt[] = { "tr", NULL } ;
686 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
687 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
688 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
689 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
690 static const char* const tr_contents[] = { "th", "td", NULL } ;
691 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
692 static const char* const li_elt[] = { "li", NULL } ;
693 static const char* const ul_depr[] = { "type", "compact", NULL} ;
694 static const char* const dir_attr[] = { "dir", NULL} ;
695 
696 #define DECL (const char**)
697 
698 static const htmlElemDesc
699 html40ElementTable[] = {
700 { "a",		0, 0, 0, 0, 0, 0, 1, "anchor ",
701 	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
702 },
703 { "abbr",	0, 0, 0, 0, 0, 0, 1, "abbreviated form",
704 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
705 },
706 { "acronym",	0, 0, 0, 0, 0, 0, 1, "",
707 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
708 },
709 { "address",	0, 0, 0, 0, 0, 0, 0, "information on author ",
710 	DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
711 },
712 { "applet",	0, 0, 0, 0, 1, 1, 2, "java applet ",
713 	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
714 },
715 { "area",	0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
716 	EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
717 },
718 { "b",		0, 3, 0, 0, 0, 0, 1, "bold text style",
719 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
720 },
721 { "base",	0, 2, 2, 1, 0, 0, 0, "document base uri ",
722 	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
723 },
724 { "basefont",	0, 2, 2, 1, 1, 1, 1, "base font size " ,
725 	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
726 },
727 { "bdo",	0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
728 	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
729 },
730 { "big",	0, 3, 0, 0, 0, 0, 1, "large text style",
731 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
732 },
733 { "blockquote",	0, 0, 0, 0, 0, 0, 0, "long quotation ",
734 	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
735 },
736 { "body",	1, 1, 0, 0, 0, 0, 0, "document body ",
737 	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
738 },
739 { "br",		0, 2, 2, 1, 0, 0, 1, "forced line break ",
740 	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
741 },
742 { "button",	0, 0, 0, 0, 0, 0, 2, "push button ",
743 	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
744 },
745 { "caption",	0, 0, 0, 0, 0, 0, 0, "table caption ",
746 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
747 },
748 { "center",	0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
749 	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
750 },
751 { "cite",	0, 0, 0, 0, 0, 0, 1, "citation",
752 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
753 },
754 { "code",	0, 0, 0, 0, 0, 0, 1, "computer code fragment",
755 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
756 },
757 { "col",	0, 2, 2, 1, 0, 0, 0, "table column ",
758 	EMPTY , NULL , DECL col_attrs , NULL, NULL
759 },
760 { "colgroup",	0, 1, 0, 0, 0, 0, 0, "table column group ",
761 	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
762 },
763 { "dd",		0, 1, 0, 0, 0, 0, 0, "definition description ",
764 	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
765 },
766 { "del",	0, 0, 0, 0, 0, 0, 2, "deleted text ",
767 	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
768 },
769 { "dfn",	0, 0, 0, 0, 0, 0, 1, "instance definition",
770 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
771 },
772 { "dir",	0, 0, 0, 0, 1, 1, 0, "directory list",
773 	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
774 },
775 { "div",	0, 0, 0, 0, 0, 0, 0, "generic language/style container",
776 	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
777 },
778 { "dl",		0, 0, 0, 0, 0, 0, 0, "definition list ",
779 	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
780 },
781 { "dt",		0, 1, 0, 0, 0, 0, 0, "definition term ",
782 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
783 },
784 { "em",		0, 3, 0, 0, 0, 0, 1, "emphasis",
785 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
786 },
787 { "embed",	0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
788 	EMPTY, NULL, DECL embed_attrs, NULL, NULL
789 },
790 { "fieldset",	0, 0, 0, 0, 0, 0, 0, "form control group ",
791 	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
792 },
793 { "font",	0, 3, 0, 0, 1, 1, 1, "local change to font ",
794 	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
795 },
796 { "form",	0, 0, 0, 0, 0, 0, 0, "interactive form ",
797 	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
798 },
799 { "frame",	0, 2, 2, 1, 0, 2, 0, "subwindow " ,
800 	EMPTY, NULL, NULL, DECL frame_attrs, NULL
801 },
802 { "frameset",	0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
803 	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
804 },
805 { "h1",		0, 0, 0, 0, 0, 0, 0, "heading ",
806 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
807 },
808 { "h2",		0, 0, 0, 0, 0, 0, 0, "heading ",
809 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
810 },
811 { "h3",		0, 0, 0, 0, 0, 0, 0, "heading ",
812 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
813 },
814 { "h4",		0, 0, 0, 0, 0, 0, 0, "heading ",
815 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
816 },
817 { "h5",		0, 0, 0, 0, 0, 0, 0, "heading ",
818 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
819 },
820 { "h6",		0, 0, 0, 0, 0, 0, 0, "heading ",
821 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
822 },
823 { "head",	1, 1, 0, 0, 0, 0, 0, "document head ",
824 	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
825 },
826 { "hr",		0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
827 	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
828 },
829 { "html",	1, 1, 0, 0, 0, 0, 0, "document root element ",
830 	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
831 },
832 { "i",		0, 3, 0, 0, 0, 0, 1, "italic text style",
833 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
834 },
835 { "iframe",	0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
836 	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
837 },
838 { "img",	0, 2, 2, 1, 0, 0, 1, "embedded image ",
839 	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
840 },
841 { "input",	0, 2, 2, 1, 0, 0, 1, "form control ",
842 	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
843 },
844 { "ins",	0, 0, 0, 0, 0, 0, 2, "inserted text",
845 	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
846 },
847 { "isindex",	0, 2, 2, 1, 1, 1, 0, "single line prompt ",
848 	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
849 },
850 { "kbd",	0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
851 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
852 },
853 { "label",	0, 0, 0, 0, 0, 0, 1, "form field label text ",
854 	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
855 },
856 { "legend",	0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
857 	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
858 },
859 { "li",		0, 1, 1, 0, 0, 0, 0, "list item ",
860 	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
861 },
862 { "link",	0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
863 	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
864 },
865 { "map",	0, 0, 0, 0, 0, 0, 2, "client-side image map ",
866 	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
867 },
868 { "menu",	0, 0, 0, 0, 1, 1, 0, "menu list ",
869 	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
870 },
871 { "meta",	0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
872 	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
873 },
874 { "noframes",	0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
875 	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
876 },
877 { "noscript",	0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
878 	DECL html_flow, "div", DECL html_attrs, NULL, NULL
879 },
880 { "object",	0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
881 	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
882 },
883 { "ol",		0, 0, 0, 0, 0, 0, 0, "ordered list ",
884 	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
885 },
886 { "optgroup",	0, 0, 0, 0, 0, 0, 0, "option group ",
887 	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
888 },
889 { "option",	0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
890 	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
891 },
892 { "p",		0, 1, 0, 0, 0, 0, 0, "paragraph ",
893 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
894 },
895 { "param",	0, 2, 2, 1, 0, 0, 0, "named property value ",
896 	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
897 },
898 { "pre",	0, 0, 0, 0, 0, 0, 0, "preformatted text ",
899 	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
900 },
901 { "q",		0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
902 	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
903 },
904 { "s",		0, 3, 0, 0, 1, 1, 1, "strike-through text style",
905 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
906 },
907 { "samp",	0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
908 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
909 },
910 { "script",	0, 0, 0, 0, 0, 0, 2, "script statements ",
911 	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
912 },
913 { "select",	0, 0, 0, 0, 0, 0, 1, "option selector ",
914 	DECL select_content, NULL, DECL select_attrs, NULL, NULL
915 },
916 { "small",	0, 3, 0, 0, 0, 0, 1, "small text style",
917 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
918 },
919 { "span",	0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
920 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
921 },
922 { "strike",	0, 3, 0, 0, 1, 1, 1, "strike-through text",
923 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
924 },
925 { "strong",	0, 3, 0, 0, 0, 0, 1, "strong emphasis",
926 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
927 },
928 { "style",	0, 0, 0, 0, 0, 0, 0, "style info ",
929 	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
930 },
931 { "sub",	0, 3, 0, 0, 0, 0, 1, "subscript",
932 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
933 },
934 { "sup",	0, 3, 0, 0, 0, 0, 1, "superscript ",
935 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
936 },
937 { "table",	0, 0, 0, 0, 0, 0, 0, "",
938 	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
939 },
940 { "tbody",	1, 0, 0, 0, 0, 0, 0, "table body ",
941 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
942 },
943 { "td",		0, 0, 0, 0, 0, 0, 0, "table data cell",
944 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
945 },
946 { "textarea",	0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
947 	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
948 },
949 { "tfoot",	0, 1, 0, 0, 0, 0, 0, "table footer ",
950 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
951 },
952 { "th",		0, 1, 0, 0, 0, 0, 0, "table header cell",
953 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
954 },
955 { "thead",	0, 1, 0, 0, 0, 0, 0, "table header ",
956 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
957 },
958 { "title",	0, 0, 0, 0, 0, 0, 0, "document title ",
959 	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
960 },
961 { "tr",		0, 0, 0, 0, 0, 0, 0, "table row ",
962 	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
963 },
964 { "tt",		0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
965 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
966 },
967 { "u",		0, 3, 0, 0, 1, 1, 1, "underlined text style",
968 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
969 },
970 { "ul",		0, 0, 0, 0, 0, 0, 0, "unordered list ",
971 	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
972 },
973 { "var",	0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
974 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
975 }
976 };
977 
978 typedef struct {
979     const char *oldTag;
980     const char *newTag;
981 } htmlStartCloseEntry;
982 
983 /*
984  * start tags that imply the end of current element
985  */
986 static const htmlStartCloseEntry htmlStartClose[] = {
987     { "a", "a" },
988     { "a", "fieldset" },
989     { "a", "table" },
990     { "a", "td" },
991     { "a", "th" },
992     { "address", "dd" },
993     { "address", "dl" },
994     { "address", "dt" },
995     { "address", "form" },
996     { "address", "li" },
997     { "address", "ul" },
998     { "b", "center" },
999     { "b", "p" },
1000     { "b", "td" },
1001     { "b", "th" },
1002     { "big", "p" },
1003     { "caption", "col" },
1004     { "caption", "colgroup" },
1005     { "caption", "tbody" },
1006     { "caption", "tfoot" },
1007     { "caption", "thead" },
1008     { "caption", "tr" },
1009     { "col", "col" },
1010     { "col", "colgroup" },
1011     { "col", "tbody" },
1012     { "col", "tfoot" },
1013     { "col", "thead" },
1014     { "col", "tr" },
1015     { "colgroup", "colgroup" },
1016     { "colgroup", "tbody" },
1017     { "colgroup", "tfoot" },
1018     { "colgroup", "thead" },
1019     { "colgroup", "tr" },
1020     { "dd", "dt" },
1021     { "dir", "dd" },
1022     { "dir", "dl" },
1023     { "dir", "dt" },
1024     { "dir", "form" },
1025     { "dir", "ul" },
1026     { "dl", "form" },
1027     { "dl", "li" },
1028     { "dt", "dd" },
1029     { "dt", "dl" },
1030     { "font", "center" },
1031     { "font", "td" },
1032     { "font", "th" },
1033     { "form", "form" },
1034     { "h1", "fieldset" },
1035     { "h1", "form" },
1036     { "h1", "li" },
1037     { "h1", "p" },
1038     { "h1", "table" },
1039     { "h2", "fieldset" },
1040     { "h2", "form" },
1041     { "h2", "li" },
1042     { "h2", "p" },
1043     { "h2", "table" },
1044     { "h3", "fieldset" },
1045     { "h3", "form" },
1046     { "h3", "li" },
1047     { "h3", "p" },
1048     { "h3", "table" },
1049     { "h4", "fieldset" },
1050     { "h4", "form" },
1051     { "h4", "li" },
1052     { "h4", "p" },
1053     { "h4", "table" },
1054     { "h5", "fieldset" },
1055     { "h5", "form" },
1056     { "h5", "li" },
1057     { "h5", "p" },
1058     { "h5", "table" },
1059     { "h6", "fieldset" },
1060     { "h6", "form" },
1061     { "h6", "li" },
1062     { "h6", "p" },
1063     { "h6", "table" },
1064     { "head", "a" },
1065     { "head", "abbr" },
1066     { "head", "acronym" },
1067     { "head", "address" },
1068     { "head", "b" },
1069     { "head", "bdo" },
1070     { "head", "big" },
1071     { "head", "blockquote" },
1072     { "head", "body" },
1073     { "head", "br" },
1074     { "head", "center" },
1075     { "head", "cite" },
1076     { "head", "code" },
1077     { "head", "dd" },
1078     { "head", "dfn" },
1079     { "head", "dir" },
1080     { "head", "div" },
1081     { "head", "dl" },
1082     { "head", "dt" },
1083     { "head", "em" },
1084     { "head", "fieldset" },
1085     { "head", "font" },
1086     { "head", "form" },
1087     { "head", "frameset" },
1088     { "head", "h1" },
1089     { "head", "h2" },
1090     { "head", "h3" },
1091     { "head", "h4" },
1092     { "head", "h5" },
1093     { "head", "h6" },
1094     { "head", "hr" },
1095     { "head", "i" },
1096     { "head", "iframe" },
1097     { "head", "img" },
1098     { "head", "kbd" },
1099     { "head", "li" },
1100     { "head", "listing" },
1101     { "head", "map" },
1102     { "head", "menu" },
1103     { "head", "ol" },
1104     { "head", "p" },
1105     { "head", "pre" },
1106     { "head", "q" },
1107     { "head", "s" },
1108     { "head", "samp" },
1109     { "head", "small" },
1110     { "head", "span" },
1111     { "head", "strike" },
1112     { "head", "strong" },
1113     { "head", "sub" },
1114     { "head", "sup" },
1115     { "head", "table" },
1116     { "head", "tt" },
1117     { "head", "u" },
1118     { "head", "ul" },
1119     { "head", "var" },
1120     { "head", "xmp" },
1121     { "hr", "form" },
1122     { "i", "center" },
1123     { "i", "p" },
1124     { "i", "td" },
1125     { "i", "th" },
1126     { "legend", "fieldset" },
1127     { "li", "li" },
1128     { "link", "body" },
1129     { "link", "frameset" },
1130     { "listing", "dd" },
1131     { "listing", "dl" },
1132     { "listing", "dt" },
1133     { "listing", "fieldset" },
1134     { "listing", "form" },
1135     { "listing", "li" },
1136     { "listing", "table" },
1137     { "listing", "ul" },
1138     { "menu", "dd" },
1139     { "menu", "dl" },
1140     { "menu", "dt" },
1141     { "menu", "form" },
1142     { "menu", "ul" },
1143     { "ol", "form" },
1144     { "option", "optgroup" },
1145     { "option", "option" },
1146     { "p", "address" },
1147     { "p", "blockquote" },
1148     { "p", "body" },
1149     { "p", "caption" },
1150     { "p", "center" },
1151     { "p", "col" },
1152     { "p", "colgroup" },
1153     { "p", "dd" },
1154     { "p", "dir" },
1155     { "p", "div" },
1156     { "p", "dl" },
1157     { "p", "dt" },
1158     { "p", "fieldset" },
1159     { "p", "form" },
1160     { "p", "frameset" },
1161     { "p", "h1" },
1162     { "p", "h2" },
1163     { "p", "h3" },
1164     { "p", "h4" },
1165     { "p", "h5" },
1166     { "p", "h6" },
1167     { "p", "head" },
1168     { "p", "hr" },
1169     { "p", "li" },
1170     { "p", "listing" },
1171     { "p", "menu" },
1172     { "p", "ol" },
1173     { "p", "p" },
1174     { "p", "pre" },
1175     { "p", "table" },
1176     { "p", "tbody" },
1177     { "p", "td" },
1178     { "p", "tfoot" },
1179     { "p", "th" },
1180     { "p", "title" },
1181     { "p", "tr" },
1182     { "p", "ul" },
1183     { "p", "xmp" },
1184     { "pre", "dd" },
1185     { "pre", "dl" },
1186     { "pre", "dt" },
1187     { "pre", "fieldset" },
1188     { "pre", "form" },
1189     { "pre", "li" },
1190     { "pre", "table" },
1191     { "pre", "ul" },
1192     { "s", "p" },
1193     { "script", "noscript" },
1194     { "small", "p" },
1195     { "span", "td" },
1196     { "span", "th" },
1197     { "strike", "p" },
1198     { "style", "body" },
1199     { "style", "frameset" },
1200     { "tbody", "tbody" },
1201     { "tbody", "tfoot" },
1202     { "td", "tbody" },
1203     { "td", "td" },
1204     { "td", "tfoot" },
1205     { "td", "th" },
1206     { "td", "tr" },
1207     { "tfoot", "tbody" },
1208     { "th", "tbody" },
1209     { "th", "td" },
1210     { "th", "tfoot" },
1211     { "th", "th" },
1212     { "th", "tr" },
1213     { "thead", "tbody" },
1214     { "thead", "tfoot" },
1215     { "title", "body" },
1216     { "title", "frameset" },
1217     { "tr", "tbody" },
1218     { "tr", "tfoot" },
1219     { "tr", "tr" },
1220     { "tt", "p" },
1221     { "u", "p" },
1222     { "u", "td" },
1223     { "u", "th" },
1224     { "ul", "address" },
1225     { "ul", "form" },
1226     { "ul", "menu" },
1227     { "ul", "pre" },
1228     { "xmp", "dd" },
1229     { "xmp", "dl" },
1230     { "xmp", "dt" },
1231     { "xmp", "fieldset" },
1232     { "xmp", "form" },
1233     { "xmp", "li" },
1234     { "xmp", "table" },
1235     { "xmp", "ul" }
1236 };
1237 
1238 /*
1239  * The list of HTML elements which are supposed not to have
1240  * CDATA content and where a p element will be implied
1241  *
1242  * TODO: extend that list by reading the HTML SGML DTD on
1243  *       implied paragraph
1244  */
1245 static const char *const htmlNoContentElements[] = {
1246     "html",
1247     "head",
1248     NULL
1249 };
1250 
1251 /*
1252  * The list of HTML attributes which are of content %Script;
1253  * NOTE: when adding ones, check htmlIsScriptAttribute() since
1254  *       it assumes the name starts with 'on'
1255  */
1256 static const char *const htmlScriptAttributes[] = {
1257     "onclick",
1258     "ondblclick",
1259     "onmousedown",
1260     "onmouseup",
1261     "onmouseover",
1262     "onmousemove",
1263     "onmouseout",
1264     "onkeypress",
1265     "onkeydown",
1266     "onkeyup",
1267     "onload",
1268     "onunload",
1269     "onfocus",
1270     "onblur",
1271     "onsubmit",
1272     "onreset",
1273     "onchange",
1274     "onselect"
1275 };
1276 
1277 /*
1278  * This table is used by the htmlparser to know what to do with
1279  * broken html pages. By assigning different priorities to different
1280  * elements the parser can decide how to handle extra endtags.
1281  * Endtags are only allowed to close elements with lower or equal
1282  * priority.
1283  */
1284 
1285 typedef struct {
1286     const char *name;
1287     int priority;
1288 } elementPriority;
1289 
1290 static const elementPriority htmlEndPriority[] = {
1291     {"div",   150},
1292     {"td",    160},
1293     {"th",    160},
1294     {"tr",    170},
1295     {"thead", 180},
1296     {"tbody", 180},
1297     {"tfoot", 180},
1298     {"table", 190},
1299     {"head",  200},
1300     {"body",  200},
1301     {"html",  220},
1302     {NULL,    100} /* Default priority */
1303 };
1304 
1305 /************************************************************************
1306  *									*
1307  *	functions to handle HTML specific data			*
1308  *									*
1309  ************************************************************************/
1310 
1311 /**
1312  * htmlInitAutoClose:
1313  *
1314  * DEPRECATED: This is a no-op.
1315  */
1316 void
htmlInitAutoClose(void)1317 htmlInitAutoClose(void) {
1318 }
1319 
1320 static int
htmlCompareTags(const void * key,const void * member)1321 htmlCompareTags(const void *key, const void *member) {
1322     const xmlChar *tag = (const xmlChar *) key;
1323     const htmlElemDesc *desc = (const htmlElemDesc *) member;
1324 
1325     return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1326 }
1327 
1328 /**
1329  * htmlTagLookup:
1330  * @tag:  The tag name in lowercase
1331  *
1332  * Lookup the HTML tag in the ElementTable
1333  *
1334  * Returns the related htmlElemDescPtr or NULL if not found.
1335  */
1336 const htmlElemDesc *
htmlTagLookup(const xmlChar * tag)1337 htmlTagLookup(const xmlChar *tag) {
1338     if (tag == NULL)
1339         return(NULL);
1340 
1341     return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1342                 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1343                 sizeof(htmlElemDesc), htmlCompareTags));
1344 }
1345 
1346 /**
1347  * htmlGetEndPriority:
1348  * @name: The name of the element to look up the priority for.
1349  *
1350  * Return value: The "endtag" priority.
1351  **/
1352 static int
htmlGetEndPriority(const xmlChar * name)1353 htmlGetEndPriority (const xmlChar *name) {
1354     int i = 0;
1355 
1356     while ((htmlEndPriority[i].name != NULL) &&
1357 	   (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1358 	i++;
1359 
1360     return(htmlEndPriority[i].priority);
1361 }
1362 
1363 
1364 static int
htmlCompareStartClose(const void * vkey,const void * member)1365 htmlCompareStartClose(const void *vkey, const void *member) {
1366     const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1367     const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1368     int ret;
1369 
1370     ret = strcmp(key->oldTag, entry->oldTag);
1371     if (ret == 0)
1372         ret = strcmp(key->newTag, entry->newTag);
1373 
1374     return(ret);
1375 }
1376 
1377 /**
1378  * htmlCheckAutoClose:
1379  * @newtag:  The new tag name
1380  * @oldtag:  The old tag name
1381  *
1382  * Checks whether the new tag is one of the registered valid tags for
1383  * closing old.
1384  *
1385  * Returns 0 if no, 1 if yes.
1386  */
1387 static int
htmlCheckAutoClose(const xmlChar * newtag,const xmlChar * oldtag)1388 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1389 {
1390     htmlStartCloseEntry key;
1391     void *res;
1392 
1393     key.oldTag = (const char *) oldtag;
1394     key.newTag = (const char *) newtag;
1395     res = bsearch(&key, htmlStartClose,
1396             sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1397             sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1398     return(res != NULL);
1399 }
1400 
1401 /**
1402  * htmlAutoCloseOnClose:
1403  * @ctxt:  an HTML parser context
1404  * @newtag:  The new tag name
1405  * @force:  force the tag closure
1406  *
1407  * The HTML DTD allows an ending tag to implicitly close other tags.
1408  */
1409 static void
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1410 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1411 {
1412     const htmlElemDesc *info;
1413     int i, priority;
1414 
1415     priority = htmlGetEndPriority(newtag);
1416 
1417     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1418 
1419         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1420             break;
1421         /*
1422          * A misplaced endtag can only close elements with lower
1423          * or equal priority, so if we find an element with higher
1424          * priority before we find an element with
1425          * matching name, we just ignore this endtag
1426          */
1427         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1428             return;
1429     }
1430     if (i < 0)
1431         return;
1432 
1433     while (!xmlStrEqual(newtag, ctxt->name)) {
1434         info = htmlTagLookup(ctxt->name);
1435         if ((info != NULL) && (info->endTag == 3)) {
1436             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1437 	                 "Opening and ending tag mismatch: %s and %s\n",
1438 			 newtag, ctxt->name);
1439         }
1440         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1441             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1442 	htmlnamePop(ctxt);
1443     }
1444 }
1445 
1446 /**
1447  * htmlAutoCloseOnEnd:
1448  * @ctxt:  an HTML parser context
1449  *
1450  * Close all remaining tags at the end of the stream
1451  */
1452 static void
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)1453 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1454 {
1455     int i;
1456 
1457     if (ctxt->nameNr == 0)
1458         return;
1459     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1460         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1461             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1462 	htmlnamePop(ctxt);
1463     }
1464 }
1465 
1466 /**
1467  * htmlAutoClose:
1468  * @ctxt:  an HTML parser context
1469  * @newtag:  The new tag name or NULL
1470  *
1471  * The HTML DTD allows a tag to implicitly close other tags.
1472  * The list is kept in htmlStartClose array. This function is
1473  * called when a new tag has been detected and generates the
1474  * appropriates closes if possible/needed.
1475  * If newtag is NULL this mean we are at the end of the resource
1476  * and we should check
1477  */
1478 static void
htmlAutoClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1479 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1480 {
1481     if (newtag == NULL)
1482         return;
1483 
1484     while ((ctxt->name != NULL) &&
1485            (htmlCheckAutoClose(newtag, ctxt->name))) {
1486         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1487             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1488 	htmlnamePop(ctxt);
1489     }
1490 }
1491 
1492 /**
1493  * htmlAutoCloseTag:
1494  * @doc:  the HTML document
1495  * @name:  The tag name
1496  * @elem:  the HTML element
1497  *
1498  * The HTML DTD allows a tag to implicitly close other tags.
1499  * The list is kept in htmlStartClose array. This function checks
1500  * if the element or one of it's children would autoclose the
1501  * given tag.
1502  *
1503  * Returns 1 if autoclose, 0 otherwise
1504  */
1505 int
htmlAutoCloseTag(htmlDocPtr doc,const xmlChar * name,htmlNodePtr elem)1506 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1507     htmlNodePtr child;
1508 
1509     if (elem == NULL) return(1);
1510     if (xmlStrEqual(name, elem->name)) return(0);
1511     if (htmlCheckAutoClose(elem->name, name)) return(1);
1512     child = elem->children;
1513     while (child != NULL) {
1514         if (htmlAutoCloseTag(doc, name, child)) return(1);
1515 	child = child->next;
1516     }
1517     return(0);
1518 }
1519 
1520 /**
1521  * htmlIsAutoClosed:
1522  * @doc:  the HTML document
1523  * @elem:  the HTML element
1524  *
1525  * The HTML DTD allows a tag to implicitly close other tags.
1526  * The list is kept in htmlStartClose array. This function checks
1527  * if a tag is autoclosed by one of it's child
1528  *
1529  * Returns 1 if autoclosed, 0 otherwise
1530  */
1531 int
htmlIsAutoClosed(htmlDocPtr doc,htmlNodePtr elem)1532 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1533     htmlNodePtr child;
1534 
1535     if (elem == NULL) return(1);
1536     child = elem->children;
1537     while (child != NULL) {
1538 	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1539 	child = child->next;
1540     }
1541     return(0);
1542 }
1543 
1544 /**
1545  * htmlCheckImplied:
1546  * @ctxt:  an HTML parser context
1547  * @newtag:  The new tag name
1548  *
1549  * The HTML DTD allows a tag to exists only implicitly
1550  * called when a new tag has been detected and generates the
1551  * appropriates implicit tags if missing
1552  */
1553 static void
htmlCheckImplied(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1554 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1555     int i;
1556 
1557     if (ctxt->options & HTML_PARSE_NOIMPLIED)
1558         return;
1559     if (!htmlOmittedDefaultValue)
1560 	return;
1561     if (xmlStrEqual(newtag, BAD_CAST"html"))
1562 	return;
1563     if (ctxt->nameNr <= 0) {
1564 	htmlnamePush(ctxt, BAD_CAST"html");
1565 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1566 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1567     }
1568     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1569         return;
1570     if ((ctxt->nameNr <= 1) &&
1571         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1572 	 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1573 	 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1574 	 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1575 	 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1576 	 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1577         if (ctxt->html >= 3) {
1578             /* we already saw or generated an <head> before */
1579             return;
1580         }
1581         /*
1582          * dropped OBJECT ... i you put it first BODY will be
1583          * assumed !
1584          */
1585         htmlnamePush(ctxt, BAD_CAST"head");
1586         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1587             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1588     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1589 	       (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1590 	       (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1591         if (ctxt->html >= 10) {
1592             /* we already saw or generated a <body> before */
1593             return;
1594         }
1595 	for (i = 0;i < ctxt->nameNr;i++) {
1596 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1597 		return;
1598 	    }
1599 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1600 		return;
1601 	    }
1602 	}
1603 
1604 	htmlnamePush(ctxt, BAD_CAST"body");
1605 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1606 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1607     }
1608 }
1609 
1610 /**
1611  * htmlCheckParagraph
1612  * @ctxt:  an HTML parser context
1613  *
1614  * Check whether a p element need to be implied before inserting
1615  * characters in the current element.
1616  *
1617  * Returns 1 if a paragraph has been inserted, 0 if not and -1
1618  *         in case of error.
1619  */
1620 
1621 static int
htmlCheckParagraph(htmlParserCtxtPtr ctxt)1622 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1623     const xmlChar *tag;
1624     int i;
1625 
1626     if (ctxt == NULL)
1627 	return(-1);
1628     tag = ctxt->name;
1629     if (tag == NULL) {
1630 	htmlAutoClose(ctxt, BAD_CAST"p");
1631 	htmlCheckImplied(ctxt, BAD_CAST"p");
1632 	htmlnamePush(ctxt, BAD_CAST"p");
1633 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1634 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1635 	return(1);
1636     }
1637     if (!htmlOmittedDefaultValue)
1638 	return(0);
1639     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1640 	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1641 	    htmlAutoClose(ctxt, BAD_CAST"p");
1642 	    htmlCheckImplied(ctxt, BAD_CAST"p");
1643 	    htmlnamePush(ctxt, BAD_CAST"p");
1644 	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1645 		ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1646 	    return(1);
1647 	}
1648     }
1649     return(0);
1650 }
1651 
1652 /**
1653  * htmlIsScriptAttribute:
1654  * @name:  an attribute name
1655  *
1656  * Check if an attribute is of content type Script
1657  *
1658  * Returns 1 is the attribute is a script 0 otherwise
1659  */
1660 int
htmlIsScriptAttribute(const xmlChar * name)1661 htmlIsScriptAttribute(const xmlChar *name) {
1662     unsigned int i;
1663 
1664     if (name == NULL)
1665       return(0);
1666     /*
1667      * all script attributes start with 'on'
1668      */
1669     if ((name[0] != 'o') || (name[1] != 'n'))
1670       return(0);
1671     for (i = 0;
1672 	 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1673 	 i++) {
1674 	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1675 	    return(1);
1676     }
1677     return(0);
1678 }
1679 
1680 /************************************************************************
1681  *									*
1682  *	The list of HTML predefined entities			*
1683  *									*
1684  ************************************************************************/
1685 
1686 
1687 static const htmlEntityDesc  html40EntitiesTable[] = {
1688 /*
1689  * the 4 absolute ones, plus apostrophe.
1690  */
1691 { 34,	"quot",	"quotation mark = APL quote, U+0022 ISOnum" },
1692 { 38,	"amp",	"ampersand, U+0026 ISOnum" },
1693 { 39,	"apos",	"single quote" },
1694 { 60,	"lt",	"less-than sign, U+003C ISOnum" },
1695 { 62,	"gt",	"greater-than sign, U+003E ISOnum" },
1696 
1697 /*
1698  * A bunch still in the 128-255 range
1699  * Replacing them depend really on the charset used.
1700  */
1701 { 160,	"nbsp",	"no-break space = non-breaking space, U+00A0 ISOnum" },
1702 { 161,	"iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1703 { 162,	"cent",	"cent sign, U+00A2 ISOnum" },
1704 { 163,	"pound","pound sign, U+00A3 ISOnum" },
1705 { 164,	"curren","currency sign, U+00A4 ISOnum" },
1706 { 165,	"yen",	"yen sign = yuan sign, U+00A5 ISOnum" },
1707 { 166,	"brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1708 { 167,	"sect",	"section sign, U+00A7 ISOnum" },
1709 { 168,	"uml",	"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1710 { 169,	"copy",	"copyright sign, U+00A9 ISOnum" },
1711 { 170,	"ordf",	"feminine ordinal indicator, U+00AA ISOnum" },
1712 { 171,	"laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1713 { 172,	"not",	"not sign, U+00AC ISOnum" },
1714 { 173,	"shy",	"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1715 { 174,	"reg",	"registered sign = registered trade mark sign, U+00AE ISOnum" },
1716 { 175,	"macr",	"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1717 { 176,	"deg",	"degree sign, U+00B0 ISOnum" },
1718 { 177,	"plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1719 { 178,	"sup2",	"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1720 { 179,	"sup3",	"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1721 { 180,	"acute","acute accent = spacing acute, U+00B4 ISOdia" },
1722 { 181,	"micro","micro sign, U+00B5 ISOnum" },
1723 { 182,	"para",	"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1724 { 183,	"middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1725 { 184,	"cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1726 { 185,	"sup1",	"superscript one = superscript digit one, U+00B9 ISOnum" },
1727 { 186,	"ordm",	"masculine ordinal indicator, U+00BA ISOnum" },
1728 { 187,	"raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1729 { 188,	"frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1730 { 189,	"frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1731 { 190,	"frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1732 { 191,	"iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1733 { 192,	"Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1734 { 193,	"Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1735 { 194,	"Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1736 { 195,	"Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1737 { 196,	"Auml",	"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1738 { 197,	"Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1739 { 198,	"AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1740 { 199,	"Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1741 { 200,	"Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1742 { 201,	"Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1743 { 202,	"Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1744 { 203,	"Euml",	"latin capital letter E with diaeresis, U+00CB ISOlat1" },
1745 { 204,	"Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1746 { 205,	"Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1747 { 206,	"Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1748 { 207,	"Iuml",	"latin capital letter I with diaeresis, U+00CF ISOlat1" },
1749 { 208,	"ETH",	"latin capital letter ETH, U+00D0 ISOlat1" },
1750 { 209,	"Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1751 { 210,	"Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1752 { 211,	"Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1753 { 212,	"Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1754 { 213,	"Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1755 { 214,	"Ouml",	"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1756 { 215,	"times","multiplication sign, U+00D7 ISOnum" },
1757 { 216,	"Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1758 { 217,	"Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1759 { 218,	"Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1760 { 219,	"Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1761 { 220,	"Uuml",	"latin capital letter U with diaeresis, U+00DC ISOlat1" },
1762 { 221,	"Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1763 { 222,	"THORN","latin capital letter THORN, U+00DE ISOlat1" },
1764 { 223,	"szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1765 { 224,	"agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1766 { 225,	"aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1767 { 226,	"acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1768 { 227,	"atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1769 { 228,	"auml",	"latin small letter a with diaeresis, U+00E4 ISOlat1" },
1770 { 229,	"aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1771 { 230,	"aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1772 { 231,	"ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1773 { 232,	"egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1774 { 233,	"eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1775 { 234,	"ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1776 { 235,	"euml",	"latin small letter e with diaeresis, U+00EB ISOlat1" },
1777 { 236,	"igrave","latin small letter i with grave, U+00EC ISOlat1" },
1778 { 237,	"iacute","latin small letter i with acute, U+00ED ISOlat1" },
1779 { 238,	"icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1780 { 239,	"iuml",	"latin small letter i with diaeresis, U+00EF ISOlat1" },
1781 { 240,	"eth",	"latin small letter eth, U+00F0 ISOlat1" },
1782 { 241,	"ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1783 { 242,	"ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1784 { 243,	"oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1785 { 244,	"ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1786 { 245,	"otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1787 { 246,	"ouml",	"latin small letter o with diaeresis, U+00F6 ISOlat1" },
1788 { 247,	"divide","division sign, U+00F7 ISOnum" },
1789 { 248,	"oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1790 { 249,	"ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1791 { 250,	"uacute","latin small letter u with acute, U+00FA ISOlat1" },
1792 { 251,	"ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1793 { 252,	"uuml",	"latin small letter u with diaeresis, U+00FC ISOlat1" },
1794 { 253,	"yacute","latin small letter y with acute, U+00FD ISOlat1" },
1795 { 254,	"thorn","latin small letter thorn with, U+00FE ISOlat1" },
1796 { 255,	"yuml",	"latin small letter y with diaeresis, U+00FF ISOlat1" },
1797 
1798 { 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },
1799 { 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },
1800 { 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1801 { 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },
1802 { 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1803 
1804 /*
1805  * Anything below should really be kept as entities references
1806  */
1807 { 402,	"fnof",	"latin small f with hook = function = florin, U+0192 ISOtech" },
1808 
1809 { 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },
1810 { 732,	"tilde","small tilde, U+02DC ISOdia" },
1811 
1812 { 913,	"Alpha","greek capital letter alpha, U+0391" },
1813 { 914,	"Beta",	"greek capital letter beta, U+0392" },
1814 { 915,	"Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1815 { 916,	"Delta","greek capital letter delta, U+0394 ISOgrk3" },
1816 { 917,	"Epsilon","greek capital letter epsilon, U+0395" },
1817 { 918,	"Zeta",	"greek capital letter zeta, U+0396" },
1818 { 919,	"Eta",	"greek capital letter eta, U+0397" },
1819 { 920,	"Theta","greek capital letter theta, U+0398 ISOgrk3" },
1820 { 921,	"Iota",	"greek capital letter iota, U+0399" },
1821 { 922,	"Kappa","greek capital letter kappa, U+039A" },
1822 { 923,	"Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1823 { 924,	"Mu",	"greek capital letter mu, U+039C" },
1824 { 925,	"Nu",	"greek capital letter nu, U+039D" },
1825 { 926,	"Xi",	"greek capital letter xi, U+039E ISOgrk3" },
1826 { 927,	"Omicron","greek capital letter omicron, U+039F" },
1827 { 928,	"Pi",	"greek capital letter pi, U+03A0 ISOgrk3" },
1828 { 929,	"Rho",	"greek capital letter rho, U+03A1" },
1829 { 931,	"Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1830 { 932,	"Tau",	"greek capital letter tau, U+03A4" },
1831 { 933,	"Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1832 { 934,	"Phi",	"greek capital letter phi, U+03A6 ISOgrk3" },
1833 { 935,	"Chi",	"greek capital letter chi, U+03A7" },
1834 { 936,	"Psi",	"greek capital letter psi, U+03A8 ISOgrk3" },
1835 { 937,	"Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1836 
1837 { 945,	"alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1838 { 946,	"beta",	"greek small letter beta, U+03B2 ISOgrk3" },
1839 { 947,	"gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1840 { 948,	"delta","greek small letter delta, U+03B4 ISOgrk3" },
1841 { 949,	"epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1842 { 950,	"zeta",	"greek small letter zeta, U+03B6 ISOgrk3" },
1843 { 951,	"eta",	"greek small letter eta, U+03B7 ISOgrk3" },
1844 { 952,	"theta","greek small letter theta, U+03B8 ISOgrk3" },
1845 { 953,	"iota",	"greek small letter iota, U+03B9 ISOgrk3" },
1846 { 954,	"kappa","greek small letter kappa, U+03BA ISOgrk3" },
1847 { 955,	"lambda","greek small letter lambda, U+03BB ISOgrk3" },
1848 { 956,	"mu",	"greek small letter mu, U+03BC ISOgrk3" },
1849 { 957,	"nu",	"greek small letter nu, U+03BD ISOgrk3" },
1850 { 958,	"xi",	"greek small letter xi, U+03BE ISOgrk3" },
1851 { 959,	"omicron","greek small letter omicron, U+03BF NEW" },
1852 { 960,	"pi",	"greek small letter pi, U+03C0 ISOgrk3" },
1853 { 961,	"rho",	"greek small letter rho, U+03C1 ISOgrk3" },
1854 { 962,	"sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1855 { 963,	"sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1856 { 964,	"tau",	"greek small letter tau, U+03C4 ISOgrk3" },
1857 { 965,	"upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1858 { 966,	"phi",	"greek small letter phi, U+03C6 ISOgrk3" },
1859 { 967,	"chi",	"greek small letter chi, U+03C7 ISOgrk3" },
1860 { 968,	"psi",	"greek small letter psi, U+03C8 ISOgrk3" },
1861 { 969,	"omega","greek small letter omega, U+03C9 ISOgrk3" },
1862 { 977,	"thetasym","greek small letter theta symbol, U+03D1 NEW" },
1863 { 978,	"upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1864 { 982,	"piv",	"greek pi symbol, U+03D6 ISOgrk3" },
1865 
1866 { 8194,	"ensp",	"en space, U+2002 ISOpub" },
1867 { 8195,	"emsp",	"em space, U+2003 ISOpub" },
1868 { 8201,	"thinsp","thin space, U+2009 ISOpub" },
1869 { 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },
1870 { 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },
1871 { 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },
1872 { 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },
1873 { 8211,	"ndash","en dash, U+2013 ISOpub" },
1874 { 8212,	"mdash","em dash, U+2014 ISOpub" },
1875 { 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },
1876 { 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },
1877 { 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },
1878 { 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },
1879 { 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },
1880 { 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },
1881 { 8224,	"dagger","dagger, U+2020 ISOpub" },
1882 { 8225,	"Dagger","double dagger, U+2021 ISOpub" },
1883 
1884 { 8226,	"bull",	"bullet = black small circle, U+2022 ISOpub" },
1885 { 8230,	"hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1886 
1887 { 8240,	"permil","per mille sign, U+2030 ISOtech" },
1888 
1889 { 8242,	"prime","prime = minutes = feet, U+2032 ISOtech" },
1890 { 8243,	"Prime","double prime = seconds = inches, U+2033 ISOtech" },
1891 
1892 { 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1893 { 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1894 
1895 { 8254,	"oline","overline = spacing overscore, U+203E NEW" },
1896 { 8260,	"frasl","fraction slash, U+2044 NEW" },
1897 
1898 { 8364,	"euro",	"euro sign, U+20AC NEW" },
1899 
1900 { 8465,	"image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1901 { 8472,	"weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1902 { 8476,	"real",	"blackletter capital R = real part symbol, U+211C ISOamso" },
1903 { 8482,	"trade","trade mark sign, U+2122 ISOnum" },
1904 { 8501,	"alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1905 { 8592,	"larr",	"leftwards arrow, U+2190 ISOnum" },
1906 { 8593,	"uarr",	"upwards arrow, U+2191 ISOnum" },
1907 { 8594,	"rarr",	"rightwards arrow, U+2192 ISOnum" },
1908 { 8595,	"darr",	"downwards arrow, U+2193 ISOnum" },
1909 { 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },
1910 { 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1911 { 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },
1912 { 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },
1913 { 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },
1914 { 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },
1915 { 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },
1916 
1917 { 8704,	"forall","for all, U+2200 ISOtech" },
1918 { 8706,	"part",	"partial differential, U+2202 ISOtech" },
1919 { 8707,	"exist","there exists, U+2203 ISOtech" },
1920 { 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },
1921 { 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },
1922 { 8712,	"isin",	"element of, U+2208 ISOtech" },
1923 { 8713,	"notin","not an element of, U+2209 ISOtech" },
1924 { 8715,	"ni",	"contains as member, U+220B ISOtech" },
1925 { 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },
1926 { 8721,	"sum",	"n-ary summation, U+2211 ISOamsb" },
1927 { 8722,	"minus","minus sign, U+2212 ISOtech" },
1928 { 8727,	"lowast","asterisk operator, U+2217 ISOtech" },
1929 { 8730,	"radic","square root = radical sign, U+221A ISOtech" },
1930 { 8733,	"prop",	"proportional to, U+221D ISOtech" },
1931 { 8734,	"infin","infinity, U+221E ISOtech" },
1932 { 8736,	"ang",	"angle, U+2220 ISOamso" },
1933 { 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },
1934 { 8744,	"or",	"logical or = vee, U+2228 ISOtech" },
1935 { 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },
1936 { 8746,	"cup",	"union = cup, U+222A ISOtech" },
1937 { 8747,	"int",	"integral, U+222B ISOtech" },
1938 { 8756,	"there4","therefore, U+2234 ISOtech" },
1939 { 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },
1940 { 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },
1941 { 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1942 { 8800,	"ne",	"not equal to, U+2260 ISOtech" },
1943 { 8801,	"equiv","identical to, U+2261 ISOtech" },
1944 { 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },
1945 { 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },
1946 { 8834,	"sub",	"subset of, U+2282 ISOtech" },
1947 { 8835,	"sup",	"superset of, U+2283 ISOtech" },
1948 { 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },
1949 { 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },
1950 { 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },
1951 { 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },
1952 { 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },
1953 { 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1954 { 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },
1955 { 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1956 { 8969,	"rceil","right ceiling, U+2309 ISOamsc" },
1957 { 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },
1958 { 8971,	"rfloor","right floor, U+230B ISOamsc" },
1959 { 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },
1960 { 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },
1961 { 9674,	"loz",	"lozenge, U+25CA ISOpub" },
1962 
1963 { 9824,	"spades","black spade suit, U+2660 ISOpub" },
1964 { 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },
1965 { 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },
1966 { 9830,	"diams","black diamond suit, U+2666 ISOpub" },
1967 
1968 };
1969 
1970 /************************************************************************
1971  *									*
1972  *		Commodity functions to handle entities			*
1973  *									*
1974  ************************************************************************/
1975 
1976 /*
1977  * Macro used to grow the current buffer.
1978  */
1979 #define growBuffer(buffer) {						\
1980     xmlChar *tmp;							\
1981     buffer##_size *= 2;							\
1982     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size); 		\
1983     if (tmp == NULL) {							\
1984 	htmlErrMemory(ctxt);			\
1985 	xmlFree(buffer);						\
1986 	return(NULL);							\
1987     }									\
1988     buffer = tmp;							\
1989 }
1990 
1991 /**
1992  * htmlEntityLookup:
1993  * @name: the entity name
1994  *
1995  * Lookup the given entity in EntitiesTable
1996  *
1997  * TODO: the linear scan is really ugly, an hash table is really needed.
1998  *
1999  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2000  */
2001 const htmlEntityDesc *
htmlEntityLookup(const xmlChar * name)2002 htmlEntityLookup(const xmlChar *name) {
2003     unsigned int i;
2004 
2005     for (i = 0;i < (sizeof(html40EntitiesTable)/
2006                     sizeof(html40EntitiesTable[0]));i++) {
2007         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2008             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2009 	}
2010     }
2011     return(NULL);
2012 }
2013 
2014 /**
2015  * htmlEntityValueLookup:
2016  * @value: the entity's unicode value
2017  *
2018  * Lookup the given entity in EntitiesTable
2019  *
2020  * TODO: the linear scan is really ugly, an hash table is really needed.
2021  *
2022  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2023  */
2024 const htmlEntityDesc *
htmlEntityValueLookup(unsigned int value)2025 htmlEntityValueLookup(unsigned int value) {
2026     unsigned int i;
2027 
2028     for (i = 0;i < (sizeof(html40EntitiesTable)/
2029                     sizeof(html40EntitiesTable[0]));i++) {
2030         if (html40EntitiesTable[i].value >= value) {
2031 	    if (html40EntitiesTable[i].value > value)
2032 		break;
2033             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2034 	}
2035     }
2036     return(NULL);
2037 }
2038 
2039 /**
2040  * UTF8ToHtml:
2041  * @out:  a pointer to an array of bytes to store the result
2042  * @outlen:  the length of @out
2043  * @in:  a pointer to an array of UTF-8 chars
2044  * @inlen:  the length of @in
2045  *
2046  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2047  * plus HTML entities block of chars out.
2048  *
2049  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2050  * The value of @inlen after return is the number of octets consumed
2051  *     as the return value is positive, else unpredictable.
2052  * The value of @outlen after return is the number of octets consumed.
2053  */
2054 int
UTF8ToHtml(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)2055 UTF8ToHtml(unsigned char* out, int *outlen,
2056               const unsigned char* in, int *inlen) {
2057     const unsigned char* processed = in;
2058     const unsigned char* outend;
2059     const unsigned char* outstart = out;
2060     const unsigned char* instart = in;
2061     const unsigned char* inend;
2062     unsigned int c, d;
2063     int trailing;
2064 
2065     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2066     if (in == NULL) {
2067         /*
2068 	 * initialization nothing to do
2069 	 */
2070 	*outlen = 0;
2071 	*inlen = 0;
2072 	return(0);
2073     }
2074     inend = in + (*inlen);
2075     outend = out + (*outlen);
2076     while (in < inend) {
2077 	d = *in++;
2078 	if      (d < 0x80)  { c= d; trailing= 0; }
2079 	else if (d < 0xC0) {
2080 	    /* trailing byte in leading position */
2081 	    *outlen = out - outstart;
2082 	    *inlen = processed - instart;
2083 	    return(-2);
2084         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2085         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2086         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2087 	else {
2088 	    /* no chance for this in Ascii */
2089 	    *outlen = out - outstart;
2090 	    *inlen = processed - instart;
2091 	    return(-2);
2092 	}
2093 
2094 	if (inend - in < trailing) {
2095 	    break;
2096 	}
2097 
2098 	for ( ; trailing; trailing--) {
2099 	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2100 		break;
2101 	    c <<= 6;
2102 	    c |= d & 0x3F;
2103 	}
2104 
2105 	/* assertion: c is a single UTF-4 value */
2106 	if (c < 0x80) {
2107 	    if (out + 1 >= outend)
2108 		break;
2109 	    *out++ = c;
2110 	} else {
2111 	    int len;
2112 	    const htmlEntityDesc * ent;
2113 	    const char *cp;
2114 	    char nbuf[16];
2115 
2116 	    /*
2117 	     * Try to lookup a predefined HTML entity for it
2118 	     */
2119 
2120 	    ent = htmlEntityValueLookup(c);
2121 	    if (ent == NULL) {
2122 	      snprintf(nbuf, sizeof(nbuf), "#%u", c);
2123 	      cp = nbuf;
2124 	    }
2125 	    else
2126 	      cp = ent->name;
2127 	    len = strlen(cp);
2128 	    if (out + 2 + len >= outend)
2129 		break;
2130 	    *out++ = '&';
2131 	    memcpy(out, cp, len);
2132 	    out += len;
2133 	    *out++ = ';';
2134 	}
2135 	processed = in;
2136     }
2137     *outlen = out - outstart;
2138     *inlen = processed - instart;
2139     return(0);
2140 }
2141 
2142 /**
2143  * htmlEncodeEntities:
2144  * @out:  a pointer to an array of bytes to store the result
2145  * @outlen:  the length of @out
2146  * @in:  a pointer to an array of UTF-8 chars
2147  * @inlen:  the length of @in
2148  * @quoteChar: the quote character to escape (' or ") or zero.
2149  *
2150  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2151  * plus HTML entities block of chars out.
2152  *
2153  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2154  * The value of @inlen after return is the number of octets consumed
2155  *     as the return value is positive, else unpredictable.
2156  * The value of @outlen after return is the number of octets consumed.
2157  */
2158 int
htmlEncodeEntities(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,int quoteChar)2159 htmlEncodeEntities(unsigned char* out, int *outlen,
2160 		   const unsigned char* in, int *inlen, int quoteChar) {
2161     const unsigned char* processed = in;
2162     const unsigned char* outend;
2163     const unsigned char* outstart = out;
2164     const unsigned char* instart = in;
2165     const unsigned char* inend;
2166     unsigned int c, d;
2167     int trailing;
2168 
2169     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2170         return(-1);
2171     outend = out + (*outlen);
2172     inend = in + (*inlen);
2173     while (in < inend) {
2174 	d = *in++;
2175 	if      (d < 0x80)  { c= d; trailing= 0; }
2176 	else if (d < 0xC0) {
2177 	    /* trailing byte in leading position */
2178 	    *outlen = out - outstart;
2179 	    *inlen = processed - instart;
2180 	    return(-2);
2181         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2182         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2183         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2184 	else {
2185 	    /* no chance for this in Ascii */
2186 	    *outlen = out - outstart;
2187 	    *inlen = processed - instart;
2188 	    return(-2);
2189 	}
2190 
2191 	if (inend - in < trailing)
2192 	    break;
2193 
2194 	while (trailing--) {
2195 	    if (((d= *in++) & 0xC0) != 0x80) {
2196 		*outlen = out - outstart;
2197 		*inlen = processed - instart;
2198 		return(-2);
2199 	    }
2200 	    c <<= 6;
2201 	    c |= d & 0x3F;
2202 	}
2203 
2204 	/* assertion: c is a single UTF-4 value */
2205 	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2206 	    (c != '&') && (c != '<') && (c != '>')) {
2207 	    if (out >= outend)
2208 		break;
2209 	    *out++ = c;
2210 	} else {
2211 	    const htmlEntityDesc * ent;
2212 	    const char *cp;
2213 	    char nbuf[16];
2214 	    int len;
2215 
2216 	    /*
2217 	     * Try to lookup a predefined HTML entity for it
2218 	     */
2219 	    ent = htmlEntityValueLookup(c);
2220 	    if (ent == NULL) {
2221 		snprintf(nbuf, sizeof(nbuf), "#%u", c);
2222 		cp = nbuf;
2223 	    }
2224 	    else
2225 		cp = ent->name;
2226 	    len = strlen(cp);
2227 	    if (outend - out < len + 2)
2228 		break;
2229 	    *out++ = '&';
2230 	    memcpy(out, cp, len);
2231 	    out += len;
2232 	    *out++ = ';';
2233 	}
2234 	processed = in;
2235     }
2236     *outlen = out - outstart;
2237     *inlen = processed - instart;
2238     return(0);
2239 }
2240 
2241 /************************************************************************
2242  *									*
2243  *		Commodity functions, cleanup needed ?			*
2244  *									*
2245  ************************************************************************/
2246 /*
2247  * all tags allowing pc data from the html 4.01 loose dtd
2248  * NOTE: it might be more appropriate to integrate this information
2249  * into the html40ElementTable array but I don't want to risk any
2250  * binary incompatibility
2251  */
2252 static const char *allowPCData[] = {
2253     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2254     "blockquote", "body", "button", "caption", "center", "cite", "code",
2255     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2256     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2257     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2258     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2259 };
2260 
2261 /**
2262  * areBlanks:
2263  * @ctxt:  an HTML parser context
2264  * @str:  a xmlChar *
2265  * @len:  the size of @str
2266  *
2267  * Is this a sequence of blank chars that one can ignore ?
2268  *
2269  * Returns 1 if ignorable 0 otherwise.
2270  */
2271 
areBlanks(htmlParserCtxtPtr ctxt,const xmlChar * str,int len)2272 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2273     unsigned int i;
2274     int j;
2275     xmlNodePtr lastChild;
2276     xmlDtdPtr dtd;
2277 
2278     for (j = 0;j < len;j++)
2279         if (!(IS_BLANK_CH(str[j]))) return(0);
2280 
2281     if (CUR == 0) return(1);
2282     if (CUR != '<') return(0);
2283     if (ctxt->name == NULL)
2284 	return(1);
2285     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2286 	return(1);
2287     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2288 	return(1);
2289 
2290     /* Only strip CDATA children of the body tag for strict HTML DTDs */
2291     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2292         dtd = xmlGetIntSubset(ctxt->myDoc);
2293         if (dtd != NULL && dtd->ExternalID != NULL) {
2294             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2295                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2296                 return(1);
2297         }
2298     }
2299 
2300     if (ctxt->node == NULL) return(0);
2301     lastChild = xmlGetLastChild(ctxt->node);
2302     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2303 	lastChild = lastChild->prev;
2304     if (lastChild == NULL) {
2305         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2306             (ctxt->node->content != NULL)) return(0);
2307 	/* keep ws in constructs like ...<b> </b>...
2308 	   for all tags "b" allowing PCDATA */
2309 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2310 	    if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2311 		return(0);
2312 	    }
2313 	}
2314     } else if (xmlNodeIsText(lastChild)) {
2315         return(0);
2316     } else {
2317 	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2318 	   for all tags "p" allowing PCDATA */
2319 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2320 	    if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2321 		return(0);
2322 	    }
2323 	}
2324     }
2325     return(1);
2326 }
2327 
2328 /**
2329  * htmlNewDocNoDtD:
2330  * @URI:  URI for the dtd, or NULL
2331  * @ExternalID:  the external ID of the DTD, or NULL
2332  *
2333  * Creates a new HTML document without a DTD node if @URI and @ExternalID
2334  * are NULL
2335  *
2336  * Returns a new document, do not initialize the DTD if not provided
2337  */
2338 htmlDocPtr
htmlNewDocNoDtD(const xmlChar * URI,const xmlChar * ExternalID)2339 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2340     xmlDocPtr cur;
2341 
2342     /*
2343      * Allocate a new document and fill the fields.
2344      */
2345     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2346     if (cur == NULL)
2347 	return(NULL);
2348     memset(cur, 0, sizeof(xmlDoc));
2349 
2350     cur->type = XML_HTML_DOCUMENT_NODE;
2351     cur->version = NULL;
2352     cur->intSubset = NULL;
2353     cur->doc = cur;
2354     cur->name = NULL;
2355     cur->children = NULL;
2356     cur->extSubset = NULL;
2357     cur->oldNs = NULL;
2358     cur->encoding = NULL;
2359     cur->standalone = 1;
2360     cur->compression = 0;
2361     cur->ids = NULL;
2362     cur->refs = NULL;
2363     cur->_private = NULL;
2364     cur->charset = XML_CHAR_ENCODING_UTF8;
2365     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2366     if ((ExternalID != NULL) ||
2367 	(URI != NULL)) {
2368         xmlDtdPtr intSubset;
2369 
2370 	intSubset = xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2371         if (intSubset == NULL) {
2372             xmlFree(cur);
2373             return(NULL);
2374         }
2375     }
2376     if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2377 	xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2378     return(cur);
2379 }
2380 
2381 /**
2382  * htmlNewDoc:
2383  * @URI:  URI for the dtd, or NULL
2384  * @ExternalID:  the external ID of the DTD, or NULL
2385  *
2386  * Creates a new HTML document
2387  *
2388  * Returns a new document
2389  */
2390 htmlDocPtr
htmlNewDoc(const xmlChar * URI,const xmlChar * ExternalID)2391 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2392     if ((URI == NULL) && (ExternalID == NULL))
2393 	return(htmlNewDocNoDtD(
2394 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2395 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2396 
2397     return(htmlNewDocNoDtD(URI, ExternalID));
2398 }
2399 
2400 
2401 /************************************************************************
2402  *									*
2403  *			The parser itself				*
2404  *	Relates to http://www.w3.org/TR/html40				*
2405  *									*
2406  ************************************************************************/
2407 
2408 /************************************************************************
2409  *									*
2410  *			The parser itself				*
2411  *									*
2412  ************************************************************************/
2413 
2414 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2415 
2416 static void
htmlSkipBogusComment(htmlParserCtxtPtr ctxt)2417 htmlSkipBogusComment(htmlParserCtxtPtr ctxt) {
2418     int c;
2419 
2420     htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT,
2421                  "Incorrectly opened comment\n", NULL, NULL);
2422 
2423     while (PARSER_STOPPED(ctxt) == 0) {
2424         c = CUR;
2425         if (c == 0)
2426             break;
2427         NEXT;
2428         if (c == '>')
2429             break;
2430     }
2431 }
2432 
2433 /**
2434  * htmlParseHTMLName:
2435  * @ctxt:  an HTML parser context
2436  *
2437  * parse an HTML tag or attribute name, note that we convert it to lowercase
2438  * since HTML names are not case-sensitive.
2439  *
2440  * Returns the Tag Name parsed or NULL
2441  */
2442 
2443 static const xmlChar *
htmlParseHTMLName(htmlParserCtxtPtr ctxt)2444 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2445     const xmlChar *ret;
2446     int i = 0;
2447     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2448 
2449     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2450         (CUR != ':') && (CUR != '.')) return(NULL);
2451 
2452     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2453            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2454 	   (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2455            (CUR == '.'))) {
2456 	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2457         else loc[i] = CUR;
2458 	i++;
2459 
2460 	NEXT;
2461     }
2462 
2463     ret = xmlDictLookup(ctxt->dict, loc, i);
2464     if (ret == NULL)
2465         htmlErrMemory(ctxt);
2466 
2467     return(ret);
2468 }
2469 
2470 
2471 /**
2472  * htmlParseHTMLName_nonInvasive:
2473  * @ctxt:  an HTML parser context
2474  *
2475  * parse an HTML tag or attribute name, note that we convert it to lowercase
2476  * since HTML names are not case-sensitive, this doesn't consume the data
2477  * from the stream, it's a look-ahead
2478  *
2479  * Returns the Tag Name parsed or NULL
2480  */
2481 
2482 static const xmlChar *
htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt)2483 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2484     int i = 0;
2485     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2486     const xmlChar *ret;
2487 
2488     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2489         (NXT(1) != ':')) return(NULL);
2490 
2491     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2492            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2493 	   (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2494 	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2495         else loc[i] = NXT(1+i);
2496 	i++;
2497     }
2498 
2499     ret = xmlDictLookup(ctxt->dict, loc, i);
2500     if (ret == NULL)
2501         htmlErrMemory(ctxt);
2502 
2503     return(ret);
2504 }
2505 
2506 
2507 /**
2508  * htmlParseName:
2509  * @ctxt:  an HTML parser context
2510  *
2511  * parse an HTML name, this routine is case sensitive.
2512  *
2513  * Returns the Name parsed or NULL
2514  */
2515 
2516 static const xmlChar *
htmlParseName(htmlParserCtxtPtr ctxt)2517 htmlParseName(htmlParserCtxtPtr ctxt) {
2518     const xmlChar *in;
2519     const xmlChar *ret;
2520     int count = 0;
2521 
2522     GROW;
2523 
2524     /*
2525      * Accelerator for simple ASCII names
2526      */
2527     in = ctxt->input->cur;
2528     if (((*in >= 0x61) && (*in <= 0x7A)) ||
2529 	((*in >= 0x41) && (*in <= 0x5A)) ||
2530 	(*in == '_') || (*in == ':')) {
2531 	in++;
2532 	while (((*in >= 0x61) && (*in <= 0x7A)) ||
2533 	       ((*in >= 0x41) && (*in <= 0x5A)) ||
2534 	       ((*in >= 0x30) && (*in <= 0x39)) ||
2535 	       (*in == '_') || (*in == '-') ||
2536 	       (*in == ':') || (*in == '.'))
2537 	    in++;
2538 
2539 	if (in == ctxt->input->end)
2540 	    return(NULL);
2541 
2542 	if ((*in > 0) && (*in < 0x80)) {
2543 	    count = in - ctxt->input->cur;
2544 	    ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2545             if (ret == NULL)
2546                 htmlErrMemory(ctxt);
2547 	    ctxt->input->cur = in;
2548 	    ctxt->input->col += count;
2549 	    return(ret);
2550 	}
2551     }
2552     return(htmlParseNameComplex(ctxt));
2553 }
2554 
2555 static const xmlChar *
htmlParseNameComplex(xmlParserCtxtPtr ctxt)2556 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2557     int len = 0, l;
2558     int c;
2559     int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2560                     XML_MAX_TEXT_LENGTH :
2561                     XML_MAX_NAME_LENGTH;
2562     const xmlChar *base = ctxt->input->base;
2563     const xmlChar *ret;
2564 
2565     /*
2566      * Handler for more complex cases
2567      */
2568     c = CUR_CHAR(l);
2569     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2570 	(!IS_LETTER(c) && (c != '_') &&
2571          (c != ':'))) {
2572 	return(NULL);
2573     }
2574 
2575     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2576 	   ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2577             (c == '.') || (c == '-') ||
2578 	    (c == '_') || (c == ':') ||
2579 	    (IS_COMBINING(c)) ||
2580 	    (IS_EXTENDER(c)))) {
2581 	len += l;
2582         if (len > maxLength) {
2583             htmlParseErr(ctxt, XML_ERR_NAME_TOO_LONG, "name too long", NULL, NULL);
2584             return(NULL);
2585         }
2586 	NEXTL(l);
2587 	c = CUR_CHAR(l);
2588 	if (ctxt->input->base != base) {
2589 	    /*
2590 	     * We changed encoding from an unknown encoding
2591 	     * Input buffer changed location, so we better start again
2592 	     */
2593 	    return(htmlParseNameComplex(ctxt));
2594 	}
2595     }
2596 
2597     if (ctxt->input->cur - ctxt->input->base < len) {
2598         /* Sanity check */
2599 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2600                      "unexpected change of input buffer", NULL, NULL);
2601         return (NULL);
2602     }
2603 
2604     ret = xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len);
2605     if (ret == NULL)
2606         htmlErrMemory(ctxt);
2607 
2608     return(ret);
2609 }
2610 
2611 
2612 /**
2613  * htmlParseHTMLAttribute:
2614  * @ctxt:  an HTML parser context
2615  * @stop:  a char stop value
2616  *
2617  * parse an HTML attribute value till the stop (quote), if
2618  * stop is 0 then it stops at the first space
2619  *
2620  * Returns the attribute parsed or NULL
2621  */
2622 
2623 static xmlChar *
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt,const xmlChar stop)2624 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2625     xmlChar *buffer = NULL;
2626     int buffer_size = 0;
2627     int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2628                     XML_MAX_HUGE_LENGTH :
2629                     XML_MAX_TEXT_LENGTH;
2630     xmlChar *out = NULL;
2631     const xmlChar *name = NULL;
2632     const xmlChar *cur = NULL;
2633     const htmlEntityDesc * ent;
2634 
2635     /*
2636      * allocate a translation buffer.
2637      */
2638     buffer_size = HTML_PARSER_BUFFER_SIZE;
2639     buffer = (xmlChar *) xmlMallocAtomic(buffer_size);
2640     if (buffer == NULL) {
2641 	htmlErrMemory(ctxt);
2642 	return(NULL);
2643     }
2644     out = buffer;
2645 
2646     /*
2647      * Ok loop until we reach one of the ending chars
2648      */
2649     while ((PARSER_STOPPED(ctxt) == 0) &&
2650            (CUR != 0) && (CUR != stop)) {
2651 	if ((stop == 0) && (CUR == '>')) break;
2652 	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2653         if (CUR == '&') {
2654 	    if (NXT(1) == '#') {
2655 		unsigned int c;
2656 		int bits;
2657 
2658 		c = htmlParseCharRef(ctxt);
2659 		if      (c <    0x80)
2660 		        { *out++  = c;                bits= -6; }
2661 		else if (c <   0x800)
2662 		        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2663 		else if (c < 0x10000)
2664 		        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2665 		else
2666 		        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2667 
2668 		for ( ; bits >= 0; bits-= 6) {
2669 		    *out++  = ((c >> bits) & 0x3F) | 0x80;
2670 		}
2671 
2672 		if (out - buffer > buffer_size - 100) {
2673 			int indx = out - buffer;
2674 
2675 			growBuffer(buffer);
2676 			out = &buffer[indx];
2677 		}
2678 	    } else {
2679 		ent = htmlParseEntityRef(ctxt, &name);
2680 		if (name == NULL) {
2681 		    *out++ = '&';
2682 		    if (out - buffer > buffer_size - 100) {
2683 			int indx = out - buffer;
2684 
2685 			growBuffer(buffer);
2686 			out = &buffer[indx];
2687 		    }
2688 		} else if (ent == NULL) {
2689 		    *out++ = '&';
2690 		    cur = name;
2691 		    while (*cur != 0) {
2692 			if (out - buffer > buffer_size - 100) {
2693 			    int indx = out - buffer;
2694 
2695 			    growBuffer(buffer);
2696 			    out = &buffer[indx];
2697 			}
2698 			*out++ = *cur++;
2699 		    }
2700 		} else {
2701 		    unsigned int c;
2702 		    int bits;
2703 
2704 		    if (out - buffer > buffer_size - 100) {
2705 			int indx = out - buffer;
2706 
2707 			growBuffer(buffer);
2708 			out = &buffer[indx];
2709 		    }
2710 		    c = ent->value;
2711 		    if      (c <    0x80)
2712 			{ *out++  = c;                bits= -6; }
2713 		    else if (c <   0x800)
2714 			{ *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2715 		    else if (c < 0x10000)
2716 			{ *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2717 		    else
2718 			{ *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2719 
2720 		    for ( ; bits >= 0; bits-= 6) {
2721 			*out++  = ((c >> bits) & 0x3F) | 0x80;
2722 		    }
2723 		}
2724 	    }
2725 	} else {
2726 	    unsigned int c;
2727 	    int bits, l;
2728 
2729 	    if (out - buffer > buffer_size - 100) {
2730 		int indx = out - buffer;
2731 
2732 		growBuffer(buffer);
2733 		out = &buffer[indx];
2734 	    }
2735 	    c = CUR_CHAR(l);
2736 	    if      (c <    0x80)
2737 		    { *out++  = c;                bits= -6; }
2738 	    else if (c <   0x800)
2739 		    { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2740 	    else if (c < 0x10000)
2741 		    { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2742 	    else
2743 		    { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2744 
2745 	    for ( ; bits >= 0; bits-= 6) {
2746 		*out++  = ((c >> bits) & 0x3F) | 0x80;
2747 	    }
2748 	    NEXTL(l);
2749 	}
2750         if (out - buffer > maxLength) {
2751             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2752                          "attribute value too long\n", NULL, NULL);
2753             xmlFree(buffer);
2754             return(NULL);
2755         }
2756     }
2757     *out = 0;
2758     return(buffer);
2759 }
2760 
2761 /**
2762  * htmlParseEntityRef:
2763  * @ctxt:  an HTML parser context
2764  * @str:  location to store the entity name
2765  *
2766  * DEPRECATED: Internal function, don't use.
2767  *
2768  * parse an HTML ENTITY references
2769  *
2770  * [68] EntityRef ::= '&' Name ';'
2771  *
2772  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2773  *         if non-NULL *str will have to be freed by the caller.
2774  */
2775 const htmlEntityDesc *
htmlParseEntityRef(htmlParserCtxtPtr ctxt,const xmlChar ** str)2776 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2777     const xmlChar *name;
2778     const htmlEntityDesc * ent = NULL;
2779 
2780     if (str != NULL) *str = NULL;
2781     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2782 
2783     if (CUR == '&') {
2784         NEXT;
2785         name = htmlParseName(ctxt);
2786 	if (name == NULL) {
2787 	    htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2788 	                 "htmlParseEntityRef: no name\n", NULL, NULL);
2789 	} else {
2790 	    GROW;
2791 	    if (CUR == ';') {
2792 	        if (str != NULL)
2793 		    *str = name;
2794 
2795 		/*
2796 		 * Lookup the entity in the table.
2797 		 */
2798 		ent = htmlEntityLookup(name);
2799 		if (ent != NULL) /* OK that's ugly !!! */
2800 		    NEXT;
2801 	    } else {
2802 		htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2803 		             "htmlParseEntityRef: expecting ';'\n",
2804 			     NULL, NULL);
2805 	        if (str != NULL)
2806 		    *str = name;
2807 	    }
2808 	}
2809     }
2810     return(ent);
2811 }
2812 
2813 /**
2814  * htmlParseAttValue:
2815  * @ctxt:  an HTML parser context
2816  *
2817  * parse a value for an attribute
2818  * Note: the parser won't do substitution of entities here, this
2819  * will be handled later in xmlStringGetNodeList, unless it was
2820  * asked for ctxt->replaceEntities != 0
2821  *
2822  * Returns the AttValue parsed or NULL.
2823  */
2824 
2825 static xmlChar *
htmlParseAttValue(htmlParserCtxtPtr ctxt)2826 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2827     xmlChar *ret = NULL;
2828 
2829     if (CUR == '"') {
2830         NEXT;
2831 	ret = htmlParseHTMLAttribute(ctxt, '"');
2832         if (CUR != '"') {
2833 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2834 	                 "AttValue: \" expected\n", NULL, NULL);
2835 	} else
2836 	    NEXT;
2837     } else if (CUR == '\'') {
2838         NEXT;
2839 	ret = htmlParseHTMLAttribute(ctxt, '\'');
2840         if (CUR != '\'') {
2841 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2842 	                 "AttValue: ' expected\n", NULL, NULL);
2843 	} else
2844 	    NEXT;
2845     } else {
2846         /*
2847 	 * That's an HTMLism, the attribute value may not be quoted
2848 	 */
2849 	ret = htmlParseHTMLAttribute(ctxt, 0);
2850 	if (ret == NULL) {
2851 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2852 	                 "AttValue: no value found\n", NULL, NULL);
2853 	}
2854     }
2855     return(ret);
2856 }
2857 
2858 /**
2859  * htmlParseSystemLiteral:
2860  * @ctxt:  an HTML parser context
2861  *
2862  * parse an HTML Literal
2863  *
2864  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2865  *
2866  * Returns the SystemLiteral parsed or NULL
2867  */
2868 
2869 static xmlChar *
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt)2870 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2871     size_t len = 0, startPosition = 0;
2872     int err = 0;
2873     int quote;
2874     xmlChar *ret = NULL;
2875 
2876     if ((CUR != '"') && (CUR != '\'')) {
2877 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2878 	             "SystemLiteral \" or ' expected\n", NULL, NULL);
2879         return(NULL);
2880     }
2881     quote = CUR;
2882     NEXT;
2883 
2884     if (CUR_PTR < BASE_PTR)
2885         return(ret);
2886     startPosition = CUR_PTR - BASE_PTR;
2887 
2888     while ((PARSER_STOPPED(ctxt) == 0) &&
2889            (CUR != 0) && (CUR != quote)) {
2890         /* TODO: Handle UTF-8 */
2891         if (!IS_CHAR_CH(CUR)) {
2892             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2893                             "Invalid char in SystemLiteral 0x%X\n", CUR);
2894             err = 1;
2895         }
2896         NEXT;
2897         len++;
2898     }
2899     if (CUR != quote) {
2900         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2901                      "Unfinished SystemLiteral\n", NULL, NULL);
2902     } else {
2903         if (err == 0) {
2904             ret = xmlStrndup((BASE_PTR+startPosition), len);
2905             if (ret == NULL) {
2906                 htmlErrMemory(ctxt);
2907                 return(NULL);
2908             }
2909         }
2910         NEXT;
2911     }
2912 
2913     return(ret);
2914 }
2915 
2916 /**
2917  * htmlParsePubidLiteral:
2918  * @ctxt:  an HTML parser context
2919  *
2920  * parse an HTML public literal
2921  *
2922  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2923  *
2924  * Returns the PubidLiteral parsed or NULL.
2925  */
2926 
2927 static xmlChar *
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt)2928 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2929     size_t len = 0, startPosition = 0;
2930     int err = 0;
2931     int quote;
2932     xmlChar *ret = NULL;
2933 
2934     if ((CUR != '"') && (CUR != '\'')) {
2935 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2936 	             "PubidLiteral \" or ' expected\n", NULL, NULL);
2937         return(NULL);
2938     }
2939     quote = CUR;
2940     NEXT;
2941 
2942     /*
2943      * Name ::= (Letter | '_') (NameChar)*
2944      */
2945     if (CUR_PTR < BASE_PTR)
2946         return(ret);
2947     startPosition = CUR_PTR - BASE_PTR;
2948 
2949     while ((PARSER_STOPPED(ctxt) == 0) &&
2950            (CUR != 0) && (CUR != quote)) {
2951         if (!IS_PUBIDCHAR_CH(CUR)) {
2952             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2953                             "Invalid char in PubidLiteral 0x%X\n", CUR);
2954             err = 1;
2955         }
2956         len++;
2957         NEXT;
2958     }
2959 
2960     if (CUR != quote) {
2961         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2962                      "Unfinished PubidLiteral\n", NULL, NULL);
2963     } else {
2964         if (err == 0) {
2965             ret = xmlStrndup((BASE_PTR + startPosition), len);
2966             if (ret == NULL) {
2967                 htmlErrMemory(ctxt);
2968                 return(NULL);
2969             }
2970         }
2971         NEXT;
2972     }
2973 
2974     return(ret);
2975 }
2976 
2977 /**
2978  * htmlParseScript:
2979  * @ctxt:  an HTML parser context
2980  *
2981  * parse the content of an HTML SCRIPT or STYLE element
2982  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2983  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2984  * http://www.w3.org/TR/html4/types.html#type-script
2985  * http://www.w3.org/TR/html4/types.html#h-6.15
2986  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2987  *
2988  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2989  * element and the value of intrinsic event attributes. User agents must
2990  * not evaluate script data as HTML markup but instead must pass it on as
2991  * data to a script engine.
2992  * NOTES:
2993  * - The content is passed like CDATA
2994  * - the attributes for style and scripting "onXXX" are also described
2995  *   as CDATA but SGML allows entities references in attributes so their
2996  *   processing is identical as other attributes
2997  */
2998 static void
htmlParseScript(htmlParserCtxtPtr ctxt)2999 htmlParseScript(htmlParserCtxtPtr ctxt) {
3000     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3001     int nbchar = 0;
3002     int cur,l;
3003 
3004     cur = CUR_CHAR(l);
3005     while (cur != 0) {
3006 	if ((cur == '<') && (NXT(1) == '/')) {
3007             /*
3008              * One should break here, the specification is clear:
3009              * Authors should therefore escape "</" within the content.
3010              * Escape mechanisms are specific to each scripting or
3011              * style sheet language.
3012              *
3013              * In recovery mode, only break if end tag match the
3014              * current tag, effectively ignoring all tags inside the
3015              * script/style block and treating the entire block as
3016              * CDATA.
3017              */
3018             if (ctxt->recovery) {
3019                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3020 				   xmlStrlen(ctxt->name)) == 0)
3021                 {
3022                     break; /* while */
3023                 } else {
3024 		    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3025 				 "Element %s embeds close tag\n",
3026 		                 ctxt->name, NULL);
3027 		}
3028             } else {
3029                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3030                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3031                 {
3032                     break; /* while */
3033                 }
3034             }
3035 	}
3036         if (IS_CHAR(cur)) {
3037 	    COPY_BUF(l,buf,nbchar,cur);
3038         } else {
3039             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3040                             "Invalid char in CDATA 0x%X\n", cur);
3041         }
3042 	NEXTL(l);
3043 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3044             buf[nbchar] = 0;
3045 	    if (ctxt->sax->cdataBlock!= NULL) {
3046 		/*
3047 		 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3048 		 */
3049 		ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3050 	    } else if (ctxt->sax->characters != NULL) {
3051 		ctxt->sax->characters(ctxt->userData, buf, nbchar);
3052 	    }
3053 	    nbchar = 0;
3054             SHRINK;
3055 	}
3056 	cur = CUR_CHAR(l);
3057     }
3058 
3059     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3060         buf[nbchar] = 0;
3061 	if (ctxt->sax->cdataBlock!= NULL) {
3062 	    /*
3063 	     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3064 	     */
3065 	    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3066 	} else if (ctxt->sax->characters != NULL) {
3067 	    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3068 	}
3069     }
3070 }
3071 
3072 
3073 /**
3074  * htmlParseCharDataInternal:
3075  * @ctxt:  an HTML parser context
3076  * @readahead: optional read ahead character in ascii range
3077  *
3078  * parse a CharData section.
3079  * if we are within a CDATA section ']]>' marks an end of section.
3080  *
3081  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3082  */
3083 
3084 static void
htmlParseCharDataInternal(htmlParserCtxtPtr ctxt,int readahead)3085 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3086     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3087     int nbchar = 0;
3088     int cur, l;
3089 
3090     if (readahead)
3091         buf[nbchar++] = readahead;
3092 
3093     cur = CUR_CHAR(l);
3094     while ((cur != '<') &&
3095            (cur != '&') &&
3096 	   (cur != 0) &&
3097            (!PARSER_STOPPED(ctxt))) {
3098 	if (!(IS_CHAR(cur))) {
3099 	    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3100 	                "Invalid char in CDATA 0x%X\n", cur);
3101 	} else {
3102 	    COPY_BUF(l,buf,nbchar,cur);
3103 	}
3104 	NEXTL(l);
3105 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3106             buf[nbchar] = 0;
3107 
3108 	    /*
3109 	     * Ok the segment is to be consumed as chars.
3110 	     */
3111 	    if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3112 		if (areBlanks(ctxt, buf, nbchar)) {
3113 		    if (ctxt->keepBlanks) {
3114 			if (ctxt->sax->characters != NULL)
3115 			    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3116 		    } else {
3117 			if (ctxt->sax->ignorableWhitespace != NULL)
3118 			    ctxt->sax->ignorableWhitespace(ctxt->userData,
3119 			                                   buf, nbchar);
3120 		    }
3121 		} else {
3122 		    htmlCheckParagraph(ctxt);
3123 		    if (ctxt->sax->characters != NULL)
3124 			ctxt->sax->characters(ctxt->userData, buf, nbchar);
3125 		}
3126 	    }
3127 	    nbchar = 0;
3128             SHRINK;
3129 	}
3130 	cur = CUR_CHAR(l);
3131     }
3132     if (nbchar != 0) {
3133         buf[nbchar] = 0;
3134 
3135 	/*
3136 	 * Ok the segment is to be consumed as chars.
3137 	 */
3138 	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3139 	    if (areBlanks(ctxt, buf, nbchar)) {
3140 		if (ctxt->keepBlanks) {
3141 		    if (ctxt->sax->characters != NULL)
3142 			ctxt->sax->characters(ctxt->userData, buf, nbchar);
3143 		} else {
3144 		    if (ctxt->sax->ignorableWhitespace != NULL)
3145 			ctxt->sax->ignorableWhitespace(ctxt->userData,
3146 			                               buf, nbchar);
3147 		}
3148 	    } else {
3149 		htmlCheckParagraph(ctxt);
3150 		if (ctxt->sax->characters != NULL)
3151 		    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3152 	    }
3153 	}
3154     }
3155 }
3156 
3157 /**
3158  * htmlParseCharData:
3159  * @ctxt:  an HTML parser context
3160  *
3161  * parse a CharData section.
3162  * if we are within a CDATA section ']]>' marks an end of section.
3163  *
3164  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3165  */
3166 
3167 static void
htmlParseCharData(htmlParserCtxtPtr ctxt)3168 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3169     htmlParseCharDataInternal(ctxt, 0);
3170 }
3171 
3172 /**
3173  * htmlParseExternalID:
3174  * @ctxt:  an HTML parser context
3175  * @publicID:  a xmlChar** receiving PubidLiteral
3176  *
3177  * Parse an External ID or a Public ID
3178  *
3179  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3180  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3181  *
3182  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3183  *
3184  * Returns the function returns SystemLiteral and in the second
3185  *                case publicID receives PubidLiteral, is strict is off
3186  *                it is possible to return NULL and have publicID set.
3187  */
3188 
3189 static xmlChar *
htmlParseExternalID(htmlParserCtxtPtr ctxt,xmlChar ** publicID)3190 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3191     xmlChar *URI = NULL;
3192 
3193     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3194          (UPP(2) == 'S') && (UPP(3) == 'T') &&
3195 	 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3196         SKIP(6);
3197 	if (!IS_BLANK_CH(CUR)) {
3198 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3199 	                 "Space required after 'SYSTEM'\n", NULL, NULL);
3200 	}
3201         SKIP_BLANKS;
3202 	URI = htmlParseSystemLiteral(ctxt);
3203 	if (URI == NULL) {
3204 	    htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3205 	                 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3206         }
3207     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3208 	       (UPP(2) == 'B') && (UPP(3) == 'L') &&
3209 	       (UPP(4) == 'I') && (UPP(5) == 'C')) {
3210         SKIP(6);
3211 	if (!IS_BLANK_CH(CUR)) {
3212 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3213 	                 "Space required after 'PUBLIC'\n", NULL, NULL);
3214 	}
3215         SKIP_BLANKS;
3216 	*publicID = htmlParsePubidLiteral(ctxt);
3217 	if (*publicID == NULL) {
3218 	    htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3219 	                 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3220 			 NULL, NULL);
3221 	}
3222         SKIP_BLANKS;
3223         if ((CUR == '"') || (CUR == '\'')) {
3224 	    URI = htmlParseSystemLiteral(ctxt);
3225 	}
3226     }
3227     return(URI);
3228 }
3229 
3230 /**
3231  * htmlParsePI:
3232  * @ctxt:  an HTML parser context
3233  *
3234  * Parse an XML Processing Instruction. HTML5 doesn't allow processing
3235  * instructions, so this will be removed at some point.
3236  */
3237 static void
htmlParsePI(htmlParserCtxtPtr ctxt)3238 htmlParsePI(htmlParserCtxtPtr ctxt) {
3239     xmlChar *buf = NULL;
3240     int len = 0;
3241     int size = HTML_PARSER_BUFFER_SIZE;
3242     int cur, l;
3243     int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3244                     XML_MAX_HUGE_LENGTH :
3245                     XML_MAX_TEXT_LENGTH;
3246     const xmlChar *target;
3247     xmlParserInputState state;
3248 
3249     if ((RAW == '<') && (NXT(1) == '?')) {
3250 	state = ctxt->instate;
3251         ctxt->instate = XML_PARSER_PI;
3252 	/*
3253 	 * this is a Processing Instruction.
3254 	 */
3255 	SKIP(2);
3256 
3257 	/*
3258 	 * Parse the target name and check for special support like
3259 	 * namespace.
3260 	 */
3261         target = htmlParseName(ctxt);
3262 	if (target != NULL) {
3263 	    if (RAW == '>') {
3264 		SKIP(1);
3265 
3266 		/*
3267 		 * SAX: PI detected.
3268 		 */
3269 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3270 		    (ctxt->sax->processingInstruction != NULL))
3271 		    ctxt->sax->processingInstruction(ctxt->userData,
3272 		                                     target, NULL);
3273                 goto done;
3274 	    }
3275 	    buf = (xmlChar *) xmlMallocAtomic(size);
3276 	    if (buf == NULL) {
3277 		htmlErrMemory(ctxt);
3278 		return;
3279 	    }
3280 	    cur = CUR;
3281 	    if (!IS_BLANK(cur)) {
3282 		htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3283 			  "ParsePI: PI %s space expected\n", target, NULL);
3284 	    }
3285             SKIP_BLANKS;
3286 	    cur = CUR_CHAR(l);
3287 	    while ((cur != 0) && (cur != '>')) {
3288 		if (len + 5 >= size) {
3289 		    xmlChar *tmp;
3290 
3291 		    size *= 2;
3292 		    tmp = (xmlChar *) xmlRealloc(buf, size);
3293 		    if (tmp == NULL) {
3294 			htmlErrMemory(ctxt);
3295 			xmlFree(buf);
3296 			return;
3297 		    }
3298 		    buf = tmp;
3299 		}
3300                 if (IS_CHAR(cur)) {
3301 		    COPY_BUF(l,buf,len,cur);
3302                 } else {
3303                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3304                                     "Invalid char in processing instruction "
3305                                     "0x%X\n", cur);
3306                 }
3307                 if (len > maxLength) {
3308                     htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3309                                  "PI %s too long", target, NULL);
3310                     xmlFree(buf);
3311                     goto done;
3312                 }
3313 		NEXTL(l);
3314 		cur = CUR_CHAR(l);
3315 	    }
3316 	    buf[len] = 0;
3317 	    if (cur != '>') {
3318 		htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3319 		      "ParsePI: PI %s never end ...\n", target, NULL);
3320 	    } else {
3321 		SKIP(1);
3322 
3323 		/*
3324 		 * SAX: PI detected.
3325 		 */
3326 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3327 		    (ctxt->sax->processingInstruction != NULL))
3328 		    ctxt->sax->processingInstruction(ctxt->userData,
3329 		                                     target, buf);
3330 	    }
3331 	    xmlFree(buf);
3332 	} else {
3333 	    htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3334                          "PI is not started correctly", NULL, NULL);
3335 	}
3336 
3337 done:
3338 	ctxt->instate = state;
3339     }
3340 }
3341 
3342 /**
3343  * htmlParseComment:
3344  * @ctxt:  an HTML parser context
3345  *
3346  * Parse an HTML comment
3347  */
3348 static void
htmlParseComment(htmlParserCtxtPtr ctxt)3349 htmlParseComment(htmlParserCtxtPtr ctxt) {
3350     xmlChar *buf = NULL;
3351     int len;
3352     int size = HTML_PARSER_BUFFER_SIZE;
3353     int q, ql;
3354     int r, rl;
3355     int cur, l;
3356     int next, nl;
3357     int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3358                     XML_MAX_HUGE_LENGTH :
3359                     XML_MAX_TEXT_LENGTH;
3360     xmlParserInputState state;
3361 
3362     /*
3363      * Check that there is a comment right here.
3364      */
3365     if ((RAW != '<') || (NXT(1) != '!') ||
3366         (NXT(2) != '-') || (NXT(3) != '-')) return;
3367 
3368     state = ctxt->instate;
3369     ctxt->instate = XML_PARSER_COMMENT;
3370     SKIP(4);
3371     buf = (xmlChar *) xmlMallocAtomic(size);
3372     if (buf == NULL) {
3373         htmlErrMemory(ctxt);
3374 	return;
3375     }
3376     len = 0;
3377     buf[len] = 0;
3378     q = CUR_CHAR(ql);
3379     if (q == 0)
3380         goto unfinished;
3381     if (q == '>') {
3382         htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3383         cur = '>';
3384         goto finished;
3385     }
3386     NEXTL(ql);
3387     r = CUR_CHAR(rl);
3388     if (r == 0)
3389         goto unfinished;
3390     if (q == '-' && r == '>') {
3391         htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3392         cur = '>';
3393         goto finished;
3394     }
3395     NEXTL(rl);
3396     cur = CUR_CHAR(l);
3397     while ((cur != 0) &&
3398            ((cur != '>') ||
3399 	    (r != '-') || (q != '-'))) {
3400 	NEXTL(l);
3401 	next = CUR_CHAR(nl);
3402 
3403 	if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3404 	  htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3405 		       "Comment incorrectly closed by '--!>'", NULL, NULL);
3406 	  cur = '>';
3407 	  break;
3408 	}
3409 
3410 	if (len + 5 >= size) {
3411 	    xmlChar *tmp;
3412 
3413 	    size *= 2;
3414 	    tmp = (xmlChar *) xmlRealloc(buf, size);
3415 	    if (tmp == NULL) {
3416 	        xmlFree(buf);
3417 	        htmlErrMemory(ctxt);
3418 		return;
3419 	    }
3420 	    buf = tmp;
3421 	}
3422         if (IS_CHAR(q)) {
3423 	    COPY_BUF(ql,buf,len,q);
3424         } else {
3425             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3426                             "Invalid char in comment 0x%X\n", q);
3427         }
3428         if (len > maxLength) {
3429             htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3430                          "comment too long", NULL, NULL);
3431             xmlFree(buf);
3432             ctxt->instate = state;
3433             return;
3434         }
3435 
3436 	q = r;
3437 	ql = rl;
3438 	r = cur;
3439 	rl = l;
3440 	cur = next;
3441 	l = nl;
3442     }
3443 finished:
3444     buf[len] = 0;
3445     if (cur == '>') {
3446         NEXT;
3447 	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3448 	    (!ctxt->disableSAX))
3449 	    ctxt->sax->comment(ctxt->userData, buf);
3450 	xmlFree(buf);
3451 	ctxt->instate = state;
3452 	return;
3453     }
3454 
3455 unfinished:
3456     htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3457 		 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3458     xmlFree(buf);
3459 }
3460 
3461 /**
3462  * htmlParseCharRef:
3463  * @ctxt:  an HTML parser context
3464  *
3465  * DEPRECATED: Internal function, don't use.
3466  *
3467  * parse Reference declarations
3468  *
3469  * [66] CharRef ::= '&#' [0-9]+ ';' |
3470  *                  '&#x' [0-9a-fA-F]+ ';'
3471  *
3472  * Returns the value parsed (as an int)
3473  */
3474 int
htmlParseCharRef(htmlParserCtxtPtr ctxt)3475 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3476     int val = 0;
3477 
3478     if ((ctxt == NULL) || (ctxt->input == NULL))
3479         return(0);
3480     if ((CUR == '&') && (NXT(1) == '#') &&
3481         ((NXT(2) == 'x') || NXT(2) == 'X')) {
3482 	SKIP(3);
3483 	while (CUR != ';') {
3484 	    if ((CUR >= '0') && (CUR <= '9')) {
3485                 if (val < 0x110000)
3486 	            val = val * 16 + (CUR - '0');
3487             } else if ((CUR >= 'a') && (CUR <= 'f')) {
3488                 if (val < 0x110000)
3489 	            val = val * 16 + (CUR - 'a') + 10;
3490             } else if ((CUR >= 'A') && (CUR <= 'F')) {
3491                 if (val < 0x110000)
3492 	            val = val * 16 + (CUR - 'A') + 10;
3493             } else {
3494 	        htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3495 		             "htmlParseCharRef: missing semicolon\n",
3496 			     NULL, NULL);
3497 		break;
3498 	    }
3499 	    NEXT;
3500 	}
3501 	if (CUR == ';')
3502 	    NEXT;
3503     } else if  ((CUR == '&') && (NXT(1) == '#')) {
3504 	SKIP(2);
3505 	while (CUR != ';') {
3506 	    if ((CUR >= '0') && (CUR <= '9')) {
3507                 if (val < 0x110000)
3508 	            val = val * 10 + (CUR - '0');
3509             } else {
3510 	        htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3511 		             "htmlParseCharRef: missing semicolon\n",
3512 			     NULL, NULL);
3513 		break;
3514 	    }
3515 	    NEXT;
3516 	}
3517 	if (CUR == ';')
3518 	    NEXT;
3519     } else {
3520 	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3521 	             "htmlParseCharRef: invalid value\n", NULL, NULL);
3522     }
3523     /*
3524      * Check the value IS_CHAR ...
3525      */
3526     if (IS_CHAR(val)) {
3527         return(val);
3528     } else if (val >= 0x110000) {
3529 	htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3530 		     "htmlParseCharRef: value too large\n", NULL, NULL);
3531     } else {
3532 	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3533 			"htmlParseCharRef: invalid xmlChar value %d\n",
3534 			val);
3535     }
3536     return(0);
3537 }
3538 
3539 
3540 /**
3541  * htmlParseDocTypeDecl:
3542  * @ctxt:  an HTML parser context
3543  *
3544  * parse a DOCTYPE declaration
3545  *
3546  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3547  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3548  */
3549 
3550 static void
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt)3551 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3552     const xmlChar *name;
3553     xmlChar *ExternalID = NULL;
3554     xmlChar *URI = NULL;
3555 
3556     /*
3557      * We know that '<!DOCTYPE' has been detected.
3558      */
3559     SKIP(9);
3560 
3561     SKIP_BLANKS;
3562 
3563     /*
3564      * Parse the DOCTYPE name.
3565      */
3566     name = htmlParseName(ctxt);
3567     if (name == NULL) {
3568 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3569 	             "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3570 		     NULL, NULL);
3571     }
3572     /*
3573      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3574      */
3575 
3576     SKIP_BLANKS;
3577 
3578     /*
3579      * Check for SystemID and ExternalID
3580      */
3581     URI = htmlParseExternalID(ctxt, &ExternalID);
3582     SKIP_BLANKS;
3583 
3584     /*
3585      * We should be at the end of the DOCTYPE declaration.
3586      */
3587     if (CUR != '>') {
3588 	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3589 	             "DOCTYPE improperly terminated\n", NULL, NULL);
3590         /* Ignore bogus content */
3591         while ((CUR != 0) && (CUR != '>') &&
3592                (PARSER_STOPPED(ctxt) == 0))
3593             NEXT;
3594     }
3595     if (CUR == '>')
3596         NEXT;
3597 
3598     /*
3599      * Create or update the document accordingly to the DOCTYPE
3600      */
3601     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3602 	(!ctxt->disableSAX))
3603 	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3604 
3605     /*
3606      * Cleanup, since we don't use all those identifiers
3607      */
3608     if (URI != NULL) xmlFree(URI);
3609     if (ExternalID != NULL) xmlFree(ExternalID);
3610 }
3611 
3612 /**
3613  * htmlParseAttribute:
3614  * @ctxt:  an HTML parser context
3615  * @value:  a xmlChar ** used to store the value of the attribute
3616  *
3617  * parse an attribute
3618  *
3619  * [41] Attribute ::= Name Eq AttValue
3620  *
3621  * [25] Eq ::= S? '=' S?
3622  *
3623  * With namespace:
3624  *
3625  * [NS 11] Attribute ::= QName Eq AttValue
3626  *
3627  * Also the case QName == xmlns:??? is handled independently as a namespace
3628  * definition.
3629  *
3630  * Returns the attribute name, and the value in *value.
3631  */
3632 
3633 static const xmlChar *
htmlParseAttribute(htmlParserCtxtPtr ctxt,xmlChar ** value)3634 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3635     const xmlChar *name;
3636     xmlChar *val = NULL;
3637 
3638     *value = NULL;
3639     name = htmlParseHTMLName(ctxt);
3640     if (name == NULL) {
3641 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3642 	             "error parsing attribute name\n", NULL, NULL);
3643         return(NULL);
3644     }
3645 
3646     /*
3647      * read the value
3648      */
3649     SKIP_BLANKS;
3650     if (CUR == '=') {
3651         NEXT;
3652 	SKIP_BLANKS;
3653 	val = htmlParseAttValue(ctxt);
3654     }
3655 
3656     *value = val;
3657     return(name);
3658 }
3659 
3660 /**
3661  * htmlCheckEncoding:
3662  * @ctxt:  an HTML parser context
3663  * @attvalue: the attribute value
3664  *
3665  * Checks an http-equiv attribute from a Meta tag to detect
3666  * the encoding
3667  * If a new encoding is detected the parser is switched to decode
3668  * it and pass UTF8
3669  */
3670 static void
htmlCheckEncoding(htmlParserCtxtPtr ctxt,const xmlChar * attvalue)3671 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3672     const xmlChar *encoding;
3673     xmlChar *copy;
3674 
3675     if (!attvalue)
3676 	return;
3677 
3678     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3679     if (encoding != NULL) {
3680 	encoding += 7;
3681     }
3682     /*
3683      * skip blank
3684      */
3685     if (encoding && IS_BLANK_CH(*encoding))
3686 	encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3687     if (encoding && *encoding == '=') {
3688 	encoding ++;
3689         copy = xmlStrdup(encoding);
3690         if (copy == NULL)
3691             htmlErrMemory(ctxt);
3692 	xmlSetDeclaredEncoding(ctxt, copy);
3693     }
3694 }
3695 
3696 /**
3697  * htmlCheckMeta:
3698  * @ctxt:  an HTML parser context
3699  * @atts:  the attributes values
3700  *
3701  * Checks an attributes from a Meta tag
3702  */
3703 static void
htmlCheckMeta(htmlParserCtxtPtr ctxt,const xmlChar ** atts)3704 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3705     int i;
3706     const xmlChar *att, *value;
3707     int http = 0;
3708     const xmlChar *content = NULL;
3709 
3710     if ((ctxt == NULL) || (atts == NULL))
3711 	return;
3712 
3713     i = 0;
3714     att = atts[i++];
3715     while (att != NULL) {
3716 	value = atts[i++];
3717         if (value != NULL) {
3718             if ((!xmlStrcasecmp(att, BAD_CAST "http-equiv")) &&
3719                 (!xmlStrcasecmp(value, BAD_CAST "Content-Type"))) {
3720                 http = 1;
3721             } else if (!xmlStrcasecmp(att, BAD_CAST "charset")) {
3722                 xmlChar *copy;
3723 
3724                 copy = xmlStrdup(value);
3725                 if (copy == NULL)
3726                     htmlErrMemory(ctxt);
3727                 xmlSetDeclaredEncoding(ctxt, copy);
3728             } else if (!xmlStrcasecmp(att, BAD_CAST "content")) {
3729                 content = value;
3730             }
3731         }
3732 	att = atts[i++];
3733     }
3734     if ((http) && (content != NULL))
3735 	htmlCheckEncoding(ctxt, content);
3736 
3737 }
3738 
3739 /**
3740  * htmlParseStartTag:
3741  * @ctxt:  an HTML parser context
3742  *
3743  * parse a start of tag either for rule element or
3744  * EmptyElement. In both case we don't parse the tag closing chars.
3745  *
3746  * [40] STag ::= '<' Name (S Attribute)* S? '>'
3747  *
3748  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3749  *
3750  * With namespace:
3751  *
3752  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3753  *
3754  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3755  *
3756  * Returns 0 in case of success, -1 in case of error and 1 if discarded
3757  */
3758 
3759 static int
htmlParseStartTag(htmlParserCtxtPtr ctxt)3760 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3761     const xmlChar *name;
3762     const xmlChar *attname;
3763     xmlChar *attvalue;
3764     const xmlChar **atts;
3765     int nbatts = 0;
3766     int maxatts;
3767     int meta = 0;
3768     int i;
3769     int discardtag = 0;
3770 
3771     if ((ctxt == NULL) || (ctxt->input == NULL))
3772 	return -1;
3773     if (CUR != '<') return -1;
3774     NEXT;
3775 
3776     atts = ctxt->atts;
3777     maxatts = ctxt->maxatts;
3778 
3779     GROW;
3780     name = htmlParseHTMLName(ctxt);
3781     if (name == NULL) {
3782 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3783 	             "htmlParseStartTag: invalid element name\n",
3784 		     NULL, NULL);
3785 	/* Dump the bogus tag like browsers do */
3786 	while ((CUR != 0) && (CUR != '>') &&
3787                (PARSER_STOPPED(ctxt) == 0))
3788 	    NEXT;
3789         return -1;
3790     }
3791     if (xmlStrEqual(name, BAD_CAST"meta"))
3792 	meta = 1;
3793 
3794     /*
3795      * Check for auto-closure of HTML elements.
3796      */
3797     htmlAutoClose(ctxt, name);
3798 
3799     /*
3800      * Check for implied HTML elements.
3801      */
3802     htmlCheckImplied(ctxt, name);
3803 
3804     /*
3805      * Avoid html at any level > 0, head at any level != 1
3806      * or any attempt to recurse body
3807      */
3808     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3809 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3810 	             "htmlParseStartTag: misplaced <html> tag\n",
3811 		     name, NULL);
3812 	discardtag = 1;
3813 	ctxt->depth++;
3814     }
3815     if ((ctxt->nameNr != 1) &&
3816 	(xmlStrEqual(name, BAD_CAST"head"))) {
3817 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3818 	             "htmlParseStartTag: misplaced <head> tag\n",
3819 		     name, NULL);
3820 	discardtag = 1;
3821 	ctxt->depth++;
3822     }
3823     if (xmlStrEqual(name, BAD_CAST"body")) {
3824 	int indx;
3825 	for (indx = 0;indx < ctxt->nameNr;indx++) {
3826 	    if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3827 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3828 		             "htmlParseStartTag: misplaced <body> tag\n",
3829 			     name, NULL);
3830 		discardtag = 1;
3831 		ctxt->depth++;
3832 	    }
3833 	}
3834     }
3835 
3836     /*
3837      * Now parse the attributes, it ends up with the ending
3838      *
3839      * (S Attribute)* S?
3840      */
3841     SKIP_BLANKS;
3842     while ((CUR != 0) &&
3843            (CUR != '>') &&
3844 	   ((CUR != '/') || (NXT(1) != '>')) &&
3845            (PARSER_STOPPED(ctxt) == 0)) {
3846 	GROW;
3847 	attname = htmlParseAttribute(ctxt, &attvalue);
3848         if (attname != NULL) {
3849 
3850 	    /*
3851 	     * Well formedness requires at most one declaration of an attribute
3852 	     */
3853 	    for (i = 0; i < nbatts;i += 2) {
3854 	        if (xmlStrEqual(atts[i], attname)) {
3855 		    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3856 		                 "Attribute %s redefined\n", attname, NULL);
3857 		    if (attvalue != NULL)
3858 			xmlFree(attvalue);
3859 		    goto failed;
3860 		}
3861 	    }
3862 
3863 	    /*
3864 	     * Add the pair to atts
3865 	     */
3866 	    if (atts == NULL) {
3867 	        maxatts = 22; /* allow for 10 attrs by default */
3868 	        atts = (const xmlChar **)
3869 		       xmlMalloc(maxatts * sizeof(xmlChar *));
3870 		if (atts == NULL) {
3871 		    htmlErrMemory(ctxt);
3872 		    if (attvalue != NULL)
3873 			xmlFree(attvalue);
3874 		    goto failed;
3875 		}
3876 		ctxt->atts = atts;
3877 		ctxt->maxatts = maxatts;
3878 	    } else if (nbatts + 4 > maxatts) {
3879 	        const xmlChar **n;
3880 
3881 	        maxatts *= 2;
3882 	        n = (const xmlChar **) xmlRealloc((void *) atts,
3883 					     maxatts * sizeof(const xmlChar *));
3884 		if (n == NULL) {
3885 		    htmlErrMemory(ctxt);
3886 		    if (attvalue != NULL)
3887 			xmlFree(attvalue);
3888 		    goto failed;
3889 		}
3890 		atts = n;
3891 		ctxt->atts = atts;
3892 		ctxt->maxatts = maxatts;
3893 	    }
3894 	    atts[nbatts++] = attname;
3895 	    atts[nbatts++] = attvalue;
3896 	    atts[nbatts] = NULL;
3897 	    atts[nbatts + 1] = NULL;
3898 	}
3899 	else {
3900 	    if (attvalue != NULL)
3901 	        xmlFree(attvalue);
3902 	    /* Dump the bogus attribute string up to the next blank or
3903 	     * the end of the tag. */
3904 	    while ((CUR != 0) &&
3905 	           !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3906 		   ((CUR != '/') || (NXT(1) != '>')) &&
3907                    (PARSER_STOPPED(ctxt) == 0))
3908 		NEXT;
3909 	}
3910 
3911 failed:
3912 	SKIP_BLANKS;
3913     }
3914 
3915     /*
3916      * Handle specific association to the META tag
3917      */
3918     if (meta && (nbatts != 0))
3919 	htmlCheckMeta(ctxt, atts);
3920 
3921     /*
3922      * SAX: Start of Element !
3923      */
3924     if (!discardtag) {
3925 	htmlnamePush(ctxt, name);
3926 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3927 	    if (nbatts != 0)
3928 		ctxt->sax->startElement(ctxt->userData, name, atts);
3929 	    else
3930 		ctxt->sax->startElement(ctxt->userData, name, NULL);
3931 	}
3932     }
3933 
3934     if (atts != NULL) {
3935         for (i = 1;i < nbatts;i += 2) {
3936 	    if (atts[i] != NULL)
3937 		xmlFree((xmlChar *) atts[i]);
3938 	}
3939     }
3940 
3941     return(discardtag);
3942 }
3943 
3944 /**
3945  * htmlParseEndTag:
3946  * @ctxt:  an HTML parser context
3947  *
3948  * parse an end of tag
3949  *
3950  * [42] ETag ::= '</' Name S? '>'
3951  *
3952  * With namespace
3953  *
3954  * [NS 9] ETag ::= '</' QName S? '>'
3955  *
3956  * Returns 1 if the current level should be closed.
3957  */
3958 
3959 static int
htmlParseEndTag(htmlParserCtxtPtr ctxt)3960 htmlParseEndTag(htmlParserCtxtPtr ctxt)
3961 {
3962     const xmlChar *name;
3963     const xmlChar *oldname;
3964     int i, ret;
3965 
3966     if ((CUR != '<') || (NXT(1) != '/')) {
3967         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3968 	             "htmlParseEndTag: '</' not found\n", NULL, NULL);
3969         return (0);
3970     }
3971     SKIP(2);
3972 
3973     name = htmlParseHTMLName(ctxt);
3974     if (name == NULL)
3975         return (0);
3976     /*
3977      * We should definitely be at the ending "S? '>'" part
3978      */
3979     SKIP_BLANKS;
3980     if (CUR != '>') {
3981         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3982 	             "End tag : expected '>'\n", NULL, NULL);
3983         /* Skip to next '>' */
3984         while ((PARSER_STOPPED(ctxt) == 0) &&
3985                (CUR != 0) && (CUR != '>'))
3986             NEXT;
3987     }
3988     if (CUR == '>')
3989         NEXT;
3990 
3991     /*
3992      * if we ignored misplaced tags in htmlParseStartTag don't pop them
3993      * out now.
3994      */
3995     if ((ctxt->depth > 0) &&
3996         (xmlStrEqual(name, BAD_CAST "html") ||
3997          xmlStrEqual(name, BAD_CAST "body") ||
3998 	 xmlStrEqual(name, BAD_CAST "head"))) {
3999 	ctxt->depth--;
4000 	return (0);
4001     }
4002 
4003     /*
4004      * If the name read is not one of the element in the parsing stack
4005      * then return, it's just an error.
4006      */
4007     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4008         if (xmlStrEqual(name, ctxt->nameTab[i]))
4009             break;
4010     }
4011     if (i < 0) {
4012         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4013 	             "Unexpected end tag : %s\n", name, NULL);
4014         return (0);
4015     }
4016 
4017 
4018     /*
4019      * Check for auto-closure of HTML elements.
4020      */
4021 
4022     htmlAutoCloseOnClose(ctxt, name);
4023 
4024     /*
4025      * Well formedness constraints, opening and closing must match.
4026      * With the exception that the autoclose may have popped stuff out
4027      * of the stack.
4028      */
4029     if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4030         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4031                      "Opening and ending tag mismatch: %s and %s\n",
4032                      name, ctxt->name);
4033     }
4034 
4035     /*
4036      * SAX: End of Tag
4037      */
4038     oldname = ctxt->name;
4039     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4040         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4041             ctxt->sax->endElement(ctxt->userData, name);
4042 	htmlNodeInfoPop(ctxt);
4043         htmlnamePop(ctxt);
4044         ret = 1;
4045     } else {
4046         ret = 0;
4047     }
4048 
4049     return (ret);
4050 }
4051 
4052 
4053 /**
4054  * htmlParseReference:
4055  * @ctxt:  an HTML parser context
4056  *
4057  * parse and handle entity references in content,
4058  * this will end-up in a call to character() since this is either a
4059  * CharRef, or a predefined entity.
4060  */
4061 static void
htmlParseReference(htmlParserCtxtPtr ctxt)4062 htmlParseReference(htmlParserCtxtPtr ctxt) {
4063     const htmlEntityDesc * ent;
4064     xmlChar out[6];
4065     const xmlChar *name;
4066     if (CUR != '&') return;
4067 
4068     if (NXT(1) == '#') {
4069 	unsigned int c;
4070 	int bits, i = 0;
4071 
4072 	c = htmlParseCharRef(ctxt);
4073 	if (c == 0)
4074 	    return;
4075 
4076         if      (c <    0x80) { out[i++]= c;                bits= -6; }
4077         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4078         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4079         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4080 
4081         for ( ; bits >= 0; bits-= 6) {
4082             out[i++]= ((c >> bits) & 0x3F) | 0x80;
4083         }
4084 	out[i] = 0;
4085 
4086 	htmlCheckParagraph(ctxt);
4087 	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4088 	    ctxt->sax->characters(ctxt->userData, out, i);
4089     } else {
4090 	ent = htmlParseEntityRef(ctxt, &name);
4091 	if (name == NULL) {
4092 	    htmlCheckParagraph(ctxt);
4093 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4094 	        ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4095 	    return;
4096 	}
4097 	if ((ent == NULL) || !(ent->value > 0)) {
4098 	    htmlCheckParagraph(ctxt);
4099 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4100 		ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4101 		ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4102 		/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4103 	    }
4104 	} else {
4105 	    unsigned int c;
4106 	    int bits, i = 0;
4107 
4108 	    c = ent->value;
4109 	    if      (c <    0x80)
4110 	            { out[i++]= c;                bits= -6; }
4111 	    else if (c <   0x800)
4112 	            { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4113 	    else if (c < 0x10000)
4114 	            { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4115 	    else
4116 	            { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4117 
4118 	    for ( ; bits >= 0; bits-= 6) {
4119 		out[i++]= ((c >> bits) & 0x3F) | 0x80;
4120 	    }
4121 	    out[i] = 0;
4122 
4123 	    htmlCheckParagraph(ctxt);
4124 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4125 		ctxt->sax->characters(ctxt->userData, out, i);
4126 	}
4127     }
4128 }
4129 
4130 /**
4131  * htmlParseContent:
4132  * @ctxt:  an HTML parser context
4133  *
4134  * Parse a content: comment, sub-element, reference or text.
4135  * Kept for compatibility with old code
4136  */
4137 
4138 static void
htmlParseContent(htmlParserCtxtPtr ctxt)4139 htmlParseContent(htmlParserCtxtPtr ctxt) {
4140     xmlChar *currentNode;
4141     int depth;
4142     const xmlChar *name;
4143 
4144     currentNode = xmlStrdup(ctxt->name);
4145     depth = ctxt->nameNr;
4146     while (!PARSER_STOPPED(ctxt)) {
4147         GROW;
4148 
4149 	/*
4150 	 * Our tag or one of it's parent or children is ending.
4151 	 */
4152         if ((CUR == '<') && (NXT(1) == '/')) {
4153 	    if (htmlParseEndTag(ctxt) &&
4154 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4155 		if (currentNode != NULL)
4156 		    xmlFree(currentNode);
4157 		return;
4158 	    }
4159 	    continue; /* while */
4160         }
4161 
4162 	else if ((CUR == '<') &&
4163 	         ((IS_ASCII_LETTER(NXT(1))) ||
4164 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4165 	    name = htmlParseHTMLName_nonInvasive(ctxt);
4166 	    if (name == NULL) {
4167 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4168 			 "htmlParseStartTag: invalid element name\n",
4169 			 NULL, NULL);
4170 	        /* Dump the bogus tag like browsers do */
4171                 while ((CUR != 0) && (CUR != '>'))
4172 	            NEXT;
4173 
4174 	        if (currentNode != NULL)
4175 	            xmlFree(currentNode);
4176 	        return;
4177 	    }
4178 
4179 	    if (ctxt->name != NULL) {
4180 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4181 	            htmlAutoClose(ctxt, name);
4182 	            continue;
4183 	        }
4184 	    }
4185 	}
4186 
4187 	/*
4188 	 * Has this node been popped out during parsing of
4189 	 * the next element
4190 	 */
4191         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4192 	    (!xmlStrEqual(currentNode, ctxt->name)))
4193 	     {
4194 	    if (currentNode != NULL) xmlFree(currentNode);
4195 	    return;
4196 	}
4197 
4198 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4199 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4200 	    /*
4201 	     * Handle SCRIPT/STYLE separately
4202 	     */
4203 	    htmlParseScript(ctxt);
4204 	}
4205 
4206         else if ((CUR == '<') && (NXT(1) == '!')) {
4207             /*
4208              * Sometimes DOCTYPE arrives in the middle of the document
4209              */
4210             if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4211                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4212                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4213                 (UPP(8) == 'E')) {
4214                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4215                              "Misplaced DOCTYPE declaration\n",
4216                              BAD_CAST "DOCTYPE" , NULL);
4217                 htmlParseDocTypeDecl(ctxt);
4218             }
4219             /*
4220              * First case :  a comment
4221              */
4222             else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4223                 htmlParseComment(ctxt);
4224             }
4225             else {
4226                 htmlSkipBogusComment(ctxt);
4227             }
4228         }
4229 
4230         /*
4231          * Second case : a Processing Instruction.
4232          */
4233         else if ((CUR == '<') && (NXT(1) == '?')) {
4234             htmlParsePI(ctxt);
4235         }
4236 
4237         /*
4238          * Third case :  a sub-element.
4239          */
4240         else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4241             htmlParseElement(ctxt);
4242         }
4243         else if (CUR == '<') {
4244             if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4245                 (ctxt->sax->characters != NULL))
4246                 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4247             NEXT;
4248         }
4249 
4250         /*
4251          * Fourth case : a reference. If if has not been resolved,
4252          *    parsing returns it's Name, create the node
4253          */
4254         else if (CUR == '&') {
4255             htmlParseReference(ctxt);
4256         }
4257 
4258         /*
4259          * Fifth case : end of the resource
4260          */
4261         else if (CUR == 0) {
4262             htmlAutoCloseOnEnd(ctxt);
4263             break;
4264         }
4265 
4266         /*
4267          * Last case, text. Note that References are handled directly.
4268          */
4269         else {
4270             htmlParseCharData(ctxt);
4271         }
4272 
4273         SHRINK;
4274         GROW;
4275     }
4276     if (currentNode != NULL) xmlFree(currentNode);
4277 }
4278 
4279 /**
4280  * htmlParseElement:
4281  * @ctxt:  an HTML parser context
4282  *
4283  * DEPRECATED: Internal function, don't use.
4284  *
4285  * parse an HTML element, this is highly recursive
4286  * this is kept for compatibility with previous code versions
4287  *
4288  * [39] element ::= EmptyElemTag | STag content ETag
4289  *
4290  * [41] Attribute ::= Name Eq AttValue
4291  */
4292 
4293 void
htmlParseElement(htmlParserCtxtPtr ctxt)4294 htmlParseElement(htmlParserCtxtPtr ctxt) {
4295     const xmlChar *name;
4296     xmlChar *currentNode = NULL;
4297     const htmlElemDesc * info;
4298     htmlParserNodeInfo node_info;
4299     int failed;
4300     int depth;
4301     const xmlChar *oldptr;
4302 
4303     if ((ctxt == NULL) || (ctxt->input == NULL))
4304 	return;
4305 
4306     /* Capture start position */
4307     if (ctxt->record_info) {
4308         node_info.begin_pos = ctxt->input->consumed +
4309                           (CUR_PTR - ctxt->input->base);
4310 	node_info.begin_line = ctxt->input->line;
4311     }
4312 
4313     failed = htmlParseStartTag(ctxt);
4314     name = ctxt->name;
4315     if ((failed == -1) || (name == NULL)) {
4316 	if (CUR == '>')
4317 	    NEXT;
4318         return;
4319     }
4320 
4321     /*
4322      * Lookup the info for that element.
4323      */
4324     info = htmlTagLookup(name);
4325     if (info == NULL) {
4326 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4327 	             "Tag %s invalid\n", name, NULL);
4328     }
4329 
4330     /*
4331      * Check for an Empty Element labeled the XML/SGML way
4332      */
4333     if ((CUR == '/') && (NXT(1) == '>')) {
4334         SKIP(2);
4335 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4336 	    ctxt->sax->endElement(ctxt->userData, name);
4337 	htmlnamePop(ctxt);
4338 	return;
4339     }
4340 
4341     if (CUR == '>') {
4342         NEXT;
4343     } else {
4344 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4345 	             "Couldn't find end of Start Tag %s\n", name, NULL);
4346 
4347 	/*
4348 	 * end of parsing of this node.
4349 	 */
4350 	if (xmlStrEqual(name, ctxt->name)) {
4351 	    nodePop(ctxt);
4352 	    htmlnamePop(ctxt);
4353 	}
4354 
4355 	/*
4356 	 * Capture end position and add node
4357 	 */
4358 	if (ctxt->record_info) {
4359 	   node_info.end_pos = ctxt->input->consumed +
4360 			      (CUR_PTR - ctxt->input->base);
4361 	   node_info.end_line = ctxt->input->line;
4362 	   node_info.node = ctxt->node;
4363 	   xmlParserAddNodeInfo(ctxt, &node_info);
4364 	}
4365 	return;
4366     }
4367 
4368     /*
4369      * Check for an Empty Element from DTD definition
4370      */
4371     if ((info != NULL) && (info->empty)) {
4372 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4373 	    ctxt->sax->endElement(ctxt->userData, name);
4374 	htmlnamePop(ctxt);
4375 	return;
4376     }
4377 
4378     /*
4379      * Parse the content of the element:
4380      */
4381     currentNode = xmlStrdup(ctxt->name);
4382     depth = ctxt->nameNr;
4383     while (CUR != 0) {
4384 	oldptr = ctxt->input->cur;
4385 	htmlParseContent(ctxt);
4386 	if (oldptr==ctxt->input->cur) break;
4387 	if (ctxt->nameNr < depth) break;
4388     }
4389 
4390     /*
4391      * Capture end position and add node
4392      */
4393     if ( currentNode != NULL && ctxt->record_info ) {
4394        node_info.end_pos = ctxt->input->consumed +
4395                           (CUR_PTR - ctxt->input->base);
4396        node_info.end_line = ctxt->input->line;
4397        node_info.node = ctxt->node;
4398        xmlParserAddNodeInfo(ctxt, &node_info);
4399     }
4400     if (CUR == 0) {
4401 	htmlAutoCloseOnEnd(ctxt);
4402     }
4403 
4404     if (currentNode != NULL)
4405 	xmlFree(currentNode);
4406 }
4407 
4408 static void
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt)4409 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4410     /*
4411      * Capture end position and add node
4412      */
4413     if ( ctxt->node != NULL && ctxt->record_info ) {
4414        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4415                                 (CUR_PTR - ctxt->input->base);
4416        ctxt->nodeInfo->end_line = ctxt->input->line;
4417        ctxt->nodeInfo->node = ctxt->node;
4418        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4419        htmlNodeInfoPop(ctxt);
4420     }
4421     if (CUR == 0) {
4422        htmlAutoCloseOnEnd(ctxt);
4423     }
4424 }
4425 
4426 /**
4427  * htmlParseElementInternal:
4428  * @ctxt:  an HTML parser context
4429  *
4430  * parse an HTML element, new version, non recursive
4431  *
4432  * [39] element ::= EmptyElemTag | STag content ETag
4433  *
4434  * [41] Attribute ::= Name Eq AttValue
4435  */
4436 
4437 static void
htmlParseElementInternal(htmlParserCtxtPtr ctxt)4438 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4439     const xmlChar *name;
4440     const htmlElemDesc * info;
4441     htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4442     int failed;
4443 
4444     if ((ctxt == NULL) || (ctxt->input == NULL))
4445 	return;
4446 
4447     /* Capture start position */
4448     if (ctxt->record_info) {
4449         node_info.begin_pos = ctxt->input->consumed +
4450                           (CUR_PTR - ctxt->input->base);
4451 	node_info.begin_line = ctxt->input->line;
4452     }
4453 
4454     failed = htmlParseStartTag(ctxt);
4455     name = ctxt->name;
4456     if ((failed == -1) || (name == NULL)) {
4457 	if (CUR == '>')
4458 	    NEXT;
4459         return;
4460     }
4461 
4462     /*
4463      * Lookup the info for that element.
4464      */
4465     info = htmlTagLookup(name);
4466     if (info == NULL) {
4467 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4468 	             "Tag %s invalid\n", name, NULL);
4469     }
4470 
4471     /*
4472      * Check for an Empty Element labeled the XML/SGML way
4473      */
4474     if ((CUR == '/') && (NXT(1) == '>')) {
4475         SKIP(2);
4476 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4477 	    ctxt->sax->endElement(ctxt->userData, name);
4478 	htmlnamePop(ctxt);
4479 	return;
4480     }
4481 
4482     if (CUR == '>') {
4483         NEXT;
4484     } else {
4485 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4486 	             "Couldn't find end of Start Tag %s\n", name, NULL);
4487 
4488 	/*
4489 	 * end of parsing of this node.
4490 	 */
4491 	if (xmlStrEqual(name, ctxt->name)) {
4492 	    nodePop(ctxt);
4493 	    htmlnamePop(ctxt);
4494 	}
4495 
4496         if (ctxt->record_info)
4497             htmlNodeInfoPush(ctxt, &node_info);
4498         htmlParserFinishElementParsing(ctxt);
4499 	return;
4500     }
4501 
4502     /*
4503      * Check for an Empty Element from DTD definition
4504      */
4505     if ((info != NULL) && (info->empty)) {
4506 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4507 	    ctxt->sax->endElement(ctxt->userData, name);
4508 	htmlnamePop(ctxt);
4509 	return;
4510     }
4511 
4512     if (ctxt->record_info)
4513         htmlNodeInfoPush(ctxt, &node_info);
4514 }
4515 
4516 /**
4517  * htmlParseContentInternal:
4518  * @ctxt:  an HTML parser context
4519  *
4520  * Parse a content: comment, sub-element, reference or text.
4521  * New version for non recursive htmlParseElementInternal
4522  */
4523 
4524 static void
htmlParseContentInternal(htmlParserCtxtPtr ctxt)4525 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4526     xmlChar *currentNode;
4527     int depth;
4528     const xmlChar *name;
4529 
4530     depth = ctxt->nameNr;
4531     if (depth <= 0) {
4532         currentNode = NULL;
4533     } else {
4534         currentNode = xmlStrdup(ctxt->name);
4535         if (currentNode == NULL) {
4536             htmlErrMemory(ctxt);
4537             return;
4538         }
4539     }
4540     while (PARSER_STOPPED(ctxt) == 0) {
4541         GROW;
4542 
4543 	/*
4544 	 * Our tag or one of it's parent or children is ending.
4545 	 */
4546         if ((CUR == '<') && (NXT(1) == '/')) {
4547 	    if (htmlParseEndTag(ctxt) &&
4548 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4549 		if (currentNode != NULL)
4550 		    xmlFree(currentNode);
4551 
4552 	        depth = ctxt->nameNr;
4553                 if (depth <= 0) {
4554                     currentNode = NULL;
4555                 } else {
4556                     currentNode = xmlStrdup(ctxt->name);
4557                     if (currentNode == NULL) {
4558                         htmlErrMemory(ctxt);
4559                         break;
4560                     }
4561                 }
4562 	    }
4563 	    continue; /* while */
4564         }
4565 
4566 	else if ((CUR == '<') &&
4567 	         ((IS_ASCII_LETTER(NXT(1))) ||
4568 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4569 	    name = htmlParseHTMLName_nonInvasive(ctxt);
4570 	    if (name == NULL) {
4571 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4572 			 "htmlParseStartTag: invalid element name\n",
4573 			 NULL, NULL);
4574 	        /* Dump the bogus tag like browsers do */
4575 	        while ((CUR == 0) && (CUR != '>'))
4576 	            NEXT;
4577 
4578 	        htmlParserFinishElementParsing(ctxt);
4579 	        if (currentNode != NULL)
4580 	            xmlFree(currentNode);
4581 
4582                 if (ctxt->name == NULL) {
4583                     currentNode = NULL;
4584                 } else {
4585                     currentNode = xmlStrdup(ctxt->name);
4586                     if (currentNode == NULL) {
4587                         htmlErrMemory(ctxt);
4588                         break;
4589                     }
4590                 }
4591 	        depth = ctxt->nameNr;
4592 	        continue;
4593 	    }
4594 
4595 	    if (ctxt->name != NULL) {
4596 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4597 	            htmlAutoClose(ctxt, name);
4598 	            continue;
4599 	        }
4600 	    }
4601 	}
4602 
4603 	/*
4604 	 * Has this node been popped out during parsing of
4605 	 * the next element
4606 	 */
4607         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4608 	    (!xmlStrEqual(currentNode, ctxt->name)))
4609 	     {
4610 	    htmlParserFinishElementParsing(ctxt);
4611 	    if (currentNode != NULL) xmlFree(currentNode);
4612 
4613             if (ctxt->name == NULL) {
4614                 currentNode = NULL;
4615             } else {
4616                 currentNode = xmlStrdup(ctxt->name);
4617                 if (currentNode == NULL) {
4618                     htmlErrMemory(ctxt);
4619                     break;
4620                 }
4621             }
4622 	    depth = ctxt->nameNr;
4623 	    continue;
4624 	}
4625 
4626 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4627 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4628 	    /*
4629 	     * Handle SCRIPT/STYLE separately
4630 	     */
4631 	    htmlParseScript(ctxt);
4632 	}
4633 
4634         else if ((CUR == '<') && (NXT(1) == '!')) {
4635             /*
4636              * Sometimes DOCTYPE arrives in the middle of the document
4637              */
4638             if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4639                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4640                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4641                 (UPP(8) == 'E')) {
4642                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4643                              "Misplaced DOCTYPE declaration\n",
4644                              BAD_CAST "DOCTYPE" , NULL);
4645                 htmlParseDocTypeDecl(ctxt);
4646             }
4647             /*
4648              * First case :  a comment
4649              */
4650             else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4651                 htmlParseComment(ctxt);
4652             }
4653             else {
4654                 htmlSkipBogusComment(ctxt);
4655             }
4656         }
4657 
4658         /*
4659          * Second case : a Processing Instruction.
4660          */
4661         else if ((CUR == '<') && (NXT(1) == '?')) {
4662             htmlParsePI(ctxt);
4663         }
4664 
4665         /*
4666          * Third case :  a sub-element.
4667          */
4668         else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4669             htmlParseElementInternal(ctxt);
4670             if (currentNode != NULL) xmlFree(currentNode);
4671 
4672             if (ctxt->name == NULL) {
4673                 currentNode = NULL;
4674             } else {
4675                 currentNode = xmlStrdup(ctxt->name);
4676                 if (currentNode == NULL) {
4677                     htmlErrMemory(ctxt);
4678                     break;
4679                 }
4680             }
4681             depth = ctxt->nameNr;
4682         }
4683         else if (CUR == '<') {
4684             if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4685                 (ctxt->sax->characters != NULL))
4686                 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4687             NEXT;
4688         }
4689 
4690         /*
4691          * Fourth case : a reference. If if has not been resolved,
4692          *    parsing returns it's Name, create the node
4693          */
4694         else if (CUR == '&') {
4695             htmlParseReference(ctxt);
4696         }
4697 
4698         /*
4699          * Fifth case : end of the resource
4700          */
4701         else if (CUR == 0) {
4702             htmlAutoCloseOnEnd(ctxt);
4703             break;
4704         }
4705 
4706         /*
4707          * Last case, text. Note that References are handled directly.
4708          */
4709         else {
4710             htmlParseCharData(ctxt);
4711         }
4712 
4713         SHRINK;
4714         GROW;
4715     }
4716     if (currentNode != NULL) xmlFree(currentNode);
4717 }
4718 
4719 /**
4720  * htmlParseContent:
4721  * @ctxt:  an HTML parser context
4722  *
4723  * Parse a content: comment, sub-element, reference or text.
4724  * This is the entry point when called from parser.c
4725  */
4726 
4727 void
__htmlParseContent(void * ctxt)4728 __htmlParseContent(void *ctxt) {
4729     if (ctxt != NULL)
4730 	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4731 }
4732 
4733 /**
4734  * htmlParseDocument:
4735  * @ctxt:  an HTML parser context
4736  *
4737  * Parse an HTML document and invoke the SAX handlers. This is useful
4738  * if you're only interested in custom SAX callbacks. If you want a
4739  * document tree, use htmlCtxtParseDocument.
4740  *
4741  * Returns 0, -1 in case of error.
4742  */
4743 
4744 int
htmlParseDocument(htmlParserCtxtPtr ctxt)4745 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4746     xmlDtdPtr dtd;
4747 
4748     if ((ctxt == NULL) || (ctxt->input == NULL))
4749 	return(-1);
4750 
4751     /*
4752      * Document locator is unused. Only for backward compatibility.
4753      */
4754     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
4755         xmlSAXLocator copy = xmlDefaultSAXLocator;
4756         ctxt->sax->setDocumentLocator(ctxt->userData, &copy);
4757     }
4758 
4759     xmlDetectEncoding(ctxt);
4760 
4761     /*
4762      * This is wrong but matches long-standing behavior. In most cases,
4763      * a document starting with an XML declaration will specify UTF-8.
4764      */
4765     if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
4766         (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0))
4767         xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
4768 
4769     /*
4770      * Wipe out everything which is before the first '<'
4771      */
4772     SKIP_BLANKS;
4773     if (CUR == 0) {
4774 	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4775 	             "Document is empty\n", NULL, NULL);
4776     }
4777 
4778     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4779 	ctxt->sax->startDocument(ctxt->userData);
4780 
4781     /*
4782      * Parse possible comments and PIs before any content
4783      */
4784     while (((CUR == '<') && (NXT(1) == '!') &&
4785             (NXT(2) == '-') && (NXT(3) == '-')) ||
4786 	   ((CUR == '<') && (NXT(1) == '?'))) {
4787         htmlParseComment(ctxt);
4788         htmlParsePI(ctxt);
4789 	SKIP_BLANKS;
4790     }
4791 
4792 
4793     /*
4794      * Then possibly doc type declaration(s) and more Misc
4795      * (doctypedecl Misc*)?
4796      */
4797     if ((CUR == '<') && (NXT(1) == '!') &&
4798 	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4799 	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4800 	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4801 	(UPP(8) == 'E')) {
4802 	htmlParseDocTypeDecl(ctxt);
4803     }
4804     SKIP_BLANKS;
4805 
4806     /*
4807      * Parse possible comments and PIs before any content
4808      */
4809     while ((PARSER_STOPPED(ctxt) == 0) &&
4810            (((CUR == '<') && (NXT(1) == '!') &&
4811              (NXT(2) == '-') && (NXT(3) == '-')) ||
4812 	    ((CUR == '<') && (NXT(1) == '?')))) {
4813         htmlParseComment(ctxt);
4814         htmlParsePI(ctxt);
4815 	SKIP_BLANKS;
4816     }
4817 
4818     /*
4819      * Time to start parsing the tree itself
4820      */
4821     htmlParseContentInternal(ctxt);
4822 
4823     /*
4824      * autoclose
4825      */
4826     if (CUR == 0)
4827 	htmlAutoCloseOnEnd(ctxt);
4828 
4829 
4830     /*
4831      * SAX: end of the document processing.
4832      */
4833     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4834         ctxt->sax->endDocument(ctxt->userData);
4835 
4836     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4837 	dtd = xmlGetIntSubset(ctxt->myDoc);
4838 	if (dtd == NULL) {
4839 	    ctxt->myDoc->intSubset =
4840 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4841 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4842 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4843             if (ctxt->myDoc->intSubset == NULL)
4844                 htmlErrMemory(ctxt);
4845         }
4846     }
4847     if (! ctxt->wellFormed) return(-1);
4848     return(0);
4849 }
4850 
4851 
4852 /************************************************************************
4853  *									*
4854  *			Parser contexts handling			*
4855  *									*
4856  ************************************************************************/
4857 
4858 /**
4859  * htmlInitParserCtxt:
4860  * @ctxt:  an HTML parser context
4861  * @sax:  SAX handler
4862  * @userData:  user data
4863  *
4864  * Initialize a parser context
4865  *
4866  * Returns 0 in case of success and -1 in case of error
4867  */
4868 
4869 static int
htmlInitParserCtxt(htmlParserCtxtPtr ctxt,const htmlSAXHandler * sax,void * userData)4870 htmlInitParserCtxt(htmlParserCtxtPtr ctxt, const htmlSAXHandler *sax,
4871                    void *userData)
4872 {
4873     if (ctxt == NULL) return(-1);
4874     memset(ctxt, 0, sizeof(htmlParserCtxt));
4875 
4876     ctxt->dict = xmlDictCreate();
4877     if (ctxt->dict == NULL)
4878 	return(-1);
4879 
4880     if (ctxt->sax == NULL)
4881         ctxt->sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4882     if (ctxt->sax == NULL)
4883 	return(-1);
4884     if (sax == NULL) {
4885         memset(ctxt->sax, 0, sizeof(htmlSAXHandler));
4886         xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
4887         ctxt->userData = ctxt;
4888     } else {
4889         memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4890         ctxt->userData = userData ? userData : ctxt;
4891     }
4892 
4893     /* Allocate the Input stack */
4894     ctxt->inputTab = (htmlParserInputPtr *)
4895                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
4896     if (ctxt->inputTab == NULL)
4897 	return(-1);
4898     ctxt->inputNr = 0;
4899     ctxt->inputMax = 5;
4900     ctxt->input = NULL;
4901     ctxt->version = NULL;
4902     ctxt->encoding = NULL;
4903     ctxt->standalone = -1;
4904     ctxt->instate = XML_PARSER_START;
4905 
4906     /* Allocate the Node stack */
4907     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4908     if (ctxt->nodeTab == NULL)
4909 	return(-1);
4910     ctxt->nodeNr = 0;
4911     ctxt->nodeMax = 10;
4912     ctxt->node = NULL;
4913 
4914     /* Allocate the Name stack */
4915     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4916     if (ctxt->nameTab == NULL)
4917 	return(-1);
4918     ctxt->nameNr = 0;
4919     ctxt->nameMax = 10;
4920     ctxt->name = NULL;
4921 
4922     ctxt->nodeInfoTab = NULL;
4923     ctxt->nodeInfoNr  = 0;
4924     ctxt->nodeInfoMax = 0;
4925 
4926     ctxt->myDoc = NULL;
4927     ctxt->wellFormed = 1;
4928     ctxt->replaceEntities = 0;
4929     ctxt->linenumbers = xmlLineNumbersDefaultValue;
4930     ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
4931     ctxt->html = 1;
4932     ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
4933     ctxt->vctxt.userData = ctxt;
4934     ctxt->vctxt.error = xmlParserValidityError;
4935     ctxt->vctxt.warning = xmlParserValidityWarning;
4936     ctxt->record_info = 0;
4937     ctxt->validate = 0;
4938     ctxt->checkIndex = 0;
4939     ctxt->catalogs = NULL;
4940     xmlInitNodeInfoSeq(&ctxt->node_seq);
4941     return(0);
4942 }
4943 
4944 /**
4945  * htmlFreeParserCtxt:
4946  * @ctxt:  an HTML parser context
4947  *
4948  * Free all the memory used by a parser context. However the parsed
4949  * document in ctxt->myDoc is not freed.
4950  */
4951 
4952 void
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)4953 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4954 {
4955     xmlFreeParserCtxt(ctxt);
4956 }
4957 
4958 /**
4959  * htmlNewParserCtxt:
4960  *
4961  * Allocate and initialize a new HTML parser context.
4962  *
4963  * This can be used to parse HTML documents into DOM trees with
4964  * functions like xmlCtxtReadFile or xmlCtxtReadMemory.
4965  *
4966  * See htmlCtxtUseOptions for parser options.
4967  *
4968  * See xmlCtxtSetErrorHandler for advanced error handling.
4969  *
4970  * See xmlNewInputURL, xmlNewInputMemory, xmlNewInputIO and similar
4971  * functions for advanced input control.
4972  *
4973  * See htmlNewSAXParserCtxt for custom SAX parsers.
4974  *
4975  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4976  */
4977 
4978 htmlParserCtxtPtr
htmlNewParserCtxt(void)4979 htmlNewParserCtxt(void)
4980 {
4981     return(htmlNewSAXParserCtxt(NULL, NULL));
4982 }
4983 
4984 /**
4985  * htmlNewSAXParserCtxt:
4986  * @sax:  SAX handler
4987  * @userData:  user data
4988  *
4989  * Allocate and initialize a new HTML SAX parser context. If userData
4990  * is NULL, the parser context will be passed as user data.
4991  *
4992  * Available since 2.11.0. If you want support older versions,
4993  * it's best to invoke htmlNewParserCtxt and set ctxt->sax with
4994  * struct assignment.
4995  *
4996  * Also see htmlNewParserCtxt.
4997  *
4998  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4999  */
5000 
5001 htmlParserCtxtPtr
htmlNewSAXParserCtxt(const htmlSAXHandler * sax,void * userData)5002 htmlNewSAXParserCtxt(const htmlSAXHandler *sax, void *userData)
5003 {
5004     xmlParserCtxtPtr ctxt;
5005 
5006     xmlInitParser();
5007 
5008     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5009     if (ctxt == NULL)
5010 	return(NULL);
5011     memset(ctxt, 0, sizeof(xmlParserCtxt));
5012     if (htmlInitParserCtxt(ctxt, sax, userData) < 0) {
5013         htmlFreeParserCtxt(ctxt);
5014 	return(NULL);
5015     }
5016     return(ctxt);
5017 }
5018 
5019 static htmlParserCtxtPtr
htmlCreateMemoryParserCtxtInternal(const char * url,const char * buffer,size_t size,const char * encoding)5020 htmlCreateMemoryParserCtxtInternal(const char *url,
5021                                    const char *buffer, size_t size,
5022                                    const char *encoding) {
5023     xmlParserCtxtPtr ctxt;
5024     xmlParserInputPtr input;
5025 
5026     if (buffer == NULL)
5027 	return(NULL);
5028 
5029     ctxt = htmlNewParserCtxt();
5030     if (ctxt == NULL)
5031 	return(NULL);
5032 
5033     input = xmlNewInputMemory(ctxt, url, buffer, size, encoding, 0);
5034     if (input == NULL) {
5035 	xmlFreeParserCtxt(ctxt);
5036         return(NULL);
5037     }
5038 
5039     inputPush(ctxt, input);
5040 
5041     return(ctxt);
5042 }
5043 
5044 /**
5045  * htmlCreateMemoryParserCtxt:
5046  * @buffer:  a pointer to a char array
5047  * @size:  the size of the array
5048  *
5049  * DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadMemory.
5050  *
5051  * Create a parser context for an HTML in-memory document. The input
5052  * buffer must not contain any terminating null bytes.
5053  *
5054  * Returns the new parser context or NULL
5055  */
5056 htmlParserCtxtPtr
htmlCreateMemoryParserCtxt(const char * buffer,int size)5057 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5058     if (size <= 0)
5059 	return(NULL);
5060 
5061     return(htmlCreateMemoryParserCtxtInternal(NULL, buffer, size, NULL));
5062 }
5063 
5064 /**
5065  * htmlCreateDocParserCtxt:
5066  * @str:  a pointer to an array of xmlChar
5067  * @encoding:  encoding (optional)
5068  *
5069  * Create a parser context for a null-terminated string.
5070  *
5071  * Returns the new parser context or NULL if a memory allocation failed.
5072  */
5073 static htmlParserCtxtPtr
htmlCreateDocParserCtxt(const xmlChar * str,const char * url,const char * encoding)5074 htmlCreateDocParserCtxt(const xmlChar *str, const char *url,
5075                         const char *encoding) {
5076     xmlParserCtxtPtr ctxt;
5077     xmlParserInputPtr input;
5078 
5079     if (str == NULL)
5080 	return(NULL);
5081 
5082     ctxt = htmlNewParserCtxt();
5083     if (ctxt == NULL)
5084 	return(NULL);
5085 
5086     input = xmlNewInputString(ctxt, url, (const char *) str, encoding, 0);
5087     if (input == NULL) {
5088 	xmlFreeParserCtxt(ctxt);
5089 	return(NULL);
5090     }
5091 
5092     inputPush(ctxt, input);
5093 
5094     return(ctxt);
5095 }
5096 
5097 #ifdef LIBXML_PUSH_ENABLED
5098 /************************************************************************
5099  *									*
5100  *	Progressive parsing interfaces				*
5101  *									*
5102  ************************************************************************/
5103 
5104 /**
5105  * htmlParseLookupSequence:
5106  * @ctxt:  an HTML parser context
5107  * @first:  the first char to lookup
5108  * @next:  the next char to lookup or zero
5109  * @third:  the next char to lookup or zero
5110  * @ignoreattrval: skip over attribute values
5111  *
5112  * Try to find if a sequence (first, next, third) or  just (first next) or
5113  * (first) is available in the input stream.
5114  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5115  * to avoid rescanning sequences of bytes, it DOES change the state of the
5116  * parser, do not use liberally.
5117  * This is basically similar to xmlParseLookupSequence()
5118  *
5119  * Returns the index to the current parsing point if the full sequence
5120  *      is available, -1 otherwise.
5121  */
5122 static int
htmlParseLookupSequence(htmlParserCtxtPtr ctxt,xmlChar first,xmlChar next,xmlChar third,int ignoreattrval)5123 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5124                         xmlChar next, xmlChar third, int ignoreattrval)
5125 {
5126     size_t base, len;
5127     htmlParserInputPtr in;
5128     const xmlChar *buf;
5129     int quote;
5130 
5131     in = ctxt->input;
5132     if (in == NULL)
5133         return (-1);
5134 
5135     base = ctxt->checkIndex;
5136     quote = ctxt->endCheckState;
5137 
5138     buf = in->cur;
5139     len = in->end - in->cur;
5140 
5141     /* take into account the sequence length */
5142     if (third)
5143         len -= 2;
5144     else if (next)
5145         len--;
5146     for (; base < len; base++) {
5147         if (base >= INT_MAX / 2) {
5148             ctxt->checkIndex = 0;
5149             ctxt->endCheckState = 0;
5150             return (base - 2);
5151         }
5152         if (ignoreattrval) {
5153             if (quote) {
5154                 if (buf[base] == quote)
5155                     quote = 0;
5156                 continue;
5157             }
5158             if (buf[base] == '"' || buf[base] == '\'') {
5159                 quote = buf[base];
5160                 continue;
5161             }
5162         }
5163         if (buf[base] == first) {
5164             if (third != 0) {
5165                 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5166                     continue;
5167             } else if (next != 0) {
5168                 if (buf[base + 1] != next)
5169                     continue;
5170             }
5171             ctxt->checkIndex = 0;
5172             ctxt->endCheckState = 0;
5173             return (base);
5174         }
5175     }
5176     ctxt->checkIndex = base;
5177     ctxt->endCheckState = quote;
5178     return (-1);
5179 }
5180 
5181 /**
5182  * htmlParseLookupCommentEnd:
5183  * @ctxt: an HTML parser context
5184  *
5185  * Try to find a comment end tag in the input stream
5186  * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5187  * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5188  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5189  * to avoid rescanning sequences of bytes, it DOES change the state of the
5190  * parser, do not use liberally.
5191  * This wraps to htmlParseLookupSequence()
5192  *
5193  * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5194  */
5195 static int
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)5196 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5197 {
5198     int mark = 0;
5199     int offset;
5200 
5201     while (1) {
5202 	mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5203 	if (mark < 0)
5204             break;
5205         if ((NXT(mark+2) == '>') ||
5206 	    ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5207             ctxt->checkIndex = 0;
5208 	    break;
5209 	}
5210         offset = (NXT(mark+2) == '!') ? 3 : 2;
5211         if (mark + offset >= ctxt->input->end - ctxt->input->cur) {
5212 	    ctxt->checkIndex = mark;
5213             return(-1);
5214         }
5215 	ctxt->checkIndex = mark + 1;
5216     }
5217     return mark;
5218 }
5219 
5220 
5221 /**
5222  * htmlParseTryOrFinish:
5223  * @ctxt:  an HTML parser context
5224  * @terminate:  last chunk indicator
5225  *
5226  * Try to progress on parsing
5227  *
5228  * Returns zero if no parsing was possible
5229  */
5230 static int
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt,int terminate)5231 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5232     int ret = 0;
5233     htmlParserInputPtr in;
5234     ptrdiff_t avail = 0;
5235     xmlChar cur, next;
5236 
5237     htmlParserNodeInfo node_info;
5238 
5239     while (PARSER_STOPPED(ctxt) == 0) {
5240 
5241 	in = ctxt->input;
5242 	if (in == NULL) break;
5243 	avail = in->end - in->cur;
5244 	if ((avail == 0) && (terminate)) {
5245 	    htmlAutoCloseOnEnd(ctxt);
5246 	    if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5247 		/*
5248 		 * SAX: end of the document processing.
5249 		 */
5250 		ctxt->instate = XML_PARSER_EOF;
5251 		if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5252 		    ctxt->sax->endDocument(ctxt->userData);
5253 	    }
5254 	}
5255         if (avail < 1)
5256 	    goto done;
5257         /*
5258          * This is done to make progress and avoid an infinite loop
5259          * if a parsing attempt was aborted by hitting a NUL byte. After
5260          * changing htmlCurrentChar, this probably isn't necessary anymore.
5261          * We should consider removing this check.
5262          */
5263 	cur = in->cur[0];
5264 	if (cur == 0) {
5265 	    SKIP(1);
5266 	    continue;
5267 	}
5268 
5269         switch (ctxt->instate) {
5270             case XML_PARSER_EOF:
5271 	        /*
5272 		 * Document parsing is done !
5273 		 */
5274 	        goto done;
5275             case XML_PARSER_START:
5276                 /*
5277                  * This is wrong but matches long-standing behavior. In most
5278                  * cases, a document starting with an XML declaration will
5279                  * specify UTF-8.
5280                  */
5281                 if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
5282                     (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0)) {
5283                     xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
5284                 }
5285 
5286 	        /*
5287 		 * Very first chars read from the document flow.
5288 		 */
5289 		cur = in->cur[0];
5290 		if (IS_BLANK_CH(cur)) {
5291 		    SKIP_BLANKS;
5292                     avail = in->end - in->cur;
5293 		}
5294                 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
5295                     xmlSAXLocator copy = xmlDefaultSAXLocator;
5296                     ctxt->sax->setDocumentLocator(ctxt->userData, &copy);
5297                 }
5298 		if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5299 	            (!ctxt->disableSAX))
5300 		    ctxt->sax->startDocument(ctxt->userData);
5301 
5302 		cur = in->cur[0];
5303 		next = in->cur[1];
5304 		if ((cur == '<') && (next == '!') &&
5305 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5306 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5307 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5308 		    (UPP(8) == 'E')) {
5309 		    if ((!terminate) &&
5310 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5311 			goto done;
5312 		    htmlParseDocTypeDecl(ctxt);
5313 		    ctxt->instate = XML_PARSER_PROLOG;
5314                 } else {
5315 		    ctxt->instate = XML_PARSER_MISC;
5316 		}
5317 		break;
5318             case XML_PARSER_MISC:
5319 		SKIP_BLANKS;
5320                 avail = in->end - in->cur;
5321 		/*
5322 		 * no chars in buffer
5323 		 */
5324 		if (avail < 1)
5325 		    goto done;
5326 		/*
5327 		 * not enough chars in buffer
5328 		 */
5329 		if (avail < 2) {
5330 		    if (!terminate)
5331 			goto done;
5332 		    else
5333 			next = ' ';
5334 		} else {
5335 		    next = in->cur[1];
5336 		}
5337 		cur = in->cur[0];
5338 	        if ((cur == '<') && (next == '!') &&
5339 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5340 		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5341 			goto done;
5342 		    htmlParseComment(ctxt);
5343 		    ctxt->instate = XML_PARSER_MISC;
5344 	        } else if ((cur == '<') && (next == '?')) {
5345 		    if ((!terminate) &&
5346 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5347 			goto done;
5348 		    htmlParsePI(ctxt);
5349 		    ctxt->instate = XML_PARSER_MISC;
5350 		} else if ((cur == '<') && (next == '!') &&
5351 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5352 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5353 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5354 		    (UPP(8) == 'E')) {
5355 		    if ((!terminate) &&
5356 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5357 			goto done;
5358 		    htmlParseDocTypeDecl(ctxt);
5359 		    ctxt->instate = XML_PARSER_PROLOG;
5360 		} else if ((cur == '<') && (next == '!') &&
5361 		           (avail < 9)) {
5362 		    goto done;
5363 		} else {
5364 		    ctxt->instate = XML_PARSER_CONTENT;
5365 		}
5366 		break;
5367             case XML_PARSER_PROLOG:
5368 		SKIP_BLANKS;
5369                 avail = in->end - in->cur;
5370 		if (avail < 2)
5371 		    goto done;
5372 		cur = in->cur[0];
5373 		next = in->cur[1];
5374 		if ((cur == '<') && (next == '!') &&
5375 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5376 		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5377 			goto done;
5378 		    htmlParseComment(ctxt);
5379 		    ctxt->instate = XML_PARSER_PROLOG;
5380 	        } else if ((cur == '<') && (next == '?')) {
5381 		    if ((!terminate) &&
5382 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5383 			goto done;
5384 		    htmlParsePI(ctxt);
5385 		    ctxt->instate = XML_PARSER_PROLOG;
5386 		} else if ((cur == '<') && (next == '!') &&
5387 		           (avail < 4)) {
5388 		    goto done;
5389 		} else {
5390 		    ctxt->instate = XML_PARSER_CONTENT;
5391 		}
5392 		break;
5393             case XML_PARSER_EPILOG:
5394                 avail = in->end - in->cur;
5395 		if (avail < 1)
5396 		    goto done;
5397 		cur = in->cur[0];
5398 		if (IS_BLANK_CH(cur)) {
5399 		    htmlParseCharData(ctxt);
5400 		    goto done;
5401 		}
5402 		if (avail < 2)
5403 		    goto done;
5404 		next = in->cur[1];
5405 	        if ((cur == '<') && (next == '!') &&
5406 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5407 		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5408 			goto done;
5409 		    htmlParseComment(ctxt);
5410 		    ctxt->instate = XML_PARSER_EPILOG;
5411 	        } else if ((cur == '<') && (next == '?')) {
5412 		    if ((!terminate) &&
5413 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5414 			goto done;
5415 		    htmlParsePI(ctxt);
5416 		    ctxt->instate = XML_PARSER_EPILOG;
5417 		} else if ((cur == '<') && (next == '!') &&
5418 		           (avail < 4)) {
5419 		    goto done;
5420 		} else {
5421 		    ctxt->errNo = XML_ERR_DOCUMENT_END;
5422 		    ctxt->wellFormed = 0;
5423 		    ctxt->instate = XML_PARSER_EOF;
5424 		    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5425 			ctxt->sax->endDocument(ctxt->userData);
5426 		    goto done;
5427 		}
5428 		break;
5429             case XML_PARSER_START_TAG: {
5430 	        const xmlChar *name;
5431 		int failed;
5432 		const htmlElemDesc * info;
5433 
5434 		/*
5435 		 * no chars in buffer
5436 		 */
5437 		if (avail < 1)
5438 		    goto done;
5439 		/*
5440 		 * not enough chars in buffer
5441 		 */
5442 		if (avail < 2) {
5443 		    if (!terminate)
5444 			goto done;
5445 		    else
5446 			next = ' ';
5447 		} else {
5448 		    next = in->cur[1];
5449 		}
5450 		cur = in->cur[0];
5451 	        if (cur != '<') {
5452 		    ctxt->instate = XML_PARSER_CONTENT;
5453 		    break;
5454 		}
5455 		if (next == '/') {
5456 		    ctxt->instate = XML_PARSER_END_TAG;
5457 		    ctxt->checkIndex = 0;
5458 		    break;
5459 		}
5460 		if ((!terminate) &&
5461 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5462 		    goto done;
5463 
5464                 /* Capture start position */
5465 	        if (ctxt->record_info) {
5466 	             node_info.begin_pos = ctxt->input->consumed +
5467 	                                (CUR_PTR - ctxt->input->base);
5468 	             node_info.begin_line = ctxt->input->line;
5469 	        }
5470 
5471 
5472 		failed = htmlParseStartTag(ctxt);
5473 		name = ctxt->name;
5474 		if ((failed == -1) ||
5475 		    (name == NULL)) {
5476 		    if (CUR == '>')
5477 			NEXT;
5478 		    break;
5479 		}
5480 
5481 		/*
5482 		 * Lookup the info for that element.
5483 		 */
5484 		info = htmlTagLookup(name);
5485 		if (info == NULL) {
5486 		    htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5487 		                 "Tag %s invalid\n", name, NULL);
5488 		}
5489 
5490 		/*
5491 		 * Check for an Empty Element labeled the XML/SGML way
5492 		 */
5493 		if ((CUR == '/') && (NXT(1) == '>')) {
5494 		    SKIP(2);
5495 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5496 			ctxt->sax->endElement(ctxt->userData, name);
5497 		    htmlnamePop(ctxt);
5498 		    ctxt->instate = XML_PARSER_CONTENT;
5499 		    break;
5500 		}
5501 
5502 		if (CUR == '>') {
5503 		    NEXT;
5504 		} else {
5505 		    htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5506 		                 "Couldn't find end of Start Tag %s\n",
5507 				 name, NULL);
5508 
5509 		    /*
5510 		     * end of parsing of this node.
5511 		     */
5512 		    if (xmlStrEqual(name, ctxt->name)) {
5513 			nodePop(ctxt);
5514 			htmlnamePop(ctxt);
5515 		    }
5516 
5517 		    if (ctxt->record_info)
5518 		        htmlNodeInfoPush(ctxt, &node_info);
5519 
5520 		    ctxt->instate = XML_PARSER_CONTENT;
5521 		    break;
5522 		}
5523 
5524 		/*
5525 		 * Check for an Empty Element from DTD definition
5526 		 */
5527 		if ((info != NULL) && (info->empty)) {
5528 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5529 			ctxt->sax->endElement(ctxt->userData, name);
5530 		    htmlnamePop(ctxt);
5531 		}
5532 
5533                 if (ctxt->record_info)
5534 	            htmlNodeInfoPush(ctxt, &node_info);
5535 
5536 		ctxt->instate = XML_PARSER_CONTENT;
5537                 break;
5538 	    }
5539             case XML_PARSER_CONTENT: {
5540 		xmlChar chr[2] = { 0, 0 };
5541 
5542                 /*
5543 		 * Handle preparsed entities and charRef
5544 		 */
5545 		if ((avail == 1) && (terminate)) {
5546 		    cur = in->cur[0];
5547 		    if ((cur != '<') && (cur != '&')) {
5548 			if (ctxt->sax != NULL) {
5549                             chr[0] = cur;
5550 			    if (IS_BLANK_CH(cur)) {
5551 				if (ctxt->keepBlanks) {
5552 				    if (ctxt->sax->characters != NULL)
5553 					ctxt->sax->characters(
5554 						ctxt->userData, chr, 1);
5555 				} else {
5556 				    if (ctxt->sax->ignorableWhitespace != NULL)
5557 					ctxt->sax->ignorableWhitespace(
5558 						ctxt->userData, chr, 1);
5559 				}
5560 			    } else {
5561 				htmlCheckParagraph(ctxt);
5562 				if (ctxt->sax->characters != NULL)
5563 				    ctxt->sax->characters(
5564 					    ctxt->userData, chr, 1);
5565 			    }
5566 			}
5567 			ctxt->checkIndex = 0;
5568 			in->cur++;
5569 			break;
5570 		    }
5571 		}
5572 		if (avail < 2)
5573 		    goto done;
5574 		cur = in->cur[0];
5575 		next = in->cur[1];
5576 		if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5577 		    (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5578 		    /*
5579 		     * Handle SCRIPT/STYLE separately
5580 		     */
5581 		    if (!terminate) {
5582 		        int idx;
5583 			xmlChar val;
5584 
5585 			idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5586 			if (idx < 0)
5587 			    goto done;
5588 		        val = in->cur[idx + 2];
5589 			if (val == 0) { /* bad cut of input */
5590                             /*
5591                              * FIXME: htmlParseScript checks for additional
5592                              * characters after '</'.
5593                              */
5594                             ctxt->checkIndex = idx;
5595 			    goto done;
5596                         }
5597 		    }
5598 		    htmlParseScript(ctxt);
5599 		    if ((cur == '<') && (next == '/')) {
5600 			ctxt->instate = XML_PARSER_END_TAG;
5601 			ctxt->checkIndex = 0;
5602 			break;
5603 		    }
5604 		} else if ((cur == '<') && (next == '!')) {
5605                     if (avail < 4)
5606                         goto done;
5607                     /*
5608                      * Sometimes DOCTYPE arrives in the middle of the document
5609                      */
5610                     if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
5611                         (UPP(4) == 'C') && (UPP(5) == 'T') &&
5612                         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5613                         (UPP(8) == 'E')) {
5614                         if ((!terminate) &&
5615                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5616                             goto done;
5617                         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5618                                      "Misplaced DOCTYPE declaration\n",
5619                                      BAD_CAST "DOCTYPE" , NULL);
5620                         htmlParseDocTypeDecl(ctxt);
5621                     } else if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
5622                         if ((!terminate) &&
5623                             (htmlParseLookupCommentEnd(ctxt) < 0))
5624                             goto done;
5625                         htmlParseComment(ctxt);
5626                         ctxt->instate = XML_PARSER_CONTENT;
5627                     } else {
5628                         if ((!terminate) &&
5629                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5630                             goto done;
5631                         htmlSkipBogusComment(ctxt);
5632                     }
5633                 } else if ((cur == '<') && (next == '?')) {
5634                     if ((!terminate) &&
5635                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5636                         goto done;
5637                     htmlParsePI(ctxt);
5638                     ctxt->instate = XML_PARSER_CONTENT;
5639                 } else if ((cur == '<') && (next == '/')) {
5640                     ctxt->instate = XML_PARSER_END_TAG;
5641                     ctxt->checkIndex = 0;
5642                     break;
5643                 } else if ((cur == '<') && IS_ASCII_LETTER(next)) {
5644                     if ((!terminate) && (next == 0))
5645                         goto done;
5646                     ctxt->instate = XML_PARSER_START_TAG;
5647                     ctxt->checkIndex = 0;
5648                     break;
5649                 } else if (cur == '<') {
5650                     if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
5651                         (ctxt->sax->characters != NULL))
5652                         ctxt->sax->characters(ctxt->userData,
5653                                               BAD_CAST "<", 1);
5654                     NEXT;
5655                 } else {
5656                     /*
5657                      * check that the text sequence is complete
5658                      * before handing out the data to the parser
5659                      * to avoid problems with erroneous end of
5660                      * data detection.
5661                      */
5662                     if ((!terminate) &&
5663                         (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5664                         goto done;
5665                     ctxt->checkIndex = 0;
5666                     while ((PARSER_STOPPED(ctxt) == 0) &&
5667                            (cur != '<') && (in->cur < in->end)) {
5668                         if (cur == '&') {
5669                             htmlParseReference(ctxt);
5670                         } else {
5671                             htmlParseCharData(ctxt);
5672                         }
5673                         cur = in->cur[0];
5674                     }
5675 		}
5676 
5677 		break;
5678 	    }
5679             case XML_PARSER_END_TAG:
5680 		if (avail < 2)
5681 		    goto done;
5682 		if ((!terminate) &&
5683 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5684 		    goto done;
5685 		htmlParseEndTag(ctxt);
5686 		if (ctxt->nameNr == 0) {
5687 		    ctxt->instate = XML_PARSER_EPILOG;
5688 		} else {
5689 		    ctxt->instate = XML_PARSER_CONTENT;
5690 		}
5691 		ctxt->checkIndex = 0;
5692 	        break;
5693 	    default:
5694 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5695 			     "HPP: internal error\n", NULL, NULL);
5696 		ctxt->instate = XML_PARSER_EOF;
5697 		break;
5698 	}
5699     }
5700 done:
5701     if ((avail == 0) && (terminate)) {
5702 	htmlAutoCloseOnEnd(ctxt);
5703 	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5704 	    /*
5705 	     * SAX: end of the document processing.
5706 	     */
5707 	    ctxt->instate = XML_PARSER_EOF;
5708 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5709 		ctxt->sax->endDocument(ctxt->userData);
5710 	}
5711     }
5712     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
5713 	((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5714 	 (ctxt->instate == XML_PARSER_EPILOG))) {
5715 	xmlDtdPtr dtd;
5716 	dtd = xmlGetIntSubset(ctxt->myDoc);
5717 	if (dtd == NULL) {
5718 	    ctxt->myDoc->intSubset =
5719 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5720 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5721 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5722             if (ctxt->myDoc->intSubset == NULL)
5723                 htmlErrMemory(ctxt);
5724         }
5725     }
5726     return(ret);
5727 }
5728 
5729 /**
5730  * htmlParseChunk:
5731  * @ctxt:  an HTML parser context
5732  * @chunk:  chunk of memory
5733  * @size:  size of chunk in bytes
5734  * @terminate:  last chunk indicator
5735  *
5736  * Parse a chunk of memory in push parser mode.
5737  *
5738  * Assumes that the parser context was initialized with
5739  * htmlCreatePushParserCtxt.
5740  *
5741  * The last chunk, which will often be empty, must be marked with
5742  * the @terminate flag. With the default SAX callbacks, the resulting
5743  * document will be available in ctxt->myDoc. This pointer will not
5744  * be freed by the library.
5745  *
5746  * If the document isn't well-formed, ctxt->myDoc is set to NULL.
5747  *
5748  * Returns an xmlParserErrors code (0 on success).
5749  */
5750 int
htmlParseChunk(htmlParserCtxtPtr ctxt,const char * chunk,int size,int terminate)5751 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5752               int terminate) {
5753     if ((ctxt == NULL) || (ctxt->input == NULL))
5754 	return(XML_ERR_ARGUMENT);
5755     if (PARSER_STOPPED(ctxt) != 0)
5756         return(ctxt->errNo);
5757     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5758         (ctxt->input->buf != NULL))  {
5759 	size_t pos = ctxt->input->cur - ctxt->input->base;
5760 	int res;
5761 
5762 	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5763         xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
5764 	if (res < 0) {
5765             htmlParseErr(ctxt, ctxt->input->buf->error,
5766                          "xmlParserInputBufferPush failed", NULL, NULL);
5767             xmlHaltParser(ctxt);
5768 	    return (ctxt->errNo);
5769 	}
5770     }
5771     htmlParseTryOrFinish(ctxt, terminate);
5772     if (terminate) {
5773 	if (ctxt->instate != XML_PARSER_EOF) {
5774 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5775 		ctxt->sax->endDocument(ctxt->userData);
5776 	}
5777 	ctxt->instate = XML_PARSER_EOF;
5778     }
5779     return((xmlParserErrors) ctxt->errNo);
5780 }
5781 
5782 /************************************************************************
5783  *									*
5784  *			User entry points				*
5785  *									*
5786  ************************************************************************/
5787 
5788 /**
5789  * htmlCreatePushParserCtxt:
5790  * @sax:  a SAX handler (optional)
5791  * @user_data:  The user data returned on SAX callbacks (optional)
5792  * @chunk:  a pointer to an array of chars (optional)
5793  * @size:  number of chars in the array
5794  * @filename:  only used for error reporting (optional)
5795  * @enc:  encoding (deprecated, pass XML_CHAR_ENCODING_NONE)
5796  *
5797  * Create a parser context for using the HTML parser in push mode.
5798  *
5799  * Returns the new parser context or NULL if a memory allocation
5800  * failed.
5801  */
5802 htmlParserCtxtPtr
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,void * user_data,const char * chunk,int size,const char * filename,xmlCharEncoding enc)5803 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5804                          const char *chunk, int size, const char *filename,
5805 			 xmlCharEncoding enc) {
5806     htmlParserCtxtPtr ctxt;
5807     htmlParserInputPtr input;
5808     const char *encoding;
5809 
5810     ctxt = htmlNewSAXParserCtxt(sax, user_data);
5811     if (ctxt == NULL)
5812 	return(NULL);
5813 
5814     encoding = xmlGetCharEncodingName(enc);
5815     input = xmlNewInputPush(ctxt, filename, chunk, size, encoding);
5816     if (input == NULL) {
5817 	htmlFreeParserCtxt(ctxt);
5818 	return(NULL);
5819     }
5820     inputPush(ctxt, input);
5821 
5822     return(ctxt);
5823 }
5824 #endif /* LIBXML_PUSH_ENABLED */
5825 
5826 /**
5827  * htmlSAXParseDoc:
5828  * @cur:  a pointer to an array of xmlChar
5829  * @encoding:  a free form C string describing the HTML document encoding, or NULL
5830  * @sax:  the SAX handler block
5831  * @userData: if using SAX, this pointer will be provided on callbacks.
5832  *
5833  * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadDoc.
5834  *
5835  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5836  * to handle parse events. If sax is NULL, fallback to the default DOM
5837  * behavior and return a tree.
5838  *
5839  * Returns the resulting document tree unless SAX is NULL or the document is
5840  *     not well formed.
5841  */
5842 
5843 htmlDocPtr
htmlSAXParseDoc(const xmlChar * cur,const char * encoding,htmlSAXHandlerPtr sax,void * userData)5844 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
5845                 htmlSAXHandlerPtr sax, void *userData) {
5846     htmlDocPtr ret;
5847     htmlParserCtxtPtr ctxt;
5848 
5849     if (cur == NULL)
5850         return(NULL);
5851 
5852     ctxt = htmlCreateDocParserCtxt(cur, NULL, encoding);
5853     if (ctxt == NULL)
5854         return(NULL);
5855 
5856     if (sax != NULL) {
5857         *ctxt->sax = *sax;
5858         ctxt->userData = userData;
5859     }
5860 
5861     htmlParseDocument(ctxt);
5862     ret = ctxt->myDoc;
5863     htmlFreeParserCtxt(ctxt);
5864 
5865     return(ret);
5866 }
5867 
5868 /**
5869  * htmlParseDoc:
5870  * @cur:  a pointer to an array of xmlChar
5871  * @encoding:  the encoding (optional)
5872  *
5873  * DEPRECATED: Use htmlReadDoc.
5874  *
5875  * Parse an HTML in-memory document and build a tree.
5876  *
5877  * This function uses deprecated global parser options.
5878  *
5879  * Returns the resulting document tree
5880  */
5881 
5882 htmlDocPtr
htmlParseDoc(const xmlChar * cur,const char * encoding)5883 htmlParseDoc(const xmlChar *cur, const char *encoding) {
5884     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5885 }
5886 
5887 
5888 /**
5889  * htmlCreateFileParserCtxt:
5890  * @filename:  the filename
5891  * @encoding:  optional encoding
5892  *
5893  * DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadFile.
5894  *
5895  * Create a parser context to read from a file.
5896  *
5897  * A non-NULL encoding overrides encoding declarations in the document.
5898  *
5899  * Automatic support for ZLIB/Compress compressed document is provided
5900  * by default if found at compile-time.
5901  *
5902  * Returns the new parser context or NULL if a memory allocation failed.
5903  */
5904 htmlParserCtxtPtr
htmlCreateFileParserCtxt(const char * filename,const char * encoding)5905 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5906 {
5907     htmlParserCtxtPtr ctxt;
5908     htmlParserInputPtr input;
5909 
5910     if (filename == NULL)
5911         return(NULL);
5912 
5913     ctxt = htmlNewParserCtxt();
5914     if (ctxt == NULL) {
5915 	return(NULL);
5916     }
5917 
5918     input = xmlNewInputURL(ctxt, filename, NULL, encoding, 0);
5919     if (input == NULL) {
5920 	xmlFreeParserCtxt(ctxt);
5921 	return(NULL);
5922     }
5923     inputPush(ctxt, input);
5924 
5925     return(ctxt);
5926 }
5927 
5928 /**
5929  * htmlSAXParseFile:
5930  * @filename:  the filename
5931  * @encoding:  encoding (optional)
5932  * @sax:  the SAX handler block
5933  * @userData: if using SAX, this pointer will be provided on callbacks.
5934  *
5935  * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadFile.
5936  *
5937  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5938  * compressed document is provided by default if found at compile-time.
5939  * It use the given SAX function block to handle the parsing callback.
5940  * If sax is NULL, fallback to the default DOM tree building routines.
5941  *
5942  * Returns the resulting document tree unless SAX is NULL or the document is
5943  *     not well formed.
5944  */
5945 
5946 htmlDocPtr
htmlSAXParseFile(const char * filename,const char * encoding,htmlSAXHandlerPtr sax,void * userData)5947 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5948                  void *userData) {
5949     htmlDocPtr ret;
5950     htmlParserCtxtPtr ctxt;
5951     htmlSAXHandlerPtr oldsax = NULL;
5952 
5953     ctxt = htmlCreateFileParserCtxt(filename, encoding);
5954     if (ctxt == NULL) return(NULL);
5955     if (sax != NULL) {
5956 	oldsax = ctxt->sax;
5957         ctxt->sax = sax;
5958         ctxt->userData = userData;
5959     }
5960 
5961     htmlParseDocument(ctxt);
5962 
5963     ret = ctxt->myDoc;
5964     if (sax != NULL) {
5965         ctxt->sax = oldsax;
5966         ctxt->userData = NULL;
5967     }
5968     htmlFreeParserCtxt(ctxt);
5969 
5970     return(ret);
5971 }
5972 
5973 /**
5974  * htmlParseFile:
5975  * @filename:  the filename
5976  * @encoding:  encoding (optional)
5977  *
5978  * Parse an HTML file and build a tree.
5979  *
5980  * See xmlNewInputURL for details.
5981  *
5982  * Returns the resulting document tree
5983  */
5984 
5985 htmlDocPtr
htmlParseFile(const char * filename,const char * encoding)5986 htmlParseFile(const char *filename, const char *encoding) {
5987     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5988 }
5989 
5990 /**
5991  * htmlHandleOmittedElem:
5992  * @val:  int 0 or 1
5993  *
5994  * Set and return the previous value for handling HTML omitted tags.
5995  *
5996  * Returns the last value for 0 for no handling, 1 for auto insertion.
5997  */
5998 
5999 int
htmlHandleOmittedElem(int val)6000 htmlHandleOmittedElem(int val) {
6001     int old = htmlOmittedDefaultValue;
6002 
6003     htmlOmittedDefaultValue = val;
6004     return(old);
6005 }
6006 
6007 /**
6008  * htmlElementAllowedHere:
6009  * @parent: HTML parent element
6010  * @elt: HTML element
6011  *
6012  * Checks whether an HTML element may be a direct child of a parent element.
6013  * Note - doesn't check for deprecated elements
6014  *
6015  * Returns 1 if allowed; 0 otherwise.
6016  */
6017 int
htmlElementAllowedHere(const htmlElemDesc * parent,const xmlChar * elt)6018 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6019   const char** p ;
6020 
6021   if ( ! elt || ! parent || ! parent->subelts )
6022 	return 0 ;
6023 
6024   for ( p = parent->subelts; *p; ++p )
6025     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6026       return 1 ;
6027 
6028   return 0 ;
6029 }
6030 /**
6031  * htmlElementStatusHere:
6032  * @parent: HTML parent element
6033  * @elt: HTML element
6034  *
6035  * Checks whether an HTML element may be a direct child of a parent element.
6036  * and if so whether it is valid or deprecated.
6037  *
6038  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6039  */
6040 htmlStatus
htmlElementStatusHere(const htmlElemDesc * parent,const htmlElemDesc * elt)6041 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6042   if ( ! parent || ! elt )
6043     return HTML_INVALID ;
6044   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6045     return HTML_INVALID ;
6046 
6047   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6048 }
6049 /**
6050  * htmlAttrAllowed:
6051  * @elt: HTML element
6052  * @attr: HTML attribute
6053  * @legacy: whether to allow deprecated attributes
6054  *
6055  * Checks whether an attribute is valid for an element
6056  * Has full knowledge of Required and Deprecated attributes
6057  *
6058  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6059  */
6060 htmlStatus
htmlAttrAllowed(const htmlElemDesc * elt,const xmlChar * attr,int legacy)6061 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6062   const char** p ;
6063 
6064   if ( !elt || ! attr )
6065 	return HTML_INVALID ;
6066 
6067   if ( elt->attrs_req )
6068     for ( p = elt->attrs_req; *p; ++p)
6069       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6070         return HTML_REQUIRED ;
6071 
6072   if ( elt->attrs_opt )
6073     for ( p = elt->attrs_opt; *p; ++p)
6074       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6075         return HTML_VALID ;
6076 
6077   if ( legacy && elt->attrs_depr )
6078     for ( p = elt->attrs_depr; *p; ++p)
6079       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6080         return HTML_DEPRECATED ;
6081 
6082   return HTML_INVALID ;
6083 }
6084 /**
6085  * htmlNodeStatus:
6086  * @node: an htmlNodePtr in a tree
6087  * @legacy: whether to allow deprecated elements (YES is faster here
6088  *	for Element nodes)
6089  *
6090  * Checks whether the tree node is valid.  Experimental (the author
6091  *     only uses the HTML enhancements in a SAX parser)
6092  *
6093  * Return: for Element nodes, a return from htmlElementAllowedHere (if
6094  *	legacy allowed) or htmlElementStatusHere (otherwise).
6095  *	for Attribute nodes, a return from htmlAttrAllowed
6096  *	for other nodes, HTML_NA (no checks performed)
6097  */
6098 htmlStatus
htmlNodeStatus(htmlNodePtr node,int legacy)6099 htmlNodeStatus(htmlNodePtr node, int legacy) {
6100   if ( ! node )
6101     return HTML_INVALID ;
6102 
6103   switch ( node->type ) {
6104     case XML_ELEMENT_NODE:
6105       return legacy
6106 	? ( htmlElementAllowedHere (
6107 		htmlTagLookup(node->parent->name) , node->name
6108 		) ? HTML_VALID : HTML_INVALID )
6109 	: htmlElementStatusHere(
6110 		htmlTagLookup(node->parent->name) ,
6111 		htmlTagLookup(node->name) )
6112 	;
6113     case XML_ATTRIBUTE_NODE:
6114       return htmlAttrAllowed(
6115 	htmlTagLookup(node->parent->name) , node->name, legacy) ;
6116     default: return HTML_NA ;
6117   }
6118 }
6119 /************************************************************************
6120  *									*
6121  *	New set (2.6.0) of simpler and more flexible APIs		*
6122  *									*
6123  ************************************************************************/
6124 /**
6125  * DICT_FREE:
6126  * @str:  a string
6127  *
6128  * Free a string if it is not owned by the "dict" dictionary in the
6129  * current scope
6130  */
6131 #define DICT_FREE(str)						\
6132 	if ((str) && ((!dict) ||				\
6133 	    (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))	\
6134 	    xmlFree((char *)(str));
6135 
6136 /**
6137  * htmlCtxtReset:
6138  * @ctxt: an HTML parser context
6139  *
6140  * Reset a parser context
6141  */
6142 void
htmlCtxtReset(htmlParserCtxtPtr ctxt)6143 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6144 {
6145     xmlParserInputPtr input;
6146     xmlDictPtr dict;
6147 
6148     if (ctxt == NULL)
6149         return;
6150 
6151     dict = ctxt->dict;
6152 
6153     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6154         xmlFreeInputStream(input);
6155     }
6156     ctxt->inputNr = 0;
6157     ctxt->input = NULL;
6158 
6159     ctxt->spaceNr = 0;
6160     if (ctxt->spaceTab != NULL) {
6161 	ctxt->spaceTab[0] = -1;
6162 	ctxt->space = &ctxt->spaceTab[0];
6163     } else {
6164 	ctxt->space = NULL;
6165     }
6166 
6167 
6168     ctxt->nodeNr = 0;
6169     ctxt->node = NULL;
6170 
6171     ctxt->nameNr = 0;
6172     ctxt->name = NULL;
6173 
6174     ctxt->nsNr = 0;
6175 
6176     DICT_FREE(ctxt->version);
6177     ctxt->version = NULL;
6178     DICT_FREE(ctxt->encoding);
6179     ctxt->encoding = NULL;
6180     DICT_FREE(ctxt->extSubURI);
6181     ctxt->extSubURI = NULL;
6182     DICT_FREE(ctxt->extSubSystem);
6183     ctxt->extSubSystem = NULL;
6184     if (ctxt->myDoc != NULL)
6185         xmlFreeDoc(ctxt->myDoc);
6186     ctxt->myDoc = NULL;
6187 
6188     ctxt->standalone = -1;
6189     ctxt->hasExternalSubset = 0;
6190     ctxt->hasPErefs = 0;
6191     ctxt->html = 1;
6192     ctxt->instate = XML_PARSER_START;
6193 
6194     ctxt->wellFormed = 1;
6195     ctxt->nsWellFormed = 1;
6196     ctxt->disableSAX = 0;
6197     ctxt->valid = 1;
6198     ctxt->vctxt.userData = ctxt;
6199     ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
6200     ctxt->vctxt.error = xmlParserValidityError;
6201     ctxt->vctxt.warning = xmlParserValidityWarning;
6202     ctxt->record_info = 0;
6203     ctxt->checkIndex = 0;
6204     ctxt->endCheckState = 0;
6205     ctxt->inSubset = 0;
6206     ctxt->errNo = XML_ERR_OK;
6207     ctxt->depth = 0;
6208     ctxt->catalogs = NULL;
6209     xmlInitNodeInfoSeq(&ctxt->node_seq);
6210 
6211     if (ctxt->attsDefault != NULL) {
6212         xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6213         ctxt->attsDefault = NULL;
6214     }
6215     if (ctxt->attsSpecial != NULL) {
6216         xmlHashFree(ctxt->attsSpecial, NULL);
6217         ctxt->attsSpecial = NULL;
6218     }
6219 
6220     ctxt->nbErrors = 0;
6221     ctxt->nbWarnings = 0;
6222     if (ctxt->lastError.code != XML_ERR_OK)
6223         xmlResetError(&ctxt->lastError);
6224 }
6225 
6226 /**
6227  * htmlCtxtUseOptions:
6228  * @ctxt: an HTML parser context
6229  * @options:  a combination of htmlParserOption(s)
6230  *
6231  * Applies the options to the parser context
6232  *
6233  * Returns 0 in case of success, the set of unknown or unimplemented options
6234  *         in case of error.
6235  */
6236 int
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt,int options)6237 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6238 {
6239     if (ctxt == NULL)
6240         return(-1);
6241 
6242     if (options & HTML_PARSE_NOWARNING) {
6243         ctxt->sax->warning = NULL;
6244         ctxt->vctxt.warning = NULL;
6245         options -= XML_PARSE_NOWARNING;
6246 	ctxt->options |= XML_PARSE_NOWARNING;
6247     }
6248     if (options & HTML_PARSE_NOERROR) {
6249         ctxt->sax->error = NULL;
6250         ctxt->vctxt.error = NULL;
6251         ctxt->sax->fatalError = NULL;
6252         options -= XML_PARSE_NOERROR;
6253 	ctxt->options |= XML_PARSE_NOERROR;
6254     }
6255     if (options & HTML_PARSE_PEDANTIC) {
6256         ctxt->pedantic = 1;
6257         options -= XML_PARSE_PEDANTIC;
6258 	ctxt->options |= XML_PARSE_PEDANTIC;
6259     } else
6260         ctxt->pedantic = 0;
6261     if (options & XML_PARSE_NOBLANKS) {
6262         ctxt->keepBlanks = 0;
6263         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6264         options -= XML_PARSE_NOBLANKS;
6265 	ctxt->options |= XML_PARSE_NOBLANKS;
6266     } else
6267         ctxt->keepBlanks = 1;
6268     if (options & HTML_PARSE_RECOVER) {
6269         ctxt->recovery = 1;
6270 	options -= HTML_PARSE_RECOVER;
6271     } else
6272         ctxt->recovery = 0;
6273     if (options & HTML_PARSE_COMPACT) {
6274 	ctxt->options |= HTML_PARSE_COMPACT;
6275         options -= HTML_PARSE_COMPACT;
6276     }
6277     if (options & XML_PARSE_HUGE) {
6278 	ctxt->options |= XML_PARSE_HUGE;
6279         options -= XML_PARSE_HUGE;
6280     }
6281     if (options & HTML_PARSE_NODEFDTD) {
6282 	ctxt->options |= HTML_PARSE_NODEFDTD;
6283         options -= HTML_PARSE_NODEFDTD;
6284     }
6285     if (options & HTML_PARSE_IGNORE_ENC) {
6286 	ctxt->options |= HTML_PARSE_IGNORE_ENC;
6287         options -= HTML_PARSE_IGNORE_ENC;
6288     }
6289     if (options & HTML_PARSE_NOIMPLIED) {
6290         ctxt->options |= HTML_PARSE_NOIMPLIED;
6291         options -= HTML_PARSE_NOIMPLIED;
6292     }
6293     ctxt->dictNames = 0;
6294     ctxt->linenumbers = 1;
6295     return (options);
6296 }
6297 
6298 /**
6299  * htmlCtxtParseDocument:
6300  * @ctxt:  an HTML parser context
6301  *
6302  * Parse an HTML document and return the resulting document tree.
6303  *
6304  * Returns the resulting document tree or NULL
6305  */
6306 htmlDocPtr
htmlCtxtParseDocument(htmlParserCtxtPtr ctxt,xmlParserInputPtr input)6307 htmlCtxtParseDocument(htmlParserCtxtPtr ctxt, xmlParserInputPtr input)
6308 {
6309     htmlDocPtr ret;
6310 
6311     if ((ctxt == NULL) || (input == NULL))
6312         return(NULL);
6313 
6314     /* assert(ctxt->inputNr == 0); */
6315     while (ctxt->inputNr > 0)
6316         xmlFreeInputStream(inputPop(ctxt));
6317 
6318     if (inputPush(ctxt, input) < 0) {
6319         xmlFreeInputStream(input);
6320         return(NULL);
6321     }
6322 
6323     ctxt->html = 1;
6324     htmlParseDocument(ctxt);
6325 
6326     if (ctxt->errNo != XML_ERR_NO_MEMORY) {
6327         ret = ctxt->myDoc;
6328     } else {
6329         ret = NULL;
6330         xmlFreeDoc(ctxt->myDoc);
6331     }
6332     ctxt->myDoc = NULL;
6333 
6334     /* assert(ctxt->inputNr == 1); */
6335     while (ctxt->inputNr > 0)
6336         xmlFreeInputStream(inputPop(ctxt));
6337 
6338     return(ret);
6339 }
6340 
6341 /**
6342  * htmlReadDoc:
6343  * @str:  a pointer to a zero terminated string
6344  * @url:  only used for error reporting (optoinal)
6345  * @encoding:  the document encoding (optional)
6346  * @options:  a combination of htmlParserOptions
6347  *
6348  * Convenience function to parse an HTML document from a zero-terminated
6349  * string.
6350  *
6351  * See htmlCtxtReadDoc for details.
6352  *
6353  * Returns the resulting document tree.
6354  */
6355 htmlDocPtr
htmlReadDoc(const xmlChar * str,const char * url,const char * encoding,int options)6356 htmlReadDoc(const xmlChar *str, const char *url, const char *encoding,
6357             int options)
6358 {
6359     htmlParserCtxtPtr ctxt;
6360     xmlParserInputPtr input;
6361     htmlDocPtr doc;
6362 
6363     ctxt = htmlNewParserCtxt();
6364     if (ctxt == NULL)
6365         return(NULL);
6366 
6367     htmlCtxtUseOptions(ctxt, options);
6368 
6369     input = xmlNewInputString(ctxt, url, (const char *) str, encoding,
6370                               XML_INPUT_BUF_STATIC);
6371 
6372     doc = htmlCtxtParseDocument(ctxt, input);
6373 
6374     htmlFreeParserCtxt(ctxt);
6375     return(doc);
6376 }
6377 
6378 /**
6379  * htmlReadFile:
6380  * @filename:  a file or URL
6381  * @encoding:  the document encoding (optional)
6382  * @options:  a combination of htmlParserOptions
6383  *
6384  * Convenience function to parse an HTML file from the filesystem,
6385  * the network or a global user-defined resource loader.
6386  *
6387  * See htmlCtxtReadFile for details.
6388  *
6389  * Returns the resulting document tree.
6390  */
6391 htmlDocPtr
htmlReadFile(const char * filename,const char * encoding,int options)6392 htmlReadFile(const char *filename, const char *encoding, int options)
6393 {
6394     htmlParserCtxtPtr ctxt;
6395     xmlParserInputPtr input;
6396     htmlDocPtr doc;
6397 
6398     ctxt = htmlNewParserCtxt();
6399     if (ctxt == NULL)
6400         return(NULL);
6401 
6402     htmlCtxtUseOptions(ctxt, options);
6403 
6404     input = xmlNewInputURL(ctxt, filename, NULL, encoding, 0);
6405 
6406     doc = htmlCtxtParseDocument(ctxt, input);
6407 
6408     htmlFreeParserCtxt(ctxt);
6409     return(doc);
6410 }
6411 
6412 /**
6413  * htmlReadMemory:
6414  * @buffer:  a pointer to a char array
6415  * @size:  the size of the array
6416  * @url:  only used for error reporting (optional)
6417  * @encoding:  the document encoding, or NULL
6418  * @options:  a combination of htmlParserOption(s)
6419  *
6420  * Convenience function to parse an HTML document from memory.
6421  * The input buffer must not contain any terminating null bytes.
6422  *
6423  * See htmlCtxtReadMemory for details.
6424  *
6425  * Returns the resulting document tree
6426  */
6427 htmlDocPtr
htmlReadMemory(const char * buffer,int size,const char * url,const char * encoding,int options)6428 htmlReadMemory(const char *buffer, int size, const char *url,
6429                const char *encoding, int options)
6430 {
6431     htmlParserCtxtPtr ctxt;
6432     xmlParserInputPtr input;
6433     htmlDocPtr doc;
6434 
6435     if (size < 0)
6436 	return(NULL);
6437 
6438     ctxt = htmlNewParserCtxt();
6439     if (ctxt == NULL)
6440         return(NULL);
6441 
6442     htmlCtxtUseOptions(ctxt, options);
6443 
6444     input = xmlNewInputMemory(ctxt, url, buffer, size, encoding,
6445                               XML_INPUT_BUF_STATIC);
6446 
6447     doc = htmlCtxtParseDocument(ctxt, input);
6448 
6449     htmlFreeParserCtxt(ctxt);
6450     return(doc);
6451 }
6452 
6453 /**
6454  * htmlReadFd:
6455  * @fd:  an open file descriptor
6456  * @url:  only used for error reporting (optional)
6457  * @encoding:  the document encoding, or NULL
6458  * @options:  a combination of htmlParserOptions
6459  *
6460  * Convenience function to parse an HTML document from a
6461  * file descriptor.
6462  *
6463  * NOTE that the file descriptor will not be closed when the
6464  * context is freed or reset.
6465  *
6466  * See htmlCtxtReadFd for details.
6467  *
6468  * Returns the resulting document tree
6469  */
6470 htmlDocPtr
htmlReadFd(int fd,const char * url,const char * encoding,int options)6471 htmlReadFd(int fd, const char *url, const char *encoding, int options)
6472 {
6473     htmlParserCtxtPtr ctxt;
6474     xmlParserInputPtr input;
6475     htmlDocPtr doc;
6476 
6477     ctxt = htmlNewParserCtxt();
6478     if (ctxt == NULL)
6479         return(NULL);
6480 
6481     htmlCtxtUseOptions(ctxt, options);
6482 
6483     input = xmlNewInputFd(ctxt, url, fd, encoding, 0);
6484     input->buf->closecallback = NULL;
6485 
6486     doc = htmlCtxtParseDocument(ctxt, input);
6487 
6488     htmlFreeParserCtxt(ctxt);
6489     return(doc);
6490 }
6491 
6492 /**
6493  * htmlReadIO:
6494  * @ioread:  an I/O read function
6495  * @ioclose:  an I/O close function (optional)
6496  * @ioctx:  an I/O handler
6497  * @url:  only used for error reporting (optional)
6498  * @encoding:  the document encoding (optional)
6499  * @options:  a combination of htmlParserOption(s)
6500  *
6501  * Convenience function to parse an HTML document from I/O functions
6502  * and context.
6503  *
6504  * See htmlCtxtReadIO for details.
6505  *
6506  * Returns the resulting document tree
6507  */
6508 htmlDocPtr
htmlReadIO(xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * url,const char * encoding,int options)6509 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6510           void *ioctx, const char *url, const char *encoding, int options)
6511 {
6512     htmlParserCtxtPtr ctxt;
6513     xmlParserInputPtr input;
6514     htmlDocPtr doc;
6515 
6516     ctxt = htmlNewParserCtxt();
6517     if (ctxt == NULL)
6518         return (NULL);
6519 
6520     htmlCtxtUseOptions(ctxt, options);
6521 
6522     input = xmlNewInputIO(ctxt, url, ioread, ioclose, ioctx, encoding, 0);
6523 
6524     doc = htmlCtxtParseDocument(ctxt, input);
6525 
6526     htmlFreeParserCtxt(ctxt);
6527     return(doc);
6528 }
6529 
6530 /**
6531  * htmlCtxtReadDoc:
6532  * @ctxt:  an HTML parser context
6533  * @str:  a pointer to a zero terminated string
6534  * @URL:  only used for error reporting (optional)
6535  * @encoding:  the document encoding (optional)
6536  * @options:  a combination of htmlParserOptions
6537  *
6538  * Parse an HTML in-memory document and build a tree.
6539  *
6540  * See htmlCtxtUseOptions for details.
6541  *
6542  * Returns the resulting document tree
6543  */
6544 htmlDocPtr
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt,const xmlChar * str,const char * URL,const char * encoding,int options)6545 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar *str,
6546                 const char *URL, const char *encoding, int options)
6547 {
6548     xmlParserInputPtr input;
6549 
6550     if (ctxt == NULL)
6551         return (NULL);
6552 
6553     htmlCtxtReset(ctxt);
6554     htmlCtxtUseOptions(ctxt, options);
6555 
6556     input = xmlNewInputString(ctxt, URL, (const char *) str, encoding, 0);
6557 
6558     return(htmlCtxtParseDocument(ctxt, input));
6559 }
6560 
6561 /**
6562  * htmlCtxtReadFile:
6563  * @ctxt:  an HTML parser context
6564  * @filename:  a file or URL
6565  * @encoding:  the document encoding (optional)
6566  * @options:  a combination of htmlParserOptions
6567  *
6568  * Parse an HTML file from the filesystem, the network or a
6569  * user-defined resource loader.
6570  *
6571  * See xmlNewInputURL and htmlCtxtUseOptions for details.
6572  *
6573  * Returns the resulting document tree
6574  */
6575 htmlDocPtr
htmlCtxtReadFile(htmlParserCtxtPtr ctxt,const char * filename,const char * encoding,int options)6576 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6577                 const char *encoding, int options)
6578 {
6579     xmlParserInputPtr input;
6580 
6581     if (ctxt == NULL)
6582         return (NULL);
6583 
6584     htmlCtxtReset(ctxt);
6585     htmlCtxtUseOptions(ctxt, options);
6586 
6587     input = xmlNewInputURL(ctxt, filename, NULL, encoding, 0);
6588 
6589     return(htmlCtxtParseDocument(ctxt, input));
6590 }
6591 
6592 /**
6593  * htmlCtxtReadMemory:
6594  * @ctxt:  an HTML parser context
6595  * @buffer:  a pointer to a char array
6596  * @size:  the size of the array
6597  * @URL:  only used for error reporting (optional)
6598  * @encoding:  the document encoding (optinal)
6599  * @options:  a combination of htmlParserOptions
6600  *
6601  * Parse an HTML in-memory document and build a tree. The input buffer must
6602  * not contain any terminating null bytes.
6603  *
6604  * See htmlCtxtUseOptions for details.
6605  *
6606  * Returns the resulting document tree
6607  */
6608 htmlDocPtr
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt,const char * buffer,int size,const char * URL,const char * encoding,int options)6609 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6610                   const char *URL, const char *encoding, int options)
6611 {
6612     xmlParserInputPtr input;
6613 
6614     if ((ctxt == NULL) || (size < 0))
6615         return (NULL);
6616 
6617     htmlCtxtReset(ctxt);
6618     htmlCtxtUseOptions(ctxt, options);
6619 
6620     input = xmlNewInputMemory(ctxt, URL, buffer, size, encoding,
6621                               XML_INPUT_BUF_STATIC);
6622 
6623     return(htmlCtxtParseDocument(ctxt, input));
6624 }
6625 
6626 /**
6627  * htmlCtxtReadFd:
6628  * @ctxt:  an HTML parser context
6629  * @fd:  an open file descriptor
6630  * @URL:  only used for error reporting (optional)
6631  * @encoding:  the document encoding (optinal)
6632  * @options:  a combination of htmlParserOptions
6633  *
6634  * Parse an HTML from a file descriptor and build a tree.
6635  *
6636  * See htmlCtxtUseOptions for details.
6637  *
6638  * NOTE that the file descriptor will not be closed when the
6639  * context is freed or reset.
6640  *
6641  * Returns the resulting document tree
6642  */
6643 htmlDocPtr
htmlCtxtReadFd(htmlParserCtxtPtr ctxt,int fd,const char * URL,const char * encoding,int options)6644 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6645               const char *URL, const char *encoding, int options)
6646 {
6647     xmlParserInputPtr input;
6648 
6649     if (ctxt == NULL)
6650         return(NULL);
6651 
6652     htmlCtxtReset(ctxt);
6653     htmlCtxtUseOptions(ctxt, options);
6654 
6655     input = xmlNewInputFd(ctxt, URL, fd, encoding, 0);
6656     input->buf->closecallback = NULL;
6657 
6658     return(htmlCtxtParseDocument(ctxt, input));
6659 }
6660 
6661 /**
6662  * htmlCtxtReadIO:
6663  * @ctxt:  an HTML parser context
6664  * @ioread:  an I/O read function
6665  * @ioclose:  an I/O close function
6666  * @ioctx:  an I/O handler
6667  * @URL:  the base URL to use for the document
6668  * @encoding:  the document encoding, or NULL
6669  * @options:  a combination of htmlParserOption(s)
6670  *
6671  * Parse an HTML document from I/O functions and source and build a tree.
6672  *
6673  * See xmlNewInputIO and htmlCtxtUseOptions for details.
6674  *
6675  * Returns the resulting document tree
6676  */
6677 htmlDocPtr
htmlCtxtReadIO(htmlParserCtxtPtr ctxt,xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)6678 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6679               xmlInputCloseCallback ioclose, void *ioctx,
6680 	      const char *URL,
6681               const char *encoding, int options)
6682 {
6683     xmlParserInputPtr input;
6684 
6685     if (ctxt == NULL)
6686         return (NULL);
6687 
6688     htmlCtxtReset(ctxt);
6689     htmlCtxtUseOptions(ctxt, options);
6690 
6691     input = xmlNewInputIO(ctxt, URL, ioread, ioclose, ioctx, encoding, 0);
6692 
6693     return(htmlCtxtParseDocument(ctxt, input));
6694 }
6695 
6696 #endif /* LIBXML_HTML_ENABLED */
6697