1 /*
2 * HTMLparser.c : an HTML parser
3 *
4 * References:
5 * HTML Living Standard
6 * https://html.spec.whatwg.org/multipage/parsing.html
7 *
8 * Tokenization now conforms to HTML5. Tree construction still follows
9 * a custom, non-standard implementation. See:
10 *
11 * https://gitlab.gnome.org/GNOME/libxml2/-/issues/211
12 *
13 * See Copyright for the status of this software.
14 *
15 * [email protected]
16 */
17
18 #define IN_LIBXML
19 #include "libxml.h"
20 #ifdef LIBXML_HTML_ENABLED
21
22 #include <string.h>
23 #include <ctype.h>
24 #include <stdlib.h>
25
26 #include <libxml/HTMLparser.h>
27 #include <libxml/xmlmemory.h>
28 #include <libxml/tree.h>
29 #include <libxml/parser.h>
30 #include <libxml/parserInternals.h>
31 #include <libxml/xmlerror.h>
32 #include <libxml/HTMLtree.h>
33 #include <libxml/entities.h>
34 #include <libxml/encoding.h>
35 #include <libxml/xmlIO.h>
36 #include <libxml/uri.h>
37
38 #include "private/buf.h"
39 #include "private/dict.h"
40 #include "private/enc.h"
41 #include "private/error.h"
42 #include "private/html.h"
43 #include "private/io.h"
44 #include "private/parser.h"
45 #include "private/tree.h"
46
47 #define HTML_MAX_NAMELEN 1000
48 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
49 #define HTML_PARSER_BUFFER_SIZE 100
50
51 #define IS_WS_HTML(c) \
52 (((c) == 0x20) || \
53 (((c) >= 0x09) && ((c) <= 0x0D) && ((c) != 0x0B)))
54
55 #define IS_HEX_DIGIT(c) \
56 ((IS_ASCII_DIGIT(c)) || \
57 ((((c) | 0x20) >= 'a') && (((c) | 0x20) <= 'f')))
58
59 #define IS_UPPER(c) \
60 (((c) >= 'A') && ((c) <= 'Z'))
61
62 #define IS_ALNUM(c) \
63 (IS_ASCII_LETTER(c) || IS_ASCII_DIGIT(c))
64
65 typedef const unsigned htmlAsciiMask[2];
66
67 static htmlAsciiMask MASK_DQ = {
68 0,
69 1u << ('"' - 32),
70 };
71 static htmlAsciiMask MASK_SQ = {
72 0,
73 1u << ('\'' - 32),
74 };
75 static htmlAsciiMask MASK_GT = {
76 0,
77 1u << ('>' - 32),
78 };
79 static htmlAsciiMask MASK_DASH = {
80 0,
81 1u << ('-' - 32),
82 };
83 static htmlAsciiMask MASK_WS_GT = {
84 1u << 0x09 | 1u << 0x0A | 1u << 0x0C | 1u << 0x0D,
85 1u << (' ' - 32) | 1u << ('>' - 32),
86 };
87 static htmlAsciiMask MASK_DQ_GT = {
88 0,
89 1u << ('"' - 32) | 1u << ('>' - 32),
90 };
91 static htmlAsciiMask MASK_SQ_GT = {
92 0,
93 1u << ('\'' - 32) | 1u << ('>' - 32),
94 };
95
96 static int htmlOmittedDefaultValue = 1;
97
98 static int
99 htmlParseElementInternal(htmlParserCtxtPtr ctxt);
100
101 /************************************************************************
102 * *
103 * Some factorized error routines *
104 * *
105 ************************************************************************/
106
107 /**
108 * htmlErrMemory:
109 * @ctxt: an HTML parser context
110 * @extra: extra information
111 *
112 * Handle a redefinition of attribute error
113 */
114 static void
htmlErrMemory(xmlParserCtxtPtr ctxt)115 htmlErrMemory(xmlParserCtxtPtr ctxt)
116 {
117 xmlCtxtErrMemory(ctxt);
118 }
119
120 /**
121 * htmlParseErr:
122 * @ctxt: an HTML parser context
123 * @error: the error number
124 * @msg: the error message
125 * @str1: string infor
126 * @str2: string infor
127 *
128 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
129 */
130 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErr(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,const xmlChar * str1,const xmlChar * str2)131 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
132 const char *msg, const xmlChar *str1, const xmlChar *str2)
133 {
134 xmlCtxtErr(ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR,
135 str1, str2, NULL, 0, msg, str1, str2);
136 }
137
138 /************************************************************************
139 * *
140 * Parser stacks related functions and macros *
141 * *
142 ************************************************************************/
143
144 /**
145 * htmlnamePush:
146 * @ctxt: an HTML parser context
147 * @value: the element name
148 *
149 * Pushes a new element name on top of the name stack
150 *
151 * Returns -1 in case of error, the index in the stack otherwise
152 */
153 static int
htmlnamePush(htmlParserCtxtPtr ctxt,const xmlChar * value)154 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
155 {
156 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
157 ctxt->html = 3;
158 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
159 ctxt->html = 10;
160 if (ctxt->nameNr >= ctxt->nameMax) {
161 size_t newSize = ctxt->nameMax * 2;
162 const xmlChar **tmp;
163
164 tmp = xmlRealloc((xmlChar **) ctxt->nameTab,
165 newSize * sizeof(ctxt->nameTab[0]));
166 if (tmp == NULL) {
167 htmlErrMemory(ctxt);
168 return (-1);
169 }
170 ctxt->nameTab = tmp;
171 ctxt->nameMax = newSize;
172 }
173 ctxt->nameTab[ctxt->nameNr] = value;
174 ctxt->name = value;
175 return (ctxt->nameNr++);
176 }
177 /**
178 * htmlnamePop:
179 * @ctxt: an HTML parser context
180 *
181 * Pops the top element name from the name stack
182 *
183 * Returns the name just removed
184 */
185 static const xmlChar *
htmlnamePop(htmlParserCtxtPtr ctxt)186 htmlnamePop(htmlParserCtxtPtr ctxt)
187 {
188 const xmlChar *ret;
189
190 if (ctxt->nameNr <= 0)
191 return (NULL);
192 ctxt->nameNr--;
193 if (ctxt->nameNr < 0)
194 return (NULL);
195 if (ctxt->nameNr > 0)
196 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
197 else
198 ctxt->name = NULL;
199 ret = ctxt->nameTab[ctxt->nameNr];
200 ctxt->nameTab[ctxt->nameNr] = NULL;
201 return (ret);
202 }
203
204 /**
205 * htmlNodeInfoPush:
206 * @ctxt: an HTML parser context
207 * @value: the node info
208 *
209 * Pushes a new element name on top of the node info stack
210 *
211 * Returns 0 in case of error, the index in the stack otherwise
212 */
213 static int
htmlNodeInfoPush(htmlParserCtxtPtr ctxt,htmlParserNodeInfo * value)214 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
215 {
216 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
217 if (ctxt->nodeInfoMax == 0)
218 ctxt->nodeInfoMax = 5;
219 ctxt->nodeInfoMax *= 2;
220 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
221 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
222 ctxt->nodeInfoMax *
223 sizeof(ctxt->nodeInfoTab[0]));
224 if (ctxt->nodeInfoTab == NULL) {
225 htmlErrMemory(ctxt);
226 return (0);
227 }
228 }
229 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
230 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
231 return (ctxt->nodeInfoNr++);
232 }
233
234 /**
235 * htmlNodeInfoPop:
236 * @ctxt: an HTML parser context
237 *
238 * Pops the top element name from the node info stack
239 *
240 * Returns 0 in case of error, the pointer to NodeInfo otherwise
241 */
242 static htmlParserNodeInfo *
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)243 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
244 {
245 if (ctxt->nodeInfoNr <= 0)
246 return (NULL);
247 ctxt->nodeInfoNr--;
248 if (ctxt->nodeInfoNr < 0)
249 return (NULL);
250 if (ctxt->nodeInfoNr > 0)
251 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
252 else
253 ctxt->nodeInfo = NULL;
254 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
255 }
256
257 /*
258 * Macros for accessing the content. Those should be used only by the parser,
259 * and not exported.
260 *
261 * Dirty macros, i.e. one need to make assumption on the context to use them
262 *
263 * CUR_PTR return the current pointer to the xmlChar to be parsed.
264 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
265 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
266 * in UNICODE mode. This should be used internally by the parser
267 * only to compare to ASCII values otherwise it would break when
268 * running with UTF-8 encoding.
269 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
270 * to compare on ASCII based substring.
271 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
272 * it should be used only to compare on ASCII based substring.
273 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
274 * strings without newlines within the parser.
275 *
276 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
277 *
278 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
279 */
280
281 #define UPPER (toupper(*ctxt->input->cur))
282
283 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
284
285 #define NXT(val) ctxt->input->cur[(val)]
286
287 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
288
289 #define CUR_PTR ctxt->input->cur
290 #define BASE_PTR ctxt->input->base
291
292 #define SHRINK \
293 if ((!PARSER_PROGRESSIVE(ctxt)) && \
294 (ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
295 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
296 xmlParserShrink(ctxt);
297
298 #define GROW \
299 if ((!PARSER_PROGRESSIVE(ctxt)) && \
300 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
301 xmlParserGrow(ctxt);
302
303 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
304
305 /* Imported from XML */
306
307 #define CUR (*ctxt->input->cur)
308
309 /**
310 * htmlFindEncoding:
311 * @the HTML parser context
312 *
313 * Ty to find and encoding in the current data available in the input
314 * buffer this is needed to try to switch to the proper encoding when
315 * one face a character error.
316 * That's an heuristic, since it's operating outside of parsing it could
317 * try to use a meta which had been commented out, that's the reason it
318 * should only be used in case of error, not as a default.
319 *
320 * Returns an encoding string or NULL if not found, the string need to
321 * be freed
322 */
323 static xmlChar *
htmlFindEncoding(xmlParserCtxtPtr ctxt)324 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
325 const xmlChar *start, *cur, *end;
326 xmlChar *ret;
327
328 if ((ctxt == NULL) || (ctxt->input == NULL) ||
329 (ctxt->input->flags & XML_INPUT_HAS_ENCODING))
330 return(NULL);
331 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
332 return(NULL);
333
334 start = ctxt->input->cur;
335 end = ctxt->input->end;
336 /* we also expect the input buffer to be zero terminated */
337 if (*end != 0)
338 return(NULL);
339
340 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
341 if (cur == NULL)
342 return(NULL);
343 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
344 if (cur == NULL)
345 return(NULL);
346 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
347 if (cur == NULL)
348 return(NULL);
349 cur += 8;
350 start = cur;
351 while ((IS_ALNUM(*cur)) ||
352 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
353 cur++;
354 if (cur == start)
355 return(NULL);
356 ret = xmlStrndup(start, cur - start);
357 if (ret == NULL)
358 htmlErrMemory(ctxt);
359 return(ret);
360 }
361
362 static int
htmlMaskMatch(htmlAsciiMask mask,unsigned c)363 htmlMaskMatch(htmlAsciiMask mask, unsigned c) {
364 if (c >= 64)
365 return(0);
366 return((mask[c/32] >> (c & 31)) & 1);
367 }
368
369 static int
htmlValidateUtf8(xmlParserCtxtPtr ctxt,const xmlChar * str,size_t len)370 htmlValidateUtf8(xmlParserCtxtPtr ctxt, const xmlChar *str, size_t len) {
371 unsigned c = str[0];
372 int size;
373
374 if (c < 0xC2) {
375 goto invalid;
376 } else if (c < 0xE0) {
377 if (len < 2)
378 goto incomplete;
379 if ((str[1] & 0xC0) != 0x80)
380 goto invalid;
381 size = 2;
382 } else if (c < 0xF0) {
383 unsigned v;
384
385 if (len < 3)
386 goto incomplete;
387
388 v = str[1] << 8 | str[2]; /* hint to generate 16-bit load */
389 v |= c << 16;
390
391 if (((v & 0x00C0C0) != 0x008080) ||
392 ((v & 0x0F2000) == 0x000000) ||
393 ((v & 0x0F2000) == 0x0D2000))
394 goto invalid;
395
396 size = 3;
397 } else {
398 unsigned v;
399
400 if (len < 4)
401 goto incomplete;
402
403 v = c << 24 | str[1] << 16 | str[2] << 8 | str[3];
404
405 if (((v & 0x00C0C0C0) != 0x00808080) ||
406 (v < 0xF0900000) || (v >= 0xF4900000))
407 goto invalid;
408
409 size = 4;
410 }
411
412 return(size);
413
414 incomplete:
415 return(0);
416
417 invalid:
418 /* Only report the first error */
419 if ((ctxt->input->flags & XML_INPUT_ENCODING_ERROR) == 0) {
420 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
421 "Invalid bytes in character encoding", NULL, NULL);
422 ctxt->input->flags |= XML_INPUT_ENCODING_ERROR;
423 }
424
425 return(-1);
426 }
427
428 /**
429 * htmlSkipBlankChars:
430 * @ctxt: the HTML parser context
431 *
432 * skip all blanks character found at that point in the input streams.
433 *
434 * Returns the number of space chars skipped
435 */
436
437 static int
htmlSkipBlankChars(xmlParserCtxtPtr ctxt)438 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
439 const xmlChar *cur = ctxt->input->cur;
440 size_t avail = ctxt->input->end - cur;
441 int res = 0;
442 int line = ctxt->input->line;
443 int col = ctxt->input->col;
444
445 while (!PARSER_STOPPED(ctxt)) {
446 if (avail == 0) {
447 ctxt->input->cur = cur;
448 GROW;
449 cur = ctxt->input->cur;
450 avail = ctxt->input->end - cur;
451
452 if (avail == 0)
453 break;
454 }
455
456 if (*cur == '\n') {
457 line++;
458 col = 1;
459 } else if (IS_WS_HTML(*cur)) {
460 col++;
461 } else {
462 break;
463 }
464
465 cur += 1;
466 avail -= 1;
467
468 if (res < INT_MAX)
469 res++;
470 }
471
472 ctxt->input->cur = cur;
473 ctxt->input->line = line;
474 ctxt->input->col = col;
475
476 if (res > 8)
477 GROW;
478
479 return(res);
480 }
481
482
483
484 /************************************************************************
485 * *
486 * The list of HTML elements and their properties *
487 * *
488 ************************************************************************/
489
490 /*
491 * Start Tag: 1 means the start tag can be omitted
492 * End Tag: 1 means the end tag can be omitted
493 * 2 means it's forbidden (empty elements)
494 * 3 means the tag is stylistic and should be closed easily
495 * Depr: this element is deprecated
496 * DTD: 1 means that this element is valid only in the Loose DTD
497 * 2 means that this element is valid only in the Frameset DTD
498 *
499 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
500 */
501
502 #define DATA_RCDATA 1
503 #define DATA_RAWTEXT 2
504 #define DATA_PLAINTEXT 3
505 #define DATA_SCRIPT 4
506 #define DATA_SCRIPT_ESC1 5
507 #define DATA_SCRIPT_ESC2 6
508
509 static const htmlElemDesc
510 html40ElementTable[] = {
511 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
512 NULL, NULL, NULL, NULL, NULL,
513 0
514 },
515 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
516 NULL, NULL, NULL, NULL, NULL,
517 0
518 },
519 { "acronym", 0, 0, 0, 0, 0, 0, 1, "",
520 NULL, NULL, NULL, NULL, NULL,
521 0
522 },
523 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
524 NULL, NULL, NULL, NULL, NULL,
525 0
526 },
527 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
528 NULL, NULL, NULL, NULL, NULL,
529 0
530 },
531 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
532 NULL, NULL, NULL, NULL, NULL,
533 0
534 },
535 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
536 NULL, NULL, NULL, NULL, NULL,
537 0
538 },
539 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
540 NULL, NULL, NULL, NULL, NULL,
541 0
542 },
543 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
544 NULL, NULL, NULL, NULL, NULL,
545 0
546 },
547 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
548 NULL, NULL, NULL, NULL, NULL,
549 0
550 },
551 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
552 NULL, NULL, NULL, NULL, NULL,
553 0
554 },
555 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
556 NULL, NULL, NULL, NULL, NULL,
557 0
558 },
559 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
560 NULL, NULL, NULL, NULL, NULL,
561 0
562 },
563 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
564 NULL, NULL, NULL, NULL, NULL,
565 0
566 },
567 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
568 NULL, NULL, NULL, NULL, NULL,
569 0
570 },
571 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
572 NULL, NULL, NULL, NULL, NULL,
573 0
574 },
575 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
576 NULL, NULL, NULL, NULL, NULL,
577 0
578 },
579 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
580 NULL, NULL, NULL, NULL, NULL,
581 0
582 },
583 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
584 NULL, NULL, NULL, NULL, NULL,
585 0
586 },
587 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
588 NULL, NULL, NULL, NULL, NULL,
589 0
590 },
591 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
592 NULL, NULL, NULL, NULL, NULL,
593 0
594 },
595 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
596 NULL, NULL, NULL, NULL, NULL,
597 0
598 },
599 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
600 NULL, NULL, NULL, NULL, NULL,
601 0
602 },
603 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
604 NULL, NULL, NULL, NULL, NULL,
605 0
606 },
607 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
608 NULL, NULL, NULL, NULL, NULL,
609 0
610 },
611 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
612 NULL, NULL, NULL, NULL, NULL,
613 0
614 },
615 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
616 NULL, NULL, NULL, NULL, NULL,
617 0
618 },
619 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
620 NULL, NULL, NULL, NULL, NULL,
621 0
622 },
623 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
624 NULL, NULL, NULL, NULL, NULL,
625 0
626 },
627 { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
628 NULL, NULL, NULL, NULL, NULL,
629 0
630 },
631 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
632 NULL, NULL, NULL, NULL, NULL,
633 0
634 },
635 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
636 NULL, NULL, NULL, NULL, NULL,
637 0
638 },
639 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
640 NULL, NULL, NULL, NULL, NULL,
641 0
642 },
643 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
644 NULL, NULL, NULL, NULL, NULL,
645 0
646 },
647 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
648 NULL, NULL, NULL, NULL, NULL,
649 0
650 },
651 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
652 NULL, NULL, NULL, NULL, NULL,
653 0
654 },
655 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
656 NULL, NULL, NULL, NULL, NULL,
657 0
658 },
659 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
660 NULL, NULL, NULL, NULL, NULL,
661 0
662 },
663 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
664 NULL, NULL, NULL, NULL, NULL,
665 0
666 },
667 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
668 NULL, NULL, NULL, NULL, NULL,
669 0
670 },
671 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
672 NULL, NULL, NULL, NULL, NULL,
673 0
674 },
675 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
676 NULL, NULL, NULL, NULL, NULL,
677 0
678 },
679 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
680 NULL, NULL, NULL, NULL, NULL,
681 0
682 },
683 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
684 NULL, NULL, NULL, NULL, NULL,
685 0
686 },
687 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
688 NULL, NULL, NULL, NULL, NULL,
689 0
690 },
691 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
692 NULL, NULL, NULL, NULL, NULL,
693 DATA_RAWTEXT
694 },
695 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
696 NULL, NULL, NULL, NULL, NULL,
697 0
698 },
699 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
700 NULL, NULL, NULL, NULL, NULL,
701 0
702 },
703 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
704 NULL, NULL, NULL, NULL, NULL,
705 0
706 },
707 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
708 NULL, NULL, NULL, NULL, NULL,
709 0
710 },
711 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
712 NULL, NULL, NULL, NULL, NULL,
713 0
714 },
715 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
716 NULL, NULL, NULL, NULL, NULL,
717 0
718 },
719 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
720 NULL, NULL, NULL, NULL, NULL,
721 0
722 },
723 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
724 NULL, NULL, NULL, NULL, NULL,
725 0
726 },
727 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
728 NULL, NULL, NULL, NULL, NULL,
729 0
730 },
731 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
732 NULL, NULL, NULL, NULL, NULL,
733 0
734 },
735 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
736 NULL, NULL, NULL, NULL, NULL,
737 0
738 },
739 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
740 NULL, NULL, NULL, NULL, NULL,
741 0
742 },
743 { "noembed", 0, 0, 0, 0, 0, 0, 0, "",
744 NULL, NULL, NULL, NULL, NULL,
745 DATA_RAWTEXT
746 },
747 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
748 NULL, NULL, NULL, NULL, NULL,
749 DATA_RAWTEXT
750 },
751 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
752 NULL, NULL, NULL, NULL, NULL,
753 0
754 },
755 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
756 NULL, NULL, NULL, NULL, NULL,
757 0
758 },
759 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
760 NULL, NULL, NULL, NULL, NULL,
761 0
762 },
763 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
764 NULL, NULL, NULL, NULL, NULL,
765 0
766 },
767 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
768 NULL, NULL, NULL, NULL, NULL,
769 0
770 },
771 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
772 NULL, NULL, NULL, NULL, NULL,
773 0
774 },
775 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
776 NULL, NULL, NULL, NULL, NULL,
777 0
778 },
779 { "plaintext", 0, 0, 0, 0, 0, 0, 0, "",
780 NULL, NULL, NULL, NULL, NULL,
781 DATA_PLAINTEXT
782 },
783 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
784 NULL, NULL, NULL, NULL, NULL,
785 0
786 },
787 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
788 NULL, NULL, NULL, NULL, NULL,
789 0
790 },
791 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
792 NULL, NULL, NULL, NULL, NULL,
793 0
794 },
795 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
796 NULL, NULL, NULL, NULL, NULL,
797 0
798 },
799 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
800 NULL, NULL, NULL, NULL, NULL,
801 DATA_SCRIPT
802 },
803 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
804 NULL, NULL, NULL, NULL, NULL,
805 0
806 },
807 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
808 NULL, NULL, NULL, NULL, NULL,
809 0
810 },
811 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
812 NULL, NULL, NULL, NULL, NULL,
813 0
814 },
815 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
816 NULL, NULL, NULL, NULL, NULL,
817 0
818 },
819 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
820 NULL, NULL, NULL, NULL, NULL,
821 0
822 },
823 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
824 NULL, NULL, NULL, NULL, NULL,
825 DATA_RAWTEXT
826 },
827 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
828 NULL, NULL, NULL, NULL, NULL,
829 0
830 },
831 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
832 NULL, NULL, NULL, NULL, NULL,
833 0
834 },
835 { "table", 0, 0, 0, 0, 0, 0, 0, "",
836 NULL, NULL, NULL, NULL, NULL,
837 0
838 },
839 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
840 NULL, NULL, NULL, NULL, NULL,
841 0
842 },
843 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
844 NULL, NULL, NULL, NULL, NULL,
845 0
846 },
847 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
848 NULL, NULL, NULL, NULL, NULL,
849 DATA_RCDATA
850 },
851 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
852 NULL, NULL, NULL, NULL, NULL,
853 0
854 },
855 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
856 NULL, NULL, NULL, NULL, NULL,
857 0
858 },
859 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
860 NULL, NULL, NULL, NULL, NULL,
861 0
862 },
863 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
864 NULL, NULL, NULL, NULL, NULL,
865 DATA_RCDATA
866 },
867 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
868 NULL, NULL, NULL, NULL, NULL,
869 0
870 },
871 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
872 NULL, NULL, NULL, NULL, NULL,
873 0
874 },
875 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
876 NULL, NULL, NULL, NULL, NULL,
877 0
878 },
879 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
880 NULL, NULL, NULL, NULL, NULL,
881 0
882 },
883 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
884 NULL, NULL, NULL, NULL, NULL,
885 0
886 },
887 { "xmp", 0, 0, 0, 0, 0, 0, 1, "",
888 NULL, NULL, NULL, NULL, NULL,
889 DATA_RAWTEXT
890 }
891 };
892
893 typedef struct {
894 const char *oldTag;
895 const char *newTag;
896 } htmlStartCloseEntry;
897
898 /*
899 * start tags that imply the end of current element
900 */
901 static const htmlStartCloseEntry htmlStartClose[] = {
902 { "a", "a" },
903 { "a", "fieldset" },
904 { "a", "table" },
905 { "a", "td" },
906 { "a", "th" },
907 { "address", "dd" },
908 { "address", "dl" },
909 { "address", "dt" },
910 { "address", "form" },
911 { "address", "li" },
912 { "address", "ul" },
913 { "b", "center" },
914 { "b", "p" },
915 { "b", "td" },
916 { "b", "th" },
917 { "big", "p" },
918 { "caption", "col" },
919 { "caption", "colgroup" },
920 { "caption", "tbody" },
921 { "caption", "tfoot" },
922 { "caption", "thead" },
923 { "caption", "tr" },
924 { "col", "col" },
925 { "col", "colgroup" },
926 { "col", "tbody" },
927 { "col", "tfoot" },
928 { "col", "thead" },
929 { "col", "tr" },
930 { "colgroup", "colgroup" },
931 { "colgroup", "tbody" },
932 { "colgroup", "tfoot" },
933 { "colgroup", "thead" },
934 { "colgroup", "tr" },
935 { "dd", "dt" },
936 { "dir", "dd" },
937 { "dir", "dl" },
938 { "dir", "dt" },
939 { "dir", "form" },
940 { "dir", "ul" },
941 { "dl", "form" },
942 { "dl", "li" },
943 { "dt", "dd" },
944 { "dt", "dl" },
945 { "font", "center" },
946 { "font", "td" },
947 { "font", "th" },
948 { "form", "form" },
949 { "h1", "fieldset" },
950 { "h1", "form" },
951 { "h1", "li" },
952 { "h1", "p" },
953 { "h1", "table" },
954 { "h2", "fieldset" },
955 { "h2", "form" },
956 { "h2", "li" },
957 { "h2", "p" },
958 { "h2", "table" },
959 { "h3", "fieldset" },
960 { "h3", "form" },
961 { "h3", "li" },
962 { "h3", "p" },
963 { "h3", "table" },
964 { "h4", "fieldset" },
965 { "h4", "form" },
966 { "h4", "li" },
967 { "h4", "p" },
968 { "h4", "table" },
969 { "h5", "fieldset" },
970 { "h5", "form" },
971 { "h5", "li" },
972 { "h5", "p" },
973 { "h5", "table" },
974 { "h6", "fieldset" },
975 { "h6", "form" },
976 { "h6", "li" },
977 { "h6", "p" },
978 { "h6", "table" },
979 { "head", "a" },
980 { "head", "abbr" },
981 { "head", "acronym" },
982 { "head", "address" },
983 { "head", "b" },
984 { "head", "bdo" },
985 { "head", "big" },
986 { "head", "blockquote" },
987 { "head", "body" },
988 { "head", "br" },
989 { "head", "center" },
990 { "head", "cite" },
991 { "head", "code" },
992 { "head", "dd" },
993 { "head", "dfn" },
994 { "head", "dir" },
995 { "head", "div" },
996 { "head", "dl" },
997 { "head", "dt" },
998 { "head", "em" },
999 { "head", "fieldset" },
1000 { "head", "font" },
1001 { "head", "form" },
1002 { "head", "frameset" },
1003 { "head", "h1" },
1004 { "head", "h2" },
1005 { "head", "h3" },
1006 { "head", "h4" },
1007 { "head", "h5" },
1008 { "head", "h6" },
1009 { "head", "hr" },
1010 { "head", "i" },
1011 { "head", "iframe" },
1012 { "head", "img" },
1013 { "head", "kbd" },
1014 { "head", "li" },
1015 { "head", "listing" },
1016 { "head", "map" },
1017 { "head", "menu" },
1018 { "head", "ol" },
1019 { "head", "p" },
1020 { "head", "pre" },
1021 { "head", "q" },
1022 { "head", "s" },
1023 { "head", "samp" },
1024 { "head", "small" },
1025 { "head", "span" },
1026 { "head", "strike" },
1027 { "head", "strong" },
1028 { "head", "sub" },
1029 { "head", "sup" },
1030 { "head", "table" },
1031 { "head", "tt" },
1032 { "head", "u" },
1033 { "head", "ul" },
1034 { "head", "var" },
1035 { "head", "xmp" },
1036 { "hr", "form" },
1037 { "i", "center" },
1038 { "i", "p" },
1039 { "i", "td" },
1040 { "i", "th" },
1041 { "legend", "fieldset" },
1042 { "li", "li" },
1043 { "link", "body" },
1044 { "link", "frameset" },
1045 { "listing", "dd" },
1046 { "listing", "dl" },
1047 { "listing", "dt" },
1048 { "listing", "fieldset" },
1049 { "listing", "form" },
1050 { "listing", "li" },
1051 { "listing", "table" },
1052 { "listing", "ul" },
1053 { "menu", "dd" },
1054 { "menu", "dl" },
1055 { "menu", "dt" },
1056 { "menu", "form" },
1057 { "menu", "ul" },
1058 { "ol", "form" },
1059 { "option", "optgroup" },
1060 { "option", "option" },
1061 { "p", "address" },
1062 { "p", "blockquote" },
1063 { "p", "body" },
1064 { "p", "caption" },
1065 { "p", "center" },
1066 { "p", "col" },
1067 { "p", "colgroup" },
1068 { "p", "dd" },
1069 { "p", "dir" },
1070 { "p", "div" },
1071 { "p", "dl" },
1072 { "p", "dt" },
1073 { "p", "fieldset" },
1074 { "p", "form" },
1075 { "p", "frameset" },
1076 { "p", "h1" },
1077 { "p", "h2" },
1078 { "p", "h3" },
1079 { "p", "h4" },
1080 { "p", "h5" },
1081 { "p", "h6" },
1082 { "p", "head" },
1083 { "p", "hr" },
1084 { "p", "li" },
1085 { "p", "listing" },
1086 { "p", "menu" },
1087 { "p", "ol" },
1088 { "p", "p" },
1089 { "p", "pre" },
1090 { "p", "table" },
1091 { "p", "tbody" },
1092 { "p", "td" },
1093 { "p", "tfoot" },
1094 { "p", "th" },
1095 { "p", "title" },
1096 { "p", "tr" },
1097 { "p", "ul" },
1098 { "p", "xmp" },
1099 { "pre", "dd" },
1100 { "pre", "dl" },
1101 { "pre", "dt" },
1102 { "pre", "fieldset" },
1103 { "pre", "form" },
1104 { "pre", "li" },
1105 { "pre", "table" },
1106 { "pre", "ul" },
1107 { "s", "p" },
1108 { "script", "noscript" },
1109 { "small", "p" },
1110 { "span", "td" },
1111 { "span", "th" },
1112 { "strike", "p" },
1113 { "style", "body" },
1114 { "style", "frameset" },
1115 { "tbody", "tbody" },
1116 { "tbody", "tfoot" },
1117 { "td", "tbody" },
1118 { "td", "td" },
1119 { "td", "tfoot" },
1120 { "td", "th" },
1121 { "td", "tr" },
1122 { "tfoot", "tbody" },
1123 { "th", "tbody" },
1124 { "th", "td" },
1125 { "th", "tfoot" },
1126 { "th", "th" },
1127 { "th", "tr" },
1128 { "thead", "tbody" },
1129 { "thead", "tfoot" },
1130 { "title", "body" },
1131 { "title", "frameset" },
1132 { "tr", "tbody" },
1133 { "tr", "tfoot" },
1134 { "tr", "tr" },
1135 { "tt", "p" },
1136 { "u", "p" },
1137 { "u", "td" },
1138 { "u", "th" },
1139 { "ul", "address" },
1140 { "ul", "form" },
1141 { "ul", "menu" },
1142 { "ul", "pre" },
1143 { "xmp", "dd" },
1144 { "xmp", "dl" },
1145 { "xmp", "dt" },
1146 { "xmp", "fieldset" },
1147 { "xmp", "form" },
1148 { "xmp", "li" },
1149 { "xmp", "table" },
1150 { "xmp", "ul" }
1151 };
1152
1153 /*
1154 * The list of HTML elements which are supposed not to have
1155 * CDATA content and where a p element will be implied
1156 *
1157 * TODO: extend that list by reading the HTML SGML DTD on
1158 * implied paragraph
1159 */
1160 static const char *const htmlNoContentElements[] = {
1161 "html",
1162 "head",
1163 NULL
1164 };
1165
1166 /*
1167 * The list of HTML attributes which are of content %Script;
1168 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1169 * it assumes the name starts with 'on'
1170 */
1171 static const char *const htmlScriptAttributes[] = {
1172 "onclick",
1173 "ondblclick",
1174 "onmousedown",
1175 "onmouseup",
1176 "onmouseover",
1177 "onmousemove",
1178 "onmouseout",
1179 "onkeypress",
1180 "onkeydown",
1181 "onkeyup",
1182 "onload",
1183 "onunload",
1184 "onfocus",
1185 "onblur",
1186 "onsubmit",
1187 "onreset",
1188 "onchange",
1189 "onselect"
1190 };
1191
1192 /*
1193 * This table is used by the htmlparser to know what to do with
1194 * broken html pages. By assigning different priorities to different
1195 * elements the parser can decide how to handle extra endtags.
1196 * Endtags are only allowed to close elements with lower or equal
1197 * priority.
1198 */
1199
1200 typedef struct {
1201 const char *name;
1202 int priority;
1203 } elementPriority;
1204
1205 static const elementPriority htmlEndPriority[] = {
1206 {"div", 150},
1207 {"td", 160},
1208 {"th", 160},
1209 {"tr", 170},
1210 {"thead", 180},
1211 {"tbody", 180},
1212 {"tfoot", 180},
1213 {"table", 190},
1214 {"head", 200},
1215 {"body", 200},
1216 {"html", 220},
1217 {NULL, 100} /* Default priority */
1218 };
1219
1220 /************************************************************************
1221 * *
1222 * functions to handle HTML specific data *
1223 * *
1224 ************************************************************************/
1225
1226 static void
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt)1227 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
1228 /*
1229 * Capture end position and add node
1230 */
1231 if ( ctxt->node != NULL && ctxt->record_info ) {
1232 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
1233 (CUR_PTR - ctxt->input->base);
1234 ctxt->nodeInfo->end_line = ctxt->input->line;
1235 ctxt->nodeInfo->node = ctxt->node;
1236 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
1237 htmlNodeInfoPop(ctxt);
1238 }
1239 }
1240
1241 /**
1242 * htmlInitAutoClose:
1243 *
1244 * DEPRECATED: This is a no-op.
1245 */
1246 void
htmlInitAutoClose(void)1247 htmlInitAutoClose(void) {
1248 }
1249
1250 static int
htmlCompareTags(const void * key,const void * member)1251 htmlCompareTags(const void *key, const void *member) {
1252 const xmlChar *tag = (const xmlChar *) key;
1253 const htmlElemDesc *desc = (const htmlElemDesc *) member;
1254
1255 return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1256 }
1257
1258 /**
1259 * htmlTagLookup:
1260 * @tag: The tag name in lowercase
1261 *
1262 * Lookup the HTML tag in the ElementTable
1263 *
1264 * Returns the related htmlElemDescPtr or NULL if not found.
1265 */
1266 const htmlElemDesc *
htmlTagLookup(const xmlChar * tag)1267 htmlTagLookup(const xmlChar *tag) {
1268 if (tag == NULL)
1269 return(NULL);
1270
1271 return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1272 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1273 sizeof(htmlElemDesc), htmlCompareTags));
1274 }
1275
1276 /**
1277 * htmlGetEndPriority:
1278 * @name: The name of the element to look up the priority for.
1279 *
1280 * Return value: The "endtag" priority.
1281 **/
1282 static int
htmlGetEndPriority(const xmlChar * name)1283 htmlGetEndPriority (const xmlChar *name) {
1284 int i = 0;
1285
1286 while ((htmlEndPriority[i].name != NULL) &&
1287 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1288 i++;
1289
1290 return(htmlEndPriority[i].priority);
1291 }
1292
1293
1294 static int
htmlCompareStartClose(const void * vkey,const void * member)1295 htmlCompareStartClose(const void *vkey, const void *member) {
1296 const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1297 const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1298 int ret;
1299
1300 ret = strcmp(key->oldTag, entry->oldTag);
1301 if (ret == 0)
1302 ret = strcmp(key->newTag, entry->newTag);
1303
1304 return(ret);
1305 }
1306
1307 /**
1308 * htmlCheckAutoClose:
1309 * @newtag: The new tag name
1310 * @oldtag: The old tag name
1311 *
1312 * Checks whether the new tag is one of the registered valid tags for
1313 * closing old.
1314 *
1315 * Returns 0 if no, 1 if yes.
1316 */
1317 static int
htmlCheckAutoClose(const xmlChar * newtag,const xmlChar * oldtag)1318 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1319 {
1320 htmlStartCloseEntry key;
1321 void *res;
1322
1323 key.oldTag = (const char *) oldtag;
1324 key.newTag = (const char *) newtag;
1325 res = bsearch(&key, htmlStartClose,
1326 sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1327 sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1328 return(res != NULL);
1329 }
1330
1331 /**
1332 * htmlAutoCloseOnClose:
1333 * @ctxt: an HTML parser context
1334 * @newtag: The new tag name
1335 * @force: force the tag closure
1336 *
1337 * The HTML DTD allows an ending tag to implicitly close other tags.
1338 */
1339 static void
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1340 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1341 {
1342 const htmlElemDesc *info;
1343 int i, priority;
1344
1345 if (ctxt->options & HTML_PARSE_HTML5)
1346 return;
1347
1348 priority = htmlGetEndPriority(newtag);
1349
1350 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1351
1352 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1353 break;
1354 /*
1355 * A misplaced endtag can only close elements with lower
1356 * or equal priority, so if we find an element with higher
1357 * priority before we find an element with
1358 * matching name, we just ignore this endtag
1359 */
1360 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1361 return;
1362 }
1363 if (i < 0)
1364 return;
1365
1366 while (!xmlStrEqual(newtag, ctxt->name)) {
1367 info = htmlTagLookup(ctxt->name);
1368 if ((info != NULL) && (info->endTag == 3)) {
1369 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1370 "Opening and ending tag mismatch: %s and %s\n",
1371 newtag, ctxt->name);
1372 }
1373 htmlParserFinishElementParsing(ctxt);
1374 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1375 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1376 htmlnamePop(ctxt);
1377 }
1378 }
1379
1380 /**
1381 * htmlAutoCloseOnEnd:
1382 * @ctxt: an HTML parser context
1383 *
1384 * Close all remaining tags at the end of the stream
1385 */
1386 static void
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)1387 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1388 {
1389 int i;
1390
1391 if (ctxt->options & HTML_PARSE_HTML5)
1392 return;
1393
1394 if (ctxt->nameNr == 0)
1395 return;
1396 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1397 htmlParserFinishElementParsing(ctxt);
1398 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1399 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1400 htmlnamePop(ctxt);
1401 }
1402 }
1403
1404 /**
1405 * htmlAutoClose:
1406 * @ctxt: an HTML parser context
1407 * @newtag: The new tag name or NULL
1408 *
1409 * The HTML DTD allows a tag to implicitly close other tags.
1410 * The list is kept in htmlStartClose array. This function is
1411 * called when a new tag has been detected and generates the
1412 * appropriates closes if possible/needed.
1413 * If newtag is NULL this mean we are at the end of the resource
1414 * and we should check
1415 */
1416 static void
htmlAutoClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1417 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1418 {
1419 if (ctxt->options & HTML_PARSE_HTML5)
1420 return;
1421
1422 if (newtag == NULL)
1423 return;
1424
1425 while ((ctxt->name != NULL) &&
1426 (htmlCheckAutoClose(newtag, ctxt->name))) {
1427 htmlParserFinishElementParsing(ctxt);
1428 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1429 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1430 htmlnamePop(ctxt);
1431 }
1432 }
1433
1434 /**
1435 * htmlAutoCloseTag:
1436 * @doc: the HTML document
1437 * @name: The tag name
1438 * @elem: the HTML element
1439 *
1440 * DEPRECATED: Internal function, don't use.
1441 *
1442 * The HTML DTD allows a tag to implicitly close other tags.
1443 * The list is kept in htmlStartClose array. This function checks
1444 * if the element or one of it's children would autoclose the
1445 * given tag.
1446 *
1447 * Returns 1 if autoclose, 0 otherwise
1448 */
1449 int
htmlAutoCloseTag(htmlDocPtr doc,const xmlChar * name,htmlNodePtr elem)1450 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1451 htmlNodePtr child;
1452
1453 if (elem == NULL) return(1);
1454 if (xmlStrEqual(name, elem->name)) return(0);
1455 if (htmlCheckAutoClose(elem->name, name)) return(1);
1456 child = elem->children;
1457 while (child != NULL) {
1458 if (htmlAutoCloseTag(doc, name, child)) return(1);
1459 child = child->next;
1460 }
1461 return(0);
1462 }
1463
1464 /**
1465 * htmlIsAutoClosed:
1466 * @doc: the HTML document
1467 * @elem: the HTML element
1468 *
1469 * DEPRECATED: Internal function, don't use.
1470 *
1471 * The HTML DTD allows a tag to implicitly close other tags.
1472 * The list is kept in htmlStartClose array. This function checks
1473 * if a tag is autoclosed by one of it's child
1474 *
1475 * Returns 1 if autoclosed, 0 otherwise
1476 */
1477 int
htmlIsAutoClosed(htmlDocPtr doc,htmlNodePtr elem)1478 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1479 htmlNodePtr child;
1480
1481 if (elem == NULL) return(1);
1482 child = elem->children;
1483 while (child != NULL) {
1484 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1485 child = child->next;
1486 }
1487 return(0);
1488 }
1489
1490 /**
1491 * htmlCheckImplied:
1492 * @ctxt: an HTML parser context
1493 * @newtag: The new tag name
1494 *
1495 * The HTML DTD allows a tag to exists only implicitly
1496 * called when a new tag has been detected and generates the
1497 * appropriates implicit tags if missing
1498 */
1499 static void
htmlCheckImplied(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1500 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1501 int i;
1502
1503 if (ctxt->options & (HTML_PARSE_NOIMPLIED | HTML_PARSE_HTML5))
1504 return;
1505 if (!htmlOmittedDefaultValue)
1506 return;
1507 if (xmlStrEqual(newtag, BAD_CAST"html"))
1508 return;
1509 if (ctxt->nameNr <= 0) {
1510 htmlnamePush(ctxt, BAD_CAST"html");
1511 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1512 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1513 }
1514 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1515 return;
1516 if ((ctxt->nameNr <= 1) &&
1517 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1518 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1519 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1520 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1521 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1522 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1523 if (ctxt->html >= 3) {
1524 /* we already saw or generated an <head> before */
1525 return;
1526 }
1527 /*
1528 * dropped OBJECT ... i you put it first BODY will be
1529 * assumed !
1530 */
1531 htmlnamePush(ctxt, BAD_CAST"head");
1532 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1533 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1534 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1535 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1536 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1537 if (ctxt->html >= 10) {
1538 /* we already saw or generated a <body> before */
1539 return;
1540 }
1541 for (i = 0;i < ctxt->nameNr;i++) {
1542 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1543 return;
1544 }
1545 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1546 return;
1547 }
1548 }
1549
1550 htmlnamePush(ctxt, BAD_CAST"body");
1551 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1552 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1553 }
1554 }
1555
1556 /**
1557 * htmlCheckParagraph
1558 * @ctxt: an HTML parser context
1559 *
1560 * Check whether a p element need to be implied before inserting
1561 * characters in the current element.
1562 *
1563 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1564 * in case of error.
1565 */
1566
1567 static int
htmlCheckParagraph(htmlParserCtxtPtr ctxt)1568 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1569 const xmlChar *tag;
1570 int i;
1571
1572 if (ctxt == NULL)
1573 return(-1);
1574 if (ctxt->options & HTML_PARSE_HTML5)
1575 return(0);
1576
1577 tag = ctxt->name;
1578 if (tag == NULL) {
1579 htmlAutoClose(ctxt, BAD_CAST"p");
1580 htmlCheckImplied(ctxt, BAD_CAST"p");
1581 htmlnamePush(ctxt, BAD_CAST"p");
1582 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1583 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1584 return(1);
1585 }
1586 if (!htmlOmittedDefaultValue)
1587 return(0);
1588 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1589 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1590 htmlAutoClose(ctxt, BAD_CAST"p");
1591 htmlCheckImplied(ctxt, BAD_CAST"p");
1592 htmlnamePush(ctxt, BAD_CAST"p");
1593 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1594 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1595 return(1);
1596 }
1597 }
1598 return(0);
1599 }
1600
1601 /**
1602 * htmlIsScriptAttribute:
1603 * @name: an attribute name
1604 *
1605 * Check if an attribute is of content type Script
1606 *
1607 * Returns 1 is the attribute is a script 0 otherwise
1608 */
1609 int
htmlIsScriptAttribute(const xmlChar * name)1610 htmlIsScriptAttribute(const xmlChar *name) {
1611 unsigned int i;
1612
1613 if (name == NULL)
1614 return(0);
1615 /*
1616 * all script attributes start with 'on'
1617 */
1618 if ((name[0] != 'o') || (name[1] != 'n'))
1619 return(0);
1620 for (i = 0;
1621 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1622 i++) {
1623 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1624 return(1);
1625 }
1626 return(0);
1627 }
1628
1629 /************************************************************************
1630 * *
1631 * The list of HTML predefined entities *
1632 * *
1633 ************************************************************************/
1634
1635
1636 static const htmlEntityDesc html40EntitiesTable[] = {
1637 /*
1638 * the 4 absolute ones, plus apostrophe.
1639 */
1640 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1641 { 38, "amp", "ampersand, U+0026 ISOnum" },
1642 { 39, "apos", "single quote" },
1643 { 60, "lt", "less-than sign, U+003C ISOnum" },
1644 { 62, "gt", "greater-than sign, U+003E ISOnum" },
1645
1646 /*
1647 * A bunch still in the 128-255 range
1648 * Replacing them depend really on the charset used.
1649 */
1650 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1651 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1652 { 162, "cent", "cent sign, U+00A2 ISOnum" },
1653 { 163, "pound","pound sign, U+00A3 ISOnum" },
1654 { 164, "curren","currency sign, U+00A4 ISOnum" },
1655 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1656 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1657 { 167, "sect", "section sign, U+00A7 ISOnum" },
1658 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1659 { 169, "copy", "copyright sign, U+00A9 ISOnum" },
1660 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1661 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1662 { 172, "not", "not sign, U+00AC ISOnum" },
1663 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1664 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1665 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1666 { 176, "deg", "degree sign, U+00B0 ISOnum" },
1667 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1668 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1669 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1670 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1671 { 181, "micro","micro sign, U+00B5 ISOnum" },
1672 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1673 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1674 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1675 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1676 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1677 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1678 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1679 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1680 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1681 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1682 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1683 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1684 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1685 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1686 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1687 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1688 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1689 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1690 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1691 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1692 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1693 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1694 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1695 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1696 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1697 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1698 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1699 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1700 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1701 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1702 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1703 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1704 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1705 { 215, "times","multiplication sign, U+00D7 ISOnum" },
1706 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1707 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1708 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1709 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1710 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1711 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1712 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1713 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1714 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1715 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1716 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1717 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1718 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1719 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1720 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1721 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1722 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1723 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1724 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1725 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1726 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1727 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1728 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1729 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1730 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1731 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1732 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1733 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1734 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1735 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1736 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1737 { 247, "divide","division sign, U+00F7 ISOnum" },
1738 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1739 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1740 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1741 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1742 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1743 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1744 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1745 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1746
1747 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1748 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1749 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1750 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1751 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1752
1753 /*
1754 * Anything below should really be kept as entities references
1755 */
1756 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1757
1758 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1759 { 732, "tilde","small tilde, U+02DC ISOdia" },
1760
1761 { 913, "Alpha","greek capital letter alpha, U+0391" },
1762 { 914, "Beta", "greek capital letter beta, U+0392" },
1763 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1764 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1765 { 917, "Epsilon","greek capital letter epsilon, U+0395" },
1766 { 918, "Zeta", "greek capital letter zeta, U+0396" },
1767 { 919, "Eta", "greek capital letter eta, U+0397" },
1768 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1769 { 921, "Iota", "greek capital letter iota, U+0399" },
1770 { 922, "Kappa","greek capital letter kappa, U+039A" },
1771 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1772 { 924, "Mu", "greek capital letter mu, U+039C" },
1773 { 925, "Nu", "greek capital letter nu, U+039D" },
1774 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1775 { 927, "Omicron","greek capital letter omicron, U+039F" },
1776 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1777 { 929, "Rho", "greek capital letter rho, U+03A1" },
1778 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1779 { 932, "Tau", "greek capital letter tau, U+03A4" },
1780 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1781 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1782 { 935, "Chi", "greek capital letter chi, U+03A7" },
1783 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1784 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1785
1786 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1787 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1788 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1789 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1790 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1791 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1792 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1793 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1794 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1795 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1796 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1797 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1798 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1799 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1800 { 959, "omicron","greek small letter omicron, U+03BF NEW" },
1801 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1802 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1803 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1804 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1805 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1806 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1807 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1808 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1809 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1810 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1811 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1812 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1813 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1814
1815 { 8194, "ensp", "en space, U+2002 ISOpub" },
1816 { 8195, "emsp", "em space, U+2003 ISOpub" },
1817 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1818 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1819 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1820 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1821 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1822 { 8211, "ndash","en dash, U+2013 ISOpub" },
1823 { 8212, "mdash","em dash, U+2014 ISOpub" },
1824 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1825 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1826 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1827 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1828 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1829 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1830 { 8224, "dagger","dagger, U+2020 ISOpub" },
1831 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1832
1833 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1834 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1835
1836 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1837
1838 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1839 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1840
1841 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1842 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1843
1844 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1845 { 8260, "frasl","fraction slash, U+2044 NEW" },
1846
1847 { 8364, "euro", "euro sign, U+20AC NEW" },
1848
1849 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1850 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1851 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1852 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1853 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1854 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1855 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1856 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1857 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1858 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1859 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1860 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1861 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1862 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1863 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1864 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1865
1866 { 8704, "forall","for all, U+2200 ISOtech" },
1867 { 8706, "part", "partial differential, U+2202 ISOtech" },
1868 { 8707, "exist","there exists, U+2203 ISOtech" },
1869 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1870 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1871 { 8712, "isin", "element of, U+2208 ISOtech" },
1872 { 8713, "notin","not an element of, U+2209 ISOtech" },
1873 { 8715, "ni", "contains as member, U+220B ISOtech" },
1874 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1875 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1876 { 8722, "minus","minus sign, U+2212 ISOtech" },
1877 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1878 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
1879 { 8733, "prop", "proportional to, U+221D ISOtech" },
1880 { 8734, "infin","infinity, U+221E ISOtech" },
1881 { 8736, "ang", "angle, U+2220 ISOamso" },
1882 { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1883 { 8744, "or", "logical or = vee, U+2228 ISOtech" },
1884 { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1885 { 8746, "cup", "union = cup, U+222A ISOtech" },
1886 { 8747, "int", "integral, U+222B ISOtech" },
1887 { 8756, "there4","therefore, U+2234 ISOtech" },
1888 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1889 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1890 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1891 { 8800, "ne", "not equal to, U+2260 ISOtech" },
1892 { 8801, "equiv","identical to, U+2261 ISOtech" },
1893 { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1894 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1895 { 8834, "sub", "subset of, U+2282 ISOtech" },
1896 { 8835, "sup", "superset of, U+2283 ISOtech" },
1897 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1898 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1899 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1900 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1901 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1902 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1903 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1904 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1905 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1906 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1907 { 8971, "rfloor","right floor, U+230B ISOamsc" },
1908 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1909 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1910 { 9674, "loz", "lozenge, U+25CA ISOpub" },
1911
1912 { 9824, "spades","black spade suit, U+2660 ISOpub" },
1913 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1914 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1915 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
1916
1917 };
1918
1919 /************************************************************************
1920 * *
1921 * Commodity functions to handle entities *
1922 * *
1923 ************************************************************************/
1924
1925 /**
1926 * htmlEntityLookup:
1927 * @name: the entity name
1928 *
1929 * Lookup the given entity in EntitiesTable
1930 *
1931 * TODO: the linear scan is really ugly, an hash table is really needed.
1932 *
1933 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1934 */
1935 const htmlEntityDesc *
htmlEntityLookup(const xmlChar * name)1936 htmlEntityLookup(const xmlChar *name) {
1937 unsigned int i;
1938
1939 for (i = 0;i < (sizeof(html40EntitiesTable)/
1940 sizeof(html40EntitiesTable[0]));i++) {
1941 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1942 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1943 }
1944 }
1945 return(NULL);
1946 }
1947
1948 static int
htmlCompareEntityDesc(const void * vkey,const void * vdesc)1949 htmlCompareEntityDesc(const void *vkey, const void *vdesc) {
1950 const unsigned *key = vkey;
1951 const htmlEntityDesc *desc = vdesc;
1952
1953 return((int) *key - (int) desc->value);
1954 }
1955
1956 /**
1957 * htmlEntityValueLookup:
1958 * @value: the entity's unicode value
1959 *
1960 * Lookup the given entity in EntitiesTable
1961 *
1962 * TODO: the linear scan is really ugly, an hash table is really needed.
1963 *
1964 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1965 */
1966 const htmlEntityDesc *
htmlEntityValueLookup(unsigned int value)1967 htmlEntityValueLookup(unsigned int value) {
1968 const htmlEntityDesc *desc;
1969 size_t nmemb;
1970
1971 nmemb = sizeof(html40EntitiesTable) / sizeof(html40EntitiesTable[0]);
1972 desc = bsearch(&value, html40EntitiesTable, nmemb, sizeof(htmlEntityDesc),
1973 htmlCompareEntityDesc);
1974
1975 return(desc);
1976 }
1977
1978 /**
1979 * UTF8ToHtml:
1980 * @out: a pointer to an array of bytes to store the result
1981 * @outlen: the length of @out
1982 * @in: a pointer to an array of UTF-8 chars
1983 * @inlen: the length of @in
1984 *
1985 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1986 * plus HTML entities block of chars out.
1987 *
1988 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1989 * The value of @inlen after return is the number of octets consumed
1990 * as the return value is positive, else unpredictable.
1991 * The value of @outlen after return is the number of octets consumed.
1992 */
1993 int
UTF8ToHtml(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)1994 UTF8ToHtml(unsigned char* out, int *outlen,
1995 const unsigned char* in, int *inlen) {
1996 const unsigned char* instart = in;
1997 const unsigned char* inend;
1998 unsigned char* outstart = out;
1999 unsigned char* outend;
2000 int ret = XML_ENC_ERR_SPACE;
2001
2002 if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
2003 return(XML_ENC_ERR_INTERNAL);
2004
2005 if (in == NULL) {
2006 /*
2007 * initialization nothing to do
2008 */
2009 *outlen = 0;
2010 *inlen = 0;
2011 return(XML_ENC_ERR_SUCCESS);
2012 }
2013
2014 inend = in + *inlen;
2015 outend = out + *outlen;
2016 while (in < inend) {
2017 const htmlEntityDesc *ent;
2018 const char *cp;
2019 char nbuf[16];
2020 unsigned c, d;
2021 int seqlen, len, i;
2022
2023 d = *in;
2024
2025 if (d < 0x80) {
2026 if (out >= outend)
2027 goto done;
2028 *out++ = d;
2029 in += 1;
2030 continue;
2031 }
2032
2033 if (d < 0xE0) { c = d & 0x1F; seqlen = 2; }
2034 else if (d < 0xF0) { c = d & 0x0F; seqlen = 3; }
2035 else { c = d & 0x07; seqlen = 4; }
2036
2037 if (inend - in < seqlen)
2038 break;
2039
2040 for (i = 1; i < seqlen; i++) {
2041 d = in[i];
2042 c <<= 6;
2043 c |= d & 0x3F;
2044 }
2045
2046 /*
2047 * Try to lookup a predefined HTML entity for it
2048 */
2049 ent = htmlEntityValueLookup(c);
2050
2051 if (ent == NULL) {
2052 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2053 cp = nbuf;
2054 } else {
2055 cp = ent->name;
2056 }
2057
2058 len = strlen(cp);
2059 if (outend - out < len + 2)
2060 goto done;
2061
2062 *out++ = '&';
2063 memcpy(out, cp, len);
2064 out += len;
2065 *out++ = ';';
2066
2067 in += seqlen;
2068 }
2069
2070 ret = out - outstart;
2071
2072 done:
2073 *outlen = out - outstart;
2074 *inlen = in - instart;
2075 return(ret);
2076 }
2077
2078 /**
2079 * htmlEncodeEntities:
2080 * @out: a pointer to an array of bytes to store the result
2081 * @outlen: the length of @out
2082 * @in: a pointer to an array of UTF-8 chars
2083 * @inlen: the length of @in
2084 * @quoteChar: the quote character to escape (' or ") or zero.
2085 *
2086 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2087 * plus HTML entities block of chars out.
2088 *
2089 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2090 * The value of @inlen after return is the number of octets consumed
2091 * as the return value is positive, else unpredictable.
2092 * The value of @outlen after return is the number of octets consumed.
2093 */
2094 int
htmlEncodeEntities(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,int quoteChar)2095 htmlEncodeEntities(unsigned char* out, int *outlen,
2096 const unsigned char* in, int *inlen, int quoteChar) {
2097 const unsigned char* processed = in;
2098 const unsigned char* outend;
2099 const unsigned char* outstart = out;
2100 const unsigned char* instart = in;
2101 const unsigned char* inend;
2102 unsigned int c, d;
2103 int trailing;
2104
2105 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2106 return(-1);
2107 outend = out + (*outlen);
2108 inend = in + (*inlen);
2109 while (in < inend) {
2110 d = *in++;
2111 if (d < 0x80) { c= d; trailing= 0; }
2112 else if (d < 0xC0) {
2113 /* trailing byte in leading position */
2114 *outlen = out - outstart;
2115 *inlen = processed - instart;
2116 return(-2);
2117 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2118 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2119 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2120 else {
2121 /* no chance for this in Ascii */
2122 *outlen = out - outstart;
2123 *inlen = processed - instart;
2124 return(-2);
2125 }
2126
2127 if (inend - in < trailing)
2128 break;
2129
2130 while (trailing--) {
2131 if (((d= *in++) & 0xC0) != 0x80) {
2132 *outlen = out - outstart;
2133 *inlen = processed - instart;
2134 return(-2);
2135 }
2136 c <<= 6;
2137 c |= d & 0x3F;
2138 }
2139
2140 /* assertion: c is a single UTF-4 value */
2141 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2142 (c != '&') && (c != '<') && (c != '>')) {
2143 if (out >= outend)
2144 break;
2145 *out++ = c;
2146 } else {
2147 const htmlEntityDesc * ent;
2148 const char *cp;
2149 char nbuf[16];
2150 int len;
2151
2152 /*
2153 * Try to lookup a predefined HTML entity for it
2154 */
2155 ent = htmlEntityValueLookup(c);
2156 if (ent == NULL) {
2157 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2158 cp = nbuf;
2159 }
2160 else
2161 cp = ent->name;
2162 len = strlen(cp);
2163 if (outend - out < len + 2)
2164 break;
2165 *out++ = '&';
2166 memcpy(out, cp, len);
2167 out += len;
2168 *out++ = ';';
2169 }
2170 processed = in;
2171 }
2172 *outlen = out - outstart;
2173 *inlen = processed - instart;
2174 return(0);
2175 }
2176
2177 /************************************************************************
2178 * *
2179 * Commodity functions, cleanup needed ? *
2180 * *
2181 ************************************************************************/
2182 /*
2183 * all tags allowing pc data from the html 4.01 loose dtd
2184 * NOTE: it might be more appropriate to integrate this information
2185 * into the html40ElementTable array but I don't want to risk any
2186 * binary incompatibility
2187 */
2188 static const char *allowPCData[] = {
2189 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2190 "blockquote", "body", "button", "caption", "center", "cite", "code",
2191 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2192 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2193 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2194 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2195 };
2196
2197 /**
2198 * areBlanks:
2199 * @ctxt: an HTML parser context
2200 * @str: a xmlChar *
2201 * @len: the size of @str
2202 *
2203 * Is this a sequence of blank chars that one can ignore ?
2204 *
2205 * Returns 1 if ignorable 0 if whitespace, -1 otherwise.
2206 */
2207
areBlanks(htmlParserCtxtPtr ctxt,const xmlChar * str,int len)2208 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2209 unsigned int i;
2210 int j;
2211 xmlNodePtr lastChild;
2212 xmlDtdPtr dtd;
2213
2214 for (j = 0;j < len;j++)
2215 if (!(IS_WS_HTML(str[j]))) return(-1);
2216
2217 if (CUR == 0) return(1);
2218 if (CUR != '<') return(0);
2219 if (ctxt->name == NULL)
2220 return(1);
2221 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2222 return(1);
2223 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2224 return(1);
2225
2226 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2227 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2228 dtd = xmlGetIntSubset(ctxt->myDoc);
2229 if (dtd != NULL && dtd->ExternalID != NULL) {
2230 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2231 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2232 return(1);
2233 }
2234 }
2235
2236 if (ctxt->node == NULL) return(0);
2237 lastChild = xmlGetLastChild(ctxt->node);
2238 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2239 lastChild = lastChild->prev;
2240 if (lastChild == NULL) {
2241 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2242 (ctxt->node->content != NULL)) return(0);
2243 /* keep ws in constructs like ...<b> </b>...
2244 for all tags "b" allowing PCDATA */
2245 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2246 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2247 return(0);
2248 }
2249 }
2250 } else if (xmlNodeIsText(lastChild)) {
2251 return(0);
2252 } else {
2253 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2254 for all tags "p" allowing PCDATA */
2255 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2256 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2257 return(0);
2258 }
2259 }
2260 }
2261 return(1);
2262 }
2263
2264 /**
2265 * htmlNewDocNoDtD:
2266 * @URI: URI for the dtd, or NULL
2267 * @ExternalID: the external ID of the DTD, or NULL
2268 *
2269 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2270 * are NULL
2271 *
2272 * Returns a new document, do not initialize the DTD if not provided
2273 */
2274 htmlDocPtr
htmlNewDocNoDtD(const xmlChar * URI,const xmlChar * ExternalID)2275 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2276 xmlDocPtr cur;
2277
2278 /*
2279 * Allocate a new document and fill the fields.
2280 */
2281 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2282 if (cur == NULL)
2283 return(NULL);
2284 memset(cur, 0, sizeof(xmlDoc));
2285
2286 cur->type = XML_HTML_DOCUMENT_NODE;
2287 cur->version = NULL;
2288 cur->intSubset = NULL;
2289 cur->doc = cur;
2290 cur->name = NULL;
2291 cur->children = NULL;
2292 cur->extSubset = NULL;
2293 cur->oldNs = NULL;
2294 cur->encoding = NULL;
2295 cur->standalone = 1;
2296 cur->compression = 0;
2297 cur->ids = NULL;
2298 cur->refs = NULL;
2299 cur->_private = NULL;
2300 cur->charset = XML_CHAR_ENCODING_UTF8;
2301 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2302 if ((ExternalID != NULL) ||
2303 (URI != NULL)) {
2304 xmlDtdPtr intSubset;
2305
2306 intSubset = xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2307 if (intSubset == NULL) {
2308 xmlFree(cur);
2309 return(NULL);
2310 }
2311 }
2312 if ((xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2313 xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2314 return(cur);
2315 }
2316
2317 /**
2318 * htmlNewDoc:
2319 * @URI: URI for the dtd, or NULL
2320 * @ExternalID: the external ID of the DTD, or NULL
2321 *
2322 * Creates a new HTML document
2323 *
2324 * Returns a new document
2325 */
2326 htmlDocPtr
htmlNewDoc(const xmlChar * URI,const xmlChar * ExternalID)2327 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2328 if ((URI == NULL) && (ExternalID == NULL))
2329 return(htmlNewDocNoDtD(
2330 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2331 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2332
2333 return(htmlNewDocNoDtD(URI, ExternalID));
2334 }
2335
2336
2337 /************************************************************************
2338 * *
2339 * The parser itself *
2340 * Relates to http://www.w3.org/TR/html40 *
2341 * *
2342 ************************************************************************/
2343
2344 /************************************************************************
2345 * *
2346 * The parser itself *
2347 * *
2348 ************************************************************************/
2349
2350 /**
2351 * htmlParseHTMLName:
2352 * @ctxt: an HTML parser context
2353 *
2354 * parse an HTML tag or attribute name, note that we convert it to lowercase
2355 * since HTML names are not case-sensitive.
2356 *
2357 * Returns the Tag Name parsed or NULL
2358 */
2359
2360 static xmlHashedString
htmlParseHTMLName(htmlParserCtxtPtr ctxt,int attr)2361 htmlParseHTMLName(htmlParserCtxtPtr ctxt, int attr) {
2362 xmlHashedString ret;
2363 xmlChar buf[HTML_PARSER_BUFFER_SIZE];
2364 const xmlChar *in;
2365 size_t avail;
2366 int eof = PARSER_PROGRESSIVE(ctxt);
2367 int nbchar = 0;
2368 int stop = attr ? '=' : ' ';
2369
2370 in = ctxt->input->cur;
2371 avail = ctxt->input->end - in;
2372
2373 while (1) {
2374 int c, size;
2375
2376 if ((!eof) && (avail < 32)) {
2377 size_t oldAvail = avail;
2378
2379 ctxt->input->cur = in;
2380
2381 SHRINK;
2382 xmlParserGrow(ctxt);
2383
2384 in = ctxt->input->cur;
2385 avail = ctxt->input->end - in;
2386
2387 if (oldAvail == avail)
2388 eof = 1;
2389 }
2390
2391 if (avail == 0)
2392 break;
2393
2394 c = *in;
2395 size = 1;
2396
2397 if ((nbchar != 0) &&
2398 ((c == '/') || (c == '>') || (c == stop) ||
2399 (IS_WS_HTML(c))))
2400 break;
2401
2402 if (c == 0) {
2403 if (nbchar + 3 <= HTML_PARSER_BUFFER_SIZE) {
2404 buf[nbchar++] = 0xEF;
2405 buf[nbchar++] = 0xBF;
2406 buf[nbchar++] = 0xBD;
2407 }
2408 } else if (c < 0x80) {
2409 if (nbchar < HTML_PARSER_BUFFER_SIZE) {
2410 if (IS_UPPER(c))
2411 c += 0x20;
2412 buf[nbchar++] = c;
2413 }
2414 } else {
2415 size = htmlValidateUtf8(ctxt, in, avail);
2416
2417 if (size > 0) {
2418 if (nbchar + size <= HTML_PARSER_BUFFER_SIZE) {
2419 memcpy(buf + nbchar, in, size);
2420 nbchar += size;
2421 }
2422 } else {
2423 size = 1;
2424
2425 if (nbchar + 3 <= HTML_PARSER_BUFFER_SIZE) {
2426 buf[nbchar++] = 0xEF;
2427 buf[nbchar++] = 0xBF;
2428 buf[nbchar++] = 0xBD;
2429 }
2430 }
2431 }
2432
2433 in += size;
2434 avail -= size;
2435 }
2436
2437 ctxt->input->cur = in;
2438
2439 SHRINK;
2440
2441 ret = xmlDictLookupHashed(ctxt->dict, buf, nbchar);
2442 if (ret.name == NULL)
2443 htmlErrMemory(ctxt);
2444
2445 return(ret);
2446 }
2447
2448 static const short htmlC1Remap[32] = {
2449 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
2450 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
2451 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
2452 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178
2453 };
2454
2455 static const xmlChar *
htmlCodePointToUtf8(int c,xmlChar * out,int * osize)2456 htmlCodePointToUtf8(int c, xmlChar *out, int *osize) {
2457 int i = 0;
2458 int bits, hi;
2459
2460 if ((c >= 0x80) && (c < 0xA0)) {
2461 c = htmlC1Remap[c - 0x80];
2462 } else if ((c <= 0) ||
2463 ((c >= 0xD800) && (c < 0xE000)) ||
2464 (c > 0x10FFFF)) {
2465 c = 0xFFFD;
2466 }
2467
2468 if (c < 0x80) { bits = 0; hi = 0x00; }
2469 else if (c < 0x800) { bits = 6; hi = 0xC0; }
2470 else if (c < 0x10000) { bits = 12; hi = 0xE0; }
2471 else { bits = 18; hi = 0xF0; }
2472
2473 out[i++] = (c >> bits) | hi;
2474
2475 while (bits > 0) {
2476 bits -= 6;
2477 out[i++] = ((c >> bits) & 0x3F) | 0x80;
2478 }
2479
2480 *osize = i;
2481 return(out);
2482 }
2483
2484 #include "html5ent.inc"
2485
2486 #define ENT_F_SEMICOLON 0x80u
2487 #define ENT_F_SUBTABLE 0x40u
2488 #define ENT_F_ALL 0xC0u
2489
2490 static const xmlChar *
htmlFindEntityPrefix(const xmlChar * string,size_t slen,int isAttr,int * nlen,int * rlen)2491 htmlFindEntityPrefix(const xmlChar *string, size_t slen, int isAttr,
2492 int *nlen, int *rlen) {
2493 const xmlChar *match = NULL;
2494 unsigned left, right;
2495 int first = string[0];
2496 size_t matchLen = 0;
2497 size_t soff = 1;
2498
2499 if (slen < 2)
2500 return(NULL);
2501 if (!IS_ASCII_LETTER(first))
2502 return(NULL);
2503
2504 /*
2505 * Look up range by first character
2506 */
2507 first &= 63;
2508 left = htmlEntAlpha[first*3] | htmlEntAlpha[first*3+1] << 8;
2509 right = left + htmlEntAlpha[first*3+2];
2510
2511 /*
2512 * Binary search
2513 */
2514 while (left < right) {
2515 const xmlChar *bytes;
2516 unsigned mid;
2517 size_t len;
2518 int cmp;
2519
2520 mid = left + (right - left) / 2;
2521 bytes = htmlEntStrings + htmlEntValues[mid];
2522 len = bytes[0] & ~ENT_F_ALL;
2523
2524 cmp = string[soff] - bytes[1];
2525
2526 if (cmp == 0) {
2527 if (slen < len) {
2528 cmp = strncmp((const char *) string + soff + 1,
2529 (const char *) bytes + 2,
2530 slen - 1);
2531 /* Prefix can never match */
2532 if (cmp == 0)
2533 break;
2534 } else {
2535 cmp = strncmp((const char *) string + soff + 1,
2536 (const char *) bytes + 2,
2537 len - 1);
2538 }
2539 }
2540
2541 if (cmp < 0) {
2542 right = mid;
2543 } else if (cmp > 0) {
2544 left = mid + 1;
2545 } else {
2546 int term = soff + len < slen ? string[soff + len] : 0;
2547 int isAlnum, isTerm;
2548
2549 isAlnum = IS_ALNUM(term);
2550 isTerm = ((term == ';') ||
2551 ((bytes[0] & ENT_F_SEMICOLON) &&
2552 ((!isAttr) ||
2553 ((!isAlnum) && (term != '=')))));
2554
2555 if (isTerm) {
2556 match = bytes + len + 1;
2557 matchLen = soff + len;
2558 if (term == ';')
2559 matchLen += 1;
2560 }
2561
2562 if (bytes[0] & ENT_F_SUBTABLE) {
2563 if (isTerm)
2564 match += 2;
2565
2566 if ((isAlnum) && (soff + len < slen)) {
2567 left = mid + bytes[len + 1];
2568 right = left + bytes[len + 2];
2569 soff += len;
2570 continue;
2571 }
2572 }
2573
2574 break;
2575 }
2576 }
2577
2578 if (match == NULL)
2579 return(NULL);
2580
2581 *nlen = matchLen;
2582 *rlen = match[0];
2583 return(match + 1);
2584 }
2585
2586 /**
2587 * htmlParseData:
2588 * @ctxt: an HTML parser context
2589 * @mask: mask of terminating characters
2590 * @comment: true if parsing a comment
2591 * @refs: true if references are allowed
2592 * @maxLength: maximum output length
2593 *
2594 * Parse data until terminator is reached.
2595 *
2596 * Returns the parsed string or NULL in case of errors.
2597 */
2598
2599 static xmlChar *
htmlParseData(htmlParserCtxtPtr ctxt,htmlAsciiMask mask,int comment,int refs,int maxLength)2600 htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
2601 int comment, int refs, int maxLength) {
2602 xmlParserInputPtr input = ctxt->input;
2603 xmlChar *ret = NULL;
2604 xmlChar *buffer;
2605 xmlChar utf8Char[4];
2606 size_t buffer_size;
2607 size_t used;
2608 int eof = PARSER_PROGRESSIVE(ctxt);
2609 int line, col;
2610 int termSkip = -1;
2611
2612 used = 0;
2613 buffer_size = ctxt->spaceMax;
2614 buffer = (xmlChar *) ctxt->spaceTab;
2615 if (buffer == NULL) {
2616 buffer_size = 500;
2617 buffer = xmlMalloc(buffer_size + 1);
2618 if (buffer == NULL) {
2619 htmlErrMemory(ctxt);
2620 return(NULL);
2621 }
2622 }
2623
2624 line = input->line;
2625 col = input->col;
2626
2627 while (!PARSER_STOPPED(ctxt)) {
2628 const xmlChar *chunk, *in, *repl;
2629 size_t avail, chunkSize, extraSize;
2630 int replSize;
2631 int skip = 0;
2632 int ncr = 0;
2633 int ncrSize = 0;
2634 int cp = 0;
2635
2636 chunk = input->cur;
2637 avail = input->end - chunk;
2638 in = chunk;
2639
2640 repl = BAD_CAST "";
2641 replSize = 0;
2642
2643 while (!PARSER_STOPPED(ctxt)) {
2644 size_t j;
2645 int cur, size;
2646
2647 if ((!eof) && (avail <= 64)) {
2648 size_t oldAvail = avail;
2649 size_t off = in - chunk;
2650
2651 input->cur = in;
2652
2653 xmlParserGrow(ctxt);
2654
2655 in = input->cur;
2656 chunk = in - off;
2657 input->cur = chunk;
2658 avail = input->end - in;
2659
2660 if (oldAvail == avail)
2661 eof = 1;
2662 }
2663
2664 if (avail == 0) {
2665 termSkip = 0;
2666 break;
2667 }
2668
2669 cur = *in;
2670 size = 1;
2671 col += 1;
2672
2673 if (htmlMaskMatch(mask, cur)) {
2674 if (comment) {
2675 if (avail < 2) {
2676 termSkip = 1;
2677 } else if (in[1] == '-') {
2678 if (avail < 3) {
2679 termSkip = 2;
2680 } else if (in[2] == '>') {
2681 termSkip = 3;
2682 } else if (in[2] == '!') {
2683 if (avail < 4)
2684 termSkip = 3;
2685 else if (in[3] == '>')
2686 termSkip = 4;
2687 }
2688 }
2689
2690 if (termSkip >= 0)
2691 break;
2692 } else {
2693 termSkip = 0;
2694 break;
2695 }
2696 }
2697
2698 if (ncr) {
2699 int lc = cur | 0x20;
2700 int digit;
2701
2702 if ((cur >= '0') && (cur <= '9')) {
2703 digit = cur - '0';
2704 } else if ((ncr == 16) && (lc >= 'a') && (lc <= 'f')) {
2705 digit = (lc - 'a') + 10;
2706 } else {
2707 if (cur == ';') {
2708 in += 1;
2709 size += 1;
2710 ncrSize += 1;
2711 }
2712 goto next_chunk;
2713 }
2714
2715 cp = cp * ncr + digit;
2716 if (cp >= 0x110000)
2717 cp = 0x110000;
2718
2719 ncrSize += 1;
2720
2721 goto next_char;
2722 }
2723
2724 switch (cur) {
2725 case '&':
2726 if (!refs)
2727 break;
2728
2729 j = 1;
2730
2731 if ((j < avail) && (in[j] == '#')) {
2732 j += 1;
2733 if (j < avail) {
2734 if ((in[j] | 0x20) == 'x') {
2735 j += 1;
2736 if ((j < avail) && (IS_HEX_DIGIT(in[j]))) {
2737 ncr = 16;
2738 size = 3;
2739 ncrSize = 3;
2740 cp = 0;
2741 }
2742 } else if (IS_ASCII_DIGIT(in[j])) {
2743 ncr = 10;
2744 size = 2;
2745 ncrSize = 2;
2746 cp = 0;
2747 }
2748 }
2749 } else {
2750 repl = htmlFindEntityPrefix(in + j,
2751 avail - j,
2752 /* isAttr */ 1,
2753 &skip, &replSize);
2754 if (repl != NULL) {
2755 skip += 1;
2756 goto next_chunk;
2757 }
2758
2759 skip = 0;
2760 }
2761
2762 break;
2763
2764 case '\0':
2765 skip = 1;
2766 repl = BAD_CAST "\xEF\xBF\xBD";
2767 replSize = 3;
2768 goto next_chunk;
2769
2770 case '\n':
2771 line += 1;
2772 col = 1;
2773 break;
2774
2775 case '\r':
2776 skip = 1;
2777 if (in[1] != 0x0A) {
2778 repl = BAD_CAST "\x0A";
2779 replSize = 1;
2780 }
2781 goto next_chunk;
2782
2783 default:
2784 if (cur < 0x80)
2785 break;
2786
2787 if ((input->flags & XML_INPUT_HAS_ENCODING) == 0) {
2788 xmlChar * guess;
2789
2790 guess = htmlFindEncoding(ctxt);
2791 if (guess == NULL) {
2792 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
2793 } else {
2794 xmlSwitchEncodingName(ctxt, (const char *) guess);
2795 xmlFree(guess);
2796 }
2797 input->flags |= XML_INPUT_HAS_ENCODING;
2798
2799 goto restart;
2800 }
2801
2802 size = htmlValidateUtf8(ctxt, in, avail);
2803
2804 if (size <= 0) {
2805 skip = 1;
2806 repl = BAD_CAST "\xEF\xBF\xBD";
2807 replSize = 3;
2808 goto next_chunk;
2809 }
2810
2811 break;
2812 }
2813
2814 next_char:
2815 in += size;
2816 avail -= size;
2817 }
2818
2819 next_chunk:
2820 if (ncrSize > 0) {
2821 skip = ncrSize;
2822 in -= ncrSize;
2823
2824 repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
2825 }
2826
2827 chunkSize = in - chunk;
2828 extraSize = chunkSize + replSize;
2829
2830 if (extraSize > maxLength - used) {
2831 htmlParseErr(ctxt, XML_ERR_RESOURCE_LIMIT,
2832 "value too long\n", NULL, NULL);
2833 goto error;
2834 }
2835
2836 if (extraSize > buffer_size - used) {
2837 size_t newSize = (used + extraSize) * 2;
2838 xmlChar *tmp = (xmlChar *) xmlRealloc(buffer, newSize + 1);
2839
2840 if (tmp == NULL) {
2841 htmlErrMemory(ctxt);
2842 goto error;
2843 }
2844 buffer = tmp;
2845 buffer_size = newSize;
2846 }
2847
2848 if (chunkSize > 0) {
2849 input->cur += chunkSize;
2850 memcpy(buffer + used, chunk, chunkSize);
2851 used += chunkSize;
2852 }
2853
2854 input->cur += skip;
2855 if (replSize > 0) {
2856 memcpy(buffer + used, repl, replSize);
2857 used += replSize;
2858 }
2859
2860 SHRINK;
2861
2862 if (termSkip >= 0)
2863 break;
2864
2865 restart:
2866 ;
2867 }
2868
2869 if (termSkip > 0) {
2870 input->cur += termSkip;
2871 col += termSkip;
2872 }
2873
2874 input->line = line;
2875 input->col = col;
2876
2877 ret = xmlMalloc(used + 1);
2878 if (ret == NULL) {
2879 htmlErrMemory(ctxt);
2880 } else {
2881 memcpy(ret, buffer, used);
2882 ret[used] = 0;
2883 }
2884
2885 error:
2886 ctxt->spaceTab = (void *) buffer;
2887 ctxt->spaceMax = buffer_size;
2888
2889 return(ret);
2890 }
2891
2892 /**
2893 * htmlParseEntityRef:
2894 * @ctxt: an HTML parser context
2895 * @str: location to store the entity name
2896 *
2897 * DEPRECATED: Internal function, don't use.
2898 *
2899 * Returns NULL.
2900 */
2901 const htmlEntityDesc *
htmlParseEntityRef(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED,const xmlChar ** str ATTRIBUTE_UNUSED)2902 htmlParseEntityRef(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED,
2903 const xmlChar **str ATTRIBUTE_UNUSED) {
2904 return(NULL);
2905 }
2906
2907 /**
2908 * htmlParseAttValue:
2909 * @ctxt: an HTML parser context
2910 *
2911 * parse a value for an attribute
2912 * Note: the parser won't do substitution of entities here, this
2913 * will be handled later in xmlStringGetNodeList, unless it was
2914 * asked for ctxt->replaceEntities != 0
2915 *
2916 * Returns the AttValue parsed or NULL.
2917 */
2918
2919 static xmlChar *
htmlParseAttValue(htmlParserCtxtPtr ctxt)2920 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2921 xmlChar *ret = NULL;
2922 int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
2923 XML_MAX_HUGE_LENGTH :
2924 XML_MAX_TEXT_LENGTH;
2925
2926 if (CUR == '"') {
2927 SKIP(1);
2928 ret = htmlParseData(ctxt, MASK_DQ, 0, 1, maxLength);
2929 if (CUR == '"')
2930 SKIP(1);
2931 } else if (CUR == '\'') {
2932 SKIP(1);
2933 ret = htmlParseData(ctxt, MASK_SQ, 0, 1, maxLength);
2934 if (CUR == '\'')
2935 SKIP(1);
2936 } else {
2937 ret = htmlParseData(ctxt, MASK_WS_GT, 0, 1, maxLength);
2938 }
2939 return(ret);
2940 }
2941
2942 static void
htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt,const xmlChar * buf,int size,int mode)2943 htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt, const xmlChar *buf,
2944 int size, int mode) {
2945 if ((ctxt->sax == NULL) || (ctxt->disableSAX))
2946 return;
2947
2948 if ((mode == 0) || (mode == DATA_RCDATA) ||
2949 (ctxt->sax->cdataBlock == NULL)) {
2950 int blank = areBlanks(ctxt, buf, size);
2951
2952 if ((mode == 0) && (blank > 0) && (!ctxt->keepBlanks)) {
2953 if (ctxt->sax->ignorableWhitespace != NULL)
2954 ctxt->sax->ignorableWhitespace(ctxt->userData,
2955 buf, size);
2956 } else {
2957 if ((mode == 0) && (blank < 0))
2958 htmlCheckParagraph(ctxt);
2959
2960 if (ctxt->sax->characters != NULL)
2961 ctxt->sax->characters(ctxt->userData, buf, size);
2962 }
2963 } else {
2964 /*
2965 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2966 */
2967 ctxt->sax->cdataBlock(ctxt->userData, buf, size);
2968 }
2969 }
2970
2971 /**
2972 * htmlParseCharData:
2973 * @ctxt: an HTML parser context
2974 * @terminate: true if the input buffer is complete
2975 *
2976 * Parse character data and references.
2977 */
2978
2979 static int
htmlParseCharData(htmlParserCtxtPtr ctxt)2980 htmlParseCharData(htmlParserCtxtPtr ctxt) {
2981 xmlParserInputPtr input = ctxt->input;
2982 xmlChar utf8Char[4];
2983 int complete = 0;
2984 int done = 0;
2985 int mode;
2986 int eof = PARSER_PROGRESSIVE(ctxt);
2987 int line, col;
2988
2989 mode = ctxt->endCheckState;
2990
2991 line = input->line;
2992 col = input->col;
2993
2994 while (!PARSER_STOPPED(ctxt)) {
2995 const xmlChar *chunk, *in, *repl;
2996 size_t avail;
2997 int replSize;
2998 int skip = 0;
2999 int ncr = 0;
3000 int ncrSize = 0;
3001 int cp = 0;
3002
3003 chunk = input->cur;
3004 avail = input->end - chunk;
3005 in = chunk;
3006
3007 repl = BAD_CAST "";
3008 replSize = 0;
3009
3010 while (!PARSER_STOPPED(ctxt)) {
3011 size_t j;
3012 int cur, size;
3013
3014 if (avail <= 64) {
3015 if (!eof) {
3016 size_t oldAvail = avail;
3017 size_t off = in - chunk;
3018
3019 input->cur = in;
3020
3021 xmlParserGrow(ctxt);
3022
3023 in = input->cur;
3024 chunk = in - off;
3025 input->cur = chunk;
3026 avail = input->end - in;
3027
3028 if (oldAvail == avail)
3029 eof = 1;
3030 }
3031
3032 if (avail == 0) {
3033 done = 1;
3034 break;
3035 }
3036 }
3037
3038 /* Accelerator */
3039 if (!ncr) {
3040 while (avail > 0) {
3041 static const unsigned mask[8] = {
3042 0x00002401, 0x10002040,
3043 0x00000000, 0x00000000,
3044 0xFFFFFFFF, 0xFFFFFFFF,
3045 0xFFFFFFFF, 0xFFFFFFFF
3046 };
3047 cur = *in;
3048 if ((1u << (cur & 0x1F)) & mask[cur >> 5])
3049 break;
3050 col += 1;
3051 in += 1;
3052 avail -= 1;
3053 }
3054
3055 if ((!eof) && (avail <= 64))
3056 continue;
3057 if (avail == 0)
3058 continue;
3059 }
3060
3061 cur = *in;
3062 size = 1;
3063 col += 1;
3064
3065 if (ncr) {
3066 int lc = cur | 0x20;
3067 int digit;
3068
3069 if ((cur >= '0') && (cur <= '9')) {
3070 digit = cur - '0';
3071 } else if ((ncr == 16) && (lc >= 'a') && (lc <= 'f')) {
3072 digit = (lc - 'a') + 10;
3073 } else {
3074 if (cur == ';') {
3075 in += 1;
3076 size += 1;
3077 ncrSize += 1;
3078 }
3079 goto next_chunk;
3080 }
3081
3082 cp = cp * ncr + digit;
3083 if (cp >= 0x110000)
3084 cp = 0x110000;
3085
3086 ncrSize += 1;
3087
3088 goto next_char;
3089 }
3090
3091 switch (cur) {
3092 case '<':
3093 if (mode == 0) {
3094 done = 1;
3095 goto next_chunk;
3096 }
3097 if (mode == DATA_PLAINTEXT)
3098 break;
3099
3100 j = 1;
3101 if (j < avail) {
3102 if ((mode == DATA_SCRIPT) && (in[j] == '!')) {
3103 /* Check for comment start */
3104
3105 j += 1;
3106 if ((j < avail) && (in[j] == '-')) {
3107 j += 1;
3108 if ((j < avail) && (in[j] == '-'))
3109 mode = DATA_SCRIPT_ESC1;
3110 }
3111 } else {
3112 int i = 0;
3113 int solidus = 0;
3114
3115 /* Check for tag */
3116
3117 if (in[j] == '/') {
3118 j += 1;
3119 solidus = 1;
3120 }
3121
3122 if ((solidus) || (mode == DATA_SCRIPT_ESC1)) {
3123 while ((j < avail) &&
3124 (ctxt->name[i] != 0) &&
3125 (ctxt->name[i] == (in[j] | 0x20))) {
3126 i += 1;
3127 j += 1;
3128 }
3129
3130 if ((ctxt->name[i] == 0) && (j < avail)) {
3131 int c = in[j];
3132
3133 if ((c == '>') || (c == '/') ||
3134 (IS_WS_HTML(c))) {
3135 if ((mode == DATA_SCRIPT_ESC1) &&
3136 (!solidus)) {
3137 mode = DATA_SCRIPT_ESC2;
3138 } else if (mode == DATA_SCRIPT_ESC2) {
3139 mode = DATA_SCRIPT_ESC1;
3140 } else {
3141 complete = 1;
3142 done = 1;
3143 goto next_chunk;
3144 }
3145 }
3146 }
3147 }
3148 }
3149 }
3150
3151 if ((mode != 0) && (PARSER_PROGRESSIVE(ctxt))) {
3152 in += 1;
3153 done = 1;
3154 goto next_chunk;
3155 }
3156
3157 break;
3158
3159 case '-':
3160 if ((mode != DATA_SCRIPT_ESC1) && (mode != DATA_SCRIPT_ESC2))
3161 break;
3162
3163 /* Check for comment end */
3164
3165 j = 1;
3166 if ((j < avail) && (in[j] == '-')) {
3167 j += 1;
3168 if ((j < avail) && (in[j] == '>'))
3169 mode = DATA_SCRIPT;
3170 }
3171
3172 break;
3173
3174 case '&':
3175 if ((mode != 0) && (mode != DATA_RCDATA))
3176 break;
3177
3178 j = 1;
3179
3180 if ((j < avail) && (in[j] == '#')) {
3181 j += 1;
3182 if (j < avail) {
3183 if ((in[j] | 0x20) == 'x') {
3184 j += 1;
3185 if ((j < avail) && (IS_HEX_DIGIT(in[j]))) {
3186 ncr = 16;
3187 size = 3;
3188 ncrSize = 3;
3189 cp = 0;
3190 }
3191 } else if (IS_ASCII_DIGIT(in[j])) {
3192 ncr = 10;
3193 size = 2;
3194 ncrSize = 2;
3195 cp = 0;
3196 }
3197 }
3198 } else {
3199 repl = htmlFindEntityPrefix(in + j,
3200 avail - j,
3201 /* isAttr */ 0,
3202 &skip, &replSize);
3203 if (repl != NULL) {
3204 skip += 1;
3205 goto next_chunk;
3206 }
3207
3208 skip = 0;
3209 }
3210
3211 break;
3212
3213 case '\0':
3214 skip = 1;
3215 repl = BAD_CAST "\xEF\xBF\xBD";
3216 replSize = 3;
3217 goto next_chunk;
3218
3219 case '\n':
3220 line += 1;
3221 col = 1;
3222 break;
3223
3224 case '\r':
3225 skip = 1;
3226 if (in[1] != 0x0A) {
3227 repl = BAD_CAST "\x0A";
3228 replSize = 1;
3229 }
3230 goto next_chunk;
3231
3232 default:
3233 if (cur < 0x80)
3234 break;
3235
3236 if ((input->flags & XML_INPUT_HAS_ENCODING) == 0) {
3237 xmlChar * guess;
3238
3239 guess = htmlFindEncoding(ctxt);
3240 if (guess == NULL) {
3241 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
3242 } else {
3243 xmlSwitchEncodingName(ctxt, (const char *) guess);
3244 xmlFree(guess);
3245 }
3246 input->flags |= XML_INPUT_HAS_ENCODING;
3247
3248 goto restart;
3249 }
3250
3251 size = htmlValidateUtf8(ctxt, in, avail);
3252
3253 if (size <= 0) {
3254 skip = 1;
3255 repl = BAD_CAST "\xEF\xBF\xBD";
3256 replSize = 3;
3257 goto next_chunk;
3258 }
3259
3260 break;
3261 }
3262
3263 next_char:
3264 in += size;
3265 avail -= size;
3266 }
3267
3268 next_chunk:
3269 if (ncrSize > 0) {
3270 skip = ncrSize;
3271 in -= ncrSize;
3272
3273 repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
3274 }
3275
3276 if (in > chunk) {
3277 input->cur += in - chunk;
3278 htmlCharDataSAXCallback(ctxt, chunk, in - chunk, mode);
3279 }
3280
3281 input->cur += skip;
3282 if (replSize > 0)
3283 htmlCharDataSAXCallback(ctxt, repl, replSize, mode);
3284
3285 SHRINK;
3286
3287 if (done)
3288 break;
3289
3290 restart:
3291 ;
3292 }
3293
3294 input->line = line;
3295 input->col = col;
3296
3297 if (complete)
3298 ctxt->endCheckState = 0;
3299 else
3300 ctxt->endCheckState = mode;
3301
3302 return(complete);
3303 }
3304
3305 /**
3306 * htmlParseComment:
3307 * @ctxt: an HTML parser context
3308 * @bogus: true if this is a bogus comment
3309 *
3310 * Parse an HTML comment
3311 */
3312 static void
htmlParseComment(htmlParserCtxtPtr ctxt,int bogus)3313 htmlParseComment(htmlParserCtxtPtr ctxt, int bogus) {
3314 const xmlChar *comment = BAD_CAST "";
3315 xmlChar *buf = NULL;
3316 int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
3317 XML_MAX_HUGE_LENGTH :
3318 XML_MAX_TEXT_LENGTH;
3319
3320 if (bogus) {
3321 buf = htmlParseData(ctxt, MASK_GT, 0, 0, maxLength);
3322 if (CUR == '>')
3323 SKIP(1);
3324 comment = buf;
3325 } else {
3326 if (CUR == '>') {
3327 SKIP(1);
3328 } else if ((CUR == '-') && (NXT(1) == '>')) {
3329 SKIP(2);
3330 } else {
3331 buf = htmlParseData(ctxt, MASK_DASH, 1, 0, maxLength);
3332 comment = buf;
3333 }
3334 }
3335
3336 if (comment == NULL)
3337 return;
3338
3339 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3340 (!ctxt->disableSAX))
3341 ctxt->sax->comment(ctxt->userData, comment);
3342
3343 xmlFree(buf);
3344 }
3345
3346 /**
3347 * htmlParseCharRef:
3348 * @ctxt: an HTML parser context
3349 *
3350 * DEPRECATED: Internal function, don't use.
3351 *
3352 * Returns 0
3353 */
3354 int
htmlParseCharRef(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED)3355 htmlParseCharRef(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED) {
3356 return(0);
3357 }
3358
3359
3360 /**
3361 * htmlParseDoctypeLiteral:
3362 * @ctxt: an HTML parser context
3363 *
3364 * Parse a DOCTYPE SYTSTEM or PUBLIC literal.
3365 *
3366 * Returns the literal or NULL in case of error.
3367 */
3368
3369 static xmlChar *
htmlParseDoctypeLiteral(htmlParserCtxtPtr ctxt)3370 htmlParseDoctypeLiteral(htmlParserCtxtPtr ctxt) {
3371 xmlChar *ret;
3372 int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
3373 XML_MAX_TEXT_LENGTH :
3374 XML_MAX_NAME_LENGTH;
3375
3376 if (CUR == '"') {
3377 SKIP(1);
3378 ret = htmlParseData(ctxt, MASK_DQ_GT, 0, 0, maxLength);
3379 if (CUR == '"')
3380 SKIP(1);
3381 } else if (CUR == '\'') {
3382 SKIP(1);
3383 ret = htmlParseData(ctxt, MASK_SQ_GT, 0, 0, maxLength);
3384 if (CUR == '\'')
3385 SKIP(1);
3386 } else {
3387 return(NULL);
3388 }
3389
3390 return(ret);
3391 }
3392
3393 static void
htmlSkipBogusDoctype(htmlParserCtxtPtr ctxt)3394 htmlSkipBogusDoctype(htmlParserCtxtPtr ctxt) {
3395 const xmlChar *in;
3396 size_t avail;
3397 int eof = PARSER_PROGRESSIVE(ctxt);
3398 int line, col;
3399
3400 line = ctxt->input->line;
3401 col = ctxt->input->col;
3402
3403 in = ctxt->input->cur;
3404 avail = ctxt->input->end - in;
3405
3406 while (!PARSER_STOPPED(ctxt)) {
3407 int cur;
3408
3409 if ((!eof) && (avail <= 64)) {
3410 size_t oldAvail = avail;
3411
3412 ctxt->input->cur = in;
3413
3414 xmlParserGrow(ctxt);
3415
3416 in = ctxt->input->cur;
3417 avail = ctxt->input->end - in;
3418
3419 if (oldAvail == avail)
3420 eof = 1;
3421 }
3422
3423 if (avail == 0)
3424 break;
3425
3426 col += 1;
3427
3428 cur = *in;
3429 if (cur == '>') {
3430 in += 1;
3431 break;
3432 } else if (cur == 0x0A) {
3433 line += 1;
3434 col = 1;
3435 }
3436
3437 in += 1;
3438 avail -= 1;
3439
3440 SHRINK;
3441 }
3442
3443 ctxt->input->cur = in;
3444 ctxt->input->line = line;
3445 ctxt->input->col = col;
3446 }
3447
3448 /**
3449 * htmlParseDocTypeDecl:
3450 * @ctxt: an HTML parser context
3451 *
3452 * Parse a DOCTYPE declaration.
3453 */
3454
3455 static void
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt)3456 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3457 xmlChar *name = NULL;
3458 xmlChar *publicId = NULL;
3459 xmlChar *URI = NULL;
3460 int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
3461 XML_MAX_TEXT_LENGTH :
3462 XML_MAX_NAME_LENGTH;
3463
3464 /*
3465 * We know that '<!DOCTYPE' has been detected.
3466 */
3467 SKIP(9);
3468
3469 SKIP_BLANKS;
3470
3471 if ((ctxt->input->cur < ctxt->input->end) && (CUR != '>')) {
3472 name = htmlParseData(ctxt, MASK_WS_GT, 0, 0, maxLength);
3473
3474 if ((ctxt->options & HTML_PARSE_HTML5) && (name != NULL)) {
3475 xmlChar *cur;
3476
3477 for (cur = name; *cur; cur++) {
3478 if (IS_UPPER(*cur))
3479 *cur += 0x20;
3480 }
3481 }
3482
3483 SKIP_BLANKS;
3484 }
3485
3486 /*
3487 * Check for SystemID and publicId
3488 */
3489 if ((UPPER == 'P') && (UPP(1) == 'U') &&
3490 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3491 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3492 SKIP(6);
3493 SKIP_BLANKS;
3494 publicId = htmlParseDoctypeLiteral(ctxt);
3495 if (publicId == NULL)
3496 goto bogus;
3497 SKIP_BLANKS;
3498 URI = htmlParseDoctypeLiteral(ctxt);
3499 } else if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3500 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3501 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3502 SKIP(6);
3503 SKIP_BLANKS;
3504 URI = htmlParseDoctypeLiteral(ctxt);
3505 }
3506
3507 bogus:
3508 htmlSkipBogusDoctype(ctxt);
3509
3510 /*
3511 * Create or update the document accordingly to the DOCTYPE
3512 */
3513 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3514 (!ctxt->disableSAX))
3515 ctxt->sax->internalSubset(ctxt->userData, name, publicId, URI);
3516
3517 xmlFree(name);
3518 xmlFree(URI);
3519 xmlFree(publicId);
3520 }
3521
3522 /**
3523 * htmlParseAttribute:
3524 * @ctxt: an HTML parser context
3525 * @value: a xmlChar ** used to store the value of the attribute
3526 *
3527 * parse an attribute
3528 *
3529 * [41] Attribute ::= Name Eq AttValue
3530 *
3531 * [25] Eq ::= S? '=' S?
3532 *
3533 * With namespace:
3534 *
3535 * [NS 11] Attribute ::= QName Eq AttValue
3536 *
3537 * Also the case QName == xmlns:??? is handled independently as a namespace
3538 * definition.
3539 *
3540 * Returns the attribute name, and the value in *value.
3541 */
3542
3543 static xmlHashedString
htmlParseAttribute(htmlParserCtxtPtr ctxt,xmlChar ** value)3544 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3545 xmlHashedString hname;
3546 xmlChar *val = NULL;
3547
3548 *value = NULL;
3549 hname = htmlParseHTMLName(ctxt, 1);
3550 if (hname.name == NULL)
3551 return(hname);
3552
3553 /*
3554 * read the value
3555 */
3556 SKIP_BLANKS;
3557 if (CUR == '=') {
3558 SKIP(1);
3559 SKIP_BLANKS;
3560 val = htmlParseAttValue(ctxt);
3561 }
3562
3563 *value = val;
3564 return(hname);
3565 }
3566
3567 /**
3568 * htmlCheckEncoding:
3569 * @ctxt: an HTML parser context
3570 * @attvalue: the attribute value
3571 *
3572 * Checks an http-equiv attribute from a Meta tag to detect
3573 * the encoding
3574 * If a new encoding is detected the parser is switched to decode
3575 * it and pass UTF8
3576 */
3577 static void
htmlCheckEncoding(htmlParserCtxtPtr ctxt,const xmlChar * attvalue)3578 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3579 const xmlChar *encoding;
3580 xmlChar *copy;
3581
3582 if (!attvalue)
3583 return;
3584
3585 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3586 if (encoding != NULL) {
3587 encoding += 7;
3588 }
3589 /*
3590 * skip blank
3591 */
3592 if (encoding && IS_WS_HTML(*encoding))
3593 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3594 if (encoding && *encoding == '=') {
3595 encoding ++;
3596 copy = xmlStrdup(encoding);
3597 if (copy == NULL)
3598 htmlErrMemory(ctxt);
3599 xmlSetDeclaredEncoding(ctxt, copy);
3600 }
3601 }
3602
3603 /**
3604 * htmlCheckMeta:
3605 * @ctxt: an HTML parser context
3606 * @atts: the attributes values
3607 *
3608 * Checks an attributes from a Meta tag
3609 */
3610 static void
htmlCheckMeta(htmlParserCtxtPtr ctxt,const xmlChar ** atts)3611 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3612 int i;
3613 const xmlChar *att, *value;
3614 int http = 0;
3615 const xmlChar *content = NULL;
3616
3617 if ((ctxt == NULL) || (atts == NULL))
3618 return;
3619
3620 i = 0;
3621 att = atts[i++];
3622 while (att != NULL) {
3623 value = atts[i++];
3624 if (value != NULL) {
3625 if ((!xmlStrcasecmp(att, BAD_CAST "http-equiv")) &&
3626 (!xmlStrcasecmp(value, BAD_CAST "Content-Type"))) {
3627 http = 1;
3628 } else if (!xmlStrcasecmp(att, BAD_CAST "charset")) {
3629 xmlChar *copy;
3630
3631 copy = xmlStrdup(value);
3632 if (copy == NULL)
3633 htmlErrMemory(ctxt);
3634 xmlSetDeclaredEncoding(ctxt, copy);
3635 } else if (!xmlStrcasecmp(att, BAD_CAST "content")) {
3636 content = value;
3637 }
3638 }
3639 att = atts[i++];
3640 }
3641 if ((http) && (content != NULL))
3642 htmlCheckEncoding(ctxt, content);
3643
3644 }
3645
3646 /**
3647 * htmlAttrHashInsert:
3648 * @ctxt: parser context
3649 * @size: size of the hash table
3650 * @name: attribute name
3651 * @hashValue: hash value of name
3652 * @aindex: attribute index (this is a multiple of 5)
3653 *
3654 * Inserts a new attribute into the hash table.
3655 *
3656 * Returns INT_MAX if no existing attribute was found, the attribute
3657 * index if an attribute was found, -1 if a memory allocation failed.
3658 */
3659 static int
htmlAttrHashInsert(xmlParserCtxtPtr ctxt,unsigned size,const xmlChar * name,unsigned hashValue,int aindex)3660 htmlAttrHashInsert(xmlParserCtxtPtr ctxt, unsigned size, const xmlChar *name,
3661 unsigned hashValue, int aindex) {
3662 xmlAttrHashBucket *table = ctxt->attrHash;
3663 xmlAttrHashBucket *bucket;
3664 unsigned hindex;
3665
3666 hindex = hashValue & (size - 1);
3667 bucket = &table[hindex];
3668
3669 while (bucket->index >= 0) {
3670 const xmlChar **atts = &ctxt->atts[bucket->index];
3671
3672 if (name == atts[0])
3673 return(bucket->index);
3674
3675 hindex++;
3676 bucket++;
3677 if (hindex >= size) {
3678 hindex = 0;
3679 bucket = table;
3680 }
3681 }
3682
3683 bucket->index = aindex;
3684
3685 return(INT_MAX);
3686 }
3687
3688 /**
3689 * htmlParseStartTag:
3690 * @ctxt: an HTML parser context
3691 *
3692 * parse a start of tag either for rule element or
3693 * EmptyElement. In both case we don't parse the tag closing chars.
3694 *
3695 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3696 *
3697 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3698 *
3699 * With namespace:
3700 *
3701 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3702 *
3703 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3704 *
3705 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3706 */
3707
3708 static void
htmlParseStartTag(htmlParserCtxtPtr ctxt)3709 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3710 const xmlChar *name;
3711 const xmlChar *attname;
3712 xmlChar *attvalue;
3713 const xmlChar **atts;
3714 int nbatts = 0;
3715 int maxatts;
3716 int meta = 0;
3717 int i;
3718 int discardtag = 0;
3719
3720 ctxt->endCheckState = 0;
3721
3722 SKIP(1);
3723
3724 atts = ctxt->atts;
3725 maxatts = ctxt->maxatts;
3726
3727 GROW;
3728 name = htmlParseHTMLName(ctxt, 0).name;
3729 if (name == NULL)
3730 return;
3731 if (xmlStrEqual(name, BAD_CAST"meta"))
3732 meta = 1;
3733
3734 if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
3735 /*
3736 * Check for auto-closure of HTML elements.
3737 */
3738 htmlAutoClose(ctxt, name);
3739
3740 /*
3741 * Check for implied HTML elements.
3742 */
3743 htmlCheckImplied(ctxt, name);
3744
3745 /*
3746 * Avoid html at any level > 0, head at any level != 1
3747 * or any attempt to recurse body
3748 */
3749 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3750 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3751 "htmlParseStartTag: misplaced <html> tag\n",
3752 name, NULL);
3753 discardtag = 1;
3754 ctxt->depth++;
3755 }
3756 if ((ctxt->nameNr != 1) &&
3757 (xmlStrEqual(name, BAD_CAST"head"))) {
3758 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3759 "htmlParseStartTag: misplaced <head> tag\n",
3760 name, NULL);
3761 discardtag = 1;
3762 ctxt->depth++;
3763 }
3764 if (xmlStrEqual(name, BAD_CAST"body")) {
3765 int indx;
3766 for (indx = 0;indx < ctxt->nameNr;indx++) {
3767 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3768 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3769 "htmlParseStartTag: misplaced <body> tag\n",
3770 name, NULL);
3771 discardtag = 1;
3772 ctxt->depth++;
3773 }
3774 }
3775 }
3776 }
3777
3778 /*
3779 * Now parse the attributes, it ends up with the ending
3780 *
3781 * (S Attribute)* S?
3782 */
3783 SKIP_BLANKS;
3784 while ((ctxt->input->cur < ctxt->input->end) &&
3785 (CUR != '>') &&
3786 ((CUR != '/') || (NXT(1) != '>')) &&
3787 (PARSER_STOPPED(ctxt) == 0)) {
3788 xmlHashedString hattname;
3789
3790 /* unexpected-solidus-in-tag */
3791 if (CUR == '/') {
3792 SKIP(1);
3793 SKIP_BLANKS;
3794 continue;
3795 }
3796 GROW;
3797 hattname = htmlParseAttribute(ctxt, &attvalue);
3798 attname = hattname.name;
3799
3800 if (attname != NULL) {
3801 /*
3802 * Add the pair to atts
3803 */
3804 if (nbatts + 4 > maxatts) {
3805 const xmlChar **tmp;
3806 unsigned *utmp;
3807 size_t newSize = maxatts ? maxatts * 2 : 22;
3808
3809 tmp = xmlMalloc(newSize * sizeof(tmp[0]));
3810 if (tmp == NULL) {
3811 htmlErrMemory(ctxt);
3812 if (attvalue != NULL)
3813 xmlFree(attvalue);
3814 goto failed;
3815 }
3816
3817 utmp = xmlRealloc(ctxt->attallocs,
3818 newSize / 2 * sizeof(utmp[0]));
3819 if (utmp == NULL) {
3820 htmlErrMemory(ctxt);
3821 if (attvalue != NULL)
3822 xmlFree(attvalue);
3823 xmlFree(tmp);
3824 goto failed;
3825 }
3826
3827 if (maxatts > 0)
3828 memcpy(tmp, atts, maxatts * sizeof(tmp[0]));
3829 xmlFree(atts);
3830
3831 atts = tmp;
3832 maxatts = newSize;
3833 ctxt->atts = atts;
3834 ctxt->attallocs = utmp;
3835 ctxt->maxatts = maxatts;
3836 }
3837
3838 ctxt->attallocs[nbatts/2] = hattname.hashValue;
3839 atts[nbatts++] = attname;
3840 atts[nbatts++] = attvalue;
3841 }
3842 else {
3843 if (attvalue != NULL)
3844 xmlFree(attvalue);
3845 }
3846
3847 failed:
3848 SKIP_BLANKS;
3849 }
3850
3851 if (ctxt->input->cur >= ctxt->input->end) {
3852 discardtag = 1;
3853 goto done;
3854 }
3855
3856 /*
3857 * Verify that attribute names are unique.
3858 */
3859 if (nbatts > 2) {
3860 unsigned attrHashSize;
3861 int j, k;
3862
3863 attrHashSize = 4;
3864 while (attrHashSize / 2 < (unsigned) nbatts / 2)
3865 attrHashSize *= 2;
3866
3867 if (attrHashSize > ctxt->attrHashMax) {
3868 xmlAttrHashBucket *tmp;
3869
3870 tmp = xmlRealloc(ctxt->attrHash, attrHashSize * sizeof(tmp[0]));
3871 if (tmp == NULL) {
3872 htmlErrMemory(ctxt);
3873 goto done;
3874 }
3875
3876 ctxt->attrHash = tmp;
3877 ctxt->attrHashMax = attrHashSize;
3878 }
3879
3880 memset(ctxt->attrHash, -1, attrHashSize * sizeof(ctxt->attrHash[0]));
3881
3882 for (i = 0, j = 0, k = 0; i < nbatts; i += 2, k++) {
3883 unsigned hashValue;
3884 int res;
3885
3886 attname = atts[i];
3887 hashValue = ctxt->attallocs[k] | 0x80000000;
3888
3889 res = htmlAttrHashInsert(ctxt, attrHashSize, attname,
3890 hashValue, j);
3891 if (res < 0)
3892 continue;
3893
3894 if (res == INT_MAX) {
3895 atts[j] = atts[i];
3896 atts[j+1] = atts[i+1];
3897 j += 2;
3898 } else {
3899 xmlFree((xmlChar *) atts[i+1]);
3900 }
3901 }
3902
3903 nbatts = j;
3904 }
3905
3906 if (nbatts > 0) {
3907 atts[nbatts] = NULL;
3908 atts[nbatts + 1] = NULL;
3909
3910 /*
3911 * Handle specific association to the META tag
3912 */
3913 if (meta)
3914 htmlCheckMeta(ctxt, atts);
3915 }
3916
3917 /*
3918 * SAX: Start of Element !
3919 */
3920 if (!discardtag) {
3921 if (ctxt->options & HTML_PARSE_HTML5) {
3922 if (ctxt->nameNr > 0)
3923 htmlnamePop(ctxt);
3924 }
3925
3926 htmlnamePush(ctxt, name);
3927 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3928 if (nbatts != 0)
3929 ctxt->sax->startElement(ctxt->userData, name, atts);
3930 else
3931 ctxt->sax->startElement(ctxt->userData, name, NULL);
3932 }
3933 }
3934
3935 done:
3936 if (atts != NULL) {
3937 for (i = 1;i < nbatts;i += 2) {
3938 if (atts[i] != NULL)
3939 xmlFree((xmlChar *) atts[i]);
3940 }
3941 }
3942 }
3943
3944 /**
3945 * htmlParseEndTag:
3946 * @ctxt: an HTML parser context
3947 *
3948 * parse an end of tag
3949 *
3950 * [42] ETag ::= '</' Name S? '>'
3951 *
3952 * With namespace
3953 *
3954 * [NS 9] ETag ::= '</' QName S? '>'
3955 *
3956 * Returns 1 if the current level should be closed.
3957 */
3958
3959 static void
htmlParseEndTag(htmlParserCtxtPtr ctxt)3960 htmlParseEndTag(htmlParserCtxtPtr ctxt)
3961 {
3962 const xmlChar *name;
3963 const xmlChar *oldname;
3964 int i;
3965
3966 ctxt->endCheckState = 0;
3967
3968 SKIP(2);
3969
3970 if (CUR == '>') {
3971 SKIP(1);
3972 return;
3973 }
3974
3975 if (!IS_ASCII_LETTER(CUR)) {
3976 htmlParseComment(ctxt, /* bogus */ 1);
3977 return;
3978 }
3979
3980 name = htmlParseHTMLName(ctxt, 0).name;
3981 if (name == NULL)
3982 return;
3983
3984 /*
3985 * Parse and ignore attributes.
3986 */
3987 SKIP_BLANKS;
3988 while ((ctxt->input->cur < ctxt->input->end) &&
3989 (CUR != '>') &&
3990 ((CUR != '/') || (NXT(1) != '>')) &&
3991 (ctxt->instate != XML_PARSER_EOF)) {
3992 xmlChar *attvalue = NULL;
3993
3994 /* unexpected-solidus-in-tag */
3995 if (CUR == '/') {
3996 SKIP(1);
3997 SKIP_BLANKS;
3998 continue;
3999 }
4000 GROW;
4001 htmlParseAttribute(ctxt, &attvalue);
4002 if (attvalue != NULL)
4003 xmlFree(attvalue);
4004
4005 SKIP_BLANKS;
4006 }
4007
4008 if (CUR == '>') {
4009 SKIP(1);
4010 } else if ((CUR == '/') && (NXT(1) == '>')) {
4011 SKIP(2);
4012 } else {
4013 return;
4014 }
4015
4016 if (ctxt->options & HTML_PARSE_HTML5) {
4017 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4018 ctxt->sax->endElement(ctxt->userData, name);
4019 return;
4020 }
4021
4022 /*
4023 * if we ignored misplaced tags in htmlParseStartTag don't pop them
4024 * out now.
4025 */
4026 if ((ctxt->depth > 0) &&
4027 (xmlStrEqual(name, BAD_CAST "html") ||
4028 xmlStrEqual(name, BAD_CAST "body") ||
4029 xmlStrEqual(name, BAD_CAST "head"))) {
4030 ctxt->depth--;
4031 return;
4032 }
4033
4034 /*
4035 * If the name read is not one of the element in the parsing stack
4036 * then return, it's just an error.
4037 */
4038 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4039 if (xmlStrEqual(name, ctxt->nameTab[i]))
4040 break;
4041 }
4042 if (i < 0) {
4043 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4044 "Unexpected end tag : %s\n", name, NULL);
4045 return;
4046 }
4047
4048
4049 /*
4050 * Check for auto-closure of HTML elements.
4051 */
4052
4053 htmlAutoCloseOnClose(ctxt, name);
4054
4055 /*
4056 * Well formedness constraints, opening and closing must match.
4057 * With the exception that the autoclose may have popped stuff out
4058 * of the stack.
4059 */
4060 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4061 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4062 "Opening and ending tag mismatch: %s and %s\n",
4063 name, ctxt->name);
4064 }
4065
4066 /*
4067 * SAX: End of Tag
4068 */
4069 oldname = ctxt->name;
4070 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4071 htmlParserFinishElementParsing(ctxt);
4072 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4073 ctxt->sax->endElement(ctxt->userData, name);
4074 htmlnamePop(ctxt);
4075 }
4076 }
4077
4078 /**
4079 * htmlParseContent:
4080 * @ctxt: an HTML parser context
4081 *
4082 * Parse a content: comment, sub-element, reference or text.
4083 * New version for non recursive htmlParseElementInternal
4084 */
4085
4086 static void
htmlParseContent(htmlParserCtxtPtr ctxt)4087 htmlParseContent(htmlParserCtxtPtr ctxt) {
4088 while ((PARSER_STOPPED(ctxt) == 0) &&
4089 (ctxt->input->cur < ctxt->input->end)) {
4090 int mode;
4091
4092 GROW;
4093 mode = ctxt->endCheckState;
4094
4095 if ((mode == 0) && (CUR == '<')) {
4096 if (NXT(1) == '/') {
4097 htmlParseEndTag(ctxt);
4098 } else if (NXT(1) == '!') {
4099 /*
4100 * Sometimes DOCTYPE arrives in the middle of the document
4101 */
4102 if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4103 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4104 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4105 (UPP(8) == 'E')) {
4106 htmlParseDocTypeDecl(ctxt);
4107 } else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4108 SKIP(4);
4109 htmlParseComment(ctxt, /* bogus */ 0);
4110 } else {
4111 SKIP(2);
4112 htmlParseComment(ctxt, /* bogus */ 1);
4113 }
4114 } else if (NXT(1) == '?') {
4115 SKIP(1);
4116 htmlParseComment(ctxt, /* bogus */ 1);
4117 } else if (IS_ASCII_LETTER(NXT(1))) {
4118 htmlParseElementInternal(ctxt);
4119 } else {
4120 htmlCheckParagraph(ctxt);
4121 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4122 (ctxt->sax->characters != NULL))
4123 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4124 SKIP(1);
4125 }
4126 } else {
4127 htmlParseCharData(ctxt);
4128 }
4129
4130 SHRINK;
4131 GROW;
4132 }
4133
4134 if (ctxt->input->cur >= ctxt->input->end)
4135 htmlAutoCloseOnEnd(ctxt);
4136 }
4137
4138 /**
4139 * htmlParseElementInternal:
4140 * @ctxt: an HTML parser context
4141 *
4142 * parse an HTML element, new version, non recursive
4143 *
4144 * [39] element ::= EmptyElemTag | STag content ETag
4145 *
4146 * [41] Attribute ::= Name Eq AttValue
4147 */
4148
4149 static int
htmlParseElementInternal(htmlParserCtxtPtr ctxt)4150 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4151 const xmlChar *name;
4152 const htmlElemDesc * info;
4153 htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4154
4155 if ((ctxt == NULL) || (ctxt->input == NULL))
4156 return(0);
4157
4158 /* Capture start position */
4159 if (ctxt->record_info) {
4160 node_info.begin_pos = ctxt->input->consumed +
4161 (CUR_PTR - ctxt->input->base);
4162 node_info.begin_line = ctxt->input->line;
4163 }
4164
4165 htmlParseStartTag(ctxt);
4166 name = ctxt->name;
4167 if (name == NULL)
4168 return(0);
4169
4170 if (ctxt->record_info)
4171 htmlNodeInfoPush(ctxt, &node_info);
4172
4173 /*
4174 * Check for an Empty Element labeled the XML/SGML way
4175 */
4176 if ((CUR == '/') && (NXT(1) == '>')) {
4177 SKIP(2);
4178 htmlParserFinishElementParsing(ctxt);
4179 if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
4180 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4181 ctxt->sax->endElement(ctxt->userData, name);
4182 }
4183 htmlnamePop(ctxt);
4184 return(0);
4185 }
4186
4187 if (CUR != '>')
4188 return(0);
4189 SKIP(1);
4190
4191 /*
4192 * Lookup the info for that element.
4193 */
4194 info = htmlTagLookup(name);
4195
4196 /*
4197 * Check for an Empty Element from DTD definition
4198 */
4199 if ((info != NULL) && (info->empty)) {
4200 htmlParserFinishElementParsing(ctxt);
4201 if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
4202 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4203 ctxt->sax->endElement(ctxt->userData, name);
4204 }
4205 htmlnamePop(ctxt);
4206 return(0);
4207 }
4208
4209 if (info != NULL)
4210 ctxt->endCheckState = info->dataMode;
4211
4212 return(1);
4213 }
4214
4215 /**
4216 * htmlParseElement:
4217 * @ctxt: an HTML parser context
4218 *
4219 * DEPRECATED: Internal function, don't use.
4220 *
4221 * parse an HTML element, this is highly recursive
4222 * this is kept for compatibility with previous code versions
4223 *
4224 * [39] element ::= EmptyElemTag | STag content ETag
4225 *
4226 * [41] Attribute ::= Name Eq AttValue
4227 */
4228
4229 void
htmlParseElement(htmlParserCtxtPtr ctxt)4230 htmlParseElement(htmlParserCtxtPtr ctxt) {
4231 const xmlChar *oldptr;
4232 int depth;
4233
4234 if ((ctxt == NULL) || (ctxt->input == NULL))
4235 return;
4236
4237 if (htmlParseElementInternal(ctxt) == 0)
4238 return;
4239
4240 /*
4241 * Parse the content of the element:
4242 */
4243 depth = ctxt->nameNr;
4244 while (CUR != 0) {
4245 oldptr = ctxt->input->cur;
4246 htmlParseContent(ctxt);
4247 if (oldptr==ctxt->input->cur) break;
4248 if (ctxt->nameNr < depth) break;
4249 }
4250
4251 if (CUR == 0) {
4252 htmlAutoCloseOnEnd(ctxt);
4253 }
4254 }
4255
4256 xmlNodePtr
htmlCtxtParseContentInternal(htmlParserCtxtPtr ctxt,xmlParserInputPtr input)4257 htmlCtxtParseContentInternal(htmlParserCtxtPtr ctxt, xmlParserInputPtr input) {
4258 xmlNodePtr root;
4259 xmlNodePtr list = NULL;
4260 xmlChar *rootName = BAD_CAST "#root";
4261
4262 root = xmlNewDocNode(ctxt->myDoc, NULL, rootName, NULL);
4263 if (root == NULL) {
4264 htmlErrMemory(ctxt);
4265 return(NULL);
4266 }
4267
4268 if (xmlPushInput(ctxt, input) < 0) {
4269 xmlFreeNode(root);
4270 return(NULL);
4271 }
4272
4273 htmlnamePush(ctxt, rootName);
4274 nodePush(ctxt, root);
4275
4276 htmlParseContent(ctxt);
4277
4278 /* TODO: Use xmlCtxtIsCatastrophicError */
4279 if (ctxt->errNo != XML_ERR_NO_MEMORY) {
4280 xmlNodePtr cur;
4281
4282 /*
4283 * Unlink newly created node list.
4284 */
4285 list = root->children;
4286 root->children = NULL;
4287 root->last = NULL;
4288 for (cur = list; cur != NULL; cur = cur->next)
4289 cur->parent = NULL;
4290 }
4291
4292 nodePop(ctxt);
4293 htmlnamePop(ctxt);
4294
4295 /* xmlPopInput would free the stream */
4296 inputPop(ctxt);
4297
4298 xmlFreeNode(root);
4299 return(list);
4300 }
4301
4302 /**
4303 * htmlParseDocument:
4304 * @ctxt: an HTML parser context
4305 *
4306 * Parse an HTML document and invoke the SAX handlers. This is useful
4307 * if you're only interested in custom SAX callbacks. If you want a
4308 * document tree, use htmlCtxtParseDocument.
4309 *
4310 * Returns 0, -1 in case of error.
4311 */
4312
4313 int
htmlParseDocument(htmlParserCtxtPtr ctxt)4314 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4315 xmlDtdPtr dtd;
4316
4317 if ((ctxt == NULL) || (ctxt->input == NULL))
4318 return(-1);
4319
4320 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
4321 ctxt->sax->setDocumentLocator(ctxt->userData,
4322 (xmlSAXLocator *) &xmlDefaultSAXLocator);
4323 }
4324
4325 xmlDetectEncoding(ctxt);
4326
4327 /*
4328 * This is wrong but matches long-standing behavior. In most cases,
4329 * a document starting with an XML declaration will specify UTF-8.
4330 */
4331 if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
4332 (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0))
4333 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
4334
4335 /*
4336 * Wipe out everything which is before the first '<'
4337 */
4338 SKIP_BLANKS;
4339
4340 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4341 ctxt->sax->startDocument(ctxt->userData);
4342
4343 /*
4344 * Parse possible comments and PIs before any content
4345 */
4346 while (CUR == '<') {
4347 if ((NXT(1) == '!') && (NXT(2) == '-') && (NXT(3) == '-')) {
4348 SKIP(4);
4349 htmlParseComment(ctxt, /* bogus */ 0);
4350 } else if (NXT(1) == '?') {
4351 SKIP(1);
4352 htmlParseComment(ctxt, /* bogus */ 1);
4353 } else {
4354 break;
4355 }
4356 SKIP_BLANKS;
4357 }
4358
4359 /*
4360 * Then possibly doc type declaration(s) and more Misc
4361 * (doctypedecl Misc*)?
4362 */
4363 if ((CUR == '<') && (NXT(1) == '!') &&
4364 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4365 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4366 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4367 (UPP(8) == 'E')) {
4368 ctxt->instate = XML_PARSER_MISC;
4369 htmlParseDocTypeDecl(ctxt);
4370 }
4371 SKIP_BLANKS;
4372
4373 /*
4374 * Parse possible comments and PIs before any content
4375 */
4376 ctxt->instate = XML_PARSER_PROLOG;
4377 while (CUR == '<') {
4378 if ((NXT(1) == '!') && (NXT(2) == '-') && (NXT(3) == '-')) {
4379 SKIP(4);
4380 htmlParseComment(ctxt, /* bogus */ 0);
4381 } else if (NXT(1) == '?') {
4382 SKIP(1);
4383 htmlParseComment(ctxt, /* bogus */ 1);
4384 } else {
4385 break;
4386 }
4387 SKIP_BLANKS;
4388 }
4389
4390 /*
4391 * Time to start parsing the tree itself
4392 */
4393 ctxt->instate = XML_PARSER_CONTENT;
4394 htmlParseContent(ctxt);
4395
4396 /*
4397 * autoclose
4398 */
4399 if (CUR == 0)
4400 htmlAutoCloseOnEnd(ctxt);
4401
4402
4403 /*
4404 * SAX: end of the document processing.
4405 */
4406 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4407 ctxt->sax->endDocument(ctxt->userData);
4408
4409 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4410 dtd = xmlGetIntSubset(ctxt->myDoc);
4411 if (dtd == NULL) {
4412 ctxt->myDoc->intSubset =
4413 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4414 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4415 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4416 if (ctxt->myDoc->intSubset == NULL)
4417 htmlErrMemory(ctxt);
4418 }
4419 }
4420 if (! ctxt->wellFormed) return(-1);
4421 return(0);
4422 }
4423
4424
4425 /************************************************************************
4426 * *
4427 * Parser contexts handling *
4428 * *
4429 ************************************************************************/
4430
4431 /**
4432 * htmlInitParserCtxt:
4433 * @ctxt: an HTML parser context
4434 * @sax: SAX handler
4435 * @userData: user data
4436 *
4437 * Initialize a parser context
4438 *
4439 * Returns 0 in case of success and -1 in case of error
4440 */
4441
4442 static int
htmlInitParserCtxt(htmlParserCtxtPtr ctxt,const htmlSAXHandler * sax,void * userData)4443 htmlInitParserCtxt(htmlParserCtxtPtr ctxt, const htmlSAXHandler *sax,
4444 void *userData)
4445 {
4446 if (ctxt == NULL) return(-1);
4447 memset(ctxt, 0, sizeof(htmlParserCtxt));
4448
4449 ctxt->dict = xmlDictCreate();
4450 if (ctxt->dict == NULL)
4451 return(-1);
4452
4453 if (ctxt->sax == NULL)
4454 ctxt->sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4455 if (ctxt->sax == NULL)
4456 return(-1);
4457 if (sax == NULL) {
4458 memset(ctxt->sax, 0, sizeof(htmlSAXHandler));
4459 xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
4460 ctxt->userData = ctxt;
4461 } else {
4462 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4463 ctxt->userData = userData ? userData : ctxt;
4464 }
4465
4466 /* Allocate the Input stack */
4467 ctxt->inputTab = (htmlParserInputPtr *)
4468 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4469 if (ctxt->inputTab == NULL)
4470 return(-1);
4471 ctxt->inputNr = 0;
4472 ctxt->inputMax = 5;
4473 ctxt->input = NULL;
4474 ctxt->version = NULL;
4475 ctxt->encoding = NULL;
4476 ctxt->standalone = -1;
4477 ctxt->instate = XML_PARSER_START;
4478
4479 /* Allocate the Node stack */
4480 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4481 if (ctxt->nodeTab == NULL)
4482 return(-1);
4483 ctxt->nodeNr = 0;
4484 ctxt->nodeMax = 10;
4485 ctxt->node = NULL;
4486
4487 /* Allocate the Name stack */
4488 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4489 if (ctxt->nameTab == NULL)
4490 return(-1);
4491 ctxt->nameNr = 0;
4492 ctxt->nameMax = 10;
4493 ctxt->name = NULL;
4494
4495 ctxt->nodeInfoTab = NULL;
4496 ctxt->nodeInfoNr = 0;
4497 ctxt->nodeInfoMax = 0;
4498
4499 ctxt->myDoc = NULL;
4500 ctxt->wellFormed = 1;
4501 ctxt->replaceEntities = 0;
4502 ctxt->linenumbers = xmlLineNumbersDefaultValue;
4503 ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
4504 ctxt->html = 1;
4505 ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
4506 ctxt->vctxt.userData = ctxt;
4507 ctxt->vctxt.error = xmlParserValidityError;
4508 ctxt->vctxt.warning = xmlParserValidityWarning;
4509 ctxt->record_info = 0;
4510 ctxt->validate = 0;
4511 ctxt->checkIndex = 0;
4512 ctxt->catalogs = NULL;
4513 xmlInitNodeInfoSeq(&ctxt->node_seq);
4514 return(0);
4515 }
4516
4517 /**
4518 * htmlFreeParserCtxt:
4519 * @ctxt: an HTML parser context
4520 *
4521 * Free all the memory used by a parser context. However the parsed
4522 * document in ctxt->myDoc is not freed.
4523 */
4524
4525 void
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)4526 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4527 {
4528 xmlFreeParserCtxt(ctxt);
4529 }
4530
4531 /**
4532 * htmlNewParserCtxt:
4533 *
4534 * Allocate and initialize a new HTML parser context.
4535 *
4536 * This can be used to parse HTML documents into DOM trees with
4537 * functions like xmlCtxtReadFile or xmlCtxtReadMemory.
4538 *
4539 * See htmlCtxtUseOptions for parser options.
4540 *
4541 * See xmlCtxtSetErrorHandler for advanced error handling.
4542 *
4543 * See htmlNewSAXParserCtxt for custom SAX parsers.
4544 *
4545 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4546 */
4547
4548 htmlParserCtxtPtr
htmlNewParserCtxt(void)4549 htmlNewParserCtxt(void)
4550 {
4551 return(htmlNewSAXParserCtxt(NULL, NULL));
4552 }
4553
4554 /**
4555 * htmlNewSAXParserCtxt:
4556 * @sax: SAX handler
4557 * @userData: user data
4558 *
4559 * Allocate and initialize a new HTML SAX parser context. If userData
4560 * is NULL, the parser context will be passed as user data.
4561 *
4562 * Available since 2.11.0. If you want support older versions,
4563 * it's best to invoke htmlNewParserCtxt and set ctxt->sax with
4564 * struct assignment.
4565 *
4566 * Also see htmlNewParserCtxt.
4567 *
4568 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4569 */
4570
4571 htmlParserCtxtPtr
htmlNewSAXParserCtxt(const htmlSAXHandler * sax,void * userData)4572 htmlNewSAXParserCtxt(const htmlSAXHandler *sax, void *userData)
4573 {
4574 xmlParserCtxtPtr ctxt;
4575
4576 xmlInitParser();
4577
4578 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4579 if (ctxt == NULL)
4580 return(NULL);
4581 memset(ctxt, 0, sizeof(xmlParserCtxt));
4582 if (htmlInitParserCtxt(ctxt, sax, userData) < 0) {
4583 htmlFreeParserCtxt(ctxt);
4584 return(NULL);
4585 }
4586 return(ctxt);
4587 }
4588
4589 static htmlParserCtxtPtr
htmlCreateMemoryParserCtxtInternal(const char * url,const char * buffer,size_t size,const char * encoding)4590 htmlCreateMemoryParserCtxtInternal(const char *url,
4591 const char *buffer, size_t size,
4592 const char *encoding) {
4593 xmlParserCtxtPtr ctxt;
4594 xmlParserInputPtr input;
4595
4596 if (buffer == NULL)
4597 return(NULL);
4598
4599 ctxt = htmlNewParserCtxt();
4600 if (ctxt == NULL)
4601 return(NULL);
4602
4603 input = xmlCtxtNewInputFromMemory(ctxt, url, buffer, size, encoding, 0);
4604 if (input == NULL) {
4605 xmlFreeParserCtxt(ctxt);
4606 return(NULL);
4607 }
4608
4609 if (inputPush(ctxt, input) < 0) {
4610 xmlFreeInputStream(input);
4611 xmlFreeParserCtxt(ctxt);
4612 return(NULL);
4613 }
4614
4615 return(ctxt);
4616 }
4617
4618 /**
4619 * htmlCreateMemoryParserCtxt:
4620 * @buffer: a pointer to a char array
4621 * @size: the size of the array
4622 *
4623 * DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadMemory.
4624 *
4625 * Create a parser context for an HTML in-memory document. The input
4626 * buffer must not contain any terminating null bytes.
4627 *
4628 * Returns the new parser context or NULL
4629 */
4630 htmlParserCtxtPtr
htmlCreateMemoryParserCtxt(const char * buffer,int size)4631 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4632 if (size <= 0)
4633 return(NULL);
4634
4635 return(htmlCreateMemoryParserCtxtInternal(NULL, buffer, size, NULL));
4636 }
4637
4638 /**
4639 * htmlCreateDocParserCtxt:
4640 * @str: a pointer to an array of xmlChar
4641 * @encoding: encoding (optional)
4642 *
4643 * Create a parser context for a null-terminated string.
4644 *
4645 * Returns the new parser context or NULL if a memory allocation failed.
4646 */
4647 static htmlParserCtxtPtr
htmlCreateDocParserCtxt(const xmlChar * str,const char * url,const char * encoding)4648 htmlCreateDocParserCtxt(const xmlChar *str, const char *url,
4649 const char *encoding) {
4650 xmlParserCtxtPtr ctxt;
4651 xmlParserInputPtr input;
4652
4653 if (str == NULL)
4654 return(NULL);
4655
4656 ctxt = htmlNewParserCtxt();
4657 if (ctxt == NULL)
4658 return(NULL);
4659
4660 input = xmlCtxtNewInputFromString(ctxt, url, (const char *) str,
4661 encoding, 0);
4662 if (input == NULL) {
4663 xmlFreeParserCtxt(ctxt);
4664 return(NULL);
4665 }
4666
4667 if (inputPush(ctxt, input) < 0) {
4668 xmlFreeInputStream(input);
4669 xmlFreeParserCtxt(ctxt);
4670 return(NULL);
4671 }
4672
4673 return(ctxt);
4674 }
4675
4676 #ifdef LIBXML_PUSH_ENABLED
4677 /************************************************************************
4678 * *
4679 * Progressive parsing interfaces *
4680 * *
4681 ************************************************************************/
4682
4683 enum xmlLookupStates {
4684 LSTATE_TAG_NAME = 0,
4685 LSTATE_BEFORE_ATTR_NAME,
4686 LSTATE_ATTR_NAME,
4687 LSTATE_AFTER_ATTR_NAME,
4688 LSTATE_BEFORE_ATTR_VALUE,
4689 LSTATE_ATTR_VALUE_DQUOTED,
4690 LSTATE_ATTR_VALUE_SQUOTED,
4691 LSTATE_ATTR_VALUE_UNQUOTED
4692 };
4693
4694 /**
4695 * htmlParseLookupGt:
4696 * @ctxt: an HTML parser context
4697 *
4698 * Check whether there's enough data in the input buffer to finish parsing
4699 * a tag. This has to take quotes into account.
4700 */
4701 static int
htmlParseLookupGt(xmlParserCtxtPtr ctxt)4702 htmlParseLookupGt(xmlParserCtxtPtr ctxt) {
4703 const xmlChar *cur;
4704 const xmlChar *end = ctxt->input->end;
4705 int state = ctxt->endCheckState;
4706 size_t index;
4707
4708 if (ctxt->checkIndex == 0)
4709 cur = ctxt->input->cur + 2; /* Skip '<a' or '</' */
4710 else
4711 cur = ctxt->input->cur + ctxt->checkIndex;
4712
4713 while (cur < end) {
4714 int c = *cur++;
4715
4716 if (state != LSTATE_ATTR_VALUE_SQUOTED &&
4717 state != LSTATE_ATTR_VALUE_DQUOTED) {
4718 if (c == '/' &&
4719 state != LSTATE_BEFORE_ATTR_VALUE &&
4720 state != LSTATE_ATTR_VALUE_UNQUOTED) {
4721 state = LSTATE_BEFORE_ATTR_NAME;
4722 continue;
4723 } else if (c == '>') {
4724 ctxt->checkIndex = 0;
4725 ctxt->endCheckState = 0;
4726 return(0);
4727 }
4728 }
4729
4730 switch (state) {
4731 case LSTATE_TAG_NAME:
4732 if (IS_WS_HTML(c))
4733 state = LSTATE_BEFORE_ATTR_NAME;
4734 break;
4735
4736 case LSTATE_BEFORE_ATTR_NAME:
4737 if (!IS_WS_HTML(c))
4738 state = LSTATE_ATTR_NAME;
4739 break;
4740
4741 case LSTATE_ATTR_NAME:
4742 if (c == '=')
4743 state = LSTATE_BEFORE_ATTR_VALUE;
4744 else if (IS_WS_HTML(c))
4745 state = LSTATE_AFTER_ATTR_NAME;
4746 break;
4747
4748 case LSTATE_AFTER_ATTR_NAME:
4749 if (c == '=')
4750 state = LSTATE_BEFORE_ATTR_VALUE;
4751 else if (!IS_WS_HTML(c))
4752 state = LSTATE_ATTR_NAME;
4753 break;
4754
4755 case LSTATE_BEFORE_ATTR_VALUE:
4756 if (c == '"')
4757 state = LSTATE_ATTR_VALUE_DQUOTED;
4758 else if (c == '\'')
4759 state = LSTATE_ATTR_VALUE_SQUOTED;
4760 else if (!IS_WS_HTML(c))
4761 state = LSTATE_ATTR_VALUE_UNQUOTED;
4762 break;
4763
4764 case LSTATE_ATTR_VALUE_DQUOTED:
4765 if (c == '"')
4766 state = LSTATE_BEFORE_ATTR_NAME;
4767 break;
4768
4769 case LSTATE_ATTR_VALUE_SQUOTED:
4770 if (c == '\'')
4771 state = LSTATE_BEFORE_ATTR_NAME;
4772 break;
4773
4774 case LSTATE_ATTR_VALUE_UNQUOTED:
4775 if (IS_WS_HTML(c))
4776 state = LSTATE_BEFORE_ATTR_NAME;
4777 break;
4778 }
4779 }
4780
4781 index = cur - ctxt->input->cur;
4782 if (index > LONG_MAX) {
4783 ctxt->checkIndex = 0;
4784 ctxt->endCheckState = 0;
4785 return(0);
4786 }
4787 ctxt->checkIndex = index;
4788 ctxt->endCheckState = state;
4789 return(-1);
4790 }
4791
4792 /**
4793 * htmlParseLookupString:
4794 * @ctxt: an XML parser context
4795 * @startDelta: delta to apply at the start
4796 * @str: string
4797 * @strLen: length of string
4798 *
4799 * Check whether the input buffer contains a string.
4800 */
4801 static int
htmlParseLookupString(xmlParserCtxtPtr ctxt,size_t startDelta,const char * str,size_t strLen,size_t extraLen)4802 htmlParseLookupString(xmlParserCtxtPtr ctxt, size_t startDelta,
4803 const char *str, size_t strLen, size_t extraLen) {
4804 const xmlChar *end = ctxt->input->end;
4805 const xmlChar *cur, *term;
4806 size_t index, rescan;
4807 int ret;
4808
4809 if (ctxt->checkIndex == 0) {
4810 cur = ctxt->input->cur + startDelta;
4811 } else {
4812 cur = ctxt->input->cur + ctxt->checkIndex;
4813 }
4814
4815 term = BAD_CAST strstr((const char *) cur, str);
4816 if ((term != NULL) &&
4817 ((size_t) (ctxt->input->end - term) >= extraLen + 1)) {
4818 ctxt->checkIndex = 0;
4819
4820 if (term - ctxt->input->cur > INT_MAX / 2)
4821 ret = INT_MAX / 2;
4822 else
4823 ret = term - ctxt->input->cur;
4824
4825 return(ret);
4826 }
4827
4828 /* Rescan (strLen + extraLen - 1) characters. */
4829 rescan = strLen + extraLen - 1;
4830 if ((size_t) (end - cur) <= rescan)
4831 end = cur;
4832 else
4833 end -= rescan;
4834 index = end - ctxt->input->cur;
4835 if (index > INT_MAX / 2) {
4836 ctxt->checkIndex = 0;
4837 ret = INT_MAX / 2;
4838 } else {
4839 ctxt->checkIndex = index;
4840 ret = -1;
4841 }
4842
4843 return(ret);
4844 }
4845
4846 /**
4847 * htmlParseLookupCommentEnd:
4848 * @ctxt: an HTML parser context
4849 *
4850 * Try to find a comment end tag in the input stream
4851 * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
4852 * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
4853 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4854 * to avoid rescanning sequences of bytes, it DOES change the state of the
4855 * parser, do not use liberally.
4856 *
4857 * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
4858 */
4859 static int
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)4860 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
4861 {
4862 int mark = 0;
4863 int offset;
4864
4865 while (1) {
4866 mark = htmlParseLookupString(ctxt, 2, "--", 2, 0);
4867 if (mark < 0)
4868 break;
4869 if ((NXT(mark+2) == '>') ||
4870 ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
4871 ctxt->checkIndex = 0;
4872 break;
4873 }
4874 offset = (NXT(mark+2) == '!') ? 3 : 2;
4875 if (mark + offset >= ctxt->input->end - ctxt->input->cur) {
4876 ctxt->checkIndex = mark;
4877 return(-1);
4878 }
4879 ctxt->checkIndex = mark + 1;
4880 }
4881 return mark;
4882 }
4883
4884
4885 /**
4886 * htmlParseTryOrFinish:
4887 * @ctxt: an HTML parser context
4888 * @terminate: last chunk indicator
4889 *
4890 * Try to progress on parsing
4891 *
4892 * Returns zero if no parsing was possible
4893 */
4894 static int
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt,int terminate)4895 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4896 int ret = 0;
4897 htmlParserInputPtr in;
4898 ptrdiff_t avail = 0;
4899 int cur;
4900
4901 htmlParserNodeInfo node_info;
4902
4903 while (PARSER_STOPPED(ctxt) == 0) {
4904
4905 in = ctxt->input;
4906 if (in == NULL) break;
4907 avail = in->end - in->cur;
4908 if ((avail == 0) && (terminate)) {
4909 htmlAutoCloseOnEnd(ctxt);
4910 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4911 /*
4912 * SAX: end of the document processing.
4913 */
4914 ctxt->instate = XML_PARSER_EOF;
4915 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4916 ctxt->sax->endDocument(ctxt->userData);
4917 }
4918 }
4919 if (avail < 1)
4920 goto done;
4921 cur = in->cur[0];
4922
4923 switch (ctxt->instate) {
4924 case XML_PARSER_EOF:
4925 /*
4926 * Document parsing is done !
4927 */
4928 goto done;
4929 case XML_PARSER_START:
4930 /*
4931 * This is wrong but matches long-standing behavior. In most
4932 * cases, a document starting with an XML declaration will
4933 * specify UTF-8.
4934 */
4935 if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
4936 (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0)) {
4937 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
4938 }
4939
4940 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
4941 ctxt->sax->setDocumentLocator(ctxt->userData,
4942 (xmlSAXLocator *) &xmlDefaultSAXLocator);
4943 }
4944 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4945 (!ctxt->disableSAX))
4946 ctxt->sax->startDocument(ctxt->userData);
4947
4948 /* Allow callback to modify state */
4949 if (ctxt->instate == XML_PARSER_START)
4950 ctxt->instate = XML_PARSER_MISC;
4951 break;
4952 case XML_PARSER_START_TAG: {
4953 const xmlChar *name;
4954 int next;
4955 const htmlElemDesc * info;
4956
4957 /*
4958 * not enough chars in buffer
4959 */
4960 if (avail < 2)
4961 goto done;
4962 cur = in->cur[0];
4963 next = in->cur[1];
4964 if (cur != '<') {
4965 ctxt->instate = XML_PARSER_CONTENT;
4966 break;
4967 }
4968 if (next == '/') {
4969 ctxt->instate = XML_PARSER_END_TAG;
4970 ctxt->checkIndex = 0;
4971 break;
4972 }
4973 if ((!terminate) &&
4974 (htmlParseLookupGt(ctxt) < 0))
4975 goto done;
4976
4977 /* Capture start position */
4978 if (ctxt->record_info) {
4979 node_info.begin_pos = ctxt->input->consumed +
4980 (CUR_PTR - ctxt->input->base);
4981 node_info.begin_line = ctxt->input->line;
4982 }
4983
4984
4985 htmlParseStartTag(ctxt);
4986 name = ctxt->name;
4987 if (name == NULL)
4988 break;
4989
4990 /*
4991 * Check for an Empty Element labeled the XML/SGML way
4992 */
4993 if ((CUR == '/') && (NXT(1) == '>')) {
4994 SKIP(2);
4995 htmlParserFinishElementParsing(ctxt);
4996 if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
4997 if ((ctxt->sax != NULL) &&
4998 (ctxt->sax->endElement != NULL))
4999 ctxt->sax->endElement(ctxt->userData, name);
5000 }
5001 htmlnamePop(ctxt);
5002 ctxt->instate = XML_PARSER_CONTENT;
5003 break;
5004 }
5005
5006 if (CUR != '>')
5007 break;
5008 SKIP(1);
5009
5010 /*
5011 * Lookup the info for that element.
5012 */
5013 info = htmlTagLookup(name);
5014
5015 /*
5016 * Check for an Empty Element from DTD definition
5017 */
5018 if ((info != NULL) && (info->empty)) {
5019 htmlParserFinishElementParsing(ctxt);
5020 if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
5021 if ((ctxt->sax != NULL) &&
5022 (ctxt->sax->endElement != NULL))
5023 ctxt->sax->endElement(ctxt->userData, name);
5024 }
5025 htmlnamePop(ctxt);
5026 }
5027
5028 if (info != NULL)
5029 ctxt->endCheckState = info->dataMode;
5030
5031 if (ctxt->record_info)
5032 htmlNodeInfoPush(ctxt, &node_info);
5033
5034 ctxt->instate = XML_PARSER_CONTENT;
5035 break;
5036 }
5037 case XML_PARSER_MISC:
5038 case XML_PARSER_PROLOG:
5039 case XML_PARSER_CONTENT:
5040 case XML_PARSER_EPILOG: {
5041 int mode;
5042
5043 if ((ctxt->instate == XML_PARSER_MISC) ||
5044 (ctxt->instate == XML_PARSER_PROLOG)) {
5045 SKIP_BLANKS;
5046 avail = in->end - in->cur;
5047 }
5048
5049 if (avail < 1)
5050 goto done;
5051 cur = in->cur[0];
5052 mode = ctxt->endCheckState;
5053
5054 if (mode != 0) {
5055 while ((PARSER_STOPPED(ctxt) == 0) &&
5056 (in->cur < in->end)) {
5057 size_t extra;
5058
5059 extra = strlen((const char *) ctxt->name) + 2;
5060
5061 if ((!terminate) &&
5062 (htmlParseLookupString(ctxt, 0, "<", 1,
5063 extra) < 0))
5064 goto done;
5065 ctxt->checkIndex = 0;
5066
5067 if (htmlParseCharData(ctxt))
5068 break;
5069 }
5070
5071 break;
5072 } else if (cur == '<') {
5073 int next;
5074
5075 if (avail < 2) {
5076 if (!terminate)
5077 goto done;
5078 next = ' ';
5079 } else {
5080 next = in->cur[1];
5081 }
5082
5083 if (next == '!') {
5084 if ((!terminate) && (avail < 4))
5085 goto done;
5086 if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
5087 if ((!terminate) &&
5088 (htmlParseLookupCommentEnd(ctxt) < 0))
5089 goto done;
5090 SKIP(4);
5091 htmlParseComment(ctxt, /* bogus */ 0);
5092 break;
5093 }
5094
5095 if ((!terminate) && (avail < 9))
5096 goto done;
5097 if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
5098 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5099 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5100 (UPP(8) == 'E')) {
5101 if ((!terminate) &&
5102 (htmlParseLookupString(ctxt, 9, ">", 1,
5103 0) < 0))
5104 goto done;
5105 htmlParseDocTypeDecl(ctxt);
5106 if (ctxt->instate == XML_PARSER_MISC)
5107 ctxt->instate = XML_PARSER_PROLOG;
5108 } else {
5109 if ((!terminate) &&
5110 (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
5111 goto done;
5112 SKIP(2);
5113 htmlParseComment(ctxt, /* bogus */ 1);
5114 }
5115 } else if (next == '?') {
5116 if ((!terminate) &&
5117 (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
5118 goto done;
5119 SKIP(1);
5120 htmlParseComment(ctxt, /* bogus */ 1);
5121 } else if (next == '/') {
5122 ctxt->instate = XML_PARSER_END_TAG;
5123 ctxt->checkIndex = 0;
5124 break;
5125 } else if (IS_ASCII_LETTER(next)) {
5126 if ((!terminate) && (next == 0))
5127 goto done;
5128 ctxt->instate = XML_PARSER_START_TAG;
5129 ctxt->checkIndex = 0;
5130 break;
5131 } else {
5132 ctxt->instate = XML_PARSER_CONTENT;
5133 htmlCheckParagraph(ctxt);
5134 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
5135 (ctxt->sax->characters != NULL))
5136 ctxt->sax->characters(ctxt->userData,
5137 BAD_CAST "<", 1);
5138 SKIP(1);
5139 }
5140 } else {
5141 /*
5142 * check that the text sequence is complete
5143 * before handing out the data to the parser
5144 * to avoid problems with erroneous end of
5145 * data detection.
5146 */
5147 if ((!terminate) &&
5148 (htmlParseLookupString(ctxt, 0, "<", 1, 0) < 0))
5149 goto done;
5150 ctxt->checkIndex = 0;
5151 htmlParseCharData(ctxt);
5152 }
5153
5154 break;
5155 }
5156 case XML_PARSER_END_TAG:
5157 if ((terminate) && (avail == 2)) {
5158 htmlCheckParagraph(ctxt);
5159 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
5160 (ctxt->sax->characters != NULL))
5161 ctxt->sax->characters(ctxt->userData,
5162 BAD_CAST "</", 2);
5163 goto done;
5164 }
5165 if ((!terminate) &&
5166 (htmlParseLookupGt(ctxt) < 0))
5167 goto done;
5168 htmlParseEndTag(ctxt);
5169 if (ctxt->nameNr == 0) {
5170 ctxt->instate = XML_PARSER_EPILOG;
5171 } else {
5172 ctxt->instate = XML_PARSER_CONTENT;
5173 }
5174 ctxt->checkIndex = 0;
5175 break;
5176 default:
5177 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5178 "HPP: internal error\n", NULL, NULL);
5179 ctxt->instate = XML_PARSER_EOF;
5180 break;
5181 }
5182 }
5183 done:
5184 if ((avail == 0) && (terminate)) {
5185 htmlAutoCloseOnEnd(ctxt);
5186 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5187 /*
5188 * SAX: end of the document processing.
5189 */
5190 ctxt->instate = XML_PARSER_EOF;
5191 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5192 ctxt->sax->endDocument(ctxt->userData);
5193 }
5194 }
5195 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
5196 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5197 (ctxt->instate == XML_PARSER_EPILOG))) {
5198 xmlDtdPtr dtd;
5199 dtd = xmlGetIntSubset(ctxt->myDoc);
5200 if (dtd == NULL) {
5201 ctxt->myDoc->intSubset =
5202 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5203 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5204 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5205 if (ctxt->myDoc->intSubset == NULL)
5206 htmlErrMemory(ctxt);
5207 }
5208 }
5209 return(ret);
5210 }
5211
5212 /**
5213 * htmlParseChunk:
5214 * @ctxt: an HTML parser context
5215 * @chunk: chunk of memory
5216 * @size: size of chunk in bytes
5217 * @terminate: last chunk indicator
5218 *
5219 * Parse a chunk of memory in push parser mode.
5220 *
5221 * Assumes that the parser context was initialized with
5222 * htmlCreatePushParserCtxt.
5223 *
5224 * The last chunk, which will often be empty, must be marked with
5225 * the @terminate flag. With the default SAX callbacks, the resulting
5226 * document will be available in ctxt->myDoc. This pointer will not
5227 * be freed by the library.
5228 *
5229 * If the document isn't well-formed, ctxt->myDoc is set to NULL.
5230 *
5231 * Returns an xmlParserErrors code (0 on success).
5232 */
5233 int
htmlParseChunk(htmlParserCtxtPtr ctxt,const char * chunk,int size,int terminate)5234 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5235 int terminate) {
5236 if ((ctxt == NULL) || (ctxt->input == NULL))
5237 return(XML_ERR_ARGUMENT);
5238 if (PARSER_STOPPED(ctxt) != 0)
5239 return(ctxt->errNo);
5240 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5241 (ctxt->input->buf != NULL)) {
5242 size_t pos = ctxt->input->cur - ctxt->input->base;
5243 int res;
5244
5245 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5246 xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
5247 if (res < 0) {
5248 htmlParseErr(ctxt, ctxt->input->buf->error,
5249 "xmlParserInputBufferPush failed", NULL, NULL);
5250 xmlHaltParser(ctxt);
5251 return (ctxt->errNo);
5252 }
5253 }
5254 htmlParseTryOrFinish(ctxt, terminate);
5255 if (terminate) {
5256 if (ctxt->instate != XML_PARSER_EOF) {
5257 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5258 ctxt->sax->endDocument(ctxt->userData);
5259 }
5260 ctxt->instate = XML_PARSER_EOF;
5261 }
5262 return((xmlParserErrors) ctxt->errNo);
5263 }
5264
5265 /************************************************************************
5266 * *
5267 * User entry points *
5268 * *
5269 ************************************************************************/
5270
5271 /**
5272 * htmlCreatePushParserCtxt:
5273 * @sax: a SAX handler (optional)
5274 * @user_data: The user data returned on SAX callbacks (optional)
5275 * @chunk: a pointer to an array of chars (optional)
5276 * @size: number of chars in the array
5277 * @filename: only used for error reporting (optional)
5278 * @enc: encoding (deprecated, pass XML_CHAR_ENCODING_NONE)
5279 *
5280 * Create a parser context for using the HTML parser in push mode.
5281 *
5282 * Returns the new parser context or NULL if a memory allocation
5283 * failed.
5284 */
5285 htmlParserCtxtPtr
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,void * user_data,const char * chunk,int size,const char * filename,xmlCharEncoding enc)5286 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5287 const char *chunk, int size, const char *filename,
5288 xmlCharEncoding enc) {
5289 htmlParserCtxtPtr ctxt;
5290 htmlParserInputPtr input;
5291 const char *encoding;
5292
5293 ctxt = htmlNewSAXParserCtxt(sax, user_data);
5294 if (ctxt == NULL)
5295 return(NULL);
5296
5297 encoding = xmlGetCharEncodingName(enc);
5298 input = xmlNewPushInput(filename, chunk, size);
5299 if (input == NULL) {
5300 htmlFreeParserCtxt(ctxt);
5301 return(NULL);
5302 }
5303
5304 if (inputPush(ctxt, input) < 0) {
5305 xmlFreeInputStream(input);
5306 xmlFreeParserCtxt(ctxt);
5307 return(NULL);
5308 }
5309
5310 if (encoding != NULL)
5311 xmlSwitchEncodingName(ctxt, encoding);
5312
5313 return(ctxt);
5314 }
5315 #endif /* LIBXML_PUSH_ENABLED */
5316
5317 /**
5318 * htmlSAXParseDoc:
5319 * @cur: a pointer to an array of xmlChar
5320 * @encoding: a free form C string describing the HTML document encoding, or NULL
5321 * @sax: the SAX handler block
5322 * @userData: if using SAX, this pointer will be provided on callbacks.
5323 *
5324 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadDoc.
5325 *
5326 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5327 * to handle parse events. If sax is NULL, fallback to the default DOM
5328 * behavior and return a tree.
5329 *
5330 * Returns the resulting document tree unless SAX is NULL or the document is
5331 * not well formed.
5332 */
5333
5334 htmlDocPtr
htmlSAXParseDoc(const xmlChar * cur,const char * encoding,htmlSAXHandlerPtr sax,void * userData)5335 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
5336 htmlSAXHandlerPtr sax, void *userData) {
5337 htmlDocPtr ret;
5338 htmlParserCtxtPtr ctxt;
5339
5340 if (cur == NULL)
5341 return(NULL);
5342
5343 ctxt = htmlCreateDocParserCtxt(cur, NULL, encoding);
5344 if (ctxt == NULL)
5345 return(NULL);
5346
5347 if (sax != NULL) {
5348 *ctxt->sax = *sax;
5349 ctxt->userData = userData;
5350 }
5351
5352 htmlParseDocument(ctxt);
5353 ret = ctxt->myDoc;
5354 htmlFreeParserCtxt(ctxt);
5355
5356 return(ret);
5357 }
5358
5359 /**
5360 * htmlParseDoc:
5361 * @cur: a pointer to an array of xmlChar
5362 * @encoding: the encoding (optional)
5363 *
5364 * DEPRECATED: Use htmlReadDoc.
5365 *
5366 * Parse an HTML in-memory document and build a tree.
5367 *
5368 * This function uses deprecated global parser options.
5369 *
5370 * Returns the resulting document tree
5371 */
5372
5373 htmlDocPtr
htmlParseDoc(const xmlChar * cur,const char * encoding)5374 htmlParseDoc(const xmlChar *cur, const char *encoding) {
5375 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5376 }
5377
5378
5379 /**
5380 * htmlCreateFileParserCtxt:
5381 * @filename: the filename
5382 * @encoding: optional encoding
5383 *
5384 * DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadFile.
5385 *
5386 * Create a parser context to read from a file.
5387 *
5388 * A non-NULL encoding overrides encoding declarations in the document.
5389 *
5390 * Automatic support for ZLIB/Compress compressed document is provided
5391 * by default if found at compile-time.
5392 *
5393 * Returns the new parser context or NULL if a memory allocation failed.
5394 */
5395 htmlParserCtxtPtr
htmlCreateFileParserCtxt(const char * filename,const char * encoding)5396 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5397 {
5398 htmlParserCtxtPtr ctxt;
5399 htmlParserInputPtr input;
5400
5401 if (filename == NULL)
5402 return(NULL);
5403
5404 ctxt = htmlNewParserCtxt();
5405 if (ctxt == NULL) {
5406 return(NULL);
5407 }
5408
5409 input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0);
5410 if (input == NULL) {
5411 xmlFreeParserCtxt(ctxt);
5412 return(NULL);
5413 }
5414 if (inputPush(ctxt, input) < 0) {
5415 xmlFreeInputStream(input);
5416 xmlFreeParserCtxt(ctxt);
5417 return(NULL);
5418 }
5419
5420 return(ctxt);
5421 }
5422
5423 /**
5424 * htmlSAXParseFile:
5425 * @filename: the filename
5426 * @encoding: encoding (optional)
5427 * @sax: the SAX handler block
5428 * @userData: if using SAX, this pointer will be provided on callbacks.
5429 *
5430 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadFile.
5431 *
5432 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5433 * compressed document is provided by default if found at compile-time.
5434 * It use the given SAX function block to handle the parsing callback.
5435 * If sax is NULL, fallback to the default DOM tree building routines.
5436 *
5437 * Returns the resulting document tree unless SAX is NULL or the document is
5438 * not well formed.
5439 */
5440
5441 htmlDocPtr
htmlSAXParseFile(const char * filename,const char * encoding,htmlSAXHandlerPtr sax,void * userData)5442 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5443 void *userData) {
5444 htmlDocPtr ret;
5445 htmlParserCtxtPtr ctxt;
5446 htmlSAXHandlerPtr oldsax = NULL;
5447
5448 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5449 if (ctxt == NULL) return(NULL);
5450 if (sax != NULL) {
5451 oldsax = ctxt->sax;
5452 ctxt->sax = sax;
5453 ctxt->userData = userData;
5454 }
5455
5456 htmlParseDocument(ctxt);
5457
5458 ret = ctxt->myDoc;
5459 if (sax != NULL) {
5460 ctxt->sax = oldsax;
5461 ctxt->userData = NULL;
5462 }
5463 htmlFreeParserCtxt(ctxt);
5464
5465 return(ret);
5466 }
5467
5468 /**
5469 * htmlParseFile:
5470 * @filename: the filename
5471 * @encoding: encoding (optional)
5472 *
5473 * Parse an HTML file and build a tree.
5474 *
5475 * Returns the resulting document tree
5476 */
5477
5478 htmlDocPtr
htmlParseFile(const char * filename,const char * encoding)5479 htmlParseFile(const char *filename, const char *encoding) {
5480 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5481 }
5482
5483 /**
5484 * htmlHandleOmittedElem:
5485 * @val: int 0 or 1
5486 *
5487 * DEPRECATED: Use HTML_PARSE_NOIMPLIED
5488 *
5489 * Set and return the previous value for handling HTML omitted tags.
5490 *
5491 * Returns the last value for 0 for no handling, 1 for auto insertion.
5492 */
5493
5494 int
htmlHandleOmittedElem(int val)5495 htmlHandleOmittedElem(int val) {
5496 int old = htmlOmittedDefaultValue;
5497
5498 htmlOmittedDefaultValue = val;
5499 return(old);
5500 }
5501
5502 /**
5503 * htmlElementAllowedHere:
5504 * @parent: HTML parent element
5505 * @elt: HTML element
5506 *
5507 * DEPRECATED: Don't use.
5508 *
5509 * Returns 1
5510 */
5511 int
htmlElementAllowedHere(const htmlElemDesc * parent ATTRIBUTE_UNUSED,const xmlChar * elt ATTRIBUTE_UNUSED)5512 htmlElementAllowedHere(const htmlElemDesc* parent ATTRIBUTE_UNUSED,
5513 const xmlChar* elt ATTRIBUTE_UNUSED) {
5514 return(1);
5515 }
5516
5517 /**
5518 * htmlElementStatusHere:
5519 * @parent: HTML parent element
5520 * @elt: HTML element
5521 *
5522 * DEPRECATED: Don't use.
5523 *
5524 * Returns HTML_VALID
5525 */
5526 htmlStatus
htmlElementStatusHere(const htmlElemDesc * parent ATTRIBUTE_UNUSED,const htmlElemDesc * elt ATTRIBUTE_UNUSED)5527 htmlElementStatusHere(const htmlElemDesc* parent ATTRIBUTE_UNUSED,
5528 const htmlElemDesc* elt ATTRIBUTE_UNUSED) {
5529 return(HTML_VALID);
5530 }
5531
5532 /**
5533 * htmlAttrAllowed:
5534 * @elt: HTML element
5535 * @attr: HTML attribute
5536 * @legacy: whether to allow deprecated attributes
5537 *
5538 * DEPRECATED: Don't use.
5539 *
5540 * Returns HTML_VALID
5541 */
5542 htmlStatus
htmlAttrAllowed(const htmlElemDesc * elt ATTRIBUTE_UNUSED,const xmlChar * attr ATTRIBUTE_UNUSED,int legacy ATTRIBUTE_UNUSED)5543 htmlAttrAllowed(const htmlElemDesc* elt ATTRIBUTE_UNUSED,
5544 const xmlChar* attr ATTRIBUTE_UNUSED,
5545 int legacy ATTRIBUTE_UNUSED) {
5546 return(HTML_VALID);
5547 }
5548
5549 /**
5550 * htmlNodeStatus:
5551 * @node: an htmlNodePtr in a tree
5552 * @legacy: whether to allow deprecated elements (YES is faster here
5553 * for Element nodes)
5554 *
5555 * DEPRECATED: Don't use.
5556 *
5557 * Returns HTML_VALID
5558 */
5559 htmlStatus
htmlNodeStatus(htmlNodePtr node ATTRIBUTE_UNUSED,int legacy ATTRIBUTE_UNUSED)5560 htmlNodeStatus(htmlNodePtr node ATTRIBUTE_UNUSED,
5561 int legacy ATTRIBUTE_UNUSED) {
5562 return(HTML_VALID);
5563 }
5564
5565 /************************************************************************
5566 * *
5567 * New set (2.6.0) of simpler and more flexible APIs *
5568 * *
5569 ************************************************************************/
5570 /**
5571 * DICT_FREE:
5572 * @str: a string
5573 *
5574 * Free a string if it is not owned by the "dict" dictionary in the
5575 * current scope
5576 */
5577 #define DICT_FREE(str) \
5578 if ((str) && ((!dict) || \
5579 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5580 xmlFree((char *)(str));
5581
5582 /**
5583 * htmlCtxtReset:
5584 * @ctxt: an HTML parser context
5585 *
5586 * Reset a parser context
5587 */
5588 void
htmlCtxtReset(htmlParserCtxtPtr ctxt)5589 htmlCtxtReset(htmlParserCtxtPtr ctxt)
5590 {
5591 xmlParserInputPtr input;
5592 xmlDictPtr dict;
5593
5594 if (ctxt == NULL)
5595 return;
5596
5597 dict = ctxt->dict;
5598
5599 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5600 xmlFreeInputStream(input);
5601 }
5602 ctxt->inputNr = 0;
5603 ctxt->input = NULL;
5604
5605 ctxt->spaceNr = 0;
5606 if (ctxt->spaceTab != NULL) {
5607 ctxt->spaceTab[0] = -1;
5608 ctxt->space = &ctxt->spaceTab[0];
5609 } else {
5610 ctxt->space = NULL;
5611 }
5612
5613
5614 ctxt->nodeNr = 0;
5615 ctxt->node = NULL;
5616
5617 ctxt->nameNr = 0;
5618 ctxt->name = NULL;
5619
5620 ctxt->nsNr = 0;
5621
5622 DICT_FREE(ctxt->version);
5623 ctxt->version = NULL;
5624 DICT_FREE(ctxt->encoding);
5625 ctxt->encoding = NULL;
5626 DICT_FREE(ctxt->extSubURI);
5627 ctxt->extSubURI = NULL;
5628 DICT_FREE(ctxt->extSubSystem);
5629 ctxt->extSubSystem = NULL;
5630
5631 if (ctxt->directory != NULL) {
5632 xmlFree(ctxt->directory);
5633 ctxt->directory = NULL;
5634 }
5635
5636 if (ctxt->myDoc != NULL)
5637 xmlFreeDoc(ctxt->myDoc);
5638 ctxt->myDoc = NULL;
5639
5640 ctxt->standalone = -1;
5641 ctxt->hasExternalSubset = 0;
5642 ctxt->hasPErefs = 0;
5643 ctxt->html = 1;
5644 ctxt->instate = XML_PARSER_START;
5645
5646 ctxt->wellFormed = 1;
5647 ctxt->nsWellFormed = 1;
5648 ctxt->disableSAX = 0;
5649 ctxt->valid = 1;
5650 ctxt->vctxt.userData = ctxt;
5651 ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
5652 ctxt->vctxt.error = xmlParserValidityError;
5653 ctxt->vctxt.warning = xmlParserValidityWarning;
5654 ctxt->record_info = 0;
5655 ctxt->checkIndex = 0;
5656 ctxt->endCheckState = 0;
5657 ctxt->inSubset = 0;
5658 ctxt->errNo = XML_ERR_OK;
5659 ctxt->depth = 0;
5660 ctxt->catalogs = NULL;
5661 xmlInitNodeInfoSeq(&ctxt->node_seq);
5662
5663 if (ctxt->attsDefault != NULL) {
5664 xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
5665 ctxt->attsDefault = NULL;
5666 }
5667 if (ctxt->attsSpecial != NULL) {
5668 xmlHashFree(ctxt->attsSpecial, NULL);
5669 ctxt->attsSpecial = NULL;
5670 }
5671
5672 ctxt->nbErrors = 0;
5673 ctxt->nbWarnings = 0;
5674 if (ctxt->lastError.code != XML_ERR_OK)
5675 xmlResetError(&ctxt->lastError);
5676 }
5677
5678 static int
htmlCtxtSetOptionsInternal(xmlParserCtxtPtr ctxt,int options,int keepMask)5679 htmlCtxtSetOptionsInternal(xmlParserCtxtPtr ctxt, int options, int keepMask)
5680 {
5681 int allMask;
5682
5683 if (ctxt == NULL)
5684 return(-1);
5685
5686 allMask = HTML_PARSE_RECOVER |
5687 HTML_PARSE_HTML5 |
5688 HTML_PARSE_NODEFDTD |
5689 HTML_PARSE_NOERROR |
5690 HTML_PARSE_NOWARNING |
5691 HTML_PARSE_PEDANTIC |
5692 HTML_PARSE_NOBLANKS |
5693 HTML_PARSE_NONET |
5694 HTML_PARSE_NOIMPLIED |
5695 HTML_PARSE_COMPACT |
5696 HTML_PARSE_HUGE |
5697 HTML_PARSE_IGNORE_ENC |
5698 HTML_PARSE_BIG_LINES;
5699
5700 ctxt->options = (ctxt->options & keepMask) | (options & allMask);
5701
5702 /*
5703 * For some options, struct members are historically the source
5704 * of truth. See xmlCtxtSetOptionsInternal.
5705 */
5706 ctxt->keepBlanks = (options & HTML_PARSE_NOBLANKS) ? 0 : 1;
5707
5708 /*
5709 * Changing SAX callbacks is a bad idea. This should be fixed.
5710 */
5711 if (options & HTML_PARSE_NOBLANKS) {
5712 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5713 }
5714 if (options & HTML_PARSE_HUGE) {
5715 if (ctxt->dict != NULL)
5716 xmlDictSetLimit(ctxt->dict, 0);
5717 }
5718
5719 /*
5720 * It would be useful to allow this feature.
5721 */
5722 ctxt->dictNames = 0;
5723
5724 ctxt->linenumbers = 1;
5725
5726 return(options & ~allMask);
5727 }
5728
5729 /**
5730 * htmlCtxtSetOptions:
5731 * @ctxt: an HTML parser context
5732 * @options: a bitmask of xmlParserOption values
5733 *
5734 * Applies the options to the parser context. Unset options are
5735 * cleared.
5736 *
5737 * Available since 2.14.0. With older versions, you can use
5738 * htmlCtxtUseOptions.
5739 *
5740 * HTML_PARSE_RECOVER
5741 *
5742 * No effect as of 2.14.0.
5743 *
5744 * HTML_PARSE_HTML5
5745 *
5746 * Make the tokenizer emit a SAX callback for each token. This results
5747 * in unbalanced invocations of startElement and endElement.
5748 *
5749 * For now, this is only usable with custom SAX callbacks.
5750 *
5751 * HTML_PARSE_NODEFDTD
5752 *
5753 * Do not default to a doctype if none was found.
5754 *
5755 * HTML_PARSE_NOERROR
5756 *
5757 * Disable error and warning reports to the error handlers.
5758 * Errors are still accessible with xmlCtxtGetLastError.
5759 *
5760 * HTML_PARSE_NOWARNING
5761 *
5762 * Disable warning reports.
5763 *
5764 * HTML_PARSE_PEDANTIC
5765 *
5766 * No effect.
5767 *
5768 * HTML_PARSE_NOBLANKS
5769 *
5770 * Remove some text nodes containing only whitespace from the
5771 * result document. Which nodes are removed depends on a conservative
5772 * heuristic. The reindenting feature of the serialization code relies
5773 * on this option to be set when parsing. Use of this option is
5774 * DISCOURAGED.
5775 *
5776 * HTML_PARSE_NONET
5777 *
5778 * No effect.
5779 *
5780 * HTML_PARSE_NOIMPLIED
5781 *
5782 * Do not add implied html, head or body elements.
5783 *
5784 * HTML_PARSE_COMPACT
5785 *
5786 * Store small strings directly in the node struct to save
5787 * memory.
5788 *
5789 * HTML_PARSE_HUGE
5790 *
5791 * Relax some internal limits.
5792 *
5793 * Available since 2.14.0. Use XML_PARSE_HUGE works with older
5794 * versions.
5795 *
5796 * Maximum size of text nodes, tags, comments, CDATA sections
5797 *
5798 * normal: 10M
5799 * huge: 1B
5800 *
5801 * Maximum size of names, system literals, pubid literals
5802 *
5803 * normal: 50K
5804 * huge: 10M
5805 *
5806 * Maximum nesting depth of elements
5807 *
5808 * normal: 256
5809 * huge: 2048
5810 *
5811 * HTML_PARSE_IGNORE_ENC
5812 *
5813 * Ignore the encoding in the HTML declaration. This option is
5814 * mostly unneeded these days. The only effect is to enforce
5815 * UTF-8 decoding of ASCII-like data.
5816 *
5817 * HTML_PARSE_BIG_LINES
5818 *
5819 * Enable reporting of line numbers larger than 65535.
5820 *
5821 * Available since 2.14.0.
5822 *
5823 * Returns 0 in case of success, the set of unknown or unimplemented options
5824 * in case of error.
5825 */
5826 int
htmlCtxtSetOptions(xmlParserCtxtPtr ctxt,int options)5827 htmlCtxtSetOptions(xmlParserCtxtPtr ctxt, int options)
5828 {
5829 return(htmlCtxtSetOptionsInternal(ctxt, options, 0));
5830 }
5831
5832 /**
5833 * htmlCtxtUseOptions:
5834 * @ctxt: an HTML parser context
5835 * @options: a combination of htmlParserOption(s)
5836 *
5837 * DEPRECATED: Use htmlCtxtSetOptions.
5838 *
5839 * Applies the options to the parser context. The following options
5840 * are never cleared and can only be enabled:
5841 *
5842 * HTML_PARSE_NODEFDTD
5843 * HTML_PARSE_NOERROR
5844 * HTML_PARSE_NOWARNING
5845 * HTML_PARSE_NOIMPLIED
5846 * HTML_PARSE_COMPACT
5847 * HTML_PARSE_HUGE
5848 * HTML_PARSE_IGNORE_ENC
5849 * HTML_PARSE_BIG_LINES
5850 *
5851 * Returns 0 in case of success, the set of unknown or unimplemented options
5852 * in case of error.
5853 */
5854 int
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt,int options)5855 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5856 {
5857 int keepMask;
5858
5859 /*
5860 * For historic reasons, some options can only be enabled.
5861 */
5862 keepMask = HTML_PARSE_NODEFDTD |
5863 HTML_PARSE_NOERROR |
5864 HTML_PARSE_NOWARNING |
5865 HTML_PARSE_NOIMPLIED |
5866 HTML_PARSE_COMPACT |
5867 HTML_PARSE_HUGE |
5868 HTML_PARSE_IGNORE_ENC |
5869 HTML_PARSE_BIG_LINES;
5870
5871 return(htmlCtxtSetOptionsInternal(ctxt, options, keepMask));
5872 }
5873
5874 /**
5875 * htmlCtxtParseDocument:
5876 * @ctxt: an HTML parser context
5877 * @input: parser input
5878 *
5879 * Parse an HTML document and return the resulting document tree.
5880 *
5881 * Available since 2.13.0.
5882 *
5883 * Returns the resulting document tree or NULL
5884 */
5885 htmlDocPtr
htmlCtxtParseDocument(htmlParserCtxtPtr ctxt,xmlParserInputPtr input)5886 htmlCtxtParseDocument(htmlParserCtxtPtr ctxt, xmlParserInputPtr input)
5887 {
5888 htmlDocPtr ret;
5889
5890 if ((ctxt == NULL) || (input == NULL))
5891 return(NULL);
5892
5893 /* assert(ctxt->inputNr == 0); */
5894 while (ctxt->inputNr > 0)
5895 xmlFreeInputStream(inputPop(ctxt));
5896
5897 if (inputPush(ctxt, input) < 0) {
5898 xmlFreeInputStream(input);
5899 return(NULL);
5900 }
5901
5902 ctxt->html = 1;
5903 htmlParseDocument(ctxt);
5904
5905 if (ctxt->errNo != XML_ERR_NO_MEMORY) {
5906 ret = ctxt->myDoc;
5907 } else {
5908 ret = NULL;
5909 xmlFreeDoc(ctxt->myDoc);
5910 }
5911 ctxt->myDoc = NULL;
5912
5913 /* assert(ctxt->inputNr == 1); */
5914 while (ctxt->inputNr > 0)
5915 xmlFreeInputStream(inputPop(ctxt));
5916
5917 return(ret);
5918 }
5919
5920 /**
5921 * htmlReadDoc:
5922 * @str: a pointer to a zero terminated string
5923 * @url: only used for error reporting (optoinal)
5924 * @encoding: the document encoding (optional)
5925 * @options: a combination of htmlParserOptions
5926 *
5927 * Convenience function to parse an HTML document from a zero-terminated
5928 * string.
5929 *
5930 * See htmlCtxtReadDoc for details.
5931 *
5932 * Returns the resulting document tree.
5933 */
5934 htmlDocPtr
htmlReadDoc(const xmlChar * str,const char * url,const char * encoding,int options)5935 htmlReadDoc(const xmlChar *str, const char *url, const char *encoding,
5936 int options)
5937 {
5938 htmlParserCtxtPtr ctxt;
5939 xmlParserInputPtr input;
5940 htmlDocPtr doc;
5941
5942 ctxt = htmlNewParserCtxt();
5943 if (ctxt == NULL)
5944 return(NULL);
5945
5946 htmlCtxtUseOptions(ctxt, options);
5947
5948 input = xmlCtxtNewInputFromString(ctxt, url, (const char *) str, encoding,
5949 XML_INPUT_BUF_STATIC);
5950
5951 doc = htmlCtxtParseDocument(ctxt, input);
5952
5953 htmlFreeParserCtxt(ctxt);
5954 return(doc);
5955 }
5956
5957 /**
5958 * htmlReadFile:
5959 * @filename: a file or URL
5960 * @encoding: the document encoding (optional)
5961 * @options: a combination of htmlParserOptions
5962 *
5963 * Convenience function to parse an HTML file from the filesystem,
5964 * the network or a global user-defined resource loader.
5965 *
5966 * See htmlCtxtReadFile for details.
5967 *
5968 * Returns the resulting document tree.
5969 */
5970 htmlDocPtr
htmlReadFile(const char * filename,const char * encoding,int options)5971 htmlReadFile(const char *filename, const char *encoding, int options)
5972 {
5973 htmlParserCtxtPtr ctxt;
5974 xmlParserInputPtr input;
5975 htmlDocPtr doc;
5976
5977 ctxt = htmlNewParserCtxt();
5978 if (ctxt == NULL)
5979 return(NULL);
5980
5981 htmlCtxtUseOptions(ctxt, options);
5982
5983 input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0);
5984
5985 doc = htmlCtxtParseDocument(ctxt, input);
5986
5987 htmlFreeParserCtxt(ctxt);
5988 return(doc);
5989 }
5990
5991 /**
5992 * htmlReadMemory:
5993 * @buffer: a pointer to a char array
5994 * @size: the size of the array
5995 * @url: only used for error reporting (optional)
5996 * @encoding: the document encoding, or NULL
5997 * @options: a combination of htmlParserOption(s)
5998 *
5999 * Convenience function to parse an HTML document from memory.
6000 * The input buffer must not contain any terminating null bytes.
6001 *
6002 * See htmlCtxtReadMemory for details.
6003 *
6004 * Returns the resulting document tree
6005 */
6006 htmlDocPtr
htmlReadMemory(const char * buffer,int size,const char * url,const char * encoding,int options)6007 htmlReadMemory(const char *buffer, int size, const char *url,
6008 const char *encoding, int options)
6009 {
6010 htmlParserCtxtPtr ctxt;
6011 xmlParserInputPtr input;
6012 htmlDocPtr doc;
6013
6014 if (size < 0)
6015 return(NULL);
6016
6017 ctxt = htmlNewParserCtxt();
6018 if (ctxt == NULL)
6019 return(NULL);
6020
6021 htmlCtxtUseOptions(ctxt, options);
6022
6023 input = xmlCtxtNewInputFromMemory(ctxt, url, buffer, size, encoding,
6024 XML_INPUT_BUF_STATIC);
6025
6026 doc = htmlCtxtParseDocument(ctxt, input);
6027
6028 htmlFreeParserCtxt(ctxt);
6029 return(doc);
6030 }
6031
6032 /**
6033 * htmlReadFd:
6034 * @fd: an open file descriptor
6035 * @url: only used for error reporting (optional)
6036 * @encoding: the document encoding, or NULL
6037 * @options: a combination of htmlParserOptions
6038 *
6039 * Convenience function to parse an HTML document from a
6040 * file descriptor.
6041 *
6042 * NOTE that the file descriptor will not be closed when the
6043 * context is freed or reset.
6044 *
6045 * See htmlCtxtReadFd for details.
6046 *
6047 * Returns the resulting document tree
6048 */
6049 htmlDocPtr
htmlReadFd(int fd,const char * url,const char * encoding,int options)6050 htmlReadFd(int fd, const char *url, const char *encoding, int options)
6051 {
6052 htmlParserCtxtPtr ctxt;
6053 xmlParserInputPtr input;
6054 htmlDocPtr doc;
6055
6056 ctxt = htmlNewParserCtxt();
6057 if (ctxt == NULL)
6058 return(NULL);
6059
6060 htmlCtxtUseOptions(ctxt, options);
6061
6062 input = xmlCtxtNewInputFromFd(ctxt, url, fd, encoding, 0);
6063
6064 doc = htmlCtxtParseDocument(ctxt, input);
6065
6066 htmlFreeParserCtxt(ctxt);
6067 return(doc);
6068 }
6069
6070 /**
6071 * htmlReadIO:
6072 * @ioread: an I/O read function
6073 * @ioclose: an I/O close function (optional)
6074 * @ioctx: an I/O handler
6075 * @url: only used for error reporting (optional)
6076 * @encoding: the document encoding (optional)
6077 * @options: a combination of htmlParserOption(s)
6078 *
6079 * Convenience function to parse an HTML document from I/O functions
6080 * and context.
6081 *
6082 * See htmlCtxtReadIO for details.
6083 *
6084 * Returns the resulting document tree
6085 */
6086 htmlDocPtr
htmlReadIO(xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * url,const char * encoding,int options)6087 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6088 void *ioctx, const char *url, const char *encoding, int options)
6089 {
6090 htmlParserCtxtPtr ctxt;
6091 xmlParserInputPtr input;
6092 htmlDocPtr doc;
6093
6094 ctxt = htmlNewParserCtxt();
6095 if (ctxt == NULL)
6096 return (NULL);
6097
6098 htmlCtxtUseOptions(ctxt, options);
6099
6100 input = xmlCtxtNewInputFromIO(ctxt, url, ioread, ioclose, ioctx,
6101 encoding, 0);
6102
6103 doc = htmlCtxtParseDocument(ctxt, input);
6104
6105 htmlFreeParserCtxt(ctxt);
6106 return(doc);
6107 }
6108
6109 /**
6110 * htmlCtxtReadDoc:
6111 * @ctxt: an HTML parser context
6112 * @str: a pointer to a zero terminated string
6113 * @URL: only used for error reporting (optional)
6114 * @encoding: the document encoding (optional)
6115 * @options: a combination of htmlParserOptions
6116 *
6117 * Parse an HTML in-memory document and build a tree.
6118 *
6119 * See htmlCtxtUseOptions for details.
6120 *
6121 * Returns the resulting document tree
6122 */
6123 htmlDocPtr
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt,const xmlChar * str,const char * URL,const char * encoding,int options)6124 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar *str,
6125 const char *URL, const char *encoding, int options)
6126 {
6127 xmlParserInputPtr input;
6128
6129 if (ctxt == NULL)
6130 return (NULL);
6131
6132 htmlCtxtReset(ctxt);
6133 htmlCtxtUseOptions(ctxt, options);
6134
6135 input = xmlCtxtNewInputFromString(ctxt, URL, (const char *) str,
6136 encoding, 0);
6137
6138 return(htmlCtxtParseDocument(ctxt, input));
6139 }
6140
6141 /**
6142 * htmlCtxtReadFile:
6143 * @ctxt: an HTML parser context
6144 * @filename: a file or URL
6145 * @encoding: the document encoding (optional)
6146 * @options: a combination of htmlParserOptions
6147 *
6148 * Parse an HTML file from the filesystem, the network or a
6149 * user-defined resource loader.
6150 *
6151 * See htmlCtxtUseOptions for details.
6152 *
6153 * Returns the resulting document tree
6154 */
6155 htmlDocPtr
htmlCtxtReadFile(htmlParserCtxtPtr ctxt,const char * filename,const char * encoding,int options)6156 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6157 const char *encoding, int options)
6158 {
6159 xmlParserInputPtr input;
6160
6161 if (ctxt == NULL)
6162 return (NULL);
6163
6164 htmlCtxtReset(ctxt);
6165 htmlCtxtUseOptions(ctxt, options);
6166
6167 input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0);
6168
6169 return(htmlCtxtParseDocument(ctxt, input));
6170 }
6171
6172 /**
6173 * htmlCtxtReadMemory:
6174 * @ctxt: an HTML parser context
6175 * @buffer: a pointer to a char array
6176 * @size: the size of the array
6177 * @URL: only used for error reporting (optional)
6178 * @encoding: the document encoding (optinal)
6179 * @options: a combination of htmlParserOptions
6180 *
6181 * Parse an HTML in-memory document and build a tree. The input buffer must
6182 * not contain any terminating null bytes.
6183 *
6184 * See htmlCtxtUseOptions for details.
6185 *
6186 * Returns the resulting document tree
6187 */
6188 htmlDocPtr
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt,const char * buffer,int size,const char * URL,const char * encoding,int options)6189 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6190 const char *URL, const char *encoding, int options)
6191 {
6192 xmlParserInputPtr input;
6193
6194 if ((ctxt == NULL) || (size < 0))
6195 return (NULL);
6196
6197 htmlCtxtReset(ctxt);
6198 htmlCtxtUseOptions(ctxt, options);
6199
6200 input = xmlCtxtNewInputFromMemory(ctxt, URL, buffer, size, encoding,
6201 XML_INPUT_BUF_STATIC);
6202
6203 return(htmlCtxtParseDocument(ctxt, input));
6204 }
6205
6206 /**
6207 * htmlCtxtReadFd:
6208 * @ctxt: an HTML parser context
6209 * @fd: an open file descriptor
6210 * @URL: only used for error reporting (optional)
6211 * @encoding: the document encoding (optinal)
6212 * @options: a combination of htmlParserOptions
6213 *
6214 * Parse an HTML from a file descriptor and build a tree.
6215 *
6216 * See htmlCtxtUseOptions for details.
6217 *
6218 * NOTE that the file descriptor will not be closed when the
6219 * context is freed or reset.
6220 *
6221 * Returns the resulting document tree
6222 */
6223 htmlDocPtr
htmlCtxtReadFd(htmlParserCtxtPtr ctxt,int fd,const char * URL,const char * encoding,int options)6224 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6225 const char *URL, const char *encoding, int options)
6226 {
6227 xmlParserInputPtr input;
6228
6229 if (ctxt == NULL)
6230 return(NULL);
6231
6232 htmlCtxtReset(ctxt);
6233 htmlCtxtUseOptions(ctxt, options);
6234
6235 input = xmlCtxtNewInputFromFd(ctxt, URL, fd, encoding, 0);
6236
6237 return(htmlCtxtParseDocument(ctxt, input));
6238 }
6239
6240 /**
6241 * htmlCtxtReadIO:
6242 * @ctxt: an HTML parser context
6243 * @ioread: an I/O read function
6244 * @ioclose: an I/O close function
6245 * @ioctx: an I/O handler
6246 * @URL: the base URL to use for the document
6247 * @encoding: the document encoding, or NULL
6248 * @options: a combination of htmlParserOption(s)
6249 *
6250 * Parse an HTML document from I/O functions and source and build a tree.
6251 *
6252 * See htmlCtxtUseOptions for details.
6253 *
6254 * Returns the resulting document tree
6255 */
6256 htmlDocPtr
htmlCtxtReadIO(htmlParserCtxtPtr ctxt,xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)6257 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6258 xmlInputCloseCallback ioclose, void *ioctx,
6259 const char *URL,
6260 const char *encoding, int options)
6261 {
6262 xmlParserInputPtr input;
6263
6264 if (ctxt == NULL)
6265 return (NULL);
6266
6267 htmlCtxtReset(ctxt);
6268 htmlCtxtUseOptions(ctxt, options);
6269
6270 input = xmlCtxtNewInputFromIO(ctxt, URL, ioread, ioclose, ioctx,
6271 encoding, 0);
6272
6273 return(htmlCtxtParseDocument(ctxt, input));
6274 }
6275
6276 #endif /* LIBXML_HTML_ENABLED */
6277