1 /*
2 * HTMLtree.c : implementation of access function for an HTML tree.
3 *
4 * See Copyright for the status of this software.
5 *
6 * [email protected]
7 */
8
9
10 #define IN_LIBXML
11 #include "libxml.h"
12 #ifdef LIBXML_HTML_ENABLED
13
14 #include <string.h> /* for memset() only ! */
15 #include <ctype.h>
16 #include <stdlib.h>
17
18 #include <libxml/xmlmemory.h>
19 #include <libxml/HTMLparser.h>
20 #include <libxml/HTMLtree.h>
21 #include <libxml/entities.h>
22 #include <libxml/xmlerror.h>
23 #include <libxml/parserInternals.h>
24 #include <libxml/uri.h>
25
26 #include "private/buf.h"
27 #include "private/error.h"
28 #include "private/io.h"
29 #include "private/save.h"
30
31 /************************************************************************
32 * *
33 * Getting/Setting encoding meta tags *
34 * *
35 ************************************************************************/
36
37 /**
38 * htmlGetMetaEncoding:
39 * @doc: the document
40 *
41 * Encoding definition lookup in the Meta tags
42 *
43 * Returns the current encoding as flagged in the HTML source
44 */
45 const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc)46 htmlGetMetaEncoding(htmlDocPtr doc) {
47 htmlNodePtr cur;
48 const xmlChar *content;
49 const xmlChar *encoding;
50
51 if (doc == NULL)
52 return(NULL);
53 cur = doc->children;
54
55 /*
56 * Search the html
57 */
58 while (cur != NULL) {
59 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
60 if (xmlStrEqual(cur->name, BAD_CAST"html"))
61 break;
62 if (xmlStrEqual(cur->name, BAD_CAST"head"))
63 goto found_head;
64 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
65 goto found_meta;
66 }
67 cur = cur->next;
68 }
69 if (cur == NULL)
70 return(NULL);
71 cur = cur->children;
72
73 /*
74 * Search the head
75 */
76 while (cur != NULL) {
77 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
78 if (xmlStrEqual(cur->name, BAD_CAST"head"))
79 break;
80 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
81 goto found_meta;
82 }
83 cur = cur->next;
84 }
85 if (cur == NULL)
86 return(NULL);
87 found_head:
88 cur = cur->children;
89
90 /*
91 * Search the meta elements
92 */
93 found_meta:
94 while (cur != NULL) {
95 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
96 if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
97 xmlAttrPtr attr = cur->properties;
98 int http;
99 const xmlChar *value;
100
101 content = NULL;
102 http = 0;
103 while (attr != NULL) {
104 if ((attr->children != NULL) &&
105 (attr->children->type == XML_TEXT_NODE) &&
106 (attr->children->next == NULL)) {
107 value = attr->children->content;
108 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
109 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
110 http = 1;
111 else if ((value != NULL)
112 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
113 content = value;
114 if ((http != 0) && (content != NULL))
115 goto found_content;
116 }
117 attr = attr->next;
118 }
119 }
120 }
121 cur = cur->next;
122 }
123 return(NULL);
124
125 found_content:
126 encoding = xmlStrstr(content, BAD_CAST"charset=");
127 if (encoding == NULL)
128 encoding = xmlStrstr(content, BAD_CAST"Charset=");
129 if (encoding == NULL)
130 encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
131 if (encoding != NULL) {
132 encoding += 8;
133 } else {
134 encoding = xmlStrstr(content, BAD_CAST"charset =");
135 if (encoding == NULL)
136 encoding = xmlStrstr(content, BAD_CAST"Charset =");
137 if (encoding == NULL)
138 encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
139 if (encoding != NULL)
140 encoding += 9;
141 }
142 if (encoding != NULL) {
143 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
144 }
145 return(encoding);
146 }
147
148 /**
149 * htmlSetMetaEncoding:
150 * @doc: the document
151 * @encoding: the encoding string
152 *
153 * Sets the current encoding in the Meta tags
154 * NOTE: this will not change the document content encoding, just
155 * the META flag associated.
156 *
157 * Returns 0 in case of success and -1 in case of error
158 */
159 int
htmlSetMetaEncoding(htmlDocPtr doc,const xmlChar * encoding)160 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
161 htmlNodePtr cur, meta = NULL, head = NULL;
162 const xmlChar *content = NULL;
163 char newcontent[100];
164
165 newcontent[0] = 0;
166
167 if (doc == NULL)
168 return(-1);
169
170 /* html isn't a real encoding it's just libxml2 way to get entities */
171 if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
172 return(-1);
173
174 if (encoding != NULL) {
175 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
176 (char *)encoding);
177 newcontent[sizeof(newcontent) - 1] = 0;
178 }
179
180 cur = doc->children;
181
182 /*
183 * Search the html
184 */
185 while (cur != NULL) {
186 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
187 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
188 break;
189 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
190 goto found_head;
191 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
192 goto found_meta;
193 }
194 cur = cur->next;
195 }
196 if (cur == NULL)
197 return(-1);
198 cur = cur->children;
199
200 /*
201 * Search the head
202 */
203 while (cur != NULL) {
204 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
205 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
206 break;
207 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
208 head = cur->parent;
209 goto found_meta;
210 }
211 }
212 cur = cur->next;
213 }
214 if (cur == NULL)
215 return(-1);
216 found_head:
217 head = cur;
218 if (cur->children == NULL)
219 goto create;
220 cur = cur->children;
221
222 found_meta:
223 /*
224 * Search and update all the remaining the meta elements carrying
225 * encoding information
226 */
227 while (cur != NULL) {
228 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
229 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
230 xmlAttrPtr attr = cur->properties;
231 int http;
232 const xmlChar *value;
233
234 content = NULL;
235 http = 0;
236 while (attr != NULL) {
237 if ((attr->children != NULL) &&
238 (attr->children->type == XML_TEXT_NODE) &&
239 (attr->children->next == NULL)) {
240 value = attr->children->content;
241 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
242 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
243 http = 1;
244 else
245 {
246 if ((value != NULL) &&
247 (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
248 content = value;
249 }
250 if ((http != 0) && (content != NULL))
251 break;
252 }
253 attr = attr->next;
254 }
255 if ((http != 0) && (content != NULL)) {
256 meta = cur;
257 break;
258 }
259
260 }
261 }
262 cur = cur->next;
263 }
264 create:
265 if (meta == NULL) {
266 if ((encoding != NULL) && (head != NULL)) {
267 /*
268 * Create a new Meta element with the right attributes
269 */
270
271 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
272 if (head->children == NULL)
273 xmlAddChild(head, meta);
274 else
275 xmlAddPrevSibling(head->children, meta);
276 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
277 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
278 }
279 } else {
280 /* remove the meta tag if NULL is passed */
281 if (encoding == NULL) {
282 xmlUnlinkNode(meta);
283 xmlFreeNode(meta);
284 }
285 /* change the document only if there is a real encoding change */
286 else if (xmlStrcasestr(content, encoding) == NULL) {
287 xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
288 }
289 }
290
291
292 return(0);
293 }
294
295 /**
296 * booleanHTMLAttrs:
297 *
298 * These are the HTML attributes which will be output
299 * in minimized form, i.e. <option selected="selected"> will be
300 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
301 *
302 */
303 static const char* const htmlBooleanAttrs[] = {
304 "checked", "compact", "declare", "defer", "disabled", "ismap",
305 "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
306 "selected", NULL
307 };
308
309
310 /**
311 * htmlIsBooleanAttr:
312 * @name: the name of the attribute to check
313 *
314 * DEPRECATED: Internal function, don't use.
315 *
316 * Determine if a given attribute is a boolean attribute.
317 *
318 * returns: false if the attribute is not boolean, true otherwise.
319 */
320 int
htmlIsBooleanAttr(const xmlChar * name)321 htmlIsBooleanAttr(const xmlChar *name)
322 {
323 int i = 0;
324
325 while (htmlBooleanAttrs[i] != NULL) {
326 if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
327 return 1;
328 i++;
329 }
330 return 0;
331 }
332
333 #ifdef LIBXML_OUTPUT_ENABLED
334 /************************************************************************
335 * *
336 * Output error handlers *
337 * *
338 ************************************************************************/
339
340 /**
341 * htmlSaveErr:
342 * @code: the error number
343 * @node: the location of the error.
344 * @extra: extra information
345 *
346 * Handle an out of memory condition
347 */
348 static void
htmlSaveErr(int code,xmlNodePtr node,const char * extra)349 htmlSaveErr(int code, xmlNodePtr node, const char *extra)
350 {
351 const char *msg = NULL;
352 int res;
353
354 switch(code) {
355 case XML_SAVE_NOT_UTF8:
356 msg = "string is not in UTF-8\n";
357 break;
358 case XML_SAVE_CHAR_INVALID:
359 msg = "invalid character value\n";
360 break;
361 case XML_SAVE_UNKNOWN_ENCODING:
362 msg = "unknown encoding %s\n";
363 break;
364 case XML_SAVE_NO_DOCTYPE:
365 msg = "HTML has no DOCTYPE\n";
366 break;
367 default:
368 msg = "unexpected error number\n";
369 }
370
371 res = xmlRaiseError(NULL, NULL, NULL, NULL, node,
372 XML_FROM_OUTPUT, code, XML_ERR_ERROR, NULL, 0,
373 extra, NULL, NULL, 0, 0,
374 msg, extra);
375 if (res < 0)
376 xmlRaiseMemoryError(NULL, NULL, NULL, XML_FROM_OUTPUT, NULL);
377 }
378
379 /************************************************************************
380 * *
381 * Dumping HTML tree content to a simple buffer *
382 * *
383 ************************************************************************/
384
385 static xmlCharEncodingHandler *
htmlFindOutputEncoder(const char * encoding)386 htmlFindOutputEncoder(const char *encoding) {
387 xmlCharEncodingHandler *handler = NULL;
388
389 if (encoding != NULL) {
390 int res;
391
392 res = xmlOpenCharEncodingHandler(encoding, /* output */ 1,
393 &handler);
394 if (res != XML_ERR_OK)
395 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
396 } else {
397 /*
398 * Fallback to HTML when the encoding is unspecified
399 */
400 xmlOpenCharEncodingHandler("HTML", /* output */ 1, &handler);
401 }
402
403 return(handler);
404 }
405
406 /**
407 * htmlBufNodeDumpFormat:
408 * @buf: the xmlBufPtr output
409 * @doc: the document
410 * @cur: the current node
411 * @format: should formatting spaces been added
412 *
413 * Dump an HTML node, recursive behaviour,children are printed too.
414 *
415 * Returns the number of byte written or -1 in case of error
416 */
417 static size_t
htmlBufNodeDumpFormat(xmlBufPtr buf,xmlDocPtr doc,xmlNodePtr cur,int format)418 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
419 int format) {
420 size_t use;
421 size_t ret;
422 xmlOutputBufferPtr outbuf;
423
424 if (cur == NULL) {
425 return ((size_t) -1);
426 }
427 if (buf == NULL) {
428 return ((size_t) -1);
429 }
430 outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
431 if (outbuf == NULL)
432 return ((size_t) -1);
433 memset(outbuf, 0, sizeof(xmlOutputBuffer));
434 outbuf->buffer = buf;
435 outbuf->encoder = NULL;
436 outbuf->writecallback = NULL;
437 outbuf->closecallback = NULL;
438 outbuf->context = NULL;
439 outbuf->written = 0;
440
441 use = xmlBufUse(buf);
442 htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
443 if (outbuf->error)
444 ret = (size_t) -1;
445 else
446 ret = xmlBufUse(buf) - use;
447 xmlFree(outbuf);
448 return (ret);
449 }
450
451 /**
452 * htmlNodeDump:
453 * @buf: the HTML buffer output
454 * @doc: the document
455 * @cur: the current node
456 *
457 * Dump an HTML node, recursive behaviour,children are printed too,
458 * and formatting returns are added.
459 *
460 * Returns the number of byte written or -1 in case of error
461 */
462 int
htmlNodeDump(xmlBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur)463 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
464 xmlBufPtr buffer;
465 size_t ret1;
466 int ret2;
467
468 if ((buf == NULL) || (cur == NULL))
469 return(-1);
470
471 xmlInitParser();
472 buffer = xmlBufFromBuffer(buf);
473 if (buffer == NULL)
474 return(-1);
475
476 ret1 = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
477
478 ret2 = xmlBufBackToBuffer(buffer, buf);
479
480 if ((ret1 == (size_t) -1) || (ret2 < 0))
481 return(-1);
482 return(ret1 > INT_MAX ? INT_MAX : ret1);
483 }
484
485 /**
486 * htmlNodeDumpFileFormat:
487 * @out: the FILE pointer
488 * @doc: the document
489 * @cur: the current node
490 * @encoding: the document encoding
491 * @format: should formatting spaces been added
492 *
493 * Dump an HTML node, recursive behaviour,children are printed too.
494 *
495 * TODO: if encoding == NULL try to save in the doc encoding
496 *
497 * returns: the number of byte written or -1 in case of failure.
498 */
499 int
htmlNodeDumpFileFormat(FILE * out,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)500 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
501 xmlNodePtr cur, const char *encoding, int format) {
502 xmlOutputBufferPtr buf;
503 xmlCharEncodingHandlerPtr handler;
504 int ret;
505
506 xmlInitParser();
507
508 /*
509 * save the content to a temp buffer.
510 */
511 handler = htmlFindOutputEncoder(encoding);
512 buf = xmlOutputBufferCreateFile(out, handler);
513 if (buf == NULL)
514 return(0);
515
516 htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
517
518 ret = xmlOutputBufferClose(buf);
519 return(ret);
520 }
521
522 /**
523 * htmlNodeDumpFile:
524 * @out: the FILE pointer
525 * @doc: the document
526 * @cur: the current node
527 *
528 * Dump an HTML node, recursive behaviour,children are printed too,
529 * and formatting returns are added.
530 */
531 void
htmlNodeDumpFile(FILE * out,xmlDocPtr doc,xmlNodePtr cur)532 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
533 htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
534 }
535
536 /**
537 * htmlDocDumpMemoryFormat:
538 * @cur: the document
539 * @mem: OUT: the memory pointer
540 * @size: OUT: the memory length
541 * @format: should formatting spaces been added
542 *
543 * Dump an HTML document in memory and return the xmlChar * and it's size.
544 * It's up to the caller to free the memory.
545 */
546 void
htmlDocDumpMemoryFormat(xmlDocPtr cur,xmlChar ** mem,int * size,int format)547 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
548 xmlOutputBufferPtr buf;
549 xmlCharEncodingHandlerPtr handler = NULL;
550 const char *encoding;
551
552 xmlInitParser();
553
554 if ((mem == NULL) || (size == NULL))
555 return;
556 *mem = NULL;
557 *size = 0;
558 if (cur == NULL)
559 return;
560
561 encoding = (const char *) htmlGetMetaEncoding(cur);
562 handler = htmlFindOutputEncoder(encoding);
563 buf = xmlAllocOutputBuffer(handler);
564 if (buf == NULL)
565 return;
566
567 htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
568
569 xmlOutputBufferFlush(buf);
570
571 if (!buf->error) {
572 if (buf->conv != NULL) {
573 *size = xmlBufUse(buf->conv);
574 *mem = xmlStrndup(xmlBufContent(buf->conv), *size);
575 } else {
576 *size = xmlBufUse(buf->buffer);
577 *mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
578 }
579 }
580
581 xmlOutputBufferClose(buf);
582 }
583
584 /**
585 * htmlDocDumpMemory:
586 * @cur: the document
587 * @mem: OUT: the memory pointer
588 * @size: OUT: the memory length
589 *
590 * Dump an HTML document in memory and return the xmlChar * and it's size.
591 * It's up to the caller to free the memory.
592 */
593 void
htmlDocDumpMemory(xmlDocPtr cur,xmlChar ** mem,int * size)594 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
595 htmlDocDumpMemoryFormat(cur, mem, size, 1);
596 }
597
598
599 /************************************************************************
600 * *
601 * Dumping HTML tree content to an I/O output buffer *
602 * *
603 ************************************************************************/
604
605 /**
606 * htmlDtdDumpOutput:
607 * @buf: the HTML buffer output
608 * @doc: the document
609 * @encoding: the encoding string
610 *
611 * TODO: check whether encoding is needed
612 *
613 * Dump the HTML document DTD, if any.
614 */
615 static void
htmlDtdDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,const char * encoding ATTRIBUTE_UNUSED)616 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
617 const char *encoding ATTRIBUTE_UNUSED) {
618 xmlDtdPtr cur = doc->intSubset;
619
620 if (cur == NULL) {
621 htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
622 return;
623 }
624 xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
625 xmlOutputBufferWriteString(buf, (const char *)cur->name);
626 if (cur->ExternalID != NULL) {
627 xmlOutputBufferWriteString(buf, " PUBLIC ");
628 xmlOutputBufferWriteQuotedString(buf, cur->ExternalID);
629 if (cur->SystemID != NULL) {
630 xmlOutputBufferWriteString(buf, " ");
631 xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
632 }
633 } else if (cur->SystemID != NULL &&
634 xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
635 xmlOutputBufferWriteString(buf, " SYSTEM ");
636 xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
637 }
638 xmlOutputBufferWriteString(buf, ">\n");
639 }
640
641 /**
642 * htmlAttrDumpOutput:
643 * @buf: the HTML buffer output
644 * @doc: the document
645 * @cur: the attribute pointer
646 *
647 * Dump an HTML attribute
648 */
649 static void
htmlAttrDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur)650 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
651 xmlChar *value;
652
653 /*
654 * The html output method should not escape a & character
655 * occurring in an attribute value immediately followed by
656 * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
657 * This is implemented in xmlEncodeEntitiesReentrant
658 */
659
660 if (cur == NULL) {
661 return;
662 }
663 xmlOutputBufferWriteString(buf, " ");
664 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
665 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
666 xmlOutputBufferWriteString(buf, ":");
667 }
668 xmlOutputBufferWriteString(buf, (const char *)cur->name);
669 if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
670 value = xmlNodeListGetString(doc, cur->children, 0);
671 if (value) {
672 xmlOutputBufferWriteString(buf, "=");
673 if ((cur->ns == NULL) && (cur->parent != NULL) &&
674 (cur->parent->ns == NULL) &&
675 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
676 (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
677 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
678 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
679 (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
680 xmlChar *escaped;
681 xmlChar *tmp = value;
682
683 while (IS_BLANK_CH(*tmp)) tmp++;
684
685 /*
686 * Angle brackets are technically illegal in URIs, but they're
687 * used in server side includes, for example. Curly brackets
688 * are illegal as well and often used in templates.
689 * Don't escape non-whitespace, printable ASCII chars for
690 * improved interoperability. Only escape space, control
691 * and non-ASCII chars.
692 */
693 escaped = xmlURIEscapeStr(tmp,
694 BAD_CAST "\"#$%&+,/:;<=>?@[\\]^`{|}");
695 if (escaped != NULL) {
696 xmlOutputBufferWriteQuotedString(buf, escaped);
697 xmlFree(escaped);
698 } else {
699 buf->error = XML_ERR_NO_MEMORY;
700 }
701 } else {
702 xmlOutputBufferWriteQuotedString(buf, value);
703 }
704 xmlFree(value);
705 } else {
706 buf->error = XML_ERR_NO_MEMORY;
707 }
708 }
709 }
710
711 /**
712 * htmlNodeDumpFormatOutput:
713 * @buf: the HTML buffer output
714 * @doc: the document
715 * @cur: the current node
716 * @encoding: the encoding string (unused)
717 * @format: should formatting spaces been added
718 *
719 * Dump an HTML node, recursive behaviour,children are printed too.
720 */
721 void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding ATTRIBUTE_UNUSED,int format)722 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
723 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
724 int format) {
725 xmlNodePtr root, parent;
726 xmlAttrPtr attr;
727 const htmlElemDesc * info;
728
729 xmlInitParser();
730
731 if ((cur == NULL) || (buf == NULL)) {
732 return;
733 }
734
735 root = cur;
736 parent = cur->parent;
737 while (1) {
738 switch (cur->type) {
739 case XML_HTML_DOCUMENT_NODE:
740 case XML_DOCUMENT_NODE:
741 if (((xmlDocPtr) cur)->intSubset != NULL) {
742 htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
743 }
744 if (cur->children != NULL) {
745 /* Always validate cur->parent when descending. */
746 if (cur->parent == parent) {
747 parent = cur;
748 cur = cur->children;
749 continue;
750 }
751 } else {
752 xmlOutputBufferWriteString(buf, "\n");
753 }
754 break;
755
756 case XML_ELEMENT_NODE:
757 /*
758 * Some users like lxml are known to pass nodes with a corrupted
759 * tree structure. Fall back to a recursive call to handle this
760 * case.
761 */
762 if ((cur->parent != parent) && (cur->children != NULL)) {
763 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
764 break;
765 }
766
767 /*
768 * Get specific HTML info for that node.
769 */
770 if (cur->ns == NULL)
771 info = htmlTagLookup(cur->name);
772 else
773 info = NULL;
774
775 xmlOutputBufferWriteString(buf, "<");
776 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
777 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
778 xmlOutputBufferWriteString(buf, ":");
779 }
780 xmlOutputBufferWriteString(buf, (const char *)cur->name);
781 if (cur->nsDef)
782 xmlNsListDumpOutput(buf, cur->nsDef);
783 attr = cur->properties;
784 while (attr != NULL) {
785 htmlAttrDumpOutput(buf, doc, attr);
786 attr = attr->next;
787 }
788
789 if ((info != NULL) && (info->empty)) {
790 xmlOutputBufferWriteString(buf, ">");
791 } else if (cur->children == NULL) {
792 if ((info != NULL) && (info->saveEndTag != 0) &&
793 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
794 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
795 xmlOutputBufferWriteString(buf, ">");
796 } else {
797 xmlOutputBufferWriteString(buf, "></");
798 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
799 xmlOutputBufferWriteString(buf,
800 (const char *)cur->ns->prefix);
801 xmlOutputBufferWriteString(buf, ":");
802 }
803 xmlOutputBufferWriteString(buf, (const char *)cur->name);
804 xmlOutputBufferWriteString(buf, ">");
805 }
806 } else {
807 xmlOutputBufferWriteString(buf, ">");
808 if ((format) && (info != NULL) && (!info->isinline) &&
809 (cur->children->type != HTML_TEXT_NODE) &&
810 (cur->children->type != HTML_ENTITY_REF_NODE) &&
811 (cur->children != cur->last) &&
812 (cur->name != NULL) &&
813 (cur->name[0] != 'p')) /* p, pre, param */
814 xmlOutputBufferWriteString(buf, "\n");
815 parent = cur;
816 cur = cur->children;
817 continue;
818 }
819
820 if ((format) && (cur->next != NULL) &&
821 (info != NULL) && (!info->isinline)) {
822 if ((cur->next->type != HTML_TEXT_NODE) &&
823 (cur->next->type != HTML_ENTITY_REF_NODE) &&
824 (parent != NULL) &&
825 (parent->name != NULL) &&
826 (parent->name[0] != 'p')) /* p, pre, param */
827 xmlOutputBufferWriteString(buf, "\n");
828 }
829
830 break;
831
832 case XML_ATTRIBUTE_NODE:
833 htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
834 break;
835
836 case HTML_TEXT_NODE:
837 if (cur->content == NULL)
838 break;
839 if (((cur->name == (const xmlChar *)xmlStringText) ||
840 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
841 ((parent == NULL) ||
842 ((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
843 (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
844 xmlChar *buffer;
845
846 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
847 if (buffer == NULL) {
848 buf->error = XML_ERR_NO_MEMORY;
849 return;
850 }
851 xmlOutputBufferWriteString(buf, (const char *)buffer);
852 xmlFree(buffer);
853 } else {
854 xmlOutputBufferWriteString(buf, (const char *)cur->content);
855 }
856 break;
857
858 case HTML_COMMENT_NODE:
859 if (cur->content != NULL) {
860 xmlOutputBufferWriteString(buf, "<!--");
861 xmlOutputBufferWriteString(buf, (const char *)cur->content);
862 xmlOutputBufferWriteString(buf, "-->");
863 }
864 break;
865
866 case HTML_PI_NODE:
867 if (cur->name != NULL) {
868 xmlOutputBufferWriteString(buf, "<?");
869 xmlOutputBufferWriteString(buf, (const char *)cur->name);
870 if (cur->content != NULL) {
871 xmlOutputBufferWriteString(buf, " ");
872 xmlOutputBufferWriteString(buf,
873 (const char *)cur->content);
874 }
875 xmlOutputBufferWriteString(buf, ">");
876 }
877 break;
878
879 case HTML_ENTITY_REF_NODE:
880 xmlOutputBufferWriteString(buf, "&");
881 xmlOutputBufferWriteString(buf, (const char *)cur->name);
882 xmlOutputBufferWriteString(buf, ";");
883 break;
884
885 case HTML_PRESERVE_NODE:
886 if (cur->content != NULL) {
887 xmlOutputBufferWriteString(buf, (const char *)cur->content);
888 }
889 break;
890
891 default:
892 break;
893 }
894
895 while (1) {
896 if (cur == root)
897 return;
898 if (cur->next != NULL) {
899 cur = cur->next;
900 break;
901 }
902
903 cur = parent;
904 /* cur->parent was validated when descending. */
905 parent = cur->parent;
906
907 if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
908 (cur->type == XML_DOCUMENT_NODE)) {
909 xmlOutputBufferWriteString(buf, "\n");
910 } else {
911 if ((format) && (cur->ns == NULL))
912 info = htmlTagLookup(cur->name);
913 else
914 info = NULL;
915
916 if ((format) && (info != NULL) && (!info->isinline) &&
917 (cur->last->type != HTML_TEXT_NODE) &&
918 (cur->last->type != HTML_ENTITY_REF_NODE) &&
919 (cur->children != cur->last) &&
920 (cur->name != NULL) &&
921 (cur->name[0] != 'p')) /* p, pre, param */
922 xmlOutputBufferWriteString(buf, "\n");
923
924 xmlOutputBufferWriteString(buf, "</");
925 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
926 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
927 xmlOutputBufferWriteString(buf, ":");
928 }
929 xmlOutputBufferWriteString(buf, (const char *)cur->name);
930 xmlOutputBufferWriteString(buf, ">");
931
932 if ((format) && (info != NULL) && (!info->isinline) &&
933 (cur->next != NULL)) {
934 if ((cur->next->type != HTML_TEXT_NODE) &&
935 (cur->next->type != HTML_ENTITY_REF_NODE) &&
936 (parent != NULL) &&
937 (parent->name != NULL) &&
938 (parent->name[0] != 'p')) /* p, pre, param */
939 xmlOutputBufferWriteString(buf, "\n");
940 }
941 }
942 }
943 }
944 }
945
946 /**
947 * htmlNodeDumpOutput:
948 * @buf: the HTML buffer output
949 * @doc: the document
950 * @cur: the current node
951 * @encoding: the encoding string (unused)
952 *
953 * Dump an HTML node, recursive behaviour,children are printed too,
954 * and formatting returns/spaces are added.
955 */
956 void
htmlNodeDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding ATTRIBUTE_UNUSED)957 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
958 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
959 htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
960 }
961
962 /**
963 * htmlDocContentDumpFormatOutput:
964 * @buf: the HTML buffer output
965 * @cur: the document
966 * @encoding: the encoding string (unused)
967 * @format: should formatting spaces been added
968 *
969 * Dump an HTML document.
970 */
971 void
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding ATTRIBUTE_UNUSED,int format)972 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
973 const char *encoding ATTRIBUTE_UNUSED,
974 int format) {
975 int type = 0;
976 if (cur) {
977 type = cur->type;
978 cur->type = XML_HTML_DOCUMENT_NODE;
979 }
980 htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
981 if (cur)
982 cur->type = (xmlElementType) type;
983 }
984
985 /**
986 * htmlDocContentDumpOutput:
987 * @buf: the HTML buffer output
988 * @cur: the document
989 * @encoding: the encoding string (unused)
990 *
991 * Dump an HTML document. Formatting return/spaces are added.
992 */
993 void
htmlDocContentDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding ATTRIBUTE_UNUSED)994 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
995 const char *encoding ATTRIBUTE_UNUSED) {
996 htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
997 }
998
999 /************************************************************************
1000 * *
1001 * Saving functions front-ends *
1002 * *
1003 ************************************************************************/
1004
1005 /**
1006 * htmlDocDump:
1007 * @f: the FILE*
1008 * @cur: the document
1009 *
1010 * Dump an HTML document to an open FILE.
1011 *
1012 * returns: the number of byte written or -1 in case of failure.
1013 */
1014 int
htmlDocDump(FILE * f,xmlDocPtr cur)1015 htmlDocDump(FILE *f, xmlDocPtr cur) {
1016 xmlOutputBufferPtr buf;
1017 xmlCharEncodingHandlerPtr handler = NULL;
1018 const char *encoding;
1019 int ret;
1020
1021 xmlInitParser();
1022
1023 if ((cur == NULL) || (f == NULL)) {
1024 return(-1);
1025 }
1026
1027 encoding = (const char *) htmlGetMetaEncoding(cur);
1028 handler = htmlFindOutputEncoder(encoding);
1029 buf = xmlOutputBufferCreateFile(f, handler);
1030 if (buf == NULL)
1031 return(-1);
1032 htmlDocContentDumpOutput(buf, cur, NULL);
1033
1034 ret = xmlOutputBufferClose(buf);
1035 return(ret);
1036 }
1037
1038 /**
1039 * htmlSaveFile:
1040 * @filename: the filename (or URL)
1041 * @cur: the document
1042 *
1043 * Dump an HTML document to a file. If @filename is "-" the stdout file is
1044 * used.
1045 * returns: the number of byte written or -1 in case of failure.
1046 */
1047 int
htmlSaveFile(const char * filename,xmlDocPtr cur)1048 htmlSaveFile(const char *filename, xmlDocPtr cur) {
1049 xmlOutputBufferPtr buf;
1050 xmlCharEncodingHandlerPtr handler = NULL;
1051 const char *encoding;
1052 int ret;
1053
1054 if ((cur == NULL) || (filename == NULL))
1055 return(-1);
1056
1057 xmlInitParser();
1058
1059 encoding = (const char *) htmlGetMetaEncoding(cur);
1060 handler = htmlFindOutputEncoder(encoding);
1061 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1062 if (buf == NULL)
1063 return(0);
1064
1065 htmlDocContentDumpOutput(buf, cur, NULL);
1066
1067 ret = xmlOutputBufferClose(buf);
1068 return(ret);
1069 }
1070
1071 /**
1072 * htmlSaveFileFormat:
1073 * @filename: the filename
1074 * @cur: the document
1075 * @format: should formatting spaces been added
1076 * @encoding: the document encoding
1077 *
1078 * Dump an HTML document to a file using a given encoding.
1079 *
1080 * returns: the number of byte written or -1 in case of failure.
1081 */
1082 int
htmlSaveFileFormat(const char * filename,xmlDocPtr cur,const char * encoding,int format)1083 htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1084 const char *encoding, int format) {
1085 xmlOutputBufferPtr buf;
1086 xmlCharEncodingHandlerPtr handler = NULL;
1087 int ret;
1088
1089 if ((cur == NULL) || (filename == NULL))
1090 return(-1);
1091
1092 xmlInitParser();
1093
1094 handler = htmlFindOutputEncoder(encoding);
1095 if (handler != NULL)
1096 htmlSetMetaEncoding(cur, (const xmlChar *) handler->name);
1097 else
1098 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1099
1100 /*
1101 * save the content to a temp buffer.
1102 */
1103 buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1104 if (buf == NULL)
1105 return(0);
1106
1107 htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1108
1109 ret = xmlOutputBufferClose(buf);
1110 return(ret);
1111 }
1112
1113 /**
1114 * htmlSaveFileEnc:
1115 * @filename: the filename
1116 * @cur: the document
1117 * @encoding: the document encoding
1118 *
1119 * Dump an HTML document to a file using a given encoding
1120 * and formatting returns/spaces are added.
1121 *
1122 * returns: the number of byte written or -1 in case of failure.
1123 */
1124 int
htmlSaveFileEnc(const char * filename,xmlDocPtr cur,const char * encoding)1125 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1126 return(htmlSaveFileFormat(filename, cur, encoding, 1));
1127 }
1128
1129 #endif /* LIBXML_OUTPUT_ENABLED */
1130
1131 #endif /* LIBXML_HTML_ENABLED */
1132