xref: /aosp_15_r20/external/libxml2/HTMLtree.c (revision 7c5688314b92172186c154356a6374bf7684c3ca)
1 /*
2  * HTMLtree.c : implementation of access function for an HTML tree.
3  *
4  * See Copyright for the status of this software.
5  *
6  * [email protected]
7  */
8 
9 
10 #define IN_LIBXML
11 #include "libxml.h"
12 #ifdef LIBXML_HTML_ENABLED
13 
14 #include <string.h> /* for memset() only ! */
15 #include <ctype.h>
16 #include <stdlib.h>
17 
18 #include <libxml/xmlmemory.h>
19 #include <libxml/HTMLparser.h>
20 #include <libxml/HTMLtree.h>
21 #include <libxml/entities.h>
22 #include <libxml/xmlerror.h>
23 #include <libxml/parserInternals.h>
24 #include <libxml/uri.h>
25 
26 #include "private/buf.h"
27 #include "private/error.h"
28 #include "private/io.h"
29 #include "private/save.h"
30 
31 /************************************************************************
32  *									*
33  *		Getting/Setting encoding meta tags			*
34  *									*
35  ************************************************************************/
36 
37 /**
38  * htmlGetMetaEncoding:
39  * @doc:  the document
40  *
41  * Encoding definition lookup in the Meta tags
42  *
43  * Returns the current encoding as flagged in the HTML source
44  */
45 const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc)46 htmlGetMetaEncoding(htmlDocPtr doc) {
47     htmlNodePtr cur;
48     const xmlChar *content;
49     const xmlChar *encoding;
50 
51     if (doc == NULL)
52 	return(NULL);
53     cur = doc->children;
54 
55     /*
56      * Search the html
57      */
58     while (cur != NULL) {
59 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
60 	    if (xmlStrEqual(cur->name, BAD_CAST"html"))
61 		break;
62 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
63 		goto found_head;
64 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
65 		goto found_meta;
66 	}
67 	cur = cur->next;
68     }
69     if (cur == NULL)
70 	return(NULL);
71     cur = cur->children;
72 
73     /*
74      * Search the head
75      */
76     while (cur != NULL) {
77 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
78 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
79 		break;
80 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
81 		goto found_meta;
82 	}
83 	cur = cur->next;
84     }
85     if (cur == NULL)
86 	return(NULL);
87 found_head:
88     cur = cur->children;
89 
90     /*
91      * Search the meta elements
92      */
93 found_meta:
94     while (cur != NULL) {
95 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
96 	    if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
97 		xmlAttrPtr attr = cur->properties;
98 		int http;
99 		const xmlChar *value;
100 
101 		content = NULL;
102 		http = 0;
103 		while (attr != NULL) {
104 		    if ((attr->children != NULL) &&
105 		        (attr->children->type == XML_TEXT_NODE) &&
106 		        (attr->children->next == NULL)) {
107 			value = attr->children->content;
108 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
109 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
110 			    http = 1;
111 			else if ((value != NULL)
112 			 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
113 			    content = value;
114 			if ((http != 0) && (content != NULL))
115 			    goto found_content;
116 		    }
117 		    attr = attr->next;
118 		}
119 	    }
120 	}
121 	cur = cur->next;
122     }
123     return(NULL);
124 
125 found_content:
126     encoding = xmlStrstr(content, BAD_CAST"charset=");
127     if (encoding == NULL)
128 	encoding = xmlStrstr(content, BAD_CAST"Charset=");
129     if (encoding == NULL)
130 	encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
131     if (encoding != NULL) {
132 	encoding += 8;
133     } else {
134 	encoding = xmlStrstr(content, BAD_CAST"charset =");
135 	if (encoding == NULL)
136 	    encoding = xmlStrstr(content, BAD_CAST"Charset =");
137 	if (encoding == NULL)
138 	    encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
139 	if (encoding != NULL)
140 	    encoding += 9;
141     }
142     if (encoding != NULL) {
143 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
144     }
145     return(encoding);
146 }
147 
148 /**
149  * htmlSetMetaEncoding:
150  * @doc:  the document
151  * @encoding:  the encoding string
152  *
153  * Sets the current encoding in the Meta tags
154  * NOTE: this will not change the document content encoding, just
155  * the META flag associated.
156  *
157  * Returns 0 in case of success and -1 in case of error
158  */
159 int
htmlSetMetaEncoding(htmlDocPtr doc,const xmlChar * encoding)160 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
161     htmlNodePtr cur, meta = NULL, head = NULL;
162     const xmlChar *content = NULL;
163     char newcontent[100];
164 
165     newcontent[0] = 0;
166 
167     if (doc == NULL)
168 	return(-1);
169 
170     /* html isn't a real encoding it's just libxml2 way to get entities */
171     if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
172         return(-1);
173 
174     if (encoding != NULL) {
175 	snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
176                 (char *)encoding);
177 	newcontent[sizeof(newcontent) - 1] = 0;
178     }
179 
180     cur = doc->children;
181 
182     /*
183      * Search the html
184      */
185     while (cur != NULL) {
186 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
187 	    if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
188 		break;
189 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
190 		goto found_head;
191 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
192 		goto found_meta;
193 	}
194 	cur = cur->next;
195     }
196     if (cur == NULL)
197 	return(-1);
198     cur = cur->children;
199 
200     /*
201      * Search the head
202      */
203     while (cur != NULL) {
204 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
205 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
206 		break;
207 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
208                 head = cur->parent;
209 		goto found_meta;
210             }
211 	}
212 	cur = cur->next;
213     }
214     if (cur == NULL)
215 	return(-1);
216 found_head:
217     head = cur;
218     if (cur->children == NULL)
219         goto create;
220     cur = cur->children;
221 
222 found_meta:
223     /*
224      * Search and update all the remaining the meta elements carrying
225      * encoding information
226      */
227     while (cur != NULL) {
228 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
229 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
230 		xmlAttrPtr attr = cur->properties;
231 		int http;
232 		const xmlChar *value;
233 
234 		content = NULL;
235 		http = 0;
236 		while (attr != NULL) {
237 		    if ((attr->children != NULL) &&
238 		        (attr->children->type == XML_TEXT_NODE) &&
239 		        (attr->children->next == NULL)) {
240 			value = attr->children->content;
241 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
242 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
243 			    http = 1;
244 			else
245                         {
246                            if ((value != NULL) &&
247                                (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
248 			       content = value;
249                         }
250 		        if ((http != 0) && (content != NULL))
251 			    break;
252 		    }
253 		    attr = attr->next;
254 		}
255 		if ((http != 0) && (content != NULL)) {
256 		    meta = cur;
257 		    break;
258 		}
259 
260 	    }
261 	}
262 	cur = cur->next;
263     }
264 create:
265     if (meta == NULL) {
266         if ((encoding != NULL) && (head != NULL)) {
267             /*
268              * Create a new Meta element with the right attributes
269              */
270 
271             meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
272             if (head->children == NULL)
273                 xmlAddChild(head, meta);
274             else
275                 xmlAddPrevSibling(head->children, meta);
276             xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
277             xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
278         }
279     } else {
280         /* remove the meta tag if NULL is passed */
281         if (encoding == NULL) {
282             xmlUnlinkNode(meta);
283             xmlFreeNode(meta);
284         }
285         /* change the document only if there is a real encoding change */
286         else if (xmlStrcasestr(content, encoding) == NULL) {
287             xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
288         }
289     }
290 
291 
292     return(0);
293 }
294 
295 /**
296  * booleanHTMLAttrs:
297  *
298  * These are the HTML attributes which will be output
299  * in minimized form, i.e. <option selected="selected"> will be
300  * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
301  *
302  */
303 static const char* const htmlBooleanAttrs[] = {
304   "checked", "compact", "declare", "defer", "disabled", "ismap",
305   "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
306   "selected", NULL
307 };
308 
309 
310 /**
311  * htmlIsBooleanAttr:
312  * @name:  the name of the attribute to check
313  *
314  * DEPRECATED: Internal function, don't use.
315  *
316  * Determine if a given attribute is a boolean attribute.
317  *
318  * returns: false if the attribute is not boolean, true otherwise.
319  */
320 int
htmlIsBooleanAttr(const xmlChar * name)321 htmlIsBooleanAttr(const xmlChar *name)
322 {
323     int i = 0;
324 
325     while (htmlBooleanAttrs[i] != NULL) {
326         if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
327             return 1;
328         i++;
329     }
330     return 0;
331 }
332 
333 #ifdef LIBXML_OUTPUT_ENABLED
334 /************************************************************************
335  *									*
336  *			Output error handlers				*
337  *									*
338  ************************************************************************/
339 
340 /**
341  * htmlSaveErr:
342  * @code:  the error number
343  * @node:  the location of the error.
344  * @extra:  extra information
345  *
346  * Handle an out of memory condition
347  */
348 static void
htmlSaveErr(int code,xmlNodePtr node,const char * extra)349 htmlSaveErr(int code, xmlNodePtr node, const char *extra)
350 {
351     const char *msg = NULL;
352     int res;
353 
354     switch(code) {
355         case XML_SAVE_NOT_UTF8:
356 	    msg = "string is not in UTF-8\n";
357 	    break;
358 	case XML_SAVE_CHAR_INVALID:
359 	    msg = "invalid character value\n";
360 	    break;
361 	case XML_SAVE_UNKNOWN_ENCODING:
362 	    msg = "unknown encoding %s\n";
363 	    break;
364 	case XML_SAVE_NO_DOCTYPE:
365 	    msg = "HTML has no DOCTYPE\n";
366 	    break;
367 	default:
368 	    msg = "unexpected error number\n";
369     }
370 
371     res = xmlRaiseError(NULL, NULL, NULL, NULL, node,
372                         XML_FROM_OUTPUT, code, XML_ERR_ERROR, NULL, 0,
373                         extra, NULL, NULL, 0, 0,
374                         msg, extra);
375     if (res < 0)
376         xmlRaiseMemoryError(NULL, NULL, NULL, XML_FROM_OUTPUT, NULL);
377 }
378 
379 /************************************************************************
380  *									*
381  *		Dumping HTML tree content to a simple buffer		*
382  *									*
383  ************************************************************************/
384 
385 static xmlCharEncodingHandler *
htmlFindOutputEncoder(const char * encoding)386 htmlFindOutputEncoder(const char *encoding) {
387     xmlCharEncodingHandler *handler = NULL;
388 
389     if (encoding != NULL) {
390         int res;
391 
392         res = xmlOpenCharEncodingHandler(encoding, /* output */ 1,
393                                          &handler);
394         if (res != XML_ERR_OK)
395             htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
396     } else {
397         /*
398          * Fallback to HTML when the encoding is unspecified
399          */
400         xmlOpenCharEncodingHandler("HTML", /* output */ 1, &handler);
401     }
402 
403     return(handler);
404 }
405 
406 /**
407  * htmlBufNodeDumpFormat:
408  * @buf:  the xmlBufPtr output
409  * @doc:  the document
410  * @cur:  the current node
411  * @format:  should formatting spaces been added
412  *
413  * Dump an HTML node, recursive behaviour,children are printed too.
414  *
415  * Returns the number of byte written or -1 in case of error
416  */
417 static size_t
htmlBufNodeDumpFormat(xmlBufPtr buf,xmlDocPtr doc,xmlNodePtr cur,int format)418 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
419 	           int format) {
420     size_t use;
421     size_t ret;
422     xmlOutputBufferPtr outbuf;
423 
424     if (cur == NULL) {
425 	return ((size_t) -1);
426     }
427     if (buf == NULL) {
428 	return ((size_t) -1);
429     }
430     outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
431     if (outbuf == NULL)
432 	return ((size_t) -1);
433     memset(outbuf, 0, sizeof(xmlOutputBuffer));
434     outbuf->buffer = buf;
435     outbuf->encoder = NULL;
436     outbuf->writecallback = NULL;
437     outbuf->closecallback = NULL;
438     outbuf->context = NULL;
439     outbuf->written = 0;
440 
441     use = xmlBufUse(buf);
442     htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
443     if (outbuf->error)
444         ret = (size_t) -1;
445     else
446         ret = xmlBufUse(buf) - use;
447     xmlFree(outbuf);
448     return (ret);
449 }
450 
451 /**
452  * htmlNodeDump:
453  * @buf:  the HTML buffer output
454  * @doc:  the document
455  * @cur:  the current node
456  *
457  * Dump an HTML node, recursive behaviour,children are printed too,
458  * and formatting returns are added.
459  *
460  * Returns the number of byte written or -1 in case of error
461  */
462 int
htmlNodeDump(xmlBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur)463 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
464     xmlBufPtr buffer;
465     size_t ret1;
466     int ret2;
467 
468     if ((buf == NULL) || (cur == NULL))
469         return(-1);
470 
471     xmlInitParser();
472     buffer = xmlBufFromBuffer(buf);
473     if (buffer == NULL)
474         return(-1);
475 
476     ret1 = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
477 
478     ret2 = xmlBufBackToBuffer(buffer, buf);
479 
480     if ((ret1 == (size_t) -1) || (ret2 < 0))
481         return(-1);
482     return(ret1 > INT_MAX ? INT_MAX : ret1);
483 }
484 
485 /**
486  * htmlNodeDumpFileFormat:
487  * @out:  the FILE pointer
488  * @doc:  the document
489  * @cur:  the current node
490  * @encoding: the document encoding
491  * @format:  should formatting spaces been added
492  *
493  * Dump an HTML node, recursive behaviour,children are printed too.
494  *
495  * TODO: if encoding == NULL try to save in the doc encoding
496  *
497  * returns: the number of byte written or -1 in case of failure.
498  */
499 int
htmlNodeDumpFileFormat(FILE * out,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)500 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
501 	               xmlNodePtr cur, const char *encoding, int format) {
502     xmlOutputBufferPtr buf;
503     xmlCharEncodingHandlerPtr handler;
504     int ret;
505 
506     xmlInitParser();
507 
508     /*
509      * save the content to a temp buffer.
510      */
511     handler = htmlFindOutputEncoder(encoding);
512     buf = xmlOutputBufferCreateFile(out, handler);
513     if (buf == NULL)
514         return(0);
515 
516     htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
517 
518     ret = xmlOutputBufferClose(buf);
519     return(ret);
520 }
521 
522 /**
523  * htmlNodeDumpFile:
524  * @out:  the FILE pointer
525  * @doc:  the document
526  * @cur:  the current node
527  *
528  * Dump an HTML node, recursive behaviour,children are printed too,
529  * and formatting returns are added.
530  */
531 void
htmlNodeDumpFile(FILE * out,xmlDocPtr doc,xmlNodePtr cur)532 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
533     htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
534 }
535 
536 /**
537  * htmlDocDumpMemoryFormat:
538  * @cur:  the document
539  * @mem:  OUT: the memory pointer
540  * @size:  OUT: the memory length
541  * @format:  should formatting spaces been added
542  *
543  * Dump an HTML document in memory and return the xmlChar * and it's size.
544  * It's up to the caller to free the memory.
545  */
546 void
htmlDocDumpMemoryFormat(xmlDocPtr cur,xmlChar ** mem,int * size,int format)547 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
548     xmlOutputBufferPtr buf;
549     xmlCharEncodingHandlerPtr handler = NULL;
550     const char *encoding;
551 
552     xmlInitParser();
553 
554     if ((mem == NULL) || (size == NULL))
555         return;
556     *mem = NULL;
557     *size = 0;
558     if (cur == NULL)
559 	return;
560 
561     encoding = (const char *) htmlGetMetaEncoding(cur);
562     handler = htmlFindOutputEncoder(encoding);
563     buf = xmlAllocOutputBuffer(handler);
564     if (buf == NULL)
565 	return;
566 
567     htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
568 
569     xmlOutputBufferFlush(buf);
570 
571     if (!buf->error) {
572         if (buf->conv != NULL) {
573             *size = xmlBufUse(buf->conv);
574             *mem = xmlStrndup(xmlBufContent(buf->conv), *size);
575         } else {
576             *size = xmlBufUse(buf->buffer);
577             *mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
578         }
579     }
580 
581     xmlOutputBufferClose(buf);
582 }
583 
584 /**
585  * htmlDocDumpMemory:
586  * @cur:  the document
587  * @mem:  OUT: the memory pointer
588  * @size:  OUT: the memory length
589  *
590  * Dump an HTML document in memory and return the xmlChar * and it's size.
591  * It's up to the caller to free the memory.
592  */
593 void
htmlDocDumpMemory(xmlDocPtr cur,xmlChar ** mem,int * size)594 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
595 	htmlDocDumpMemoryFormat(cur, mem, size, 1);
596 }
597 
598 
599 /************************************************************************
600  *									*
601  *		Dumping HTML tree content to an I/O output buffer	*
602  *									*
603  ************************************************************************/
604 
605 /**
606  * htmlDtdDumpOutput:
607  * @buf:  the HTML buffer output
608  * @doc:  the document
609  * @encoding:  the encoding string
610  *
611  * TODO: check whether encoding is needed
612  *
613  * Dump the HTML document DTD, if any.
614  */
615 static void
htmlDtdDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,const char * encoding ATTRIBUTE_UNUSED)616 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
617 	          const char *encoding ATTRIBUTE_UNUSED) {
618     xmlDtdPtr cur = doc->intSubset;
619 
620     if (cur == NULL) {
621 	htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
622 	return;
623     }
624     xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
625     xmlOutputBufferWriteString(buf, (const char *)cur->name);
626     if (cur->ExternalID != NULL) {
627 	xmlOutputBufferWriteString(buf, " PUBLIC ");
628 	xmlOutputBufferWriteQuotedString(buf, cur->ExternalID);
629 	if (cur->SystemID != NULL) {
630 	    xmlOutputBufferWriteString(buf, " ");
631 	    xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
632 	}
633     } else if (cur->SystemID != NULL &&
634 	       xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
635 	xmlOutputBufferWriteString(buf, " SYSTEM ");
636 	xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
637     }
638     xmlOutputBufferWriteString(buf, ">\n");
639 }
640 
641 /**
642  * htmlAttrDumpOutput:
643  * @buf:  the HTML buffer output
644  * @doc:  the document
645  * @cur:  the attribute pointer
646  *
647  * Dump an HTML attribute
648  */
649 static void
htmlAttrDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur)650 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
651     xmlChar *value;
652 
653     /*
654      * The html output method should not escape a & character
655      * occurring in an attribute value immediately followed by
656      * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
657      * This is implemented in xmlEncodeEntitiesReentrant
658      */
659 
660     if (cur == NULL) {
661 	return;
662     }
663     xmlOutputBufferWriteString(buf, " ");
664     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
665         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
666 	xmlOutputBufferWriteString(buf, ":");
667     }
668     xmlOutputBufferWriteString(buf, (const char *)cur->name);
669     if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
670 	value = xmlNodeListGetString(doc, cur->children, 0);
671 	if (value) {
672 	    xmlOutputBufferWriteString(buf, "=");
673 	    if ((cur->ns == NULL) && (cur->parent != NULL) &&
674 		(cur->parent->ns == NULL) &&
675 		((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
676 	         (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
677 		 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
678 		 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
679 		  (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
680 		xmlChar *escaped;
681 		xmlChar *tmp = value;
682 
683 		while (IS_BLANK_CH(*tmp)) tmp++;
684 
685 		/*
686                  * Angle brackets are technically illegal in URIs, but they're
687                  * used in server side includes, for example. Curly brackets
688                  * are illegal as well and often used in templates.
689                  * Don't escape non-whitespace, printable ASCII chars for
690                  * improved interoperability. Only escape space, control
691                  * and non-ASCII chars.
692 		 */
693 		escaped = xmlURIEscapeStr(tmp,
694                         BAD_CAST "\"#$%&+,/:;<=>?@[\\]^`{|}");
695 		if (escaped != NULL) {
696 		    xmlOutputBufferWriteQuotedString(buf, escaped);
697 		    xmlFree(escaped);
698 		} else {
699                     buf->error = XML_ERR_NO_MEMORY;
700 		}
701 	    } else {
702 		xmlOutputBufferWriteQuotedString(buf, value);
703 	    }
704 	    xmlFree(value);
705 	} else  {
706             buf->error = XML_ERR_NO_MEMORY;
707 	}
708     }
709 }
710 
711 /**
712  * htmlNodeDumpFormatOutput:
713  * @buf:  the HTML buffer output
714  * @doc:  the document
715  * @cur:  the current node
716  * @encoding:  the encoding string (unused)
717  * @format:  should formatting spaces been added
718  *
719  * Dump an HTML node, recursive behaviour,children are printed too.
720  */
721 void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding ATTRIBUTE_UNUSED,int format)722 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
723 	                 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
724                          int format) {
725     xmlNodePtr root, parent;
726     xmlAttrPtr attr;
727     const htmlElemDesc * info;
728 
729     xmlInitParser();
730 
731     if ((cur == NULL) || (buf == NULL)) {
732 	return;
733     }
734 
735     root = cur;
736     parent = cur->parent;
737     while (1) {
738         switch (cur->type) {
739         case XML_HTML_DOCUMENT_NODE:
740         case XML_DOCUMENT_NODE:
741             if (((xmlDocPtr) cur)->intSubset != NULL) {
742                 htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
743             }
744             if (cur->children != NULL) {
745                 /* Always validate cur->parent when descending. */
746                 if (cur->parent == parent) {
747                     parent = cur;
748                     cur = cur->children;
749                     continue;
750                 }
751             } else {
752                 xmlOutputBufferWriteString(buf, "\n");
753             }
754             break;
755 
756         case XML_ELEMENT_NODE:
757             /*
758              * Some users like lxml are known to pass nodes with a corrupted
759              * tree structure. Fall back to a recursive call to handle this
760              * case.
761              */
762             if ((cur->parent != parent) && (cur->children != NULL)) {
763                 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
764                 break;
765             }
766 
767             /*
768              * Get specific HTML info for that node.
769              */
770             if (cur->ns == NULL)
771                 info = htmlTagLookup(cur->name);
772             else
773                 info = NULL;
774 
775             xmlOutputBufferWriteString(buf, "<");
776             if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
777                 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
778                 xmlOutputBufferWriteString(buf, ":");
779             }
780             xmlOutputBufferWriteString(buf, (const char *)cur->name);
781             if (cur->nsDef)
782                 xmlNsListDumpOutput(buf, cur->nsDef);
783             attr = cur->properties;
784             while (attr != NULL) {
785                 htmlAttrDumpOutput(buf, doc, attr);
786                 attr = attr->next;
787             }
788 
789             if ((info != NULL) && (info->empty)) {
790                 xmlOutputBufferWriteString(buf, ">");
791             } else if (cur->children == NULL) {
792                 if ((info != NULL) && (info->saveEndTag != 0) &&
793                     (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
794                     (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
795                     xmlOutputBufferWriteString(buf, ">");
796                 } else {
797                     xmlOutputBufferWriteString(buf, "></");
798                     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
799                         xmlOutputBufferWriteString(buf,
800                                 (const char *)cur->ns->prefix);
801                         xmlOutputBufferWriteString(buf, ":");
802                     }
803                     xmlOutputBufferWriteString(buf, (const char *)cur->name);
804                     xmlOutputBufferWriteString(buf, ">");
805                 }
806             } else {
807                 xmlOutputBufferWriteString(buf, ">");
808                 if ((format) && (info != NULL) && (!info->isinline) &&
809                     (cur->children->type != HTML_TEXT_NODE) &&
810                     (cur->children->type != HTML_ENTITY_REF_NODE) &&
811                     (cur->children != cur->last) &&
812                     (cur->name != NULL) &&
813                     (cur->name[0] != 'p')) /* p, pre, param */
814                     xmlOutputBufferWriteString(buf, "\n");
815                 parent = cur;
816                 cur = cur->children;
817                 continue;
818             }
819 
820             if ((format) && (cur->next != NULL) &&
821                 (info != NULL) && (!info->isinline)) {
822                 if ((cur->next->type != HTML_TEXT_NODE) &&
823                     (cur->next->type != HTML_ENTITY_REF_NODE) &&
824                     (parent != NULL) &&
825                     (parent->name != NULL) &&
826                     (parent->name[0] != 'p')) /* p, pre, param */
827                     xmlOutputBufferWriteString(buf, "\n");
828             }
829 
830             break;
831 
832         case XML_ATTRIBUTE_NODE:
833             htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
834             break;
835 
836         case HTML_TEXT_NODE:
837             if (cur->content == NULL)
838                 break;
839             if (((cur->name == (const xmlChar *)xmlStringText) ||
840                  (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
841                 ((parent == NULL) ||
842                  ((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
843                   (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
844                 xmlChar *buffer;
845 
846                 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
847                 if (buffer == NULL) {
848                     buf->error = XML_ERR_NO_MEMORY;
849                     return;
850                 }
851                 xmlOutputBufferWriteString(buf, (const char *)buffer);
852                 xmlFree(buffer);
853             } else {
854                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
855             }
856             break;
857 
858         case HTML_COMMENT_NODE:
859             if (cur->content != NULL) {
860                 xmlOutputBufferWriteString(buf, "<!--");
861                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
862                 xmlOutputBufferWriteString(buf, "-->");
863             }
864             break;
865 
866         case HTML_PI_NODE:
867             if (cur->name != NULL) {
868                 xmlOutputBufferWriteString(buf, "<?");
869                 xmlOutputBufferWriteString(buf, (const char *)cur->name);
870                 if (cur->content != NULL) {
871                     xmlOutputBufferWriteString(buf, " ");
872                     xmlOutputBufferWriteString(buf,
873                             (const char *)cur->content);
874                 }
875                 xmlOutputBufferWriteString(buf, ">");
876             }
877             break;
878 
879         case HTML_ENTITY_REF_NODE:
880             xmlOutputBufferWriteString(buf, "&");
881             xmlOutputBufferWriteString(buf, (const char *)cur->name);
882             xmlOutputBufferWriteString(buf, ";");
883             break;
884 
885         case HTML_PRESERVE_NODE:
886             if (cur->content != NULL) {
887                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
888             }
889             break;
890 
891         default:
892             break;
893         }
894 
895         while (1) {
896             if (cur == root)
897                 return;
898             if (cur->next != NULL) {
899                 cur = cur->next;
900                 break;
901             }
902 
903             cur = parent;
904             /* cur->parent was validated when descending. */
905             parent = cur->parent;
906 
907             if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
908                 (cur->type == XML_DOCUMENT_NODE)) {
909                 xmlOutputBufferWriteString(buf, "\n");
910             } else {
911                 if ((format) && (cur->ns == NULL))
912                     info = htmlTagLookup(cur->name);
913                 else
914                     info = NULL;
915 
916                 if ((format) && (info != NULL) && (!info->isinline) &&
917                     (cur->last->type != HTML_TEXT_NODE) &&
918                     (cur->last->type != HTML_ENTITY_REF_NODE) &&
919                     (cur->children != cur->last) &&
920                     (cur->name != NULL) &&
921                     (cur->name[0] != 'p')) /* p, pre, param */
922                     xmlOutputBufferWriteString(buf, "\n");
923 
924                 xmlOutputBufferWriteString(buf, "</");
925                 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
926                     xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
927                     xmlOutputBufferWriteString(buf, ":");
928                 }
929                 xmlOutputBufferWriteString(buf, (const char *)cur->name);
930                 xmlOutputBufferWriteString(buf, ">");
931 
932                 if ((format) && (info != NULL) && (!info->isinline) &&
933                     (cur->next != NULL)) {
934                     if ((cur->next->type != HTML_TEXT_NODE) &&
935                         (cur->next->type != HTML_ENTITY_REF_NODE) &&
936                         (parent != NULL) &&
937                         (parent->name != NULL) &&
938                         (parent->name[0] != 'p')) /* p, pre, param */
939                         xmlOutputBufferWriteString(buf, "\n");
940                 }
941             }
942         }
943     }
944 }
945 
946 /**
947  * htmlNodeDumpOutput:
948  * @buf:  the HTML buffer output
949  * @doc:  the document
950  * @cur:  the current node
951  * @encoding:  the encoding string (unused)
952  *
953  * Dump an HTML node, recursive behaviour,children are printed too,
954  * and formatting returns/spaces are added.
955  */
956 void
htmlNodeDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding ATTRIBUTE_UNUSED)957 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
958 	           xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
959     htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
960 }
961 
962 /**
963  * htmlDocContentDumpFormatOutput:
964  * @buf:  the HTML buffer output
965  * @cur:  the document
966  * @encoding:  the encoding string (unused)
967  * @format:  should formatting spaces been added
968  *
969  * Dump an HTML document.
970  */
971 void
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding ATTRIBUTE_UNUSED,int format)972 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
973 	                       const char *encoding ATTRIBUTE_UNUSED,
974                                int format) {
975     int type = 0;
976     if (cur) {
977         type = cur->type;
978         cur->type = XML_HTML_DOCUMENT_NODE;
979     }
980     htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
981     if (cur)
982         cur->type = (xmlElementType) type;
983 }
984 
985 /**
986  * htmlDocContentDumpOutput:
987  * @buf:  the HTML buffer output
988  * @cur:  the document
989  * @encoding:  the encoding string (unused)
990  *
991  * Dump an HTML document. Formatting return/spaces are added.
992  */
993 void
htmlDocContentDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding ATTRIBUTE_UNUSED)994 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
995 	                 const char *encoding ATTRIBUTE_UNUSED) {
996     htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
997 }
998 
999 /************************************************************************
1000  *									*
1001  *		Saving functions front-ends				*
1002  *									*
1003  ************************************************************************/
1004 
1005 /**
1006  * htmlDocDump:
1007  * @f:  the FILE*
1008  * @cur:  the document
1009  *
1010  * Dump an HTML document to an open FILE.
1011  *
1012  * returns: the number of byte written or -1 in case of failure.
1013  */
1014 int
htmlDocDump(FILE * f,xmlDocPtr cur)1015 htmlDocDump(FILE *f, xmlDocPtr cur) {
1016     xmlOutputBufferPtr buf;
1017     xmlCharEncodingHandlerPtr handler = NULL;
1018     const char *encoding;
1019     int ret;
1020 
1021     xmlInitParser();
1022 
1023     if ((cur == NULL) || (f == NULL)) {
1024 	return(-1);
1025     }
1026 
1027     encoding = (const char *) htmlGetMetaEncoding(cur);
1028     handler = htmlFindOutputEncoder(encoding);
1029     buf = xmlOutputBufferCreateFile(f, handler);
1030     if (buf == NULL)
1031         return(-1);
1032     htmlDocContentDumpOutput(buf, cur, NULL);
1033 
1034     ret = xmlOutputBufferClose(buf);
1035     return(ret);
1036 }
1037 
1038 /**
1039  * htmlSaveFile:
1040  * @filename:  the filename (or URL)
1041  * @cur:  the document
1042  *
1043  * Dump an HTML document to a file. If @filename is "-" the stdout file is
1044  * used.
1045  * returns: the number of byte written or -1 in case of failure.
1046  */
1047 int
htmlSaveFile(const char * filename,xmlDocPtr cur)1048 htmlSaveFile(const char *filename, xmlDocPtr cur) {
1049     xmlOutputBufferPtr buf;
1050     xmlCharEncodingHandlerPtr handler = NULL;
1051     const char *encoding;
1052     int ret;
1053 
1054     if ((cur == NULL) || (filename == NULL))
1055         return(-1);
1056 
1057     xmlInitParser();
1058 
1059     encoding = (const char *) htmlGetMetaEncoding(cur);
1060     handler = htmlFindOutputEncoder(encoding);
1061     buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1062     if (buf == NULL)
1063         return(0);
1064 
1065     htmlDocContentDumpOutput(buf, cur, NULL);
1066 
1067     ret = xmlOutputBufferClose(buf);
1068     return(ret);
1069 }
1070 
1071 /**
1072  * htmlSaveFileFormat:
1073  * @filename:  the filename
1074  * @cur:  the document
1075  * @format:  should formatting spaces been added
1076  * @encoding: the document encoding
1077  *
1078  * Dump an HTML document to a file using a given encoding.
1079  *
1080  * returns: the number of byte written or -1 in case of failure.
1081  */
1082 int
htmlSaveFileFormat(const char * filename,xmlDocPtr cur,const char * encoding,int format)1083 htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1084 	           const char *encoding, int format) {
1085     xmlOutputBufferPtr buf;
1086     xmlCharEncodingHandlerPtr handler = NULL;
1087     int ret;
1088 
1089     if ((cur == NULL) || (filename == NULL))
1090         return(-1);
1091 
1092     xmlInitParser();
1093 
1094     handler = htmlFindOutputEncoder(encoding);
1095     if (handler != NULL)
1096         htmlSetMetaEncoding(cur, (const xmlChar *) handler->name);
1097     else
1098 	htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1099 
1100     /*
1101      * save the content to a temp buffer.
1102      */
1103     buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1104     if (buf == NULL)
1105         return(0);
1106 
1107     htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1108 
1109     ret = xmlOutputBufferClose(buf);
1110     return(ret);
1111 }
1112 
1113 /**
1114  * htmlSaveFileEnc:
1115  * @filename:  the filename
1116  * @cur:  the document
1117  * @encoding: the document encoding
1118  *
1119  * Dump an HTML document to a file using a given encoding
1120  * and formatting returns/spaces are added.
1121  *
1122  * returns: the number of byte written or -1 in case of failure.
1123  */
1124 int
htmlSaveFileEnc(const char * filename,xmlDocPtr cur,const char * encoding)1125 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1126     return(htmlSaveFileFormat(filename, cur, encoding, 1));
1127 }
1128 
1129 #endif /* LIBXML_OUTPUT_ENABLED */
1130 
1131 #endif /* LIBXML_HTML_ENABLED */
1132