1// Copyright 2021 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package markdown
6
7import (
8	"bytes"
9	"strconv"
10	"strings"
11	"unicode"
12)
13
14type HTMLBlock struct {
15	Position
16	Text []string
17}
18
19func (b *HTMLBlock) PrintHTML(buf *bytes.Buffer) {
20	for _, s := range b.Text {
21		buf.WriteString(s)
22		buf.WriteString("\n")
23	}
24}
25
26func (b *HTMLBlock) printMarkdown(buf *bytes.Buffer, s mdState) {
27	if s.prefix1 != "" {
28		buf.WriteString(s.prefix1)
29	} else {
30		buf.WriteString(s.prefix)
31	}
32	b.PrintHTML(buf)
33}
34
35type htmlBuilder struct {
36	endBlank bool
37	text     []string
38	endFunc  func(string) bool
39}
40
41func (c *htmlBuilder) extend(p *parseState, s line) (line, bool) {
42	if c.endBlank && s.isBlank() {
43		return s, false
44	}
45	t := s.string()
46	c.text = append(c.text, t)
47	if c.endFunc != nil && c.endFunc(t) {
48		return line{}, false
49	}
50	return line{}, true
51}
52
53func (c *htmlBuilder) build(p buildState) Block {
54	return &HTMLBlock{
55		p.pos(),
56		c.text,
57	}
58}
59
60func newHTML(p *parseState, s line) (line, bool) {
61	peek := s
62	if p.startHTML(&peek) {
63		return line{}, true
64	}
65	return s, false
66}
67
68func (p *parseState) startHTML(s *line) bool {
69	tt := *s
70	tt.trimSpace(0, 3, false)
71	if tt.peek() != '<' {
72		return false
73	}
74	t := tt.string()
75
76	var end string
77	switch {
78	case strings.HasPrefix(t, "<!--"):
79		end = "-->"
80	case strings.HasPrefix(t, "<?"):
81		end = "?>"
82	case strings.HasPrefix(t, "<![CDATA["):
83		end = "]]>"
84	case strings.HasPrefix(t, "<!") && len(t) >= 3 && isLetter(t[2]):
85		if 'a' <= t[2] && t[2] <= 'z' {
86			// Goldmark and the Dingus only accept <!UPPER> not <!lower>.
87			p.corner = true
88		}
89		end = ">"
90	}
91	if end != "" {
92		b := &htmlBuilder{endFunc: func(s string) bool { return strings.Contains(s, end) }}
93		p.addBlock(b)
94		b.text = append(b.text, s.string())
95		if b.endFunc(t) {
96			p.closeBlock()
97		}
98		return true
99	}
100
101	// case 6
102	i := 1
103	if i < len(t) && t[i] == '/' {
104		i++
105	}
106	buf := make([]byte, 0, 16)
107	for ; i < len(t) && len(buf) < 16; i++ {
108		c := t[i]
109		if 'A' <= c && c <= 'Z' {
110			c += 'a' - 'A'
111		}
112		if !('a' <= c && c <= 'z') && !('0' <= c && c <= '9') {
113			break
114		}
115		buf = append(buf, c)
116	}
117	var sep byte
118	if i < len(t) {
119		switch t[i] {
120		default:
121			goto Next
122		case ' ', '\t', '>':
123			// ok
124			sep = t[i]
125		case '/':
126			if i+1 >= len(t) || t[i+1] != '>' {
127				goto Next
128			}
129		}
130	}
131
132	if len(buf) == 0 {
133		goto Next
134	}
135	{
136		c := buf[0]
137		var ok bool
138		for _, name := range htmlTags {
139			if name[0] == c && len(name) == len(buf) && name == string(buf) {
140				if sep == '\t' {
141					// Goldmark recognizes space here but not tab.
142					// testdata/extra.txt 143.md
143					p.corner = true
144				}
145				ok = true
146				break
147			}
148		}
149		if !ok {
150			goto Next
151		}
152	}
153
154	{
155		b := &htmlBuilder{endBlank: true}
156		p.addBlock(b)
157		b.text = append(b.text, s.string())
158		return true
159	}
160
161Next:
162	// case 1
163	if len(t) > 1 && t[1] != '/' && (i >= len(t) || t[i] == ' ' || t[i] == '\t' || t[i] == '>') {
164		switch string(buf) {
165		case "pre", "script", "style", "textarea":
166			b := &htmlBuilder{endFunc: hasEndPre}
167			p.addBlock(b)
168			b.text = append(b.text, s.string())
169			if hasEndPre(t) {
170				p.closeBlock()
171			}
172			return true
173		}
174	}
175
176	// case 7
177	if p.para() == nil {
178		if _, e, ok := parseHTMLOpenTag(p, t, 0); ok && skipSpace(t, e) == len(t) {
179			if e != len(t) {
180				// Goldmark disallows trailing space
181				p.corner = true
182			}
183			b := &htmlBuilder{endBlank: true}
184			p.addBlock(b)
185			b.text = append(b.text, s.string())
186			return true
187		}
188		if _, e, ok := parseHTMLClosingTag(p, t, 0); ok && skipSpace(t, e) == len(t) {
189			b := &htmlBuilder{endBlank: true}
190			p.addBlock(b)
191			b.text = append(b.text, s.string())
192			return true
193		}
194	}
195
196	return false
197}
198
199func hasEndPre(s string) bool {
200	for i := 0; i < len(s); i++ {
201		if s[i] == '<' && i+1 < len(s) && s[i+1] == '/' {
202			buf := make([]byte, 0, 8)
203			for i += 2; i < len(s) && len(buf) < 8; i++ {
204				c := s[i]
205				if 'A' <= c && c <= 'Z' {
206					c += 'a' - 'A'
207				}
208				if c < 'a' || 'z' < c {
209					break
210				}
211				buf = append(buf, c)
212			}
213			if i < len(s) && s[i] == '>' {
214				switch string(buf) {
215				case "pre", "script", "style", "textarea":
216					return true
217				}
218			}
219		}
220	}
221	return false
222}
223
224func parseHTMLTag(p *parseState, s string, i int) (Inline, int, bool) {
225	// “An HTML tag consists of an open tag, a closing tag, an HTML comment,
226	// a processing instruction, a declaration, or a CDATA section.”
227	if i+3 <= len(s) && s[i] == '<' {
228		switch s[i+1] {
229		default:
230			return parseHTMLOpenTag(p, s, i)
231		case '/':
232			return parseHTMLClosingTag(p, s, i)
233		case '!':
234			switch s[i+2] {
235			case '-':
236				return parseHTMLComment(s, i)
237			case '[':
238				return parseHTMLCDATA(s, i)
239			default:
240				return parseHTMLDecl(p, s, i)
241			}
242		case '?':
243			return parseHTMLProcInst(s, i)
244		}
245	}
246	return nil, 0, false
247}
248
249func parseHTMLOpenTag(p *parseState, s string, i int) (Inline, int, bool) {
250	if i >= len(s) || s[i] != '<' {
251		return nil, 0, false
252	}
253	// “An open tag consists of a < character, a tag name, zero or more attributes,
254	// optional spaces, tabs, and up to one line ending, an optional / character, and a > character.”
255	if name, j, ok := parseTagName(s, i+1); ok {
256		switch name {
257		case "pre", "script", "style", "textarea":
258			// Goldmark treats these as starting a new HTMLBlock
259			// and ending the paragraph they appear in.
260			p.corner = true
261		}
262		for {
263			if j >= len(s) || s[j] != ' ' && s[j] != '\t' && s[j] != '\n' && s[j] != '/' && s[j] != '>' {
264				return nil, 0, false
265			}
266			_, k, ok := parseAttr(p, s, j)
267			if !ok {
268				break
269			}
270			j = k
271		}
272		k := skipSpace(s, j)
273		if k != j {
274			// Goldmark mishandles spaces before >.
275			p.corner = true
276		}
277		j = k
278		if j < len(s) && s[j] == '/' {
279			j++
280		}
281		if j < len(s) && s[j] == '>' {
282			return &HTMLTag{s[i : j+1]}, j + 1, true
283		}
284	}
285	return nil, 0, false
286}
287
288func parseHTMLClosingTag(p *parseState, s string, i int) (Inline, int, bool) {
289	// “A closing tag consists of the string </, a tag name,
290	// optional spaces, tabs, and up to one line ending, and the character >.”
291	if i+2 >= len(s) || s[i] != '<' || s[i+1] != '/' {
292		return nil, 0, false
293	}
294	if skipSpace(s, i+2) != i+2 {
295		// Goldmark allows spaces here but the spec and the Dingus do not.
296		p.corner = true
297	}
298
299	if _, j, ok := parseTagName(s, i+2); ok {
300		j = skipSpace(s, j)
301		if j < len(s) && s[j] == '>' {
302			return &HTMLTag{s[i : j+1]}, j + 1, true
303		}
304	}
305	return nil, 0, false
306}
307
308func parseTagName(s string, i int) (string, int, bool) {
309	// “A tag name consists of an ASCII letter followed by zero or more ASCII letters, digits, or hyphens (-).”
310	if i < len(s) && isLetter(s[i]) {
311		j := i + 1
312		for j < len(s) && isLDH(s[j]) {
313			j++
314		}
315		return s[i:j], j, true
316	}
317	return "", 0, false
318}
319
320func parseAttr(p *parseState, s string, i int) (string, int, bool) {
321	// “An attribute consists of spaces, tabs, and up to one line ending,
322	// an attribute name, and an optional attribute value specification.”
323	i = skipSpace(s, i)
324	if _, j, ok := parseAttrName(s, i); ok {
325		if _, k, ok := parseAttrValueSpec(p, s, j); ok {
326			j = k
327		}
328		return s[i:j], j, true
329	}
330	return "", 0, false
331}
332
333func parseAttrName(s string, i int) (string, int, bool) {
334	// “An attribute name consists of an ASCII letter, _, or :,
335	// followed by zero or more ASCII letters, digits, _, ., :, or -.”
336	if i+1 < len(s) && (isLetter(s[i]) || s[i] == '_' || s[i] == ':') {
337		j := i + 1
338		for j < len(s) && (isLDH(s[j]) || s[j] == '_' || s[j] == '.' || s[j] == ':') {
339			j++
340		}
341		return s[i:j], j, true
342	}
343	return "", 0, false
344}
345
346func parseAttrValueSpec(p *parseState, s string, i int) (string, int, bool) {
347	// “An attribute value specification consists of
348	// optional spaces, tabs, and up to one line ending,
349	// a = character,
350	// optional spaces, tabs, and up to one line ending,
351	// and an attribute value.”
352	i = skipSpace(s, i)
353	if i+1 < len(s) && s[i] == '=' {
354		i = skipSpace(s, i+1)
355		if _, j, ok := parseAttrValue(s, i); ok {
356			p.corner = p.corner || strings.Contains(s[i:j], "\ufffd")
357			return s[i:j], j, true
358		}
359	}
360	return "", 0, false
361}
362
363func parseAttrValue(s string, i int) (string, int, bool) {
364	// “An attribute value consists of
365	// an unquoted attribute value,
366	// a single-quoted attribute value,
367	// or a double-quoted attribute value.”
368	// TODO: No escaping???
369	if i < len(s) && (s[i] == '\'' || s[i] == '"') {
370		// “A single-quoted attribute value consists of ',
371		// zero or more characters not including ', and a final '.”
372		// “A double-quoted attribute value consists of ",
373		// zero or more characters not including ", and a final ".”
374		if j := strings.IndexByte(s[i+1:], s[i]); j >= 0 {
375			end := i + 1 + j + 1
376			return s[i:end], end, true
377		}
378	}
379
380	// “An unquoted attribute value is a nonempty string of characters
381	// not including spaces, tabs, line endings, ", ', =, <, >, or `.”
382	j := i
383	for j < len(s) && strings.IndexByte(" \t\n\"'=<>`", s[j]) < 0 {
384		j++
385	}
386	if j > i {
387		return s[i:j], j, true
388	}
389	return "", 0, false
390}
391
392func parseHTMLComment(s string, i int) (Inline, int, bool) {
393	// “An HTML comment consists of <!-- + text + -->,
394	// where text does not start with > or ->,
395	// does not end with -, and does not contain --.”
396	if !strings.HasPrefix(s[i:], "<!-->") &&
397		!strings.HasPrefix(s[i:], "<!--->") {
398		if x, end, ok := parseHTMLMarker(s, i, "<!--", "-->"); ok {
399			if t := x.(*HTMLTag).Text; !strings.Contains(t[len("<!--"):len(t)-len("->")], "--") {
400				return x, end, ok
401			}
402		}
403	}
404	return nil, 0, false
405}
406
407func parseHTMLCDATA(s string, i int) (Inline, int, bool) {
408	// “A CDATA section consists of the string <![CDATA[,
409	// a string of characters not including the string ]]>, and the string ]]>.”
410	return parseHTMLMarker(s, i, "<![CDATA[", "]]>")
411}
412
413func parseHTMLDecl(p *parseState, s string, i int) (Inline, int, bool) {
414	// “A declaration consists of the string <!, an ASCII letter,
415	// zero or more characters not including the character >, and the character >.”
416	if i+2 < len(s) && isLetter(s[i+2]) {
417		if 'a' <= s[i+2] && s[i+2] <= 'z' {
418			p.corner = true // goldmark requires uppercase
419		}
420		return parseHTMLMarker(s, i, "<!", ">")
421	}
422	return nil, 0, false
423}
424
425func parseHTMLProcInst(s string, i int) (Inline, int, bool) {
426	// “A processing instruction consists of the string <?,
427	// a string of characters not including the string ?>, and the string ?>.”
428	return parseHTMLMarker(s, i, "<?", "?>")
429}
430
431func parseHTMLMarker(s string, i int, prefix, suffix string) (Inline, int, bool) {
432	if strings.HasPrefix(s[i:], prefix) {
433		if j := strings.Index(s[i+len(prefix):], suffix); j >= 0 {
434			end := i + len(prefix) + j + len(suffix)
435			return &HTMLTag{s[i:end]}, end, true
436		}
437	}
438	return nil, 0, false
439}
440
441func parseHTMLEntity(_ *parseState, s string, i int) (Inline, int, int, bool) {
442	start := i
443	if i+1 < len(s) && s[i+1] == '#' {
444		i += 2
445		var r, end int
446		if i < len(s) && (s[i] == 'x' || s[i] == 'X') {
447			// hex
448			i++
449			j := i
450			for j < len(s) && isHexDigit(s[j]) {
451				j++
452			}
453			if j-i < 1 || j-i > 6 || j >= len(s) || s[j] != ';' {
454				return nil, 0, 0, false
455			}
456			r64, _ := strconv.ParseInt(s[i:j], 16, 0)
457			r = int(r64)
458			end = j + 1
459		} else {
460			// decimal
461			j := i
462			for j < len(s) && isDigit(s[j]) {
463				j++
464			}
465			if j-i < 1 || j-i > 7 || j >= len(s) || s[j] != ';' {
466				return nil, 0, 0, false
467			}
468			r, _ = strconv.Atoi(s[i:j])
469			end = j + 1
470		}
471		if r > unicode.MaxRune || r == 0 {
472			r = unicode.ReplacementChar
473		}
474		return &Plain{string(rune(r))}, start, end, true
475	}
476
477	// Max name in list is 32 bytes. Try for 64 for good measure.
478	for j := i + 1; j < len(s) && j-i < 64; j++ {
479		if s[j] == '&' { // Stop possible quadratic search on &&&&&&&.
480			break
481		}
482		if s[j] == ';' {
483			if r, ok := htmlEntity[s[i:j+1]]; ok {
484				return &Plain{r}, start, j + 1, true
485			}
486			break
487		}
488	}
489
490	return nil, 0, 0, false
491}
492
493type HTMLTag struct {
494	Text string
495}
496
497func (*HTMLTag) Inline() {}
498
499func (x *HTMLTag) PrintHTML(buf *bytes.Buffer) {
500	buf.WriteString(x.Text)
501}
502
503func (x *HTMLTag) printMarkdown(buf *bytes.Buffer) {
504	x.PrintHTML(buf)
505}
506
507func (x *HTMLTag) PrintText(buf *bytes.Buffer) {}
508