html.go - OpenGrok cross reference for /aosp_15_r20/prebuilts/go/linux-x86/src/cmd/vendor/rsc.io/markdown/html.go

// Copyright 2021 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package markdown

import (
	"bytes"
	"strconv"
	"strings"
	"unicode"
)

type HTMLBlock struct {
	Position
	Text []string
}

func (b *HTMLBlock) PrintHTML(buf *bytes.Buffer) {
	for _, s := range b.Text {
		buf.WriteString(s)
		buf.WriteString("\n")
	}
}

func (b *HTMLBlock) printMarkdown(buf *bytes.Buffer, s mdState) {
	if s.prefix1 != "" {
		buf.WriteString(s.prefix1)
	} else {
		buf.WriteString(s.prefix)
	}
	b.PrintHTML(buf)
}

type htmlBuilder struct {
	endBlank bool
	text     []string
	endFunc  func(string) bool
}

func (c *htmlBuilder) extend(p *parseState, s line) (line, bool) {
	if c.endBlank && s.isBlank() {
		return s, false
	}
	t := s.string()
	c.text = append(c.text, t)
	if c.endFunc != nil && c.endFunc(t) {
		return line{}, false
	}
	return line{}, true
}

func (c *htmlBuilder) build(p buildState) Block {
	return &HTMLBlock{
		p.pos(),
		c.text,
	}
}

func newHTML(p *parseState, s line) (line, bool) {
	peek := s
	if p.startHTML(&peek) {
		return line{}, true
	}
	return s, false
}

func (p *parseState) startHTML(s *line) bool {
	tt := *s
	tt.trimSpace(0, 3, false)
	if tt.peek() != '<' {
		return false
	}
	t := tt.string()

	var end string
	switch {
	case strings.HasPrefix(t, "<!--"):
		end = "-->"
	case strings.HasPrefix(t, "<?"):
		end = "?>"
	case strings.HasPrefix(t, "<![CDATA["):
		end = "]]>"
	case strings.HasPrefix(t, "<!") && len(t) >= 3 && isLetter(t[2]):
		if 'a' <= t[2] && t[2] <= 'z' {
			// Goldmark and the Dingus only accept <!UPPER> not <!lower>.
			p.corner = true
		}
		end = ">"
	}
	if end != "" {
		b := &htmlBuilder{endFunc: func(s string) bool { return strings.Contains(s, end) }}
		p.addBlock(b)
		b.text = append(b.text, s.string())
		if b.endFunc(t) {
			p.closeBlock()
		}
		return true
	}

	// case 6
	i := 1
	if i < len(t) && t[i] == '/' {
		i++
	}
	buf := make([]byte, 0, 16)
	for ; i < len(t) && len(buf) < 16; i++ {
		c := t[i]
		if 'A' <= c && c <= 'Z' {
			c += 'a' - 'A'
		}
		if !('a' <= c && c <= 'z') && !('0' <= c && c <= '9') {
			break
		}
		buf = append(buf, c)
	}
	var sep byte
	if i < len(t) {
		switch t[i] {
		default:
			goto Next
		case ' ', '\t', '>':
			// ok
			sep = t[i]
		case '/':
			if i+1 >= len(t) || t[i+1] != '>' {
				goto Next
			}
		}
	}

	if len(buf) == 0 {
		goto Next
	}
	{
		c := buf[0]
		var ok bool
		for _, name := range htmlTags {
			if name[0] == c && len(name) == len(buf) && name == string(buf) {
				if sep == '\t' {
					// Goldmark recognizes space here but not tab.
					// testdata/extra.txt 143.md
					p.corner = true
				}
				ok = true
				break
			}
		}
		if !ok {
			goto Next
		}
	}

	{
		b := &htmlBuilder{endBlank: true}
		p.addBlock(b)
		b.text = append(b.text, s.string())
		return true
	}

Next:
	// case 1
	if len(t) > 1 && t[1] != '/' && (i >= len(t) || t[i] == ' ' || t[i] == '\t' || t[i] == '>') {
		switch string(buf) {
		case "pre", "script", "style", "textarea":
			b := &htmlBuilder{endFunc: hasEndPre}
			p.addBlock(b)
			b.text = append(b.text, s.string())
			if hasEndPre(t) {
				p.closeBlock()
			}
			return true
		}
	}

	// case 7
	if p.para() == nil {
		if _, e, ok := parseHTMLOpenTag(p, t, 0); ok && skipSpace(t, e) == len(t) {
			if e != len(t) {
				// Goldmark disallows trailing space
				p.corner = true
			}
			b := &htmlBuilder{endBlank: true}
			p.addBlock(b)
			b.text = append(b.text, s.string())
			return true
		}
		if _, e, ok := parseHTMLClosingTag(p, t, 0); ok && skipSpace(t, e) == len(t) {
			b := &htmlBuilder{endBlank: true}
			p.addBlock(b)
			b.text = append(b.text, s.string())
			return true
		}
	}

	return false
}

func hasEndPre(s string) bool {
	for i := 0; i < len(s); i++ {
		if s[i] == '<' && i+1 < len(s) && s[i+1] == '/' {
			buf := make([]byte, 0, 8)
			for i += 2; i < len(s) && len(buf) < 8; i++ {
				c := s[i]
				if 'A' <= c && c <= 'Z' {
					c += 'a' - 'A'
				}
				if c < 'a' || 'z' < c {
					break
				}
				buf = append(buf, c)
			}
			if i < len(s) && s[i] == '>' {
				switch string(buf) {
				case "pre", "script", "style", "textarea":
					return true
				}
			}
		}
	}
	return false
}

func parseHTMLTag(p *parseState, s string, i int) (Inline, int, bool) {
	// “An HTML tag consists of an open tag, a closing tag, an HTML comment,
	// a processing instruction, a declaration, or a CDATA section.”
	if i+3 <= len(s) && s[i] == '<' {
		switch s[i+1] {
		default:
			return parseHTMLOpenTag(p, s, i)
		case '/':
			return parseHTMLClosingTag(p, s, i)
		case '!':
			switch s[i+2] {
			case '-':
				return parseHTMLComment(s, i)
			case '[':
				return parseHTMLCDATA(s, i)
			default:
				return parseHTMLDecl(p, s, i)
			}
		case '?':
			return parseHTMLProcInst(s, i)
		}
	}
	return nil, 0, false
}

func parseHTMLOpenTag(p *parseState, s string, i int) (Inline, int, bool) {
	if i >= len(s) || s[i] != '<' {
		return nil, 0, false
	}
	// “An open tag consists of a < character, a tag name, zero or more attributes,
	// optional spaces, tabs, and up to one line ending, an optional / character, and a > character.”
	if name, j, ok := parseTagName(s, i+1); ok {
		switch name {
		case "pre", "script", "style", "textarea":
			// Goldmark treats these as starting a new HTMLBlock
			// and ending the paragraph they appear in.
			p.corner = true
		}
		for {
			if j >= len(s) || s[j] != ' ' && s[j] != '\t' && s[j] != '\n' && s[j] != '/' && s[j] != '>' {
				return nil, 0, false
			}
			_, k, ok := parseAttr(p, s, j)
			if !ok {
				break
			}
			j = k
		}
		k := skipSpace(s, j)
		if k != j {
			// Goldmark mishandles spaces before >.
			p.corner = true
		}
		j = k
		if j < len(s) && s[j] == '/' {
			j++
		}
		if j < len(s) && s[j] == '>' {
			return &HTMLTag{s[i : j+1]}, j + 1, true
		}
	}
	return nil, 0, false
}

func parseHTMLClosingTag(p *parseState, s string, i int) (Inline, int, bool) {
	// “A closing tag consists of the string </, a tag name,
	// optional spaces, tabs, and up to one line ending, and the character >.”
	if i+2 >= len(s) || s[i] != '<' || s[i+1] != '/' {
		return nil, 0, false
	}
	if skipSpace(s, i+2) != i+2 {
		// Goldmark allows spaces here but the spec and the Dingus do not.
		p.corner = true
	}

	if _, j, ok := parseTagName(s, i+2); ok {
		j = skipSpace(s, j)
		if j < len(s) && s[j] == '>' {
			return &HTMLTag{s[i : j+1]}, j + 1, true
		}
	}
	return nil, 0, false
}

func parseTagName(s string, i int) (string, int, bool) {
	// “A tag name consists of an ASCII letter followed by zero or more ASCII letters, digits, or hyphens (-).”
	if i < len(s) && isLetter(s[i]) {
		j := i + 1
		for j < len(s) && isLDH(s[j]) {
			j++
		}
		return s[i:j], j, true
	}
	return "", 0, false
}

func parseAttr(p *parseState, s string, i int) (string, int, bool) {
	// “An attribute consists of spaces, tabs, and up to one line ending,
	// an attribute name, and an optional attribute value specification.”
	i = skipSpace(s, i)
	if _, j, ok := parseAttrName(s, i); ok {
		if _, k, ok := parseAttrValueSpec(p, s, j); ok {
			j = k
		}
		return s[i:j], j, true
	}
	return "", 0, false
}

func parseAttrName(s string, i int) (string, int, bool) {
	// “An attribute name consists of an ASCII letter, _, or :,
	// followed by zero or more ASCII letters, digits, _, ., :, or -.”
	if i+1 < len(s) && (isLetter(s[i]) || s[i] == '_' || s[i] == ':') {
		j := i + 1
		for j < len(s) && (isLDH(s[j]) || s[j] == '_' || s[j] == '.' || s[j] == ':') {
			j++
		}
		return s[i:j], j, true
	}
	return "", 0, false
}

func parseAttrValueSpec(p *parseState, s string, i int) (string, int, bool) {
	// “An attribute value specification consists of
	// optional spaces, tabs, and up to one line ending,
	// a = character,
	// optional spaces, tabs, and up to one line ending,
	// and an attribute value.”
	i = skipSpace(s, i)
	if i+1 < len(s) && s[i] == '=' {
		i = skipSpace(s, i+1)
		if _, j, ok := parseAttrValue(s, i); ok {
			p.corner = p.corner || strings.Contains(s[i:j], "\ufffd")
			return s[i:j], j, true
		}
	}
	return "", 0, false
}

func parseAttrValue(s string, i int) (string, int, bool) {
	// “An attribute value consists of
	// an unquoted attribute value,
	// a single-quoted attribute value,
	// or a double-quoted attribute value.”
	// TODO: No escaping???
	if i < len(s) && (s[i] == '\'' || s[i] == '"') {
		// “A single-quoted attribute value consists of ',
		// zero or more characters not including ', and a final '.”
		// “A double-quoted attribute value consists of ",
		// zero or more characters not including ", and a final ".”
		if j := strings.IndexByte(s[i+1:], s[i]); j >= 0 {
			end := i + 1 + j + 1
			return s[i:end], end, true
		}
	}

	// “An unquoted attribute value is a nonempty string of characters
	// not including spaces, tabs, line endings, ", ', =, <, >, or `.”
	j := i
	for j < len(s) && strings.IndexByte(" \t\n\"'=<>`", s[j]) < 0 {
		j++
	}
	if j > i {
		return s[i:j], j, true
	}
	return "", 0, false
}

func parseHTMLComment(s string, i int) (Inline, int, bool) {
	// “An HTML comment consists of <!-- + text + -->,
	// where text does not start with > or ->,
	// does not end with -, and does not contain --.”
	if !strings.HasPrefix(s[i:], "<!-->") &&
		!strings.HasPrefix(s[i:], "<!--->") {
		if x, end, ok := parseHTMLMarker(s, i, "<!--", "-->"); ok {
			if t := x.(*HTMLTag).Text; !strings.Contains(t[len("<!--"):len(t)-len("->")], "--") {
				return x, end, ok
			}
		}
	}
	return nil, 0, false
}

func parseHTMLCDATA(s string, i int) (Inline, int, bool) {
	// “A CDATA section consists of the string <![CDATA[,
	// a string of characters not including the string ]]>, and the string ]]>.”
	return parseHTMLMarker(s, i, "<![CDATA[", "]]>")
}

func parseHTMLDecl(p *parseState, s string, i int) (Inline, int, bool) {
	// “A declaration consists of the string <!, an ASCII letter,
	// zero or more characters not including the character >, and the character >.”
	if i+2 < len(s) && isLetter(s[i+2]) {
		if 'a' <= s[i+2] && s[i+2] <= 'z' {
			p.corner = true // goldmark requires uppercase
		}
		return parseHTMLMarker(s, i, "<!", ">")
	}
	return nil, 0, false
}

func parseHTMLProcInst(s string, i int) (Inline, int, bool) {
	// “A processing instruction consists of the string <?,
	// a string of characters not including the string ?>, and the string ?>.”
	return parseHTMLMarker(s, i, "<?", "?>")
}

func parseHTMLMarker(s string, i int, prefix, suffix string) (Inline, int, bool) {
	if strings.HasPrefix(s[i:], prefix) {
		if j := strings.Index(s[i+len(prefix):], suffix); j >= 0 {
			end := i + len(prefix) + j + len(suffix)
			return &HTMLTag{s[i:end]}, end, true
		}
	}
	return nil, 0, false
}

func parseHTMLEntity(_ *parseState, s string, i int) (Inline, int, int, bool) {
	start := i
	if i+1 < len(s) && s[i+1] == '#' {
		i += 2
		var r, end int
		if i < len(s) && (s[i] == 'x' || s[i] == 'X') {
			// hex
			i++
			j := i
			for j < len(s) && isHexDigit(s[j]) {
				j++
			}
			if j-i < 1 || j-i > 6 || j >= len(s) || s[j] != ';' {
				return nil, 0, 0, false
			}
			r64, _ := strconv.ParseInt(s[i:j], 16, 0)
			r = int(r64)
			end = j + 1
		} else {
			// decimal
			j := i
			for j < len(s) && isDigit(s[j]) {
				j++
			}
			if j-i < 1 || j-i > 7 || j >= len(s) || s[j] != ';' {
				return nil, 0, 0, false
			}
			r, _ = strconv.Atoi(s[i:j])
			end = j + 1
		}
		if r > unicode.MaxRune || r == 0 {
			r = unicode.ReplacementChar
		}
		return &Plain{string(rune(r))}, start, end, true
	}

	// Max name in list is 32 bytes. Try for 64 for good measure.
	for j := i + 1; j < len(s) && j-i < 64; j++ {
		if s[j] == '&' { // Stop possible quadratic search on &&&&&&&.
			break
		}
		if s[j] == ';' {
			if r, ok := htmlEntity[s[i:j+1]]; ok {
				return &Plain{r}, start, j + 1, true
			}
			break
		}
	}

	return nil, 0, 0, false
}

type HTMLTag struct {
	Text string
}

func (*HTMLTag) Inline() {}

func (x *HTMLTag) PrintHTML(buf *bytes.Buffer) {
	buf.WriteString(x.Text)
}

func (x *HTMLTag) printMarkdown(buf *bytes.Buffer) {
	x.PrintHTML(buf)
}

func (x *HTMLTag) PrintText(buf *bytes.Buffer) {}