1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package template
6
7import (
8	"bytes"
9	"strings"
10)
11
12// transitionFunc is the array of context transition functions for text nodes.
13// A transition function takes a context and template text input, and returns
14// the updated context and the number of bytes consumed from the front of the
15// input.
16var transitionFunc = [...]func(context, []byte) (context, int){
17	stateText:           tText,
18	stateTag:            tTag,
19	stateAttrName:       tAttrName,
20	stateAfterName:      tAfterName,
21	stateBeforeValue:    tBeforeValue,
22	stateHTMLCmt:        tHTMLCmt,
23	stateRCDATA:         tSpecialTagEnd,
24	stateAttr:           tAttr,
25	stateURL:            tURL,
26	stateSrcset:         tURL,
27	stateJS:             tJS,
28	stateJSDqStr:        tJSDelimited,
29	stateJSSqStr:        tJSDelimited,
30	stateJSRegexp:       tJSDelimited,
31	stateJSTmplLit:      tJSTmpl,
32	stateJSBlockCmt:     tBlockCmt,
33	stateJSLineCmt:      tLineCmt,
34	stateJSHTMLOpenCmt:  tLineCmt,
35	stateJSHTMLCloseCmt: tLineCmt,
36	stateCSS:            tCSS,
37	stateCSSDqStr:       tCSSStr,
38	stateCSSSqStr:       tCSSStr,
39	stateCSSDqURL:       tCSSStr,
40	stateCSSSqURL:       tCSSStr,
41	stateCSSURL:         tCSSStr,
42	stateCSSBlockCmt:    tBlockCmt,
43	stateCSSLineCmt:     tLineCmt,
44	stateError:          tError,
45}
46
47var commentStart = []byte("<!--")
48var commentEnd = []byte("-->")
49
50// tText is the context transition function for the text state.
51func tText(c context, s []byte) (context, int) {
52	k := 0
53	for {
54		i := k + bytes.IndexByte(s[k:], '<')
55		if i < k || i+1 == len(s) {
56			return c, len(s)
57		} else if i+4 <= len(s) && bytes.Equal(commentStart, s[i:i+4]) {
58			return context{state: stateHTMLCmt}, i + 4
59		}
60		i++
61		end := false
62		if s[i] == '/' {
63			if i+1 == len(s) {
64				return c, len(s)
65			}
66			end, i = true, i+1
67		}
68		j, e := eatTagName(s, i)
69		if j != i {
70			if end {
71				e = elementNone
72			}
73			// We've found an HTML tag.
74			return context{state: stateTag, element: e}, j
75		}
76		k = j
77	}
78}
79
80var elementContentType = [...]state{
81	elementNone:     stateText,
82	elementScript:   stateJS,
83	elementStyle:    stateCSS,
84	elementTextarea: stateRCDATA,
85	elementTitle:    stateRCDATA,
86}
87
88// tTag is the context transition function for the tag state.
89func tTag(c context, s []byte) (context, int) {
90	// Find the attribute name.
91	i := eatWhiteSpace(s, 0)
92	if i == len(s) {
93		return c, len(s)
94	}
95	if s[i] == '>' {
96		return context{
97			state:   elementContentType[c.element],
98			element: c.element,
99		}, i + 1
100	}
101	j, err := eatAttrName(s, i)
102	if err != nil {
103		return context{state: stateError, err: err}, len(s)
104	}
105	state, attr := stateTag, attrNone
106	if i == j {
107		return context{
108			state: stateError,
109			err:   errorf(ErrBadHTML, nil, 0, "expected space, attr name, or end of tag, but got %q", s[i:]),
110		}, len(s)
111	}
112
113	attrName := strings.ToLower(string(s[i:j]))
114	if c.element == elementScript && attrName == "type" {
115		attr = attrScriptType
116	} else {
117		switch attrType(attrName) {
118		case contentTypeURL:
119			attr = attrURL
120		case contentTypeCSS:
121			attr = attrStyle
122		case contentTypeJS:
123			attr = attrScript
124		case contentTypeSrcset:
125			attr = attrSrcset
126		}
127	}
128
129	if j == len(s) {
130		state = stateAttrName
131	} else {
132		state = stateAfterName
133	}
134	return context{state: state, element: c.element, attr: attr}, j
135}
136
137// tAttrName is the context transition function for stateAttrName.
138func tAttrName(c context, s []byte) (context, int) {
139	i, err := eatAttrName(s, 0)
140	if err != nil {
141		return context{state: stateError, err: err}, len(s)
142	} else if i != len(s) {
143		c.state = stateAfterName
144	}
145	return c, i
146}
147
148// tAfterName is the context transition function for stateAfterName.
149func tAfterName(c context, s []byte) (context, int) {
150	// Look for the start of the value.
151	i := eatWhiteSpace(s, 0)
152	if i == len(s) {
153		return c, len(s)
154	} else if s[i] != '=' {
155		// Occurs due to tag ending '>', and valueless attribute.
156		c.state = stateTag
157		return c, i
158	}
159	c.state = stateBeforeValue
160	// Consume the "=".
161	return c, i + 1
162}
163
164var attrStartStates = [...]state{
165	attrNone:       stateAttr,
166	attrScript:     stateJS,
167	attrScriptType: stateAttr,
168	attrStyle:      stateCSS,
169	attrURL:        stateURL,
170	attrSrcset:     stateSrcset,
171}
172
173// tBeforeValue is the context transition function for stateBeforeValue.
174func tBeforeValue(c context, s []byte) (context, int) {
175	i := eatWhiteSpace(s, 0)
176	if i == len(s) {
177		return c, len(s)
178	}
179	// Find the attribute delimiter.
180	delim := delimSpaceOrTagEnd
181	switch s[i] {
182	case '\'':
183		delim, i = delimSingleQuote, i+1
184	case '"':
185		delim, i = delimDoubleQuote, i+1
186	}
187	c.state, c.delim = attrStartStates[c.attr], delim
188	return c, i
189}
190
191// tHTMLCmt is the context transition function for stateHTMLCmt.
192func tHTMLCmt(c context, s []byte) (context, int) {
193	if i := bytes.Index(s, commentEnd); i != -1 {
194		return context{}, i + 3
195	}
196	return c, len(s)
197}
198
199// specialTagEndMarkers maps element types to the character sequence that
200// case-insensitively signals the end of the special tag body.
201var specialTagEndMarkers = [...][]byte{
202	elementScript:   []byte("script"),
203	elementStyle:    []byte("style"),
204	elementTextarea: []byte("textarea"),
205	elementTitle:    []byte("title"),
206}
207
208var (
209	specialTagEndPrefix = []byte("</")
210	tagEndSeparators    = []byte("> \t\n\f/")
211)
212
213// tSpecialTagEnd is the context transition function for raw text and RCDATA
214// element states.
215func tSpecialTagEnd(c context, s []byte) (context, int) {
216	if c.element != elementNone {
217		// script end tags ("</script") within script literals are ignored, so that
218		// we can properly escape them.
219		if c.element == elementScript && (isInScriptLiteral(c.state) || isComment(c.state)) {
220			return c, len(s)
221		}
222		if i := indexTagEnd(s, specialTagEndMarkers[c.element]); i != -1 {
223			return context{}, i
224		}
225	}
226	return c, len(s)
227}
228
229// indexTagEnd finds the index of a special tag end in a case insensitive way, or returns -1
230func indexTagEnd(s []byte, tag []byte) int {
231	res := 0
232	plen := len(specialTagEndPrefix)
233	for len(s) > 0 {
234		// Try to find the tag end prefix first
235		i := bytes.Index(s, specialTagEndPrefix)
236		if i == -1 {
237			return i
238		}
239		s = s[i+plen:]
240		// Try to match the actual tag if there is still space for it
241		if len(tag) <= len(s) && bytes.EqualFold(tag, s[:len(tag)]) {
242			s = s[len(tag):]
243			// Check the tag is followed by a proper separator
244			if len(s) > 0 && bytes.IndexByte(tagEndSeparators, s[0]) != -1 {
245				return res + i
246			}
247			res += len(tag)
248		}
249		res += i + plen
250	}
251	return -1
252}
253
254// tAttr is the context transition function for the attribute state.
255func tAttr(c context, s []byte) (context, int) {
256	return c, len(s)
257}
258
259// tURL is the context transition function for the URL state.
260func tURL(c context, s []byte) (context, int) {
261	if bytes.ContainsAny(s, "#?") {
262		c.urlPart = urlPartQueryOrFrag
263	} else if len(s) != eatWhiteSpace(s, 0) && c.urlPart == urlPartNone {
264		// HTML5 uses "Valid URL potentially surrounded by spaces" for
265		// attrs: https://www.w3.org/TR/html5/index.html#attributes-1
266		c.urlPart = urlPartPreQuery
267	}
268	return c, len(s)
269}
270
271// tJS is the context transition function for the JS state.
272func tJS(c context, s []byte) (context, int) {
273	i := bytes.IndexAny(s, "\"`'/{}<-#")
274	if i == -1 {
275		// Entire input is non string, comment, regexp tokens.
276		c.jsCtx = nextJSCtx(s, c.jsCtx)
277		return c, len(s)
278	}
279	c.jsCtx = nextJSCtx(s[:i], c.jsCtx)
280	switch s[i] {
281	case '"':
282		c.state, c.jsCtx = stateJSDqStr, jsCtxRegexp
283	case '\'':
284		c.state, c.jsCtx = stateJSSqStr, jsCtxRegexp
285	case '`':
286		c.state, c.jsCtx = stateJSTmplLit, jsCtxRegexp
287	case '/':
288		switch {
289		case i+1 < len(s) && s[i+1] == '/':
290			c.state, i = stateJSLineCmt, i+1
291		case i+1 < len(s) && s[i+1] == '*':
292			c.state, i = stateJSBlockCmt, i+1
293		case c.jsCtx == jsCtxRegexp:
294			c.state = stateJSRegexp
295		case c.jsCtx == jsCtxDivOp:
296			c.jsCtx = jsCtxRegexp
297		default:
298			return context{
299				state: stateError,
300				err:   errorf(ErrSlashAmbig, nil, 0, "'/' could start a division or regexp: %.32q", s[i:]),
301			}, len(s)
302		}
303	// ECMAScript supports HTML style comments for legacy reasons, see Appendix
304	// B.1.1 "HTML-like Comments". The handling of these comments is somewhat
305	// confusing. Multi-line comments are not supported, i.e. anything on lines
306	// between the opening and closing tokens is not considered a comment, but
307	// anything following the opening or closing token, on the same line, is
308	// ignored. As such we simply treat any line prefixed with "<!--" or "-->"
309	// as if it were actually prefixed with "//" and move on.
310	case '<':
311		if i+3 < len(s) && bytes.Equal(commentStart, s[i:i+4]) {
312			c.state, i = stateJSHTMLOpenCmt, i+3
313		}
314	case '-':
315		if i+2 < len(s) && bytes.Equal(commentEnd, s[i:i+3]) {
316			c.state, i = stateJSHTMLCloseCmt, i+2
317		}
318	// ECMAScript also supports "hashbang" comment lines, see Section 12.5.
319	case '#':
320		if i+1 < len(s) && s[i+1] == '!' {
321			c.state, i = stateJSLineCmt, i+1
322		}
323	case '{':
324		// We only care about tracking brace depth if we are inside of a
325		// template literal.
326		if len(c.jsBraceDepth) == 0 {
327			return c, i + 1
328		}
329		c.jsBraceDepth[len(c.jsBraceDepth)-1]++
330	case '}':
331		if len(c.jsBraceDepth) == 0 {
332			return c, i + 1
333		}
334		// There are no cases where a brace can be escaped in the JS context
335		// that are not syntax errors, it seems. Because of this we can just
336		// count "\}" as "}" and move on, the script is already broken as
337		// fully fledged parsers will just fail anyway.
338		c.jsBraceDepth[len(c.jsBraceDepth)-1]--
339		if c.jsBraceDepth[len(c.jsBraceDepth)-1] >= 0 {
340			return c, i + 1
341		}
342		c.jsBraceDepth = c.jsBraceDepth[:len(c.jsBraceDepth)-1]
343		c.state = stateJSTmplLit
344	default:
345		panic("unreachable")
346	}
347	return c, i + 1
348}
349
350func tJSTmpl(c context, s []byte) (context, int) {
351	var k int
352	for {
353		i := k + bytes.IndexAny(s[k:], "`\\$")
354		if i < k {
355			break
356		}
357		switch s[i] {
358		case '\\':
359			i++
360			if i == len(s) {
361				return context{
362					state: stateError,
363					err:   errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in JS string: %q", s),
364				}, len(s)
365			}
366		case '$':
367			if len(s) >= i+2 && s[i+1] == '{' {
368				c.jsBraceDepth = append(c.jsBraceDepth, 0)
369				c.state = stateJS
370				return c, i + 2
371			}
372		case '`':
373			// end
374			c.state = stateJS
375			return c, i + 1
376		}
377		k = i + 1
378	}
379
380	return c, len(s)
381}
382
383// tJSDelimited is the context transition function for the JS string and regexp
384// states.
385func tJSDelimited(c context, s []byte) (context, int) {
386	specials := `\"`
387	switch c.state {
388	case stateJSSqStr:
389		specials = `\'`
390	case stateJSRegexp:
391		specials = `\/[]`
392	}
393
394	k, inCharset := 0, false
395	for {
396		i := k + bytes.IndexAny(s[k:], specials)
397		if i < k {
398			break
399		}
400		switch s[i] {
401		case '\\':
402			i++
403			if i == len(s) {
404				return context{
405					state: stateError,
406					err:   errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in JS string: %q", s),
407				}, len(s)
408			}
409		case '[':
410			inCharset = true
411		case ']':
412			inCharset = false
413		case '/':
414			// If "</script" appears in a regex literal, the '/' should not
415			// close the regex literal, and it will later be escaped to
416			// "\x3C/script" in escapeText.
417			if i > 0 && i+7 <= len(s) && bytes.Equal(bytes.ToLower(s[i-1:i+7]), []byte("</script")) {
418				i++
419			} else if !inCharset {
420				c.state, c.jsCtx = stateJS, jsCtxDivOp
421				return c, i + 1
422			}
423		default:
424			// end delimiter
425			if !inCharset {
426				c.state, c.jsCtx = stateJS, jsCtxDivOp
427				return c, i + 1
428			}
429		}
430		k = i + 1
431	}
432
433	if inCharset {
434		// This can be fixed by making context richer if interpolation
435		// into charsets is desired.
436		return context{
437			state: stateError,
438			err:   errorf(ErrPartialCharset, nil, 0, "unfinished JS regexp charset: %q", s),
439		}, len(s)
440	}
441
442	return c, len(s)
443}
444
445var blockCommentEnd = []byte("*/")
446
447// tBlockCmt is the context transition function for /*comment*/ states.
448func tBlockCmt(c context, s []byte) (context, int) {
449	i := bytes.Index(s, blockCommentEnd)
450	if i == -1 {
451		return c, len(s)
452	}
453	switch c.state {
454	case stateJSBlockCmt:
455		c.state = stateJS
456	case stateCSSBlockCmt:
457		c.state = stateCSS
458	default:
459		panic(c.state.String())
460	}
461	return c, i + 2
462}
463
464// tLineCmt is the context transition function for //comment states, and the JS HTML-like comment state.
465func tLineCmt(c context, s []byte) (context, int) {
466	var lineTerminators string
467	var endState state
468	switch c.state {
469	case stateJSLineCmt, stateJSHTMLOpenCmt, stateJSHTMLCloseCmt:
470		lineTerminators, endState = "\n\r\u2028\u2029", stateJS
471	case stateCSSLineCmt:
472		lineTerminators, endState = "\n\f\r", stateCSS
473		// Line comments are not part of any published CSS standard but
474		// are supported by the 4 major browsers.
475		// This defines line comments as
476		//     LINECOMMENT ::= "//" [^\n\f\d]*
477		// since https://www.w3.org/TR/css3-syntax/#SUBTOK-nl defines
478		// newlines:
479		//     nl ::= #xA | #xD #xA | #xD | #xC
480	default:
481		panic(c.state.String())
482	}
483
484	i := bytes.IndexAny(s, lineTerminators)
485	if i == -1 {
486		return c, len(s)
487	}
488	c.state = endState
489	// Per section 7.4 of EcmaScript 5 : https://es5.github.io/#x7.4
490	// "However, the LineTerminator at the end of the line is not
491	// considered to be part of the single-line comment; it is
492	// recognized separately by the lexical grammar and becomes part
493	// of the stream of input elements for the syntactic grammar."
494	return c, i
495}
496
497// tCSS is the context transition function for the CSS state.
498func tCSS(c context, s []byte) (context, int) {
499	// CSS quoted strings are almost never used except for:
500	// (1) URLs as in background: "/foo.png"
501	// (2) Multiword font-names as in font-family: "Times New Roman"
502	// (3) List separators in content values as in inline-lists:
503	//    <style>
504	//    ul.inlineList { list-style: none; padding:0 }
505	//    ul.inlineList > li { display: inline }
506	//    ul.inlineList > li:before { content: ", " }
507	//    ul.inlineList > li:first-child:before { content: "" }
508	//    </style>
509	//    <ul class=inlineList><li>One<li>Two<li>Three</ul>
510	// (4) Attribute value selectors as in a[href="http://example.com/"]
511	//
512	// We conservatively treat all strings as URLs, but make some
513	// allowances to avoid confusion.
514	//
515	// In (1), our conservative assumption is justified.
516	// In (2), valid font names do not contain ':', '?', or '#', so our
517	// conservative assumption is fine since we will never transition past
518	// urlPartPreQuery.
519	// In (3), our protocol heuristic should not be tripped, and there
520	// should not be non-space content after a '?' or '#', so as long as
521	// we only %-encode RFC 3986 reserved characters we are ok.
522	// In (4), we should URL escape for URL attributes, and for others we
523	// have the attribute name available if our conservative assumption
524	// proves problematic for real code.
525
526	k := 0
527	for {
528		i := k + bytes.IndexAny(s[k:], `("'/`)
529		if i < k {
530			return c, len(s)
531		}
532		switch s[i] {
533		case '(':
534			// Look for url to the left.
535			p := bytes.TrimRight(s[:i], "\t\n\f\r ")
536			if endsWithCSSKeyword(p, "url") {
537				j := len(s) - len(bytes.TrimLeft(s[i+1:], "\t\n\f\r "))
538				switch {
539				case j != len(s) && s[j] == '"':
540					c.state, j = stateCSSDqURL, j+1
541				case j != len(s) && s[j] == '\'':
542					c.state, j = stateCSSSqURL, j+1
543				default:
544					c.state = stateCSSURL
545				}
546				return c, j
547			}
548		case '/':
549			if i+1 < len(s) {
550				switch s[i+1] {
551				case '/':
552					c.state = stateCSSLineCmt
553					return c, i + 2
554				case '*':
555					c.state = stateCSSBlockCmt
556					return c, i + 2
557				}
558			}
559		case '"':
560			c.state = stateCSSDqStr
561			return c, i + 1
562		case '\'':
563			c.state = stateCSSSqStr
564			return c, i + 1
565		}
566		k = i + 1
567	}
568}
569
570// tCSSStr is the context transition function for the CSS string and URL states.
571func tCSSStr(c context, s []byte) (context, int) {
572	var endAndEsc string
573	switch c.state {
574	case stateCSSDqStr, stateCSSDqURL:
575		endAndEsc = `\"`
576	case stateCSSSqStr, stateCSSSqURL:
577		endAndEsc = `\'`
578	case stateCSSURL:
579		// Unquoted URLs end with a newline or close parenthesis.
580		// The below includes the wc (whitespace character) and nl.
581		endAndEsc = "\\\t\n\f\r )"
582	default:
583		panic(c.state.String())
584	}
585
586	k := 0
587	for {
588		i := k + bytes.IndexAny(s[k:], endAndEsc)
589		if i < k {
590			c, nread := tURL(c, decodeCSS(s[k:]))
591			return c, k + nread
592		}
593		if s[i] == '\\' {
594			i++
595			if i == len(s) {
596				return context{
597					state: stateError,
598					err:   errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in CSS string: %q", s),
599				}, len(s)
600			}
601		} else {
602			c.state = stateCSS
603			return c, i + 1
604		}
605		c, _ = tURL(c, decodeCSS(s[:i+1]))
606		k = i + 1
607	}
608}
609
610// tError is the context transition function for the error state.
611func tError(c context, s []byte) (context, int) {
612	return c, len(s)
613}
614
615// eatAttrName returns the largest j such that s[i:j] is an attribute name.
616// It returns an error if s[i:] does not look like it begins with an
617// attribute name, such as encountering a quote mark without a preceding
618// equals sign.
619func eatAttrName(s []byte, i int) (int, *Error) {
620	for j := i; j < len(s); j++ {
621		switch s[j] {
622		case ' ', '\t', '\n', '\f', '\r', '=', '>':
623			return j, nil
624		case '\'', '"', '<':
625			// These result in a parse warning in HTML5 and are
626			// indicative of serious problems if seen in an attr
627			// name in a template.
628			return -1, errorf(ErrBadHTML, nil, 0, "%q in attribute name: %.32q", s[j:j+1], s)
629		default:
630			// No-op.
631		}
632	}
633	return len(s), nil
634}
635
636var elementNameMap = map[string]element{
637	"script":   elementScript,
638	"style":    elementStyle,
639	"textarea": elementTextarea,
640	"title":    elementTitle,
641}
642
643// asciiAlpha reports whether c is an ASCII letter.
644func asciiAlpha(c byte) bool {
645	return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
646}
647
648// asciiAlphaNum reports whether c is an ASCII letter or digit.
649func asciiAlphaNum(c byte) bool {
650	return asciiAlpha(c) || '0' <= c && c <= '9'
651}
652
653// eatTagName returns the largest j such that s[i:j] is a tag name and the tag type.
654func eatTagName(s []byte, i int) (int, element) {
655	if i == len(s) || !asciiAlpha(s[i]) {
656		return i, elementNone
657	}
658	j := i + 1
659	for j < len(s) {
660		x := s[j]
661		if asciiAlphaNum(x) {
662			j++
663			continue
664		}
665		// Allow "x-y" or "x:y" but not "x-", "-y", or "x--y".
666		if (x == ':' || x == '-') && j+1 < len(s) && asciiAlphaNum(s[j+1]) {
667			j += 2
668			continue
669		}
670		break
671	}
672	return j, elementNameMap[strings.ToLower(string(s[i:j]))]
673}
674
675// eatWhiteSpace returns the largest j such that s[i:j] is white space.
676func eatWhiteSpace(s []byte, i int) int {
677	for j := i; j < len(s); j++ {
678		switch s[j] {
679		case ' ', '\t', '\n', '\f', '\r':
680			// No-op.
681		default:
682			return j
683		}
684	}
685	return len(s)
686}
687