1// Copyright 2011 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package template 6 7import ( 8 "bytes" 9 "strings" 10) 11 12// transitionFunc is the array of context transition functions for text nodes. 13// A transition function takes a context and template text input, and returns 14// the updated context and the number of bytes consumed from the front of the 15// input. 16var transitionFunc = [...]func(context, []byte) (context, int){ 17 stateText: tText, 18 stateTag: tTag, 19 stateAttrName: tAttrName, 20 stateAfterName: tAfterName, 21 stateBeforeValue: tBeforeValue, 22 stateHTMLCmt: tHTMLCmt, 23 stateRCDATA: tSpecialTagEnd, 24 stateAttr: tAttr, 25 stateURL: tURL, 26 stateSrcset: tURL, 27 stateJS: tJS, 28 stateJSDqStr: tJSDelimited, 29 stateJSSqStr: tJSDelimited, 30 stateJSRegexp: tJSDelimited, 31 stateJSTmplLit: tJSTmpl, 32 stateJSBlockCmt: tBlockCmt, 33 stateJSLineCmt: tLineCmt, 34 stateJSHTMLOpenCmt: tLineCmt, 35 stateJSHTMLCloseCmt: tLineCmt, 36 stateCSS: tCSS, 37 stateCSSDqStr: tCSSStr, 38 stateCSSSqStr: tCSSStr, 39 stateCSSDqURL: tCSSStr, 40 stateCSSSqURL: tCSSStr, 41 stateCSSURL: tCSSStr, 42 stateCSSBlockCmt: tBlockCmt, 43 stateCSSLineCmt: tLineCmt, 44 stateError: tError, 45} 46 47var commentStart = []byte("<!--") 48var commentEnd = []byte("-->") 49 50// tText is the context transition function for the text state. 51func tText(c context, s []byte) (context, int) { 52 k := 0 53 for { 54 i := k + bytes.IndexByte(s[k:], '<') 55 if i < k || i+1 == len(s) { 56 return c, len(s) 57 } else if i+4 <= len(s) && bytes.Equal(commentStart, s[i:i+4]) { 58 return context{state: stateHTMLCmt}, i + 4 59 } 60 i++ 61 end := false 62 if s[i] == '/' { 63 if i+1 == len(s) { 64 return c, len(s) 65 } 66 end, i = true, i+1 67 } 68 j, e := eatTagName(s, i) 69 if j != i { 70 if end { 71 e = elementNone 72 } 73 // We've found an HTML tag. 74 return context{state: stateTag, element: e}, j 75 } 76 k = j 77 } 78} 79 80var elementContentType = [...]state{ 81 elementNone: stateText, 82 elementScript: stateJS, 83 elementStyle: stateCSS, 84 elementTextarea: stateRCDATA, 85 elementTitle: stateRCDATA, 86} 87 88// tTag is the context transition function for the tag state. 89func tTag(c context, s []byte) (context, int) { 90 // Find the attribute name. 91 i := eatWhiteSpace(s, 0) 92 if i == len(s) { 93 return c, len(s) 94 } 95 if s[i] == '>' { 96 return context{ 97 state: elementContentType[c.element], 98 element: c.element, 99 }, i + 1 100 } 101 j, err := eatAttrName(s, i) 102 if err != nil { 103 return context{state: stateError, err: err}, len(s) 104 } 105 state, attr := stateTag, attrNone 106 if i == j { 107 return context{ 108 state: stateError, 109 err: errorf(ErrBadHTML, nil, 0, "expected space, attr name, or end of tag, but got %q", s[i:]), 110 }, len(s) 111 } 112 113 attrName := strings.ToLower(string(s[i:j])) 114 if c.element == elementScript && attrName == "type" { 115 attr = attrScriptType 116 } else { 117 switch attrType(attrName) { 118 case contentTypeURL: 119 attr = attrURL 120 case contentTypeCSS: 121 attr = attrStyle 122 case contentTypeJS: 123 attr = attrScript 124 case contentTypeSrcset: 125 attr = attrSrcset 126 } 127 } 128 129 if j == len(s) { 130 state = stateAttrName 131 } else { 132 state = stateAfterName 133 } 134 return context{state: state, element: c.element, attr: attr}, j 135} 136 137// tAttrName is the context transition function for stateAttrName. 138func tAttrName(c context, s []byte) (context, int) { 139 i, err := eatAttrName(s, 0) 140 if err != nil { 141 return context{state: stateError, err: err}, len(s) 142 } else if i != len(s) { 143 c.state = stateAfterName 144 } 145 return c, i 146} 147 148// tAfterName is the context transition function for stateAfterName. 149func tAfterName(c context, s []byte) (context, int) { 150 // Look for the start of the value. 151 i := eatWhiteSpace(s, 0) 152 if i == len(s) { 153 return c, len(s) 154 } else if s[i] != '=' { 155 // Occurs due to tag ending '>', and valueless attribute. 156 c.state = stateTag 157 return c, i 158 } 159 c.state = stateBeforeValue 160 // Consume the "=". 161 return c, i + 1 162} 163 164var attrStartStates = [...]state{ 165 attrNone: stateAttr, 166 attrScript: stateJS, 167 attrScriptType: stateAttr, 168 attrStyle: stateCSS, 169 attrURL: stateURL, 170 attrSrcset: stateSrcset, 171} 172 173// tBeforeValue is the context transition function for stateBeforeValue. 174func tBeforeValue(c context, s []byte) (context, int) { 175 i := eatWhiteSpace(s, 0) 176 if i == len(s) { 177 return c, len(s) 178 } 179 // Find the attribute delimiter. 180 delim := delimSpaceOrTagEnd 181 switch s[i] { 182 case '\'': 183 delim, i = delimSingleQuote, i+1 184 case '"': 185 delim, i = delimDoubleQuote, i+1 186 } 187 c.state, c.delim = attrStartStates[c.attr], delim 188 return c, i 189} 190 191// tHTMLCmt is the context transition function for stateHTMLCmt. 192func tHTMLCmt(c context, s []byte) (context, int) { 193 if i := bytes.Index(s, commentEnd); i != -1 { 194 return context{}, i + 3 195 } 196 return c, len(s) 197} 198 199// specialTagEndMarkers maps element types to the character sequence that 200// case-insensitively signals the end of the special tag body. 201var specialTagEndMarkers = [...][]byte{ 202 elementScript: []byte("script"), 203 elementStyle: []byte("style"), 204 elementTextarea: []byte("textarea"), 205 elementTitle: []byte("title"), 206} 207 208var ( 209 specialTagEndPrefix = []byte("</") 210 tagEndSeparators = []byte("> \t\n\f/") 211) 212 213// tSpecialTagEnd is the context transition function for raw text and RCDATA 214// element states. 215func tSpecialTagEnd(c context, s []byte) (context, int) { 216 if c.element != elementNone { 217 // script end tags ("</script") within script literals are ignored, so that 218 // we can properly escape them. 219 if c.element == elementScript && (isInScriptLiteral(c.state) || isComment(c.state)) { 220 return c, len(s) 221 } 222 if i := indexTagEnd(s, specialTagEndMarkers[c.element]); i != -1 { 223 return context{}, i 224 } 225 } 226 return c, len(s) 227} 228 229// indexTagEnd finds the index of a special tag end in a case insensitive way, or returns -1 230func indexTagEnd(s []byte, tag []byte) int { 231 res := 0 232 plen := len(specialTagEndPrefix) 233 for len(s) > 0 { 234 // Try to find the tag end prefix first 235 i := bytes.Index(s, specialTagEndPrefix) 236 if i == -1 { 237 return i 238 } 239 s = s[i+plen:] 240 // Try to match the actual tag if there is still space for it 241 if len(tag) <= len(s) && bytes.EqualFold(tag, s[:len(tag)]) { 242 s = s[len(tag):] 243 // Check the tag is followed by a proper separator 244 if len(s) > 0 && bytes.IndexByte(tagEndSeparators, s[0]) != -1 { 245 return res + i 246 } 247 res += len(tag) 248 } 249 res += i + plen 250 } 251 return -1 252} 253 254// tAttr is the context transition function for the attribute state. 255func tAttr(c context, s []byte) (context, int) { 256 return c, len(s) 257} 258 259// tURL is the context transition function for the URL state. 260func tURL(c context, s []byte) (context, int) { 261 if bytes.ContainsAny(s, "#?") { 262 c.urlPart = urlPartQueryOrFrag 263 } else if len(s) != eatWhiteSpace(s, 0) && c.urlPart == urlPartNone { 264 // HTML5 uses "Valid URL potentially surrounded by spaces" for 265 // attrs: https://www.w3.org/TR/html5/index.html#attributes-1 266 c.urlPart = urlPartPreQuery 267 } 268 return c, len(s) 269} 270 271// tJS is the context transition function for the JS state. 272func tJS(c context, s []byte) (context, int) { 273 i := bytes.IndexAny(s, "\"`'/{}<-#") 274 if i == -1 { 275 // Entire input is non string, comment, regexp tokens. 276 c.jsCtx = nextJSCtx(s, c.jsCtx) 277 return c, len(s) 278 } 279 c.jsCtx = nextJSCtx(s[:i], c.jsCtx) 280 switch s[i] { 281 case '"': 282 c.state, c.jsCtx = stateJSDqStr, jsCtxRegexp 283 case '\'': 284 c.state, c.jsCtx = stateJSSqStr, jsCtxRegexp 285 case '`': 286 c.state, c.jsCtx = stateJSTmplLit, jsCtxRegexp 287 case '/': 288 switch { 289 case i+1 < len(s) && s[i+1] == '/': 290 c.state, i = stateJSLineCmt, i+1 291 case i+1 < len(s) && s[i+1] == '*': 292 c.state, i = stateJSBlockCmt, i+1 293 case c.jsCtx == jsCtxRegexp: 294 c.state = stateJSRegexp 295 case c.jsCtx == jsCtxDivOp: 296 c.jsCtx = jsCtxRegexp 297 default: 298 return context{ 299 state: stateError, 300 err: errorf(ErrSlashAmbig, nil, 0, "'/' could start a division or regexp: %.32q", s[i:]), 301 }, len(s) 302 } 303 // ECMAScript supports HTML style comments for legacy reasons, see Appendix 304 // B.1.1 "HTML-like Comments". The handling of these comments is somewhat 305 // confusing. Multi-line comments are not supported, i.e. anything on lines 306 // between the opening and closing tokens is not considered a comment, but 307 // anything following the opening or closing token, on the same line, is 308 // ignored. As such we simply treat any line prefixed with "<!--" or "-->" 309 // as if it were actually prefixed with "//" and move on. 310 case '<': 311 if i+3 < len(s) && bytes.Equal(commentStart, s[i:i+4]) { 312 c.state, i = stateJSHTMLOpenCmt, i+3 313 } 314 case '-': 315 if i+2 < len(s) && bytes.Equal(commentEnd, s[i:i+3]) { 316 c.state, i = stateJSHTMLCloseCmt, i+2 317 } 318 // ECMAScript also supports "hashbang" comment lines, see Section 12.5. 319 case '#': 320 if i+1 < len(s) && s[i+1] == '!' { 321 c.state, i = stateJSLineCmt, i+1 322 } 323 case '{': 324 // We only care about tracking brace depth if we are inside of a 325 // template literal. 326 if len(c.jsBraceDepth) == 0 { 327 return c, i + 1 328 } 329 c.jsBraceDepth[len(c.jsBraceDepth)-1]++ 330 case '}': 331 if len(c.jsBraceDepth) == 0 { 332 return c, i + 1 333 } 334 // There are no cases where a brace can be escaped in the JS context 335 // that are not syntax errors, it seems. Because of this we can just 336 // count "\}" as "}" and move on, the script is already broken as 337 // fully fledged parsers will just fail anyway. 338 c.jsBraceDepth[len(c.jsBraceDepth)-1]-- 339 if c.jsBraceDepth[len(c.jsBraceDepth)-1] >= 0 { 340 return c, i + 1 341 } 342 c.jsBraceDepth = c.jsBraceDepth[:len(c.jsBraceDepth)-1] 343 c.state = stateJSTmplLit 344 default: 345 panic("unreachable") 346 } 347 return c, i + 1 348} 349 350func tJSTmpl(c context, s []byte) (context, int) { 351 var k int 352 for { 353 i := k + bytes.IndexAny(s[k:], "`\\$") 354 if i < k { 355 break 356 } 357 switch s[i] { 358 case '\\': 359 i++ 360 if i == len(s) { 361 return context{ 362 state: stateError, 363 err: errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in JS string: %q", s), 364 }, len(s) 365 } 366 case '$': 367 if len(s) >= i+2 && s[i+1] == '{' { 368 c.jsBraceDepth = append(c.jsBraceDepth, 0) 369 c.state = stateJS 370 return c, i + 2 371 } 372 case '`': 373 // end 374 c.state = stateJS 375 return c, i + 1 376 } 377 k = i + 1 378 } 379 380 return c, len(s) 381} 382 383// tJSDelimited is the context transition function for the JS string and regexp 384// states. 385func tJSDelimited(c context, s []byte) (context, int) { 386 specials := `\"` 387 switch c.state { 388 case stateJSSqStr: 389 specials = `\'` 390 case stateJSRegexp: 391 specials = `\/[]` 392 } 393 394 k, inCharset := 0, false 395 for { 396 i := k + bytes.IndexAny(s[k:], specials) 397 if i < k { 398 break 399 } 400 switch s[i] { 401 case '\\': 402 i++ 403 if i == len(s) { 404 return context{ 405 state: stateError, 406 err: errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in JS string: %q", s), 407 }, len(s) 408 } 409 case '[': 410 inCharset = true 411 case ']': 412 inCharset = false 413 case '/': 414 // If "</script" appears in a regex literal, the '/' should not 415 // close the regex literal, and it will later be escaped to 416 // "\x3C/script" in escapeText. 417 if i > 0 && i+7 <= len(s) && bytes.Equal(bytes.ToLower(s[i-1:i+7]), []byte("</script")) { 418 i++ 419 } else if !inCharset { 420 c.state, c.jsCtx = stateJS, jsCtxDivOp 421 return c, i + 1 422 } 423 default: 424 // end delimiter 425 if !inCharset { 426 c.state, c.jsCtx = stateJS, jsCtxDivOp 427 return c, i + 1 428 } 429 } 430 k = i + 1 431 } 432 433 if inCharset { 434 // This can be fixed by making context richer if interpolation 435 // into charsets is desired. 436 return context{ 437 state: stateError, 438 err: errorf(ErrPartialCharset, nil, 0, "unfinished JS regexp charset: %q", s), 439 }, len(s) 440 } 441 442 return c, len(s) 443} 444 445var blockCommentEnd = []byte("*/") 446 447// tBlockCmt is the context transition function for /*comment*/ states. 448func tBlockCmt(c context, s []byte) (context, int) { 449 i := bytes.Index(s, blockCommentEnd) 450 if i == -1 { 451 return c, len(s) 452 } 453 switch c.state { 454 case stateJSBlockCmt: 455 c.state = stateJS 456 case stateCSSBlockCmt: 457 c.state = stateCSS 458 default: 459 panic(c.state.String()) 460 } 461 return c, i + 2 462} 463 464// tLineCmt is the context transition function for //comment states, and the JS HTML-like comment state. 465func tLineCmt(c context, s []byte) (context, int) { 466 var lineTerminators string 467 var endState state 468 switch c.state { 469 case stateJSLineCmt, stateJSHTMLOpenCmt, stateJSHTMLCloseCmt: 470 lineTerminators, endState = "\n\r\u2028\u2029", stateJS 471 case stateCSSLineCmt: 472 lineTerminators, endState = "\n\f\r", stateCSS 473 // Line comments are not part of any published CSS standard but 474 // are supported by the 4 major browsers. 475 // This defines line comments as 476 // LINECOMMENT ::= "//" [^\n\f\d]* 477 // since https://www.w3.org/TR/css3-syntax/#SUBTOK-nl defines 478 // newlines: 479 // nl ::= #xA | #xD #xA | #xD | #xC 480 default: 481 panic(c.state.String()) 482 } 483 484 i := bytes.IndexAny(s, lineTerminators) 485 if i == -1 { 486 return c, len(s) 487 } 488 c.state = endState 489 // Per section 7.4 of EcmaScript 5 : https://es5.github.io/#x7.4 490 // "However, the LineTerminator at the end of the line is not 491 // considered to be part of the single-line comment; it is 492 // recognized separately by the lexical grammar and becomes part 493 // of the stream of input elements for the syntactic grammar." 494 return c, i 495} 496 497// tCSS is the context transition function for the CSS state. 498func tCSS(c context, s []byte) (context, int) { 499 // CSS quoted strings are almost never used except for: 500 // (1) URLs as in background: "/foo.png" 501 // (2) Multiword font-names as in font-family: "Times New Roman" 502 // (3) List separators in content values as in inline-lists: 503 // <style> 504 // ul.inlineList { list-style: none; padding:0 } 505 // ul.inlineList > li { display: inline } 506 // ul.inlineList > li:before { content: ", " } 507 // ul.inlineList > li:first-child:before { content: "" } 508 // </style> 509 // <ul class=inlineList><li>One<li>Two<li>Three</ul> 510 // (4) Attribute value selectors as in a[href="http://example.com/"] 511 // 512 // We conservatively treat all strings as URLs, but make some 513 // allowances to avoid confusion. 514 // 515 // In (1), our conservative assumption is justified. 516 // In (2), valid font names do not contain ':', '?', or '#', so our 517 // conservative assumption is fine since we will never transition past 518 // urlPartPreQuery. 519 // In (3), our protocol heuristic should not be tripped, and there 520 // should not be non-space content after a '?' or '#', so as long as 521 // we only %-encode RFC 3986 reserved characters we are ok. 522 // In (4), we should URL escape for URL attributes, and for others we 523 // have the attribute name available if our conservative assumption 524 // proves problematic for real code. 525 526 k := 0 527 for { 528 i := k + bytes.IndexAny(s[k:], `("'/`) 529 if i < k { 530 return c, len(s) 531 } 532 switch s[i] { 533 case '(': 534 // Look for url to the left. 535 p := bytes.TrimRight(s[:i], "\t\n\f\r ") 536 if endsWithCSSKeyword(p, "url") { 537 j := len(s) - len(bytes.TrimLeft(s[i+1:], "\t\n\f\r ")) 538 switch { 539 case j != len(s) && s[j] == '"': 540 c.state, j = stateCSSDqURL, j+1 541 case j != len(s) && s[j] == '\'': 542 c.state, j = stateCSSSqURL, j+1 543 default: 544 c.state = stateCSSURL 545 } 546 return c, j 547 } 548 case '/': 549 if i+1 < len(s) { 550 switch s[i+1] { 551 case '/': 552 c.state = stateCSSLineCmt 553 return c, i + 2 554 case '*': 555 c.state = stateCSSBlockCmt 556 return c, i + 2 557 } 558 } 559 case '"': 560 c.state = stateCSSDqStr 561 return c, i + 1 562 case '\'': 563 c.state = stateCSSSqStr 564 return c, i + 1 565 } 566 k = i + 1 567 } 568} 569 570// tCSSStr is the context transition function for the CSS string and URL states. 571func tCSSStr(c context, s []byte) (context, int) { 572 var endAndEsc string 573 switch c.state { 574 case stateCSSDqStr, stateCSSDqURL: 575 endAndEsc = `\"` 576 case stateCSSSqStr, stateCSSSqURL: 577 endAndEsc = `\'` 578 case stateCSSURL: 579 // Unquoted URLs end with a newline or close parenthesis. 580 // The below includes the wc (whitespace character) and nl. 581 endAndEsc = "\\\t\n\f\r )" 582 default: 583 panic(c.state.String()) 584 } 585 586 k := 0 587 for { 588 i := k + bytes.IndexAny(s[k:], endAndEsc) 589 if i < k { 590 c, nread := tURL(c, decodeCSS(s[k:])) 591 return c, k + nread 592 } 593 if s[i] == '\\' { 594 i++ 595 if i == len(s) { 596 return context{ 597 state: stateError, 598 err: errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in CSS string: %q", s), 599 }, len(s) 600 } 601 } else { 602 c.state = stateCSS 603 return c, i + 1 604 } 605 c, _ = tURL(c, decodeCSS(s[:i+1])) 606 k = i + 1 607 } 608} 609 610// tError is the context transition function for the error state. 611func tError(c context, s []byte) (context, int) { 612 return c, len(s) 613} 614 615// eatAttrName returns the largest j such that s[i:j] is an attribute name. 616// It returns an error if s[i:] does not look like it begins with an 617// attribute name, such as encountering a quote mark without a preceding 618// equals sign. 619func eatAttrName(s []byte, i int) (int, *Error) { 620 for j := i; j < len(s); j++ { 621 switch s[j] { 622 case ' ', '\t', '\n', '\f', '\r', '=', '>': 623 return j, nil 624 case '\'', '"', '<': 625 // These result in a parse warning in HTML5 and are 626 // indicative of serious problems if seen in an attr 627 // name in a template. 628 return -1, errorf(ErrBadHTML, nil, 0, "%q in attribute name: %.32q", s[j:j+1], s) 629 default: 630 // No-op. 631 } 632 } 633 return len(s), nil 634} 635 636var elementNameMap = map[string]element{ 637 "script": elementScript, 638 "style": elementStyle, 639 "textarea": elementTextarea, 640 "title": elementTitle, 641} 642 643// asciiAlpha reports whether c is an ASCII letter. 644func asciiAlpha(c byte) bool { 645 return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' 646} 647 648// asciiAlphaNum reports whether c is an ASCII letter or digit. 649func asciiAlphaNum(c byte) bool { 650 return asciiAlpha(c) || '0' <= c && c <= '9' 651} 652 653// eatTagName returns the largest j such that s[i:j] is a tag name and the tag type. 654func eatTagName(s []byte, i int) (int, element) { 655 if i == len(s) || !asciiAlpha(s[i]) { 656 return i, elementNone 657 } 658 j := i + 1 659 for j < len(s) { 660 x := s[j] 661 if asciiAlphaNum(x) { 662 j++ 663 continue 664 } 665 // Allow "x-y" or "x:y" but not "x-", "-y", or "x--y". 666 if (x == ':' || x == '-') && j+1 < len(s) && asciiAlphaNum(s[j+1]) { 667 j += 2 668 continue 669 } 670 break 671 } 672 return j, elementNameMap[strings.ToLower(string(s[i:j]))] 673} 674 675// eatWhiteSpace returns the largest j such that s[i:j] is white space. 676func eatWhiteSpace(s []byte, i int) int { 677 for j := i; j < len(s); j++ { 678 switch s[j] { 679 case ' ', '\t', '\n', '\f', '\r': 680 // No-op. 681 default: 682 return j 683 } 684 } 685 return len(s) 686} 687