1// Copyright 2021 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package markdown 6 7import ( 8 "bytes" 9 "strconv" 10 "strings" 11 "unicode" 12) 13 14type HTMLBlock struct { 15 Position 16 Text []string 17} 18 19func (b *HTMLBlock) PrintHTML(buf *bytes.Buffer) { 20 for _, s := range b.Text { 21 buf.WriteString(s) 22 buf.WriteString("\n") 23 } 24} 25 26func (b *HTMLBlock) printMarkdown(buf *bytes.Buffer, s mdState) { 27 if s.prefix1 != "" { 28 buf.WriteString(s.prefix1) 29 } else { 30 buf.WriteString(s.prefix) 31 } 32 b.PrintHTML(buf) 33} 34 35type htmlBuilder struct { 36 endBlank bool 37 text []string 38 endFunc func(string) bool 39} 40 41func (c *htmlBuilder) extend(p *parseState, s line) (line, bool) { 42 if c.endBlank && s.isBlank() { 43 return s, false 44 } 45 t := s.string() 46 c.text = append(c.text, t) 47 if c.endFunc != nil && c.endFunc(t) { 48 return line{}, false 49 } 50 return line{}, true 51} 52 53func (c *htmlBuilder) build(p buildState) Block { 54 return &HTMLBlock{ 55 p.pos(), 56 c.text, 57 } 58} 59 60func newHTML(p *parseState, s line) (line, bool) { 61 peek := s 62 if p.startHTML(&peek) { 63 return line{}, true 64 } 65 return s, false 66} 67 68func (p *parseState) startHTML(s *line) bool { 69 tt := *s 70 tt.trimSpace(0, 3, false) 71 if tt.peek() != '<' { 72 return false 73 } 74 t := tt.string() 75 76 var end string 77 switch { 78 case strings.HasPrefix(t, "<!--"): 79 end = "-->" 80 case strings.HasPrefix(t, "<?"): 81 end = "?>" 82 case strings.HasPrefix(t, "<![CDATA["): 83 end = "]]>" 84 case strings.HasPrefix(t, "<!") && len(t) >= 3 && isLetter(t[2]): 85 if 'a' <= t[2] && t[2] <= 'z' { 86 // Goldmark and the Dingus only accept <!UPPER> not <!lower>. 87 p.corner = true 88 } 89 end = ">" 90 } 91 if end != "" { 92 b := &htmlBuilder{endFunc: func(s string) bool { return strings.Contains(s, end) }} 93 p.addBlock(b) 94 b.text = append(b.text, s.string()) 95 if b.endFunc(t) { 96 p.closeBlock() 97 } 98 return true 99 } 100 101 // case 6 102 i := 1 103 if i < len(t) && t[i] == '/' { 104 i++ 105 } 106 buf := make([]byte, 0, 16) 107 for ; i < len(t) && len(buf) < 16; i++ { 108 c := t[i] 109 if 'A' <= c && c <= 'Z' { 110 c += 'a' - 'A' 111 } 112 if !('a' <= c && c <= 'z') && !('0' <= c && c <= '9') { 113 break 114 } 115 buf = append(buf, c) 116 } 117 var sep byte 118 if i < len(t) { 119 switch t[i] { 120 default: 121 goto Next 122 case ' ', '\t', '>': 123 // ok 124 sep = t[i] 125 case '/': 126 if i+1 >= len(t) || t[i+1] != '>' { 127 goto Next 128 } 129 } 130 } 131 132 if len(buf) == 0 { 133 goto Next 134 } 135 { 136 c := buf[0] 137 var ok bool 138 for _, name := range htmlTags { 139 if name[0] == c && len(name) == len(buf) && name == string(buf) { 140 if sep == '\t' { 141 // Goldmark recognizes space here but not tab. 142 // testdata/extra.txt 143.md 143 p.corner = true 144 } 145 ok = true 146 break 147 } 148 } 149 if !ok { 150 goto Next 151 } 152 } 153 154 { 155 b := &htmlBuilder{endBlank: true} 156 p.addBlock(b) 157 b.text = append(b.text, s.string()) 158 return true 159 } 160 161Next: 162 // case 1 163 if len(t) > 1 && t[1] != '/' && (i >= len(t) || t[i] == ' ' || t[i] == '\t' || t[i] == '>') { 164 switch string(buf) { 165 case "pre", "script", "style", "textarea": 166 b := &htmlBuilder{endFunc: hasEndPre} 167 p.addBlock(b) 168 b.text = append(b.text, s.string()) 169 if hasEndPre(t) { 170 p.closeBlock() 171 } 172 return true 173 } 174 } 175 176 // case 7 177 if p.para() == nil { 178 if _, e, ok := parseHTMLOpenTag(p, t, 0); ok && skipSpace(t, e) == len(t) { 179 if e != len(t) { 180 // Goldmark disallows trailing space 181 p.corner = true 182 } 183 b := &htmlBuilder{endBlank: true} 184 p.addBlock(b) 185 b.text = append(b.text, s.string()) 186 return true 187 } 188 if _, e, ok := parseHTMLClosingTag(p, t, 0); ok && skipSpace(t, e) == len(t) { 189 b := &htmlBuilder{endBlank: true} 190 p.addBlock(b) 191 b.text = append(b.text, s.string()) 192 return true 193 } 194 } 195 196 return false 197} 198 199func hasEndPre(s string) bool { 200 for i := 0; i < len(s); i++ { 201 if s[i] == '<' && i+1 < len(s) && s[i+1] == '/' { 202 buf := make([]byte, 0, 8) 203 for i += 2; i < len(s) && len(buf) < 8; i++ { 204 c := s[i] 205 if 'A' <= c && c <= 'Z' { 206 c += 'a' - 'A' 207 } 208 if c < 'a' || 'z' < c { 209 break 210 } 211 buf = append(buf, c) 212 } 213 if i < len(s) && s[i] == '>' { 214 switch string(buf) { 215 case "pre", "script", "style", "textarea": 216 return true 217 } 218 } 219 } 220 } 221 return false 222} 223 224func parseHTMLTag(p *parseState, s string, i int) (Inline, int, bool) { 225 // “An HTML tag consists of an open tag, a closing tag, an HTML comment, 226 // a processing instruction, a declaration, or a CDATA section.” 227 if i+3 <= len(s) && s[i] == '<' { 228 switch s[i+1] { 229 default: 230 return parseHTMLOpenTag(p, s, i) 231 case '/': 232 return parseHTMLClosingTag(p, s, i) 233 case '!': 234 switch s[i+2] { 235 case '-': 236 return parseHTMLComment(s, i) 237 case '[': 238 return parseHTMLCDATA(s, i) 239 default: 240 return parseHTMLDecl(p, s, i) 241 } 242 case '?': 243 return parseHTMLProcInst(s, i) 244 } 245 } 246 return nil, 0, false 247} 248 249func parseHTMLOpenTag(p *parseState, s string, i int) (Inline, int, bool) { 250 if i >= len(s) || s[i] != '<' { 251 return nil, 0, false 252 } 253 // “An open tag consists of a < character, a tag name, zero or more attributes, 254 // optional spaces, tabs, and up to one line ending, an optional / character, and a > character.” 255 if name, j, ok := parseTagName(s, i+1); ok { 256 switch name { 257 case "pre", "script", "style", "textarea": 258 // Goldmark treats these as starting a new HTMLBlock 259 // and ending the paragraph they appear in. 260 p.corner = true 261 } 262 for { 263 if j >= len(s) || s[j] != ' ' && s[j] != '\t' && s[j] != '\n' && s[j] != '/' && s[j] != '>' { 264 return nil, 0, false 265 } 266 _, k, ok := parseAttr(p, s, j) 267 if !ok { 268 break 269 } 270 j = k 271 } 272 k := skipSpace(s, j) 273 if k != j { 274 // Goldmark mishandles spaces before >. 275 p.corner = true 276 } 277 j = k 278 if j < len(s) && s[j] == '/' { 279 j++ 280 } 281 if j < len(s) && s[j] == '>' { 282 return &HTMLTag{s[i : j+1]}, j + 1, true 283 } 284 } 285 return nil, 0, false 286} 287 288func parseHTMLClosingTag(p *parseState, s string, i int) (Inline, int, bool) { 289 // “A closing tag consists of the string </, a tag name, 290 // optional spaces, tabs, and up to one line ending, and the character >.” 291 if i+2 >= len(s) || s[i] != '<' || s[i+1] != '/' { 292 return nil, 0, false 293 } 294 if skipSpace(s, i+2) != i+2 { 295 // Goldmark allows spaces here but the spec and the Dingus do not. 296 p.corner = true 297 } 298 299 if _, j, ok := parseTagName(s, i+2); ok { 300 j = skipSpace(s, j) 301 if j < len(s) && s[j] == '>' { 302 return &HTMLTag{s[i : j+1]}, j + 1, true 303 } 304 } 305 return nil, 0, false 306} 307 308func parseTagName(s string, i int) (string, int, bool) { 309 // “A tag name consists of an ASCII letter followed by zero or more ASCII letters, digits, or hyphens (-).” 310 if i < len(s) && isLetter(s[i]) { 311 j := i + 1 312 for j < len(s) && isLDH(s[j]) { 313 j++ 314 } 315 return s[i:j], j, true 316 } 317 return "", 0, false 318} 319 320func parseAttr(p *parseState, s string, i int) (string, int, bool) { 321 // “An attribute consists of spaces, tabs, and up to one line ending, 322 // an attribute name, and an optional attribute value specification.” 323 i = skipSpace(s, i) 324 if _, j, ok := parseAttrName(s, i); ok { 325 if _, k, ok := parseAttrValueSpec(p, s, j); ok { 326 j = k 327 } 328 return s[i:j], j, true 329 } 330 return "", 0, false 331} 332 333func parseAttrName(s string, i int) (string, int, bool) { 334 // “An attribute name consists of an ASCII letter, _, or :, 335 // followed by zero or more ASCII letters, digits, _, ., :, or -.” 336 if i+1 < len(s) && (isLetter(s[i]) || s[i] == '_' || s[i] == ':') { 337 j := i + 1 338 for j < len(s) && (isLDH(s[j]) || s[j] == '_' || s[j] == '.' || s[j] == ':') { 339 j++ 340 } 341 return s[i:j], j, true 342 } 343 return "", 0, false 344} 345 346func parseAttrValueSpec(p *parseState, s string, i int) (string, int, bool) { 347 // “An attribute value specification consists of 348 // optional spaces, tabs, and up to one line ending, 349 // a = character, 350 // optional spaces, tabs, and up to one line ending, 351 // and an attribute value.” 352 i = skipSpace(s, i) 353 if i+1 < len(s) && s[i] == '=' { 354 i = skipSpace(s, i+1) 355 if _, j, ok := parseAttrValue(s, i); ok { 356 p.corner = p.corner || strings.Contains(s[i:j], "\ufffd") 357 return s[i:j], j, true 358 } 359 } 360 return "", 0, false 361} 362 363func parseAttrValue(s string, i int) (string, int, bool) { 364 // “An attribute value consists of 365 // an unquoted attribute value, 366 // a single-quoted attribute value, 367 // or a double-quoted attribute value.” 368 // TODO: No escaping??? 369 if i < len(s) && (s[i] == '\'' || s[i] == '"') { 370 // “A single-quoted attribute value consists of ', 371 // zero or more characters not including ', and a final '.” 372 // “A double-quoted attribute value consists of ", 373 // zero or more characters not including ", and a final ".” 374 if j := strings.IndexByte(s[i+1:], s[i]); j >= 0 { 375 end := i + 1 + j + 1 376 return s[i:end], end, true 377 } 378 } 379 380 // “An unquoted attribute value is a nonempty string of characters 381 // not including spaces, tabs, line endings, ", ', =, <, >, or `.” 382 j := i 383 for j < len(s) && strings.IndexByte(" \t\n\"'=<>`", s[j]) < 0 { 384 j++ 385 } 386 if j > i { 387 return s[i:j], j, true 388 } 389 return "", 0, false 390} 391 392func parseHTMLComment(s string, i int) (Inline, int, bool) { 393 // “An HTML comment consists of <!-- + text + -->, 394 // where text does not start with > or ->, 395 // does not end with -, and does not contain --.” 396 if !strings.HasPrefix(s[i:], "<!-->") && 397 !strings.HasPrefix(s[i:], "<!--->") { 398 if x, end, ok := parseHTMLMarker(s, i, "<!--", "-->"); ok { 399 if t := x.(*HTMLTag).Text; !strings.Contains(t[len("<!--"):len(t)-len("->")], "--") { 400 return x, end, ok 401 } 402 } 403 } 404 return nil, 0, false 405} 406 407func parseHTMLCDATA(s string, i int) (Inline, int, bool) { 408 // “A CDATA section consists of the string <![CDATA[, 409 // a string of characters not including the string ]]>, and the string ]]>.” 410 return parseHTMLMarker(s, i, "<![CDATA[", "]]>") 411} 412 413func parseHTMLDecl(p *parseState, s string, i int) (Inline, int, bool) { 414 // “A declaration consists of the string <!, an ASCII letter, 415 // zero or more characters not including the character >, and the character >.” 416 if i+2 < len(s) && isLetter(s[i+2]) { 417 if 'a' <= s[i+2] && s[i+2] <= 'z' { 418 p.corner = true // goldmark requires uppercase 419 } 420 return parseHTMLMarker(s, i, "<!", ">") 421 } 422 return nil, 0, false 423} 424 425func parseHTMLProcInst(s string, i int) (Inline, int, bool) { 426 // “A processing instruction consists of the string <?, 427 // a string of characters not including the string ?>, and the string ?>.” 428 return parseHTMLMarker(s, i, "<?", "?>") 429} 430 431func parseHTMLMarker(s string, i int, prefix, suffix string) (Inline, int, bool) { 432 if strings.HasPrefix(s[i:], prefix) { 433 if j := strings.Index(s[i+len(prefix):], suffix); j >= 0 { 434 end := i + len(prefix) + j + len(suffix) 435 return &HTMLTag{s[i:end]}, end, true 436 } 437 } 438 return nil, 0, false 439} 440 441func parseHTMLEntity(_ *parseState, s string, i int) (Inline, int, int, bool) { 442 start := i 443 if i+1 < len(s) && s[i+1] == '#' { 444 i += 2 445 var r, end int 446 if i < len(s) && (s[i] == 'x' || s[i] == 'X') { 447 // hex 448 i++ 449 j := i 450 for j < len(s) && isHexDigit(s[j]) { 451 j++ 452 } 453 if j-i < 1 || j-i > 6 || j >= len(s) || s[j] != ';' { 454 return nil, 0, 0, false 455 } 456 r64, _ := strconv.ParseInt(s[i:j], 16, 0) 457 r = int(r64) 458 end = j + 1 459 } else { 460 // decimal 461 j := i 462 for j < len(s) && isDigit(s[j]) { 463 j++ 464 } 465 if j-i < 1 || j-i > 7 || j >= len(s) || s[j] != ';' { 466 return nil, 0, 0, false 467 } 468 r, _ = strconv.Atoi(s[i:j]) 469 end = j + 1 470 } 471 if r > unicode.MaxRune || r == 0 { 472 r = unicode.ReplacementChar 473 } 474 return &Plain{string(rune(r))}, start, end, true 475 } 476 477 // Max name in list is 32 bytes. Try for 64 for good measure. 478 for j := i + 1; j < len(s) && j-i < 64; j++ { 479 if s[j] == '&' { // Stop possible quadratic search on &&&&&&&. 480 break 481 } 482 if s[j] == ';' { 483 if r, ok := htmlEntity[s[i:j+1]]; ok { 484 return &Plain{r}, start, j + 1, true 485 } 486 break 487 } 488 } 489 490 return nil, 0, 0, false 491} 492 493type HTMLTag struct { 494 Text string 495} 496 497func (*HTMLTag) Inline() {} 498 499func (x *HTMLTag) PrintHTML(buf *bytes.Buffer) { 500 buf.WriteString(x.Text) 501} 502 503func (x *HTMLTag) printMarkdown(buf *bytes.Buffer) { 504 x.PrintHTML(buf) 505} 506 507func (x *HTMLTag) PrintText(buf *bytes.Buffer) {} 508