1*4947cdc7SCole Faust// Copyright 2017 The Bazel Authors. All rights reserved. 2*4947cdc7SCole Faust// Use of this source code is governed by a BSD-style 3*4947cdc7SCole Faust// license that can be found in the LICENSE file. 4*4947cdc7SCole Faust 5*4947cdc7SCole Faustpackage syntax 6*4947cdc7SCole Faust 7*4947cdc7SCole Faust// Starlark quoted string utilities. 8*4947cdc7SCole Faust 9*4947cdc7SCole Faustimport ( 10*4947cdc7SCole Faust "fmt" 11*4947cdc7SCole Faust "strconv" 12*4947cdc7SCole Faust "strings" 13*4947cdc7SCole Faust "unicode" 14*4947cdc7SCole Faust "unicode/utf8" 15*4947cdc7SCole Faust) 16*4947cdc7SCole Faust 17*4947cdc7SCole Faust// unesc maps single-letter chars following \ to their actual values. 18*4947cdc7SCole Faustvar unesc = [256]byte{ 19*4947cdc7SCole Faust 'a': '\a', 20*4947cdc7SCole Faust 'b': '\b', 21*4947cdc7SCole Faust 'f': '\f', 22*4947cdc7SCole Faust 'n': '\n', 23*4947cdc7SCole Faust 'r': '\r', 24*4947cdc7SCole Faust 't': '\t', 25*4947cdc7SCole Faust 'v': '\v', 26*4947cdc7SCole Faust '\\': '\\', 27*4947cdc7SCole Faust '\'': '\'', 28*4947cdc7SCole Faust '"': '"', 29*4947cdc7SCole Faust} 30*4947cdc7SCole Faust 31*4947cdc7SCole Faust// esc maps escape-worthy bytes to the char that should follow \. 32*4947cdc7SCole Faustvar esc = [256]byte{ 33*4947cdc7SCole Faust '\a': 'a', 34*4947cdc7SCole Faust '\b': 'b', 35*4947cdc7SCole Faust '\f': 'f', 36*4947cdc7SCole Faust '\n': 'n', 37*4947cdc7SCole Faust '\r': 'r', 38*4947cdc7SCole Faust '\t': 't', 39*4947cdc7SCole Faust '\v': 'v', 40*4947cdc7SCole Faust '\\': '\\', 41*4947cdc7SCole Faust '\'': '\'', 42*4947cdc7SCole Faust '"': '"', 43*4947cdc7SCole Faust} 44*4947cdc7SCole Faust 45*4947cdc7SCole Faust// unquote unquotes the quoted string, returning the actual 46*4947cdc7SCole Faust// string value, whether the original was triple-quoted, 47*4947cdc7SCole Faust// whether it was a byte string, and an error describing invalid input. 48*4947cdc7SCole Faustfunc unquote(quoted string) (s string, triple, isByte bool, err error) { 49*4947cdc7SCole Faust // Check for raw prefix: means don't interpret the inner \. 50*4947cdc7SCole Faust raw := false 51*4947cdc7SCole Faust if strings.HasPrefix(quoted, "r") { 52*4947cdc7SCole Faust raw = true 53*4947cdc7SCole Faust quoted = quoted[1:] 54*4947cdc7SCole Faust } 55*4947cdc7SCole Faust // Check for bytes prefix. 56*4947cdc7SCole Faust if strings.HasPrefix(quoted, "b") { 57*4947cdc7SCole Faust isByte = true 58*4947cdc7SCole Faust quoted = quoted[1:] 59*4947cdc7SCole Faust } 60*4947cdc7SCole Faust 61*4947cdc7SCole Faust if len(quoted) < 2 { 62*4947cdc7SCole Faust err = fmt.Errorf("string literal too short") 63*4947cdc7SCole Faust return 64*4947cdc7SCole Faust } 65*4947cdc7SCole Faust 66*4947cdc7SCole Faust if quoted[0] != '"' && quoted[0] != '\'' || quoted[0] != quoted[len(quoted)-1] { 67*4947cdc7SCole Faust err = fmt.Errorf("string literal has invalid quotes") 68*4947cdc7SCole Faust return 69*4947cdc7SCole Faust } 70*4947cdc7SCole Faust 71*4947cdc7SCole Faust // Check for triple quoted string. 72*4947cdc7SCole Faust quote := quoted[0] 73*4947cdc7SCole Faust if len(quoted) >= 6 && quoted[1] == quote && quoted[2] == quote && quoted[:3] == quoted[len(quoted)-3:] { 74*4947cdc7SCole Faust triple = true 75*4947cdc7SCole Faust quoted = quoted[3 : len(quoted)-3] 76*4947cdc7SCole Faust } else { 77*4947cdc7SCole Faust quoted = quoted[1 : len(quoted)-1] 78*4947cdc7SCole Faust } 79*4947cdc7SCole Faust 80*4947cdc7SCole Faust // Now quoted is the quoted data, but no quotes. 81*4947cdc7SCole Faust // If we're in raw mode or there are no escapes or 82*4947cdc7SCole Faust // carriage returns, we're done. 83*4947cdc7SCole Faust var unquoteChars string 84*4947cdc7SCole Faust if raw { 85*4947cdc7SCole Faust unquoteChars = "\r" 86*4947cdc7SCole Faust } else { 87*4947cdc7SCole Faust unquoteChars = "\\\r" 88*4947cdc7SCole Faust } 89*4947cdc7SCole Faust if !strings.ContainsAny(quoted, unquoteChars) { 90*4947cdc7SCole Faust s = quoted 91*4947cdc7SCole Faust return 92*4947cdc7SCole Faust } 93*4947cdc7SCole Faust 94*4947cdc7SCole Faust // Otherwise process quoted string. 95*4947cdc7SCole Faust // Each iteration processes one escape sequence along with the 96*4947cdc7SCole Faust // plain text leading up to it. 97*4947cdc7SCole Faust buf := new(strings.Builder) 98*4947cdc7SCole Faust for { 99*4947cdc7SCole Faust // Remove prefix before escape sequence. 100*4947cdc7SCole Faust i := strings.IndexAny(quoted, unquoteChars) 101*4947cdc7SCole Faust if i < 0 { 102*4947cdc7SCole Faust i = len(quoted) 103*4947cdc7SCole Faust } 104*4947cdc7SCole Faust buf.WriteString(quoted[:i]) 105*4947cdc7SCole Faust quoted = quoted[i:] 106*4947cdc7SCole Faust 107*4947cdc7SCole Faust if len(quoted) == 0 { 108*4947cdc7SCole Faust break 109*4947cdc7SCole Faust } 110*4947cdc7SCole Faust 111*4947cdc7SCole Faust // Process carriage return. 112*4947cdc7SCole Faust if quoted[0] == '\r' { 113*4947cdc7SCole Faust buf.WriteByte('\n') 114*4947cdc7SCole Faust if len(quoted) > 1 && quoted[1] == '\n' { 115*4947cdc7SCole Faust quoted = quoted[2:] 116*4947cdc7SCole Faust } else { 117*4947cdc7SCole Faust quoted = quoted[1:] 118*4947cdc7SCole Faust } 119*4947cdc7SCole Faust continue 120*4947cdc7SCole Faust } 121*4947cdc7SCole Faust 122*4947cdc7SCole Faust // Process escape sequence. 123*4947cdc7SCole Faust if len(quoted) == 1 { 124*4947cdc7SCole Faust err = fmt.Errorf(`truncated escape sequence \`) 125*4947cdc7SCole Faust return 126*4947cdc7SCole Faust } 127*4947cdc7SCole Faust 128*4947cdc7SCole Faust switch quoted[1] { 129*4947cdc7SCole Faust default: 130*4947cdc7SCole Faust // In Starlark, like Go, a backslash must escape something. 131*4947cdc7SCole Faust // (Python still treats unnecessary backslashes literally, 132*4947cdc7SCole Faust // but since 3.6 has emitted a deprecation warning.) 133*4947cdc7SCole Faust err = fmt.Errorf("invalid escape sequence \\%c", quoted[1]) 134*4947cdc7SCole Faust return 135*4947cdc7SCole Faust 136*4947cdc7SCole Faust case '\n': 137*4947cdc7SCole Faust // Ignore the escape and the line break. 138*4947cdc7SCole Faust quoted = quoted[2:] 139*4947cdc7SCole Faust 140*4947cdc7SCole Faust case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"': 141*4947cdc7SCole Faust // One-char escape. 142*4947cdc7SCole Faust // Escapes are allowed for both kinds of quotation 143*4947cdc7SCole Faust // mark, not just the kind in use. 144*4947cdc7SCole Faust buf.WriteByte(unesc[quoted[1]]) 145*4947cdc7SCole Faust quoted = quoted[2:] 146*4947cdc7SCole Faust 147*4947cdc7SCole Faust case '0', '1', '2', '3', '4', '5', '6', '7': 148*4947cdc7SCole Faust // Octal escape, up to 3 digits, \OOO. 149*4947cdc7SCole Faust n := int(quoted[1] - '0') 150*4947cdc7SCole Faust quoted = quoted[2:] 151*4947cdc7SCole Faust for i := 1; i < 3; i++ { 152*4947cdc7SCole Faust if len(quoted) == 0 || quoted[0] < '0' || '7' < quoted[0] { 153*4947cdc7SCole Faust break 154*4947cdc7SCole Faust } 155*4947cdc7SCole Faust n = n*8 + int(quoted[0]-'0') 156*4947cdc7SCole Faust quoted = quoted[1:] 157*4947cdc7SCole Faust } 158*4947cdc7SCole Faust if !isByte && n > 127 { 159*4947cdc7SCole Faust err = fmt.Errorf(`non-ASCII octal escape \%o (use \u%04X for the UTF-8 encoding of U+%04X)`, n, n, n) 160*4947cdc7SCole Faust return 161*4947cdc7SCole Faust } 162*4947cdc7SCole Faust if n >= 256 { 163*4947cdc7SCole Faust // NOTE: Python silently discards the high bit, 164*4947cdc7SCole Faust // so that '\541' == '\141' == 'a'. 165*4947cdc7SCole Faust // Let's see if we can avoid doing that in BUILD files. 166*4947cdc7SCole Faust err = fmt.Errorf(`invalid escape sequence \%03o`, n) 167*4947cdc7SCole Faust return 168*4947cdc7SCole Faust } 169*4947cdc7SCole Faust buf.WriteByte(byte(n)) 170*4947cdc7SCole Faust 171*4947cdc7SCole Faust case 'x': 172*4947cdc7SCole Faust // Hexadecimal escape, exactly 2 digits, \xXX. [0-127] 173*4947cdc7SCole Faust if len(quoted) < 4 { 174*4947cdc7SCole Faust err = fmt.Errorf(`truncated escape sequence %s`, quoted) 175*4947cdc7SCole Faust return 176*4947cdc7SCole Faust } 177*4947cdc7SCole Faust n, err1 := strconv.ParseUint(quoted[2:4], 16, 0) 178*4947cdc7SCole Faust if err1 != nil { 179*4947cdc7SCole Faust err = fmt.Errorf(`invalid escape sequence %s`, quoted[:4]) 180*4947cdc7SCole Faust return 181*4947cdc7SCole Faust } 182*4947cdc7SCole Faust if !isByte && n > 127 { 183*4947cdc7SCole Faust err = fmt.Errorf(`non-ASCII hex escape %s (use \u%04X for the UTF-8 encoding of U+%04X)`, 184*4947cdc7SCole Faust quoted[:4], n, n) 185*4947cdc7SCole Faust return 186*4947cdc7SCole Faust } 187*4947cdc7SCole Faust buf.WriteByte(byte(n)) 188*4947cdc7SCole Faust quoted = quoted[4:] 189*4947cdc7SCole Faust 190*4947cdc7SCole Faust case 'u', 'U': 191*4947cdc7SCole Faust // Unicode code point, 4 (\uXXXX) or 8 (\UXXXXXXXX) hex digits. 192*4947cdc7SCole Faust sz := 6 193*4947cdc7SCole Faust if quoted[1] == 'U' { 194*4947cdc7SCole Faust sz = 10 195*4947cdc7SCole Faust } 196*4947cdc7SCole Faust if len(quoted) < sz { 197*4947cdc7SCole Faust err = fmt.Errorf(`truncated escape sequence %s`, quoted) 198*4947cdc7SCole Faust return 199*4947cdc7SCole Faust } 200*4947cdc7SCole Faust n, err1 := strconv.ParseUint(quoted[2:sz], 16, 0) 201*4947cdc7SCole Faust if err1 != nil { 202*4947cdc7SCole Faust err = fmt.Errorf(`invalid escape sequence %s`, quoted[:sz]) 203*4947cdc7SCole Faust return 204*4947cdc7SCole Faust } 205*4947cdc7SCole Faust if n > unicode.MaxRune { 206*4947cdc7SCole Faust err = fmt.Errorf(`code point out of range: %s (max \U%08x)`, 207*4947cdc7SCole Faust quoted[:sz], n) 208*4947cdc7SCole Faust return 209*4947cdc7SCole Faust } 210*4947cdc7SCole Faust // As in Go, surrogates are disallowed. 211*4947cdc7SCole Faust if 0xD800 <= n && n < 0xE000 { 212*4947cdc7SCole Faust err = fmt.Errorf(`invalid Unicode code point U+%04X`, n) 213*4947cdc7SCole Faust return 214*4947cdc7SCole Faust } 215*4947cdc7SCole Faust buf.WriteRune(rune(n)) 216*4947cdc7SCole Faust quoted = quoted[sz:] 217*4947cdc7SCole Faust } 218*4947cdc7SCole Faust } 219*4947cdc7SCole Faust 220*4947cdc7SCole Faust s = buf.String() 221*4947cdc7SCole Faust return 222*4947cdc7SCole Faust} 223*4947cdc7SCole Faust 224*4947cdc7SCole Faust// indexByte returns the index of the first instance of b in s, or else -1. 225*4947cdc7SCole Faustfunc indexByte(s string, b byte) int { 226*4947cdc7SCole Faust for i := 0; i < len(s); i++ { 227*4947cdc7SCole Faust if s[i] == b { 228*4947cdc7SCole Faust return i 229*4947cdc7SCole Faust } 230*4947cdc7SCole Faust } 231*4947cdc7SCole Faust return -1 232*4947cdc7SCole Faust} 233*4947cdc7SCole Faust 234*4947cdc7SCole Faust// Quote returns a Starlark literal that denotes s. 235*4947cdc7SCole Faust// If b, it returns a bytes literal. 236*4947cdc7SCole Faustfunc Quote(s string, b bool) string { 237*4947cdc7SCole Faust const hex = "0123456789abcdef" 238*4947cdc7SCole Faust var runeTmp [utf8.UTFMax]byte 239*4947cdc7SCole Faust 240*4947cdc7SCole Faust buf := make([]byte, 0, 3*len(s)/2) 241*4947cdc7SCole Faust if b { 242*4947cdc7SCole Faust buf = append(buf, 'b') 243*4947cdc7SCole Faust } 244*4947cdc7SCole Faust buf = append(buf, '"') 245*4947cdc7SCole Faust for width := 0; len(s) > 0; s = s[width:] { 246*4947cdc7SCole Faust r := rune(s[0]) 247*4947cdc7SCole Faust width = 1 248*4947cdc7SCole Faust if r >= utf8.RuneSelf { 249*4947cdc7SCole Faust r, width = utf8.DecodeRuneInString(s) 250*4947cdc7SCole Faust } 251*4947cdc7SCole Faust if width == 1 && r == utf8.RuneError { 252*4947cdc7SCole Faust // String (!b) literals accept \xXX escapes only for ASCII, 253*4947cdc7SCole Faust // but we must use them here to represent invalid bytes. 254*4947cdc7SCole Faust // The result is not a legal literal. 255*4947cdc7SCole Faust buf = append(buf, `\x`...) 256*4947cdc7SCole Faust buf = append(buf, hex[s[0]>>4]) 257*4947cdc7SCole Faust buf = append(buf, hex[s[0]&0xF]) 258*4947cdc7SCole Faust continue 259*4947cdc7SCole Faust } 260*4947cdc7SCole Faust if r == '"' || r == '\\' { // always backslashed 261*4947cdc7SCole Faust buf = append(buf, '\\') 262*4947cdc7SCole Faust buf = append(buf, byte(r)) 263*4947cdc7SCole Faust continue 264*4947cdc7SCole Faust } 265*4947cdc7SCole Faust if strconv.IsPrint(r) { 266*4947cdc7SCole Faust n := utf8.EncodeRune(runeTmp[:], r) 267*4947cdc7SCole Faust buf = append(buf, runeTmp[:n]...) 268*4947cdc7SCole Faust continue 269*4947cdc7SCole Faust } 270*4947cdc7SCole Faust switch r { 271*4947cdc7SCole Faust case '\a': 272*4947cdc7SCole Faust buf = append(buf, `\a`...) 273*4947cdc7SCole Faust case '\b': 274*4947cdc7SCole Faust buf = append(buf, `\b`...) 275*4947cdc7SCole Faust case '\f': 276*4947cdc7SCole Faust buf = append(buf, `\f`...) 277*4947cdc7SCole Faust case '\n': 278*4947cdc7SCole Faust buf = append(buf, `\n`...) 279*4947cdc7SCole Faust case '\r': 280*4947cdc7SCole Faust buf = append(buf, `\r`...) 281*4947cdc7SCole Faust case '\t': 282*4947cdc7SCole Faust buf = append(buf, `\t`...) 283*4947cdc7SCole Faust case '\v': 284*4947cdc7SCole Faust buf = append(buf, `\v`...) 285*4947cdc7SCole Faust default: 286*4947cdc7SCole Faust switch { 287*4947cdc7SCole Faust case r < ' ' || r == 0x7f: 288*4947cdc7SCole Faust buf = append(buf, `\x`...) 289*4947cdc7SCole Faust buf = append(buf, hex[byte(r)>>4]) 290*4947cdc7SCole Faust buf = append(buf, hex[byte(r)&0xF]) 291*4947cdc7SCole Faust case r > utf8.MaxRune: 292*4947cdc7SCole Faust r = 0xFFFD 293*4947cdc7SCole Faust fallthrough 294*4947cdc7SCole Faust case r < 0x10000: 295*4947cdc7SCole Faust buf = append(buf, `\u`...) 296*4947cdc7SCole Faust for s := 12; s >= 0; s -= 4 { 297*4947cdc7SCole Faust buf = append(buf, hex[r>>uint(s)&0xF]) 298*4947cdc7SCole Faust } 299*4947cdc7SCole Faust default: 300*4947cdc7SCole Faust buf = append(buf, `\U`...) 301*4947cdc7SCole Faust for s := 28; s >= 0; s -= 4 { 302*4947cdc7SCole Faust buf = append(buf, hex[r>>uint(s)&0xF]) 303*4947cdc7SCole Faust } 304*4947cdc7SCole Faust } 305*4947cdc7SCole Faust } 306*4947cdc7SCole Faust } 307*4947cdc7SCole Faust buf = append(buf, '"') 308*4947cdc7SCole Faust return string(buf) 309*4947cdc7SCole Faust} 310