xref: /aosp_15_r20/external/starlark-go/syntax/quote.go (revision 4947cdc739c985f6d86941e22894f5cefe7c9e9a)
1*4947cdc7SCole Faust// Copyright 2017 The Bazel Authors. All rights reserved.
2*4947cdc7SCole Faust// Use of this source code is governed by a BSD-style
3*4947cdc7SCole Faust// license that can be found in the LICENSE file.
4*4947cdc7SCole Faust
5*4947cdc7SCole Faustpackage syntax
6*4947cdc7SCole Faust
7*4947cdc7SCole Faust// Starlark quoted string utilities.
8*4947cdc7SCole Faust
9*4947cdc7SCole Faustimport (
10*4947cdc7SCole Faust	"fmt"
11*4947cdc7SCole Faust	"strconv"
12*4947cdc7SCole Faust	"strings"
13*4947cdc7SCole Faust	"unicode"
14*4947cdc7SCole Faust	"unicode/utf8"
15*4947cdc7SCole Faust)
16*4947cdc7SCole Faust
17*4947cdc7SCole Faust// unesc maps single-letter chars following \ to their actual values.
18*4947cdc7SCole Faustvar unesc = [256]byte{
19*4947cdc7SCole Faust	'a':  '\a',
20*4947cdc7SCole Faust	'b':  '\b',
21*4947cdc7SCole Faust	'f':  '\f',
22*4947cdc7SCole Faust	'n':  '\n',
23*4947cdc7SCole Faust	'r':  '\r',
24*4947cdc7SCole Faust	't':  '\t',
25*4947cdc7SCole Faust	'v':  '\v',
26*4947cdc7SCole Faust	'\\': '\\',
27*4947cdc7SCole Faust	'\'': '\'',
28*4947cdc7SCole Faust	'"':  '"',
29*4947cdc7SCole Faust}
30*4947cdc7SCole Faust
31*4947cdc7SCole Faust// esc maps escape-worthy bytes to the char that should follow \.
32*4947cdc7SCole Faustvar esc = [256]byte{
33*4947cdc7SCole Faust	'\a': 'a',
34*4947cdc7SCole Faust	'\b': 'b',
35*4947cdc7SCole Faust	'\f': 'f',
36*4947cdc7SCole Faust	'\n': 'n',
37*4947cdc7SCole Faust	'\r': 'r',
38*4947cdc7SCole Faust	'\t': 't',
39*4947cdc7SCole Faust	'\v': 'v',
40*4947cdc7SCole Faust	'\\': '\\',
41*4947cdc7SCole Faust	'\'': '\'',
42*4947cdc7SCole Faust	'"':  '"',
43*4947cdc7SCole Faust}
44*4947cdc7SCole Faust
45*4947cdc7SCole Faust// unquote unquotes the quoted string, returning the actual
46*4947cdc7SCole Faust// string value, whether the original was triple-quoted,
47*4947cdc7SCole Faust// whether it was a byte string, and an error describing invalid input.
48*4947cdc7SCole Faustfunc unquote(quoted string) (s string, triple, isByte bool, err error) {
49*4947cdc7SCole Faust	// Check for raw prefix: means don't interpret the inner \.
50*4947cdc7SCole Faust	raw := false
51*4947cdc7SCole Faust	if strings.HasPrefix(quoted, "r") {
52*4947cdc7SCole Faust		raw = true
53*4947cdc7SCole Faust		quoted = quoted[1:]
54*4947cdc7SCole Faust	}
55*4947cdc7SCole Faust	// Check for bytes prefix.
56*4947cdc7SCole Faust	if strings.HasPrefix(quoted, "b") {
57*4947cdc7SCole Faust		isByte = true
58*4947cdc7SCole Faust		quoted = quoted[1:]
59*4947cdc7SCole Faust	}
60*4947cdc7SCole Faust
61*4947cdc7SCole Faust	if len(quoted) < 2 {
62*4947cdc7SCole Faust		err = fmt.Errorf("string literal too short")
63*4947cdc7SCole Faust		return
64*4947cdc7SCole Faust	}
65*4947cdc7SCole Faust
66*4947cdc7SCole Faust	if quoted[0] != '"' && quoted[0] != '\'' || quoted[0] != quoted[len(quoted)-1] {
67*4947cdc7SCole Faust		err = fmt.Errorf("string literal has invalid quotes")
68*4947cdc7SCole Faust		return
69*4947cdc7SCole Faust	}
70*4947cdc7SCole Faust
71*4947cdc7SCole Faust	// Check for triple quoted string.
72*4947cdc7SCole Faust	quote := quoted[0]
73*4947cdc7SCole Faust	if len(quoted) >= 6 && quoted[1] == quote && quoted[2] == quote && quoted[:3] == quoted[len(quoted)-3:] {
74*4947cdc7SCole Faust		triple = true
75*4947cdc7SCole Faust		quoted = quoted[3 : len(quoted)-3]
76*4947cdc7SCole Faust	} else {
77*4947cdc7SCole Faust		quoted = quoted[1 : len(quoted)-1]
78*4947cdc7SCole Faust	}
79*4947cdc7SCole Faust
80*4947cdc7SCole Faust	// Now quoted is the quoted data, but no quotes.
81*4947cdc7SCole Faust	// If we're in raw mode or there are no escapes or
82*4947cdc7SCole Faust	// carriage returns, we're done.
83*4947cdc7SCole Faust	var unquoteChars string
84*4947cdc7SCole Faust	if raw {
85*4947cdc7SCole Faust		unquoteChars = "\r"
86*4947cdc7SCole Faust	} else {
87*4947cdc7SCole Faust		unquoteChars = "\\\r"
88*4947cdc7SCole Faust	}
89*4947cdc7SCole Faust	if !strings.ContainsAny(quoted, unquoteChars) {
90*4947cdc7SCole Faust		s = quoted
91*4947cdc7SCole Faust		return
92*4947cdc7SCole Faust	}
93*4947cdc7SCole Faust
94*4947cdc7SCole Faust	// Otherwise process quoted string.
95*4947cdc7SCole Faust	// Each iteration processes one escape sequence along with the
96*4947cdc7SCole Faust	// plain text leading up to it.
97*4947cdc7SCole Faust	buf := new(strings.Builder)
98*4947cdc7SCole Faust	for {
99*4947cdc7SCole Faust		// Remove prefix before escape sequence.
100*4947cdc7SCole Faust		i := strings.IndexAny(quoted, unquoteChars)
101*4947cdc7SCole Faust		if i < 0 {
102*4947cdc7SCole Faust			i = len(quoted)
103*4947cdc7SCole Faust		}
104*4947cdc7SCole Faust		buf.WriteString(quoted[:i])
105*4947cdc7SCole Faust		quoted = quoted[i:]
106*4947cdc7SCole Faust
107*4947cdc7SCole Faust		if len(quoted) == 0 {
108*4947cdc7SCole Faust			break
109*4947cdc7SCole Faust		}
110*4947cdc7SCole Faust
111*4947cdc7SCole Faust		// Process carriage return.
112*4947cdc7SCole Faust		if quoted[0] == '\r' {
113*4947cdc7SCole Faust			buf.WriteByte('\n')
114*4947cdc7SCole Faust			if len(quoted) > 1 && quoted[1] == '\n' {
115*4947cdc7SCole Faust				quoted = quoted[2:]
116*4947cdc7SCole Faust			} else {
117*4947cdc7SCole Faust				quoted = quoted[1:]
118*4947cdc7SCole Faust			}
119*4947cdc7SCole Faust			continue
120*4947cdc7SCole Faust		}
121*4947cdc7SCole Faust
122*4947cdc7SCole Faust		// Process escape sequence.
123*4947cdc7SCole Faust		if len(quoted) == 1 {
124*4947cdc7SCole Faust			err = fmt.Errorf(`truncated escape sequence \`)
125*4947cdc7SCole Faust			return
126*4947cdc7SCole Faust		}
127*4947cdc7SCole Faust
128*4947cdc7SCole Faust		switch quoted[1] {
129*4947cdc7SCole Faust		default:
130*4947cdc7SCole Faust			// In Starlark, like Go, a backslash must escape something.
131*4947cdc7SCole Faust			// (Python still treats unnecessary backslashes literally,
132*4947cdc7SCole Faust			// but since 3.6 has emitted a deprecation warning.)
133*4947cdc7SCole Faust			err = fmt.Errorf("invalid escape sequence \\%c", quoted[1])
134*4947cdc7SCole Faust			return
135*4947cdc7SCole Faust
136*4947cdc7SCole Faust		case '\n':
137*4947cdc7SCole Faust			// Ignore the escape and the line break.
138*4947cdc7SCole Faust			quoted = quoted[2:]
139*4947cdc7SCole Faust
140*4947cdc7SCole Faust		case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"':
141*4947cdc7SCole Faust			// One-char escape.
142*4947cdc7SCole Faust			// Escapes are allowed for both kinds of quotation
143*4947cdc7SCole Faust			// mark, not just the kind in use.
144*4947cdc7SCole Faust			buf.WriteByte(unesc[quoted[1]])
145*4947cdc7SCole Faust			quoted = quoted[2:]
146*4947cdc7SCole Faust
147*4947cdc7SCole Faust		case '0', '1', '2', '3', '4', '5', '6', '7':
148*4947cdc7SCole Faust			// Octal escape, up to 3 digits, \OOO.
149*4947cdc7SCole Faust			n := int(quoted[1] - '0')
150*4947cdc7SCole Faust			quoted = quoted[2:]
151*4947cdc7SCole Faust			for i := 1; i < 3; i++ {
152*4947cdc7SCole Faust				if len(quoted) == 0 || quoted[0] < '0' || '7' < quoted[0] {
153*4947cdc7SCole Faust					break
154*4947cdc7SCole Faust				}
155*4947cdc7SCole Faust				n = n*8 + int(quoted[0]-'0')
156*4947cdc7SCole Faust				quoted = quoted[1:]
157*4947cdc7SCole Faust			}
158*4947cdc7SCole Faust			if !isByte && n > 127 {
159*4947cdc7SCole Faust				err = fmt.Errorf(`non-ASCII octal escape \%o (use \u%04X for the UTF-8 encoding of U+%04X)`, n, n, n)
160*4947cdc7SCole Faust				return
161*4947cdc7SCole Faust			}
162*4947cdc7SCole Faust			if n >= 256 {
163*4947cdc7SCole Faust				// NOTE: Python silently discards the high bit,
164*4947cdc7SCole Faust				// so that '\541' == '\141' == 'a'.
165*4947cdc7SCole Faust				// Let's see if we can avoid doing that in BUILD files.
166*4947cdc7SCole Faust				err = fmt.Errorf(`invalid escape sequence \%03o`, n)
167*4947cdc7SCole Faust				return
168*4947cdc7SCole Faust			}
169*4947cdc7SCole Faust			buf.WriteByte(byte(n))
170*4947cdc7SCole Faust
171*4947cdc7SCole Faust		case 'x':
172*4947cdc7SCole Faust			// Hexadecimal escape, exactly 2 digits, \xXX. [0-127]
173*4947cdc7SCole Faust			if len(quoted) < 4 {
174*4947cdc7SCole Faust				err = fmt.Errorf(`truncated escape sequence %s`, quoted)
175*4947cdc7SCole Faust				return
176*4947cdc7SCole Faust			}
177*4947cdc7SCole Faust			n, err1 := strconv.ParseUint(quoted[2:4], 16, 0)
178*4947cdc7SCole Faust			if err1 != nil {
179*4947cdc7SCole Faust				err = fmt.Errorf(`invalid escape sequence %s`, quoted[:4])
180*4947cdc7SCole Faust				return
181*4947cdc7SCole Faust			}
182*4947cdc7SCole Faust			if !isByte && n > 127 {
183*4947cdc7SCole Faust				err = fmt.Errorf(`non-ASCII hex escape %s (use \u%04X for the UTF-8 encoding of U+%04X)`,
184*4947cdc7SCole Faust					quoted[:4], n, n)
185*4947cdc7SCole Faust				return
186*4947cdc7SCole Faust			}
187*4947cdc7SCole Faust			buf.WriteByte(byte(n))
188*4947cdc7SCole Faust			quoted = quoted[4:]
189*4947cdc7SCole Faust
190*4947cdc7SCole Faust		case 'u', 'U':
191*4947cdc7SCole Faust			// Unicode code point, 4 (\uXXXX) or 8 (\UXXXXXXXX) hex digits.
192*4947cdc7SCole Faust			sz := 6
193*4947cdc7SCole Faust			if quoted[1] == 'U' {
194*4947cdc7SCole Faust				sz = 10
195*4947cdc7SCole Faust			}
196*4947cdc7SCole Faust			if len(quoted) < sz {
197*4947cdc7SCole Faust				err = fmt.Errorf(`truncated escape sequence %s`, quoted)
198*4947cdc7SCole Faust				return
199*4947cdc7SCole Faust			}
200*4947cdc7SCole Faust			n, err1 := strconv.ParseUint(quoted[2:sz], 16, 0)
201*4947cdc7SCole Faust			if err1 != nil {
202*4947cdc7SCole Faust				err = fmt.Errorf(`invalid escape sequence %s`, quoted[:sz])
203*4947cdc7SCole Faust				return
204*4947cdc7SCole Faust			}
205*4947cdc7SCole Faust			if n > unicode.MaxRune {
206*4947cdc7SCole Faust				err = fmt.Errorf(`code point out of range: %s (max \U%08x)`,
207*4947cdc7SCole Faust					quoted[:sz], n)
208*4947cdc7SCole Faust				return
209*4947cdc7SCole Faust			}
210*4947cdc7SCole Faust			// As in Go, surrogates are disallowed.
211*4947cdc7SCole Faust			if 0xD800 <= n && n < 0xE000 {
212*4947cdc7SCole Faust				err = fmt.Errorf(`invalid Unicode code point U+%04X`, n)
213*4947cdc7SCole Faust				return
214*4947cdc7SCole Faust			}
215*4947cdc7SCole Faust			buf.WriteRune(rune(n))
216*4947cdc7SCole Faust			quoted = quoted[sz:]
217*4947cdc7SCole Faust		}
218*4947cdc7SCole Faust	}
219*4947cdc7SCole Faust
220*4947cdc7SCole Faust	s = buf.String()
221*4947cdc7SCole Faust	return
222*4947cdc7SCole Faust}
223*4947cdc7SCole Faust
224*4947cdc7SCole Faust// indexByte returns the index of the first instance of b in s, or else -1.
225*4947cdc7SCole Faustfunc indexByte(s string, b byte) int {
226*4947cdc7SCole Faust	for i := 0; i < len(s); i++ {
227*4947cdc7SCole Faust		if s[i] == b {
228*4947cdc7SCole Faust			return i
229*4947cdc7SCole Faust		}
230*4947cdc7SCole Faust	}
231*4947cdc7SCole Faust	return -1
232*4947cdc7SCole Faust}
233*4947cdc7SCole Faust
234*4947cdc7SCole Faust// Quote returns a Starlark literal that denotes s.
235*4947cdc7SCole Faust// If b, it returns a bytes literal.
236*4947cdc7SCole Faustfunc Quote(s string, b bool) string {
237*4947cdc7SCole Faust	const hex = "0123456789abcdef"
238*4947cdc7SCole Faust	var runeTmp [utf8.UTFMax]byte
239*4947cdc7SCole Faust
240*4947cdc7SCole Faust	buf := make([]byte, 0, 3*len(s)/2)
241*4947cdc7SCole Faust	if b {
242*4947cdc7SCole Faust		buf = append(buf, 'b')
243*4947cdc7SCole Faust	}
244*4947cdc7SCole Faust	buf = append(buf, '"')
245*4947cdc7SCole Faust	for width := 0; len(s) > 0; s = s[width:] {
246*4947cdc7SCole Faust		r := rune(s[0])
247*4947cdc7SCole Faust		width = 1
248*4947cdc7SCole Faust		if r >= utf8.RuneSelf {
249*4947cdc7SCole Faust			r, width = utf8.DecodeRuneInString(s)
250*4947cdc7SCole Faust		}
251*4947cdc7SCole Faust		if width == 1 && r == utf8.RuneError {
252*4947cdc7SCole Faust			// String (!b) literals accept \xXX escapes only for ASCII,
253*4947cdc7SCole Faust			// but we must use them here to represent invalid bytes.
254*4947cdc7SCole Faust			// The result is not a legal literal.
255*4947cdc7SCole Faust			buf = append(buf, `\x`...)
256*4947cdc7SCole Faust			buf = append(buf, hex[s[0]>>4])
257*4947cdc7SCole Faust			buf = append(buf, hex[s[0]&0xF])
258*4947cdc7SCole Faust			continue
259*4947cdc7SCole Faust		}
260*4947cdc7SCole Faust		if r == '"' || r == '\\' { // always backslashed
261*4947cdc7SCole Faust			buf = append(buf, '\\')
262*4947cdc7SCole Faust			buf = append(buf, byte(r))
263*4947cdc7SCole Faust			continue
264*4947cdc7SCole Faust		}
265*4947cdc7SCole Faust		if strconv.IsPrint(r) {
266*4947cdc7SCole Faust			n := utf8.EncodeRune(runeTmp[:], r)
267*4947cdc7SCole Faust			buf = append(buf, runeTmp[:n]...)
268*4947cdc7SCole Faust			continue
269*4947cdc7SCole Faust		}
270*4947cdc7SCole Faust		switch r {
271*4947cdc7SCole Faust		case '\a':
272*4947cdc7SCole Faust			buf = append(buf, `\a`...)
273*4947cdc7SCole Faust		case '\b':
274*4947cdc7SCole Faust			buf = append(buf, `\b`...)
275*4947cdc7SCole Faust		case '\f':
276*4947cdc7SCole Faust			buf = append(buf, `\f`...)
277*4947cdc7SCole Faust		case '\n':
278*4947cdc7SCole Faust			buf = append(buf, `\n`...)
279*4947cdc7SCole Faust		case '\r':
280*4947cdc7SCole Faust			buf = append(buf, `\r`...)
281*4947cdc7SCole Faust		case '\t':
282*4947cdc7SCole Faust			buf = append(buf, `\t`...)
283*4947cdc7SCole Faust		case '\v':
284*4947cdc7SCole Faust			buf = append(buf, `\v`...)
285*4947cdc7SCole Faust		default:
286*4947cdc7SCole Faust			switch {
287*4947cdc7SCole Faust			case r < ' ' || r == 0x7f:
288*4947cdc7SCole Faust				buf = append(buf, `\x`...)
289*4947cdc7SCole Faust				buf = append(buf, hex[byte(r)>>4])
290*4947cdc7SCole Faust				buf = append(buf, hex[byte(r)&0xF])
291*4947cdc7SCole Faust			case r > utf8.MaxRune:
292*4947cdc7SCole Faust				r = 0xFFFD
293*4947cdc7SCole Faust				fallthrough
294*4947cdc7SCole Faust			case r < 0x10000:
295*4947cdc7SCole Faust				buf = append(buf, `\u`...)
296*4947cdc7SCole Faust				for s := 12; s >= 0; s -= 4 {
297*4947cdc7SCole Faust					buf = append(buf, hex[r>>uint(s)&0xF])
298*4947cdc7SCole Faust				}
299*4947cdc7SCole Faust			default:
300*4947cdc7SCole Faust				buf = append(buf, `\U`...)
301*4947cdc7SCole Faust				for s := 28; s >= 0; s -= 4 {
302*4947cdc7SCole Faust					buf = append(buf, hex[r>>uint(s)&0xF])
303*4947cdc7SCole Faust				}
304*4947cdc7SCole Faust			}
305*4947cdc7SCole Faust		}
306*4947cdc7SCole Faust	}
307*4947cdc7SCole Faust	buf = append(buf, '"')
308*4947cdc7SCole Faust	return string(buf)
309*4947cdc7SCole Faust}
310