xref: /aosp_15_r20/external/golang-protobuf/internal/encoding/json/decode.go (revision 1c12ee1efe575feb122dbf939ff15148a3b3e8f2)
1*1c12ee1eSDan Willemsen// Copyright 2018 The Go Authors. All rights reserved.
2*1c12ee1eSDan Willemsen// Use of this source code is governed by a BSD-style
3*1c12ee1eSDan Willemsen// license that can be found in the LICENSE file.
4*1c12ee1eSDan Willemsen
5*1c12ee1eSDan Willemsenpackage json
6*1c12ee1eSDan Willemsen
7*1c12ee1eSDan Willemsenimport (
8*1c12ee1eSDan Willemsen	"bytes"
9*1c12ee1eSDan Willemsen	"fmt"
10*1c12ee1eSDan Willemsen	"io"
11*1c12ee1eSDan Willemsen	"regexp"
12*1c12ee1eSDan Willemsen	"unicode/utf8"
13*1c12ee1eSDan Willemsen
14*1c12ee1eSDan Willemsen	"google.golang.org/protobuf/internal/errors"
15*1c12ee1eSDan Willemsen)
16*1c12ee1eSDan Willemsen
17*1c12ee1eSDan Willemsen// call specifies which Decoder method was invoked.
18*1c12ee1eSDan Willemsentype call uint8
19*1c12ee1eSDan Willemsen
20*1c12ee1eSDan Willemsenconst (
21*1c12ee1eSDan Willemsen	readCall call = iota
22*1c12ee1eSDan Willemsen	peekCall
23*1c12ee1eSDan Willemsen)
24*1c12ee1eSDan Willemsen
25*1c12ee1eSDan Willemsenconst unexpectedFmt = "unexpected token %s"
26*1c12ee1eSDan Willemsen
27*1c12ee1eSDan Willemsen// ErrUnexpectedEOF means that EOF was encountered in the middle of the input.
28*1c12ee1eSDan Willemsenvar ErrUnexpectedEOF = errors.New("%v", io.ErrUnexpectedEOF)
29*1c12ee1eSDan Willemsen
30*1c12ee1eSDan Willemsen// Decoder is a token-based JSON decoder.
31*1c12ee1eSDan Willemsentype Decoder struct {
32*1c12ee1eSDan Willemsen	// lastCall is last method called, either readCall or peekCall.
33*1c12ee1eSDan Willemsen	// Initial value is readCall.
34*1c12ee1eSDan Willemsen	lastCall call
35*1c12ee1eSDan Willemsen
36*1c12ee1eSDan Willemsen	// lastToken contains the last read token.
37*1c12ee1eSDan Willemsen	lastToken Token
38*1c12ee1eSDan Willemsen
39*1c12ee1eSDan Willemsen	// lastErr contains the last read error.
40*1c12ee1eSDan Willemsen	lastErr error
41*1c12ee1eSDan Willemsen
42*1c12ee1eSDan Willemsen	// openStack is a stack containing ObjectOpen and ArrayOpen values. The
43*1c12ee1eSDan Willemsen	// top of stack represents the object or the array the current value is
44*1c12ee1eSDan Willemsen	// directly located in.
45*1c12ee1eSDan Willemsen	openStack []Kind
46*1c12ee1eSDan Willemsen
47*1c12ee1eSDan Willemsen	// orig is used in reporting line and column.
48*1c12ee1eSDan Willemsen	orig []byte
49*1c12ee1eSDan Willemsen	// in contains the unconsumed input.
50*1c12ee1eSDan Willemsen	in []byte
51*1c12ee1eSDan Willemsen}
52*1c12ee1eSDan Willemsen
53*1c12ee1eSDan Willemsen// NewDecoder returns a Decoder to read the given []byte.
54*1c12ee1eSDan Willemsenfunc NewDecoder(b []byte) *Decoder {
55*1c12ee1eSDan Willemsen	return &Decoder{orig: b, in: b}
56*1c12ee1eSDan Willemsen}
57*1c12ee1eSDan Willemsen
58*1c12ee1eSDan Willemsen// Peek looks ahead and returns the next token kind without advancing a read.
59*1c12ee1eSDan Willemsenfunc (d *Decoder) Peek() (Token, error) {
60*1c12ee1eSDan Willemsen	defer func() { d.lastCall = peekCall }()
61*1c12ee1eSDan Willemsen	if d.lastCall == readCall {
62*1c12ee1eSDan Willemsen		d.lastToken, d.lastErr = d.Read()
63*1c12ee1eSDan Willemsen	}
64*1c12ee1eSDan Willemsen	return d.lastToken, d.lastErr
65*1c12ee1eSDan Willemsen}
66*1c12ee1eSDan Willemsen
67*1c12ee1eSDan Willemsen// Read returns the next JSON token.
68*1c12ee1eSDan Willemsen// It will return an error if there is no valid token.
69*1c12ee1eSDan Willemsenfunc (d *Decoder) Read() (Token, error) {
70*1c12ee1eSDan Willemsen	const scalar = Null | Bool | Number | String
71*1c12ee1eSDan Willemsen
72*1c12ee1eSDan Willemsen	defer func() { d.lastCall = readCall }()
73*1c12ee1eSDan Willemsen	if d.lastCall == peekCall {
74*1c12ee1eSDan Willemsen		return d.lastToken, d.lastErr
75*1c12ee1eSDan Willemsen	}
76*1c12ee1eSDan Willemsen
77*1c12ee1eSDan Willemsen	tok, err := d.parseNext()
78*1c12ee1eSDan Willemsen	if err != nil {
79*1c12ee1eSDan Willemsen		return Token{}, err
80*1c12ee1eSDan Willemsen	}
81*1c12ee1eSDan Willemsen
82*1c12ee1eSDan Willemsen	switch tok.kind {
83*1c12ee1eSDan Willemsen	case EOF:
84*1c12ee1eSDan Willemsen		if len(d.openStack) != 0 ||
85*1c12ee1eSDan Willemsen			d.lastToken.kind&scalar|ObjectClose|ArrayClose == 0 {
86*1c12ee1eSDan Willemsen			return Token{}, ErrUnexpectedEOF
87*1c12ee1eSDan Willemsen		}
88*1c12ee1eSDan Willemsen
89*1c12ee1eSDan Willemsen	case Null:
90*1c12ee1eSDan Willemsen		if !d.isValueNext() {
91*1c12ee1eSDan Willemsen			return Token{}, d.newSyntaxError(tok.pos, unexpectedFmt, tok.RawString())
92*1c12ee1eSDan Willemsen		}
93*1c12ee1eSDan Willemsen
94*1c12ee1eSDan Willemsen	case Bool, Number:
95*1c12ee1eSDan Willemsen		if !d.isValueNext() {
96*1c12ee1eSDan Willemsen			return Token{}, d.newSyntaxError(tok.pos, unexpectedFmt, tok.RawString())
97*1c12ee1eSDan Willemsen		}
98*1c12ee1eSDan Willemsen
99*1c12ee1eSDan Willemsen	case String:
100*1c12ee1eSDan Willemsen		if d.isValueNext() {
101*1c12ee1eSDan Willemsen			break
102*1c12ee1eSDan Willemsen		}
103*1c12ee1eSDan Willemsen		// This string token should only be for a field name.
104*1c12ee1eSDan Willemsen		if d.lastToken.kind&(ObjectOpen|comma) == 0 {
105*1c12ee1eSDan Willemsen			return Token{}, d.newSyntaxError(tok.pos, unexpectedFmt, tok.RawString())
106*1c12ee1eSDan Willemsen		}
107*1c12ee1eSDan Willemsen		if len(d.in) == 0 {
108*1c12ee1eSDan Willemsen			return Token{}, ErrUnexpectedEOF
109*1c12ee1eSDan Willemsen		}
110*1c12ee1eSDan Willemsen		if c := d.in[0]; c != ':' {
111*1c12ee1eSDan Willemsen			return Token{}, d.newSyntaxError(d.currPos(), `unexpected character %s, missing ":" after field name`, string(c))
112*1c12ee1eSDan Willemsen		}
113*1c12ee1eSDan Willemsen		tok.kind = Name
114*1c12ee1eSDan Willemsen		d.consume(1)
115*1c12ee1eSDan Willemsen
116*1c12ee1eSDan Willemsen	case ObjectOpen, ArrayOpen:
117*1c12ee1eSDan Willemsen		if !d.isValueNext() {
118*1c12ee1eSDan Willemsen			return Token{}, d.newSyntaxError(tok.pos, unexpectedFmt, tok.RawString())
119*1c12ee1eSDan Willemsen		}
120*1c12ee1eSDan Willemsen		d.openStack = append(d.openStack, tok.kind)
121*1c12ee1eSDan Willemsen
122*1c12ee1eSDan Willemsen	case ObjectClose:
123*1c12ee1eSDan Willemsen		if len(d.openStack) == 0 ||
124*1c12ee1eSDan Willemsen			d.lastToken.kind == comma ||
125*1c12ee1eSDan Willemsen			d.openStack[len(d.openStack)-1] != ObjectOpen {
126*1c12ee1eSDan Willemsen			return Token{}, d.newSyntaxError(tok.pos, unexpectedFmt, tok.RawString())
127*1c12ee1eSDan Willemsen		}
128*1c12ee1eSDan Willemsen		d.openStack = d.openStack[:len(d.openStack)-1]
129*1c12ee1eSDan Willemsen
130*1c12ee1eSDan Willemsen	case ArrayClose:
131*1c12ee1eSDan Willemsen		if len(d.openStack) == 0 ||
132*1c12ee1eSDan Willemsen			d.lastToken.kind == comma ||
133*1c12ee1eSDan Willemsen			d.openStack[len(d.openStack)-1] != ArrayOpen {
134*1c12ee1eSDan Willemsen			return Token{}, d.newSyntaxError(tok.pos, unexpectedFmt, tok.RawString())
135*1c12ee1eSDan Willemsen		}
136*1c12ee1eSDan Willemsen		d.openStack = d.openStack[:len(d.openStack)-1]
137*1c12ee1eSDan Willemsen
138*1c12ee1eSDan Willemsen	case comma:
139*1c12ee1eSDan Willemsen		if len(d.openStack) == 0 ||
140*1c12ee1eSDan Willemsen			d.lastToken.kind&(scalar|ObjectClose|ArrayClose) == 0 {
141*1c12ee1eSDan Willemsen			return Token{}, d.newSyntaxError(tok.pos, unexpectedFmt, tok.RawString())
142*1c12ee1eSDan Willemsen		}
143*1c12ee1eSDan Willemsen	}
144*1c12ee1eSDan Willemsen
145*1c12ee1eSDan Willemsen	// Update d.lastToken only after validating token to be in the right sequence.
146*1c12ee1eSDan Willemsen	d.lastToken = tok
147*1c12ee1eSDan Willemsen
148*1c12ee1eSDan Willemsen	if d.lastToken.kind == comma {
149*1c12ee1eSDan Willemsen		return d.Read()
150*1c12ee1eSDan Willemsen	}
151*1c12ee1eSDan Willemsen	return tok, nil
152*1c12ee1eSDan Willemsen}
153*1c12ee1eSDan Willemsen
154*1c12ee1eSDan Willemsen// Any sequence that looks like a non-delimiter (for error reporting).
155*1c12ee1eSDan Willemsenvar errRegexp = regexp.MustCompile(`^([-+._a-zA-Z0-9]{1,32}|.)`)
156*1c12ee1eSDan Willemsen
157*1c12ee1eSDan Willemsen// parseNext parses for the next JSON token. It returns a Token object for
158*1c12ee1eSDan Willemsen// different types, except for Name. It does not handle whether the next token
159*1c12ee1eSDan Willemsen// is in a valid sequence or not.
160*1c12ee1eSDan Willemsenfunc (d *Decoder) parseNext() (Token, error) {
161*1c12ee1eSDan Willemsen	// Trim leading spaces.
162*1c12ee1eSDan Willemsen	d.consume(0)
163*1c12ee1eSDan Willemsen
164*1c12ee1eSDan Willemsen	in := d.in
165*1c12ee1eSDan Willemsen	if len(in) == 0 {
166*1c12ee1eSDan Willemsen		return d.consumeToken(EOF, 0), nil
167*1c12ee1eSDan Willemsen	}
168*1c12ee1eSDan Willemsen
169*1c12ee1eSDan Willemsen	switch in[0] {
170*1c12ee1eSDan Willemsen	case 'n':
171*1c12ee1eSDan Willemsen		if n := matchWithDelim("null", in); n != 0 {
172*1c12ee1eSDan Willemsen			return d.consumeToken(Null, n), nil
173*1c12ee1eSDan Willemsen		}
174*1c12ee1eSDan Willemsen
175*1c12ee1eSDan Willemsen	case 't':
176*1c12ee1eSDan Willemsen		if n := matchWithDelim("true", in); n != 0 {
177*1c12ee1eSDan Willemsen			return d.consumeBoolToken(true, n), nil
178*1c12ee1eSDan Willemsen		}
179*1c12ee1eSDan Willemsen
180*1c12ee1eSDan Willemsen	case 'f':
181*1c12ee1eSDan Willemsen		if n := matchWithDelim("false", in); n != 0 {
182*1c12ee1eSDan Willemsen			return d.consumeBoolToken(false, n), nil
183*1c12ee1eSDan Willemsen		}
184*1c12ee1eSDan Willemsen
185*1c12ee1eSDan Willemsen	case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
186*1c12ee1eSDan Willemsen		if n, ok := parseNumber(in); ok {
187*1c12ee1eSDan Willemsen			return d.consumeToken(Number, n), nil
188*1c12ee1eSDan Willemsen		}
189*1c12ee1eSDan Willemsen
190*1c12ee1eSDan Willemsen	case '"':
191*1c12ee1eSDan Willemsen		s, n, err := d.parseString(in)
192*1c12ee1eSDan Willemsen		if err != nil {
193*1c12ee1eSDan Willemsen			return Token{}, err
194*1c12ee1eSDan Willemsen		}
195*1c12ee1eSDan Willemsen		return d.consumeStringToken(s, n), nil
196*1c12ee1eSDan Willemsen
197*1c12ee1eSDan Willemsen	case '{':
198*1c12ee1eSDan Willemsen		return d.consumeToken(ObjectOpen, 1), nil
199*1c12ee1eSDan Willemsen
200*1c12ee1eSDan Willemsen	case '}':
201*1c12ee1eSDan Willemsen		return d.consumeToken(ObjectClose, 1), nil
202*1c12ee1eSDan Willemsen
203*1c12ee1eSDan Willemsen	case '[':
204*1c12ee1eSDan Willemsen		return d.consumeToken(ArrayOpen, 1), nil
205*1c12ee1eSDan Willemsen
206*1c12ee1eSDan Willemsen	case ']':
207*1c12ee1eSDan Willemsen		return d.consumeToken(ArrayClose, 1), nil
208*1c12ee1eSDan Willemsen
209*1c12ee1eSDan Willemsen	case ',':
210*1c12ee1eSDan Willemsen		return d.consumeToken(comma, 1), nil
211*1c12ee1eSDan Willemsen	}
212*1c12ee1eSDan Willemsen	return Token{}, d.newSyntaxError(d.currPos(), "invalid value %s", errRegexp.Find(in))
213*1c12ee1eSDan Willemsen}
214*1c12ee1eSDan Willemsen
215*1c12ee1eSDan Willemsen// newSyntaxError returns an error with line and column information useful for
216*1c12ee1eSDan Willemsen// syntax errors.
217*1c12ee1eSDan Willemsenfunc (d *Decoder) newSyntaxError(pos int, f string, x ...interface{}) error {
218*1c12ee1eSDan Willemsen	e := errors.New(f, x...)
219*1c12ee1eSDan Willemsen	line, column := d.Position(pos)
220*1c12ee1eSDan Willemsen	return errors.New("syntax error (line %d:%d): %v", line, column, e)
221*1c12ee1eSDan Willemsen}
222*1c12ee1eSDan Willemsen
223*1c12ee1eSDan Willemsen// Position returns line and column number of given index of the original input.
224*1c12ee1eSDan Willemsen// It will panic if index is out of range.
225*1c12ee1eSDan Willemsenfunc (d *Decoder) Position(idx int) (line int, column int) {
226*1c12ee1eSDan Willemsen	b := d.orig[:idx]
227*1c12ee1eSDan Willemsen	line = bytes.Count(b, []byte("\n")) + 1
228*1c12ee1eSDan Willemsen	if i := bytes.LastIndexByte(b, '\n'); i >= 0 {
229*1c12ee1eSDan Willemsen		b = b[i+1:]
230*1c12ee1eSDan Willemsen	}
231*1c12ee1eSDan Willemsen	column = utf8.RuneCount(b) + 1 // ignore multi-rune characters
232*1c12ee1eSDan Willemsen	return line, column
233*1c12ee1eSDan Willemsen}
234*1c12ee1eSDan Willemsen
235*1c12ee1eSDan Willemsen// currPos returns the current index position of d.in from d.orig.
236*1c12ee1eSDan Willemsenfunc (d *Decoder) currPos() int {
237*1c12ee1eSDan Willemsen	return len(d.orig) - len(d.in)
238*1c12ee1eSDan Willemsen}
239*1c12ee1eSDan Willemsen
240*1c12ee1eSDan Willemsen// matchWithDelim matches s with the input b and verifies that the match
241*1c12ee1eSDan Willemsen// terminates with a delimiter of some form (e.g., r"[^-+_.a-zA-Z0-9]").
242*1c12ee1eSDan Willemsen// As a special case, EOF is considered a delimiter. It returns the length of s
243*1c12ee1eSDan Willemsen// if there is a match, else 0.
244*1c12ee1eSDan Willemsenfunc matchWithDelim(s string, b []byte) int {
245*1c12ee1eSDan Willemsen	if !bytes.HasPrefix(b, []byte(s)) {
246*1c12ee1eSDan Willemsen		return 0
247*1c12ee1eSDan Willemsen	}
248*1c12ee1eSDan Willemsen
249*1c12ee1eSDan Willemsen	n := len(s)
250*1c12ee1eSDan Willemsen	if n < len(b) && isNotDelim(b[n]) {
251*1c12ee1eSDan Willemsen		return 0
252*1c12ee1eSDan Willemsen	}
253*1c12ee1eSDan Willemsen	return n
254*1c12ee1eSDan Willemsen}
255*1c12ee1eSDan Willemsen
256*1c12ee1eSDan Willemsen// isNotDelim returns true if given byte is a not delimiter character.
257*1c12ee1eSDan Willemsenfunc isNotDelim(c byte) bool {
258*1c12ee1eSDan Willemsen	return (c == '-' || c == '+' || c == '.' || c == '_' ||
259*1c12ee1eSDan Willemsen		('a' <= c && c <= 'z') ||
260*1c12ee1eSDan Willemsen		('A' <= c && c <= 'Z') ||
261*1c12ee1eSDan Willemsen		('0' <= c && c <= '9'))
262*1c12ee1eSDan Willemsen}
263*1c12ee1eSDan Willemsen
264*1c12ee1eSDan Willemsen// consume consumes n bytes of input and any subsequent whitespace.
265*1c12ee1eSDan Willemsenfunc (d *Decoder) consume(n int) {
266*1c12ee1eSDan Willemsen	d.in = d.in[n:]
267*1c12ee1eSDan Willemsen	for len(d.in) > 0 {
268*1c12ee1eSDan Willemsen		switch d.in[0] {
269*1c12ee1eSDan Willemsen		case ' ', '\n', '\r', '\t':
270*1c12ee1eSDan Willemsen			d.in = d.in[1:]
271*1c12ee1eSDan Willemsen		default:
272*1c12ee1eSDan Willemsen			return
273*1c12ee1eSDan Willemsen		}
274*1c12ee1eSDan Willemsen	}
275*1c12ee1eSDan Willemsen}
276*1c12ee1eSDan Willemsen
277*1c12ee1eSDan Willemsen// isValueNext returns true if next type should be a JSON value: Null,
278*1c12ee1eSDan Willemsen// Number, String or Bool.
279*1c12ee1eSDan Willemsenfunc (d *Decoder) isValueNext() bool {
280*1c12ee1eSDan Willemsen	if len(d.openStack) == 0 {
281*1c12ee1eSDan Willemsen		return d.lastToken.kind == 0
282*1c12ee1eSDan Willemsen	}
283*1c12ee1eSDan Willemsen
284*1c12ee1eSDan Willemsen	start := d.openStack[len(d.openStack)-1]
285*1c12ee1eSDan Willemsen	switch start {
286*1c12ee1eSDan Willemsen	case ObjectOpen:
287*1c12ee1eSDan Willemsen		return d.lastToken.kind&Name != 0
288*1c12ee1eSDan Willemsen	case ArrayOpen:
289*1c12ee1eSDan Willemsen		return d.lastToken.kind&(ArrayOpen|comma) != 0
290*1c12ee1eSDan Willemsen	}
291*1c12ee1eSDan Willemsen	panic(fmt.Sprintf(
292*1c12ee1eSDan Willemsen		"unreachable logic in Decoder.isValueNext, lastToken.kind: %v, openStack: %v",
293*1c12ee1eSDan Willemsen		d.lastToken.kind, start))
294*1c12ee1eSDan Willemsen}
295*1c12ee1eSDan Willemsen
296*1c12ee1eSDan Willemsen// consumeToken constructs a Token for given Kind with raw value derived from
297*1c12ee1eSDan Willemsen// current d.in and given size, and consumes the given size-length of it.
298*1c12ee1eSDan Willemsenfunc (d *Decoder) consumeToken(kind Kind, size int) Token {
299*1c12ee1eSDan Willemsen	tok := Token{
300*1c12ee1eSDan Willemsen		kind: kind,
301*1c12ee1eSDan Willemsen		raw:  d.in[:size],
302*1c12ee1eSDan Willemsen		pos:  len(d.orig) - len(d.in),
303*1c12ee1eSDan Willemsen	}
304*1c12ee1eSDan Willemsen	d.consume(size)
305*1c12ee1eSDan Willemsen	return tok
306*1c12ee1eSDan Willemsen}
307*1c12ee1eSDan Willemsen
308*1c12ee1eSDan Willemsen// consumeBoolToken constructs a Token for a Bool kind with raw value derived from
309*1c12ee1eSDan Willemsen// current d.in and given size.
310*1c12ee1eSDan Willemsenfunc (d *Decoder) consumeBoolToken(b bool, size int) Token {
311*1c12ee1eSDan Willemsen	tok := Token{
312*1c12ee1eSDan Willemsen		kind: Bool,
313*1c12ee1eSDan Willemsen		raw:  d.in[:size],
314*1c12ee1eSDan Willemsen		pos:  len(d.orig) - len(d.in),
315*1c12ee1eSDan Willemsen		boo:  b,
316*1c12ee1eSDan Willemsen	}
317*1c12ee1eSDan Willemsen	d.consume(size)
318*1c12ee1eSDan Willemsen	return tok
319*1c12ee1eSDan Willemsen}
320*1c12ee1eSDan Willemsen
321*1c12ee1eSDan Willemsen// consumeStringToken constructs a Token for a String kind with raw value derived
322*1c12ee1eSDan Willemsen// from current d.in and given size.
323*1c12ee1eSDan Willemsenfunc (d *Decoder) consumeStringToken(s string, size int) Token {
324*1c12ee1eSDan Willemsen	tok := Token{
325*1c12ee1eSDan Willemsen		kind: String,
326*1c12ee1eSDan Willemsen		raw:  d.in[:size],
327*1c12ee1eSDan Willemsen		pos:  len(d.orig) - len(d.in),
328*1c12ee1eSDan Willemsen		str:  s,
329*1c12ee1eSDan Willemsen	}
330*1c12ee1eSDan Willemsen	d.consume(size)
331*1c12ee1eSDan Willemsen	return tok
332*1c12ee1eSDan Willemsen}
333*1c12ee1eSDan Willemsen
334*1c12ee1eSDan Willemsen// Clone returns a copy of the Decoder for use in reading ahead the next JSON
335*1c12ee1eSDan Willemsen// object, array or other values without affecting current Decoder.
336*1c12ee1eSDan Willemsenfunc (d *Decoder) Clone() *Decoder {
337*1c12ee1eSDan Willemsen	ret := *d
338*1c12ee1eSDan Willemsen	ret.openStack = append([]Kind(nil), ret.openStack...)
339*1c12ee1eSDan Willemsen	return &ret
340*1c12ee1eSDan Willemsen}
341