xref: /aosp_15_r20/external/golang-protobuf/internal/encoding/text/decode.go (revision 1c12ee1efe575feb122dbf939ff15148a3b3e8f2)
1*1c12ee1eSDan Willemsen// Copyright 2018 The Go Authors. All rights reserved.
2*1c12ee1eSDan Willemsen// Use of this source code is governed by a BSD-style
3*1c12ee1eSDan Willemsen// license that can be found in the LICENSE file.
4*1c12ee1eSDan Willemsen
5*1c12ee1eSDan Willemsenpackage text
6*1c12ee1eSDan Willemsen
7*1c12ee1eSDan Willemsenimport (
8*1c12ee1eSDan Willemsen	"bytes"
9*1c12ee1eSDan Willemsen	"fmt"
10*1c12ee1eSDan Willemsen	"io"
11*1c12ee1eSDan Willemsen	"strconv"
12*1c12ee1eSDan Willemsen	"unicode/utf8"
13*1c12ee1eSDan Willemsen
14*1c12ee1eSDan Willemsen	"google.golang.org/protobuf/internal/errors"
15*1c12ee1eSDan Willemsen)
16*1c12ee1eSDan Willemsen
17*1c12ee1eSDan Willemsen// Decoder is a token-based textproto decoder.
18*1c12ee1eSDan Willemsentype Decoder struct {
19*1c12ee1eSDan Willemsen	// lastCall is last method called, either readCall or peekCall.
20*1c12ee1eSDan Willemsen	// Initial value is readCall.
21*1c12ee1eSDan Willemsen	lastCall call
22*1c12ee1eSDan Willemsen
23*1c12ee1eSDan Willemsen	// lastToken contains the last read token.
24*1c12ee1eSDan Willemsen	lastToken Token
25*1c12ee1eSDan Willemsen
26*1c12ee1eSDan Willemsen	// lastErr contains the last read error.
27*1c12ee1eSDan Willemsen	lastErr error
28*1c12ee1eSDan Willemsen
29*1c12ee1eSDan Willemsen	// openStack is a stack containing the byte characters for MessageOpen and
30*1c12ee1eSDan Willemsen	// ListOpen kinds. The top of stack represents the message or the list that
31*1c12ee1eSDan Willemsen	// the current token is nested in. An empty stack means the current token is
32*1c12ee1eSDan Willemsen	// at the top level message. The characters '{' and '<' both represent the
33*1c12ee1eSDan Willemsen	// MessageOpen kind.
34*1c12ee1eSDan Willemsen	openStack []byte
35*1c12ee1eSDan Willemsen
36*1c12ee1eSDan Willemsen	// orig is used in reporting line and column.
37*1c12ee1eSDan Willemsen	orig []byte
38*1c12ee1eSDan Willemsen	// in contains the unconsumed input.
39*1c12ee1eSDan Willemsen	in []byte
40*1c12ee1eSDan Willemsen}
41*1c12ee1eSDan Willemsen
42*1c12ee1eSDan Willemsen// NewDecoder returns a Decoder to read the given []byte.
43*1c12ee1eSDan Willemsenfunc NewDecoder(b []byte) *Decoder {
44*1c12ee1eSDan Willemsen	return &Decoder{orig: b, in: b}
45*1c12ee1eSDan Willemsen}
46*1c12ee1eSDan Willemsen
47*1c12ee1eSDan Willemsen// ErrUnexpectedEOF means that EOF was encountered in the middle of the input.
48*1c12ee1eSDan Willemsenvar ErrUnexpectedEOF = errors.New("%v", io.ErrUnexpectedEOF)
49*1c12ee1eSDan Willemsen
50*1c12ee1eSDan Willemsen// call specifies which Decoder method was invoked.
51*1c12ee1eSDan Willemsentype call uint8
52*1c12ee1eSDan Willemsen
53*1c12ee1eSDan Willemsenconst (
54*1c12ee1eSDan Willemsen	readCall call = iota
55*1c12ee1eSDan Willemsen	peekCall
56*1c12ee1eSDan Willemsen)
57*1c12ee1eSDan Willemsen
58*1c12ee1eSDan Willemsen// Peek looks ahead and returns the next token and error without advancing a read.
59*1c12ee1eSDan Willemsenfunc (d *Decoder) Peek() (Token, error) {
60*1c12ee1eSDan Willemsen	defer func() { d.lastCall = peekCall }()
61*1c12ee1eSDan Willemsen	if d.lastCall == readCall {
62*1c12ee1eSDan Willemsen		d.lastToken, d.lastErr = d.Read()
63*1c12ee1eSDan Willemsen	}
64*1c12ee1eSDan Willemsen	return d.lastToken, d.lastErr
65*1c12ee1eSDan Willemsen}
66*1c12ee1eSDan Willemsen
67*1c12ee1eSDan Willemsen// Read returns the next token.
68*1c12ee1eSDan Willemsen// It will return an error if there is no valid token.
69*1c12ee1eSDan Willemsenfunc (d *Decoder) Read() (Token, error) {
70*1c12ee1eSDan Willemsen	defer func() { d.lastCall = readCall }()
71*1c12ee1eSDan Willemsen	if d.lastCall == peekCall {
72*1c12ee1eSDan Willemsen		return d.lastToken, d.lastErr
73*1c12ee1eSDan Willemsen	}
74*1c12ee1eSDan Willemsen
75*1c12ee1eSDan Willemsen	tok, err := d.parseNext(d.lastToken.kind)
76*1c12ee1eSDan Willemsen	if err != nil {
77*1c12ee1eSDan Willemsen		return Token{}, err
78*1c12ee1eSDan Willemsen	}
79*1c12ee1eSDan Willemsen
80*1c12ee1eSDan Willemsen	switch tok.kind {
81*1c12ee1eSDan Willemsen	case comma, semicolon:
82*1c12ee1eSDan Willemsen		tok, err = d.parseNext(tok.kind)
83*1c12ee1eSDan Willemsen		if err != nil {
84*1c12ee1eSDan Willemsen			return Token{}, err
85*1c12ee1eSDan Willemsen		}
86*1c12ee1eSDan Willemsen	}
87*1c12ee1eSDan Willemsen	d.lastToken = tok
88*1c12ee1eSDan Willemsen	return tok, nil
89*1c12ee1eSDan Willemsen}
90*1c12ee1eSDan Willemsen
91*1c12ee1eSDan Willemsenconst (
92*1c12ee1eSDan Willemsen	mismatchedFmt = "mismatched close character %q"
93*1c12ee1eSDan Willemsen	unexpectedFmt = "unexpected character %q"
94*1c12ee1eSDan Willemsen)
95*1c12ee1eSDan Willemsen
96*1c12ee1eSDan Willemsen// parseNext parses the next Token based on given last kind.
97*1c12ee1eSDan Willemsenfunc (d *Decoder) parseNext(lastKind Kind) (Token, error) {
98*1c12ee1eSDan Willemsen	// Trim leading spaces.
99*1c12ee1eSDan Willemsen	d.consume(0)
100*1c12ee1eSDan Willemsen	isEOF := false
101*1c12ee1eSDan Willemsen	if len(d.in) == 0 {
102*1c12ee1eSDan Willemsen		isEOF = true
103*1c12ee1eSDan Willemsen	}
104*1c12ee1eSDan Willemsen
105*1c12ee1eSDan Willemsen	switch lastKind {
106*1c12ee1eSDan Willemsen	case EOF:
107*1c12ee1eSDan Willemsen		return d.consumeToken(EOF, 0, 0), nil
108*1c12ee1eSDan Willemsen
109*1c12ee1eSDan Willemsen	case bof:
110*1c12ee1eSDan Willemsen		// Start of top level message. Next token can be EOF or Name.
111*1c12ee1eSDan Willemsen		if isEOF {
112*1c12ee1eSDan Willemsen			return d.consumeToken(EOF, 0, 0), nil
113*1c12ee1eSDan Willemsen		}
114*1c12ee1eSDan Willemsen		return d.parseFieldName()
115*1c12ee1eSDan Willemsen
116*1c12ee1eSDan Willemsen	case Name:
117*1c12ee1eSDan Willemsen		// Next token can be MessageOpen, ListOpen or Scalar.
118*1c12ee1eSDan Willemsen		if isEOF {
119*1c12ee1eSDan Willemsen			return Token{}, ErrUnexpectedEOF
120*1c12ee1eSDan Willemsen		}
121*1c12ee1eSDan Willemsen		switch ch := d.in[0]; ch {
122*1c12ee1eSDan Willemsen		case '{', '<':
123*1c12ee1eSDan Willemsen			d.pushOpenStack(ch)
124*1c12ee1eSDan Willemsen			return d.consumeToken(MessageOpen, 1, 0), nil
125*1c12ee1eSDan Willemsen		case '[':
126*1c12ee1eSDan Willemsen			d.pushOpenStack(ch)
127*1c12ee1eSDan Willemsen			return d.consumeToken(ListOpen, 1, 0), nil
128*1c12ee1eSDan Willemsen		default:
129*1c12ee1eSDan Willemsen			return d.parseScalar()
130*1c12ee1eSDan Willemsen		}
131*1c12ee1eSDan Willemsen
132*1c12ee1eSDan Willemsen	case Scalar:
133*1c12ee1eSDan Willemsen		openKind, closeCh := d.currentOpenKind()
134*1c12ee1eSDan Willemsen		switch openKind {
135*1c12ee1eSDan Willemsen		case bof:
136*1c12ee1eSDan Willemsen			// Top level message.
137*1c12ee1eSDan Willemsen			// 	Next token can be EOF, comma, semicolon or Name.
138*1c12ee1eSDan Willemsen			if isEOF {
139*1c12ee1eSDan Willemsen				return d.consumeToken(EOF, 0, 0), nil
140*1c12ee1eSDan Willemsen			}
141*1c12ee1eSDan Willemsen			switch d.in[0] {
142*1c12ee1eSDan Willemsen			case ',':
143*1c12ee1eSDan Willemsen				return d.consumeToken(comma, 1, 0), nil
144*1c12ee1eSDan Willemsen			case ';':
145*1c12ee1eSDan Willemsen				return d.consumeToken(semicolon, 1, 0), nil
146*1c12ee1eSDan Willemsen			default:
147*1c12ee1eSDan Willemsen				return d.parseFieldName()
148*1c12ee1eSDan Willemsen			}
149*1c12ee1eSDan Willemsen
150*1c12ee1eSDan Willemsen		case MessageOpen:
151*1c12ee1eSDan Willemsen			// Next token can be MessageClose, comma, semicolon or Name.
152*1c12ee1eSDan Willemsen			if isEOF {
153*1c12ee1eSDan Willemsen				return Token{}, ErrUnexpectedEOF
154*1c12ee1eSDan Willemsen			}
155*1c12ee1eSDan Willemsen			switch ch := d.in[0]; ch {
156*1c12ee1eSDan Willemsen			case closeCh:
157*1c12ee1eSDan Willemsen				d.popOpenStack()
158*1c12ee1eSDan Willemsen				return d.consumeToken(MessageClose, 1, 0), nil
159*1c12ee1eSDan Willemsen			case otherCloseChar[closeCh]:
160*1c12ee1eSDan Willemsen				return Token{}, d.newSyntaxError(mismatchedFmt, ch)
161*1c12ee1eSDan Willemsen			case ',':
162*1c12ee1eSDan Willemsen				return d.consumeToken(comma, 1, 0), nil
163*1c12ee1eSDan Willemsen			case ';':
164*1c12ee1eSDan Willemsen				return d.consumeToken(semicolon, 1, 0), nil
165*1c12ee1eSDan Willemsen			default:
166*1c12ee1eSDan Willemsen				return d.parseFieldName()
167*1c12ee1eSDan Willemsen			}
168*1c12ee1eSDan Willemsen
169*1c12ee1eSDan Willemsen		case ListOpen:
170*1c12ee1eSDan Willemsen			// Next token can be ListClose or comma.
171*1c12ee1eSDan Willemsen			if isEOF {
172*1c12ee1eSDan Willemsen				return Token{}, ErrUnexpectedEOF
173*1c12ee1eSDan Willemsen			}
174*1c12ee1eSDan Willemsen			switch ch := d.in[0]; ch {
175*1c12ee1eSDan Willemsen			case ']':
176*1c12ee1eSDan Willemsen				d.popOpenStack()
177*1c12ee1eSDan Willemsen				return d.consumeToken(ListClose, 1, 0), nil
178*1c12ee1eSDan Willemsen			case ',':
179*1c12ee1eSDan Willemsen				return d.consumeToken(comma, 1, 0), nil
180*1c12ee1eSDan Willemsen			default:
181*1c12ee1eSDan Willemsen				return Token{}, d.newSyntaxError(unexpectedFmt, ch)
182*1c12ee1eSDan Willemsen			}
183*1c12ee1eSDan Willemsen		}
184*1c12ee1eSDan Willemsen
185*1c12ee1eSDan Willemsen	case MessageOpen:
186*1c12ee1eSDan Willemsen		// Next token can be MessageClose or Name.
187*1c12ee1eSDan Willemsen		if isEOF {
188*1c12ee1eSDan Willemsen			return Token{}, ErrUnexpectedEOF
189*1c12ee1eSDan Willemsen		}
190*1c12ee1eSDan Willemsen		_, closeCh := d.currentOpenKind()
191*1c12ee1eSDan Willemsen		switch ch := d.in[0]; ch {
192*1c12ee1eSDan Willemsen		case closeCh:
193*1c12ee1eSDan Willemsen			d.popOpenStack()
194*1c12ee1eSDan Willemsen			return d.consumeToken(MessageClose, 1, 0), nil
195*1c12ee1eSDan Willemsen		case otherCloseChar[closeCh]:
196*1c12ee1eSDan Willemsen			return Token{}, d.newSyntaxError(mismatchedFmt, ch)
197*1c12ee1eSDan Willemsen		default:
198*1c12ee1eSDan Willemsen			return d.parseFieldName()
199*1c12ee1eSDan Willemsen		}
200*1c12ee1eSDan Willemsen
201*1c12ee1eSDan Willemsen	case MessageClose:
202*1c12ee1eSDan Willemsen		openKind, closeCh := d.currentOpenKind()
203*1c12ee1eSDan Willemsen		switch openKind {
204*1c12ee1eSDan Willemsen		case bof:
205*1c12ee1eSDan Willemsen			// Top level message.
206*1c12ee1eSDan Willemsen			// Next token can be EOF, comma, semicolon or Name.
207*1c12ee1eSDan Willemsen			if isEOF {
208*1c12ee1eSDan Willemsen				return d.consumeToken(EOF, 0, 0), nil
209*1c12ee1eSDan Willemsen			}
210*1c12ee1eSDan Willemsen			switch ch := d.in[0]; ch {
211*1c12ee1eSDan Willemsen			case ',':
212*1c12ee1eSDan Willemsen				return d.consumeToken(comma, 1, 0), nil
213*1c12ee1eSDan Willemsen			case ';':
214*1c12ee1eSDan Willemsen				return d.consumeToken(semicolon, 1, 0), nil
215*1c12ee1eSDan Willemsen			default:
216*1c12ee1eSDan Willemsen				return d.parseFieldName()
217*1c12ee1eSDan Willemsen			}
218*1c12ee1eSDan Willemsen
219*1c12ee1eSDan Willemsen		case MessageOpen:
220*1c12ee1eSDan Willemsen			// Next token can be MessageClose, comma, semicolon or Name.
221*1c12ee1eSDan Willemsen			if isEOF {
222*1c12ee1eSDan Willemsen				return Token{}, ErrUnexpectedEOF
223*1c12ee1eSDan Willemsen			}
224*1c12ee1eSDan Willemsen			switch ch := d.in[0]; ch {
225*1c12ee1eSDan Willemsen			case closeCh:
226*1c12ee1eSDan Willemsen				d.popOpenStack()
227*1c12ee1eSDan Willemsen				return d.consumeToken(MessageClose, 1, 0), nil
228*1c12ee1eSDan Willemsen			case otherCloseChar[closeCh]:
229*1c12ee1eSDan Willemsen				return Token{}, d.newSyntaxError(mismatchedFmt, ch)
230*1c12ee1eSDan Willemsen			case ',':
231*1c12ee1eSDan Willemsen				return d.consumeToken(comma, 1, 0), nil
232*1c12ee1eSDan Willemsen			case ';':
233*1c12ee1eSDan Willemsen				return d.consumeToken(semicolon, 1, 0), nil
234*1c12ee1eSDan Willemsen			default:
235*1c12ee1eSDan Willemsen				return d.parseFieldName()
236*1c12ee1eSDan Willemsen			}
237*1c12ee1eSDan Willemsen
238*1c12ee1eSDan Willemsen		case ListOpen:
239*1c12ee1eSDan Willemsen			// Next token can be ListClose or comma
240*1c12ee1eSDan Willemsen			if isEOF {
241*1c12ee1eSDan Willemsen				return Token{}, ErrUnexpectedEOF
242*1c12ee1eSDan Willemsen			}
243*1c12ee1eSDan Willemsen			switch ch := d.in[0]; ch {
244*1c12ee1eSDan Willemsen			case closeCh:
245*1c12ee1eSDan Willemsen				d.popOpenStack()
246*1c12ee1eSDan Willemsen				return d.consumeToken(ListClose, 1, 0), nil
247*1c12ee1eSDan Willemsen			case ',':
248*1c12ee1eSDan Willemsen				return d.consumeToken(comma, 1, 0), nil
249*1c12ee1eSDan Willemsen			default:
250*1c12ee1eSDan Willemsen				return Token{}, d.newSyntaxError(unexpectedFmt, ch)
251*1c12ee1eSDan Willemsen			}
252*1c12ee1eSDan Willemsen		}
253*1c12ee1eSDan Willemsen
254*1c12ee1eSDan Willemsen	case ListOpen:
255*1c12ee1eSDan Willemsen		// Next token can be ListClose, MessageStart or Scalar.
256*1c12ee1eSDan Willemsen		if isEOF {
257*1c12ee1eSDan Willemsen			return Token{}, ErrUnexpectedEOF
258*1c12ee1eSDan Willemsen		}
259*1c12ee1eSDan Willemsen		switch ch := d.in[0]; ch {
260*1c12ee1eSDan Willemsen		case ']':
261*1c12ee1eSDan Willemsen			d.popOpenStack()
262*1c12ee1eSDan Willemsen			return d.consumeToken(ListClose, 1, 0), nil
263*1c12ee1eSDan Willemsen		case '{', '<':
264*1c12ee1eSDan Willemsen			d.pushOpenStack(ch)
265*1c12ee1eSDan Willemsen			return d.consumeToken(MessageOpen, 1, 0), nil
266*1c12ee1eSDan Willemsen		default:
267*1c12ee1eSDan Willemsen			return d.parseScalar()
268*1c12ee1eSDan Willemsen		}
269*1c12ee1eSDan Willemsen
270*1c12ee1eSDan Willemsen	case ListClose:
271*1c12ee1eSDan Willemsen		openKind, closeCh := d.currentOpenKind()
272*1c12ee1eSDan Willemsen		switch openKind {
273*1c12ee1eSDan Willemsen		case bof:
274*1c12ee1eSDan Willemsen			// Top level message.
275*1c12ee1eSDan Willemsen			// Next token can be EOF, comma, semicolon or Name.
276*1c12ee1eSDan Willemsen			if isEOF {
277*1c12ee1eSDan Willemsen				return d.consumeToken(EOF, 0, 0), nil
278*1c12ee1eSDan Willemsen			}
279*1c12ee1eSDan Willemsen			switch ch := d.in[0]; ch {
280*1c12ee1eSDan Willemsen			case ',':
281*1c12ee1eSDan Willemsen				return d.consumeToken(comma, 1, 0), nil
282*1c12ee1eSDan Willemsen			case ';':
283*1c12ee1eSDan Willemsen				return d.consumeToken(semicolon, 1, 0), nil
284*1c12ee1eSDan Willemsen			default:
285*1c12ee1eSDan Willemsen				return d.parseFieldName()
286*1c12ee1eSDan Willemsen			}
287*1c12ee1eSDan Willemsen
288*1c12ee1eSDan Willemsen		case MessageOpen:
289*1c12ee1eSDan Willemsen			// Next token can be MessageClose, comma, semicolon or Name.
290*1c12ee1eSDan Willemsen			if isEOF {
291*1c12ee1eSDan Willemsen				return Token{}, ErrUnexpectedEOF
292*1c12ee1eSDan Willemsen			}
293*1c12ee1eSDan Willemsen			switch ch := d.in[0]; ch {
294*1c12ee1eSDan Willemsen			case closeCh:
295*1c12ee1eSDan Willemsen				d.popOpenStack()
296*1c12ee1eSDan Willemsen				return d.consumeToken(MessageClose, 1, 0), nil
297*1c12ee1eSDan Willemsen			case otherCloseChar[closeCh]:
298*1c12ee1eSDan Willemsen				return Token{}, d.newSyntaxError(mismatchedFmt, ch)
299*1c12ee1eSDan Willemsen			case ',':
300*1c12ee1eSDan Willemsen				return d.consumeToken(comma, 1, 0), nil
301*1c12ee1eSDan Willemsen			case ';':
302*1c12ee1eSDan Willemsen				return d.consumeToken(semicolon, 1, 0), nil
303*1c12ee1eSDan Willemsen			default:
304*1c12ee1eSDan Willemsen				return d.parseFieldName()
305*1c12ee1eSDan Willemsen			}
306*1c12ee1eSDan Willemsen
307*1c12ee1eSDan Willemsen		default:
308*1c12ee1eSDan Willemsen			// It is not possible to have this case. Let it panic below.
309*1c12ee1eSDan Willemsen		}
310*1c12ee1eSDan Willemsen
311*1c12ee1eSDan Willemsen	case comma, semicolon:
312*1c12ee1eSDan Willemsen		openKind, closeCh := d.currentOpenKind()
313*1c12ee1eSDan Willemsen		switch openKind {
314*1c12ee1eSDan Willemsen		case bof:
315*1c12ee1eSDan Willemsen			// Top level message. Next token can be EOF or Name.
316*1c12ee1eSDan Willemsen			if isEOF {
317*1c12ee1eSDan Willemsen				return d.consumeToken(EOF, 0, 0), nil
318*1c12ee1eSDan Willemsen			}
319*1c12ee1eSDan Willemsen			return d.parseFieldName()
320*1c12ee1eSDan Willemsen
321*1c12ee1eSDan Willemsen		case MessageOpen:
322*1c12ee1eSDan Willemsen			// Next token can be MessageClose or Name.
323*1c12ee1eSDan Willemsen			if isEOF {
324*1c12ee1eSDan Willemsen				return Token{}, ErrUnexpectedEOF
325*1c12ee1eSDan Willemsen			}
326*1c12ee1eSDan Willemsen			switch ch := d.in[0]; ch {
327*1c12ee1eSDan Willemsen			case closeCh:
328*1c12ee1eSDan Willemsen				d.popOpenStack()
329*1c12ee1eSDan Willemsen				return d.consumeToken(MessageClose, 1, 0), nil
330*1c12ee1eSDan Willemsen			case otherCloseChar[closeCh]:
331*1c12ee1eSDan Willemsen				return Token{}, d.newSyntaxError(mismatchedFmt, ch)
332*1c12ee1eSDan Willemsen			default:
333*1c12ee1eSDan Willemsen				return d.parseFieldName()
334*1c12ee1eSDan Willemsen			}
335*1c12ee1eSDan Willemsen
336*1c12ee1eSDan Willemsen		case ListOpen:
337*1c12ee1eSDan Willemsen			if lastKind == semicolon {
338*1c12ee1eSDan Willemsen				// It is not be possible to have this case as logic here
339*1c12ee1eSDan Willemsen				// should not have produced a semicolon Token when inside a
340*1c12ee1eSDan Willemsen				// list. Let it panic below.
341*1c12ee1eSDan Willemsen				break
342*1c12ee1eSDan Willemsen			}
343*1c12ee1eSDan Willemsen			// Next token can be MessageOpen or Scalar.
344*1c12ee1eSDan Willemsen			if isEOF {
345*1c12ee1eSDan Willemsen				return Token{}, ErrUnexpectedEOF
346*1c12ee1eSDan Willemsen			}
347*1c12ee1eSDan Willemsen			switch ch := d.in[0]; ch {
348*1c12ee1eSDan Willemsen			case '{', '<':
349*1c12ee1eSDan Willemsen				d.pushOpenStack(ch)
350*1c12ee1eSDan Willemsen				return d.consumeToken(MessageOpen, 1, 0), nil
351*1c12ee1eSDan Willemsen			default:
352*1c12ee1eSDan Willemsen				return d.parseScalar()
353*1c12ee1eSDan Willemsen			}
354*1c12ee1eSDan Willemsen		}
355*1c12ee1eSDan Willemsen	}
356*1c12ee1eSDan Willemsen
357*1c12ee1eSDan Willemsen	line, column := d.Position(len(d.orig) - len(d.in))
358*1c12ee1eSDan Willemsen	panic(fmt.Sprintf("Decoder.parseNext: bug at handling line %d:%d with lastKind=%v", line, column, lastKind))
359*1c12ee1eSDan Willemsen}
360*1c12ee1eSDan Willemsen
361*1c12ee1eSDan Willemsenvar otherCloseChar = map[byte]byte{
362*1c12ee1eSDan Willemsen	'}': '>',
363*1c12ee1eSDan Willemsen	'>': '}',
364*1c12ee1eSDan Willemsen}
365*1c12ee1eSDan Willemsen
366*1c12ee1eSDan Willemsen// currentOpenKind indicates whether current position is inside a message, list
367*1c12ee1eSDan Willemsen// or top-level message by returning MessageOpen, ListOpen or bof respectively.
368*1c12ee1eSDan Willemsen// If the returned kind is either a MessageOpen or ListOpen, it also returns the
369*1c12ee1eSDan Willemsen// corresponding closing character.
370*1c12ee1eSDan Willemsenfunc (d *Decoder) currentOpenKind() (Kind, byte) {
371*1c12ee1eSDan Willemsen	if len(d.openStack) == 0 {
372*1c12ee1eSDan Willemsen		return bof, 0
373*1c12ee1eSDan Willemsen	}
374*1c12ee1eSDan Willemsen	openCh := d.openStack[len(d.openStack)-1]
375*1c12ee1eSDan Willemsen	switch openCh {
376*1c12ee1eSDan Willemsen	case '{':
377*1c12ee1eSDan Willemsen		return MessageOpen, '}'
378*1c12ee1eSDan Willemsen	case '<':
379*1c12ee1eSDan Willemsen		return MessageOpen, '>'
380*1c12ee1eSDan Willemsen	case '[':
381*1c12ee1eSDan Willemsen		return ListOpen, ']'
382*1c12ee1eSDan Willemsen	}
383*1c12ee1eSDan Willemsen	panic(fmt.Sprintf("Decoder: openStack contains invalid byte %c", openCh))
384*1c12ee1eSDan Willemsen}
385*1c12ee1eSDan Willemsen
386*1c12ee1eSDan Willemsenfunc (d *Decoder) pushOpenStack(ch byte) {
387*1c12ee1eSDan Willemsen	d.openStack = append(d.openStack, ch)
388*1c12ee1eSDan Willemsen}
389*1c12ee1eSDan Willemsen
390*1c12ee1eSDan Willemsenfunc (d *Decoder) popOpenStack() {
391*1c12ee1eSDan Willemsen	d.openStack = d.openStack[:len(d.openStack)-1]
392*1c12ee1eSDan Willemsen}
393*1c12ee1eSDan Willemsen
394*1c12ee1eSDan Willemsen// parseFieldName parses field name and separator.
395*1c12ee1eSDan Willemsenfunc (d *Decoder) parseFieldName() (tok Token, err error) {
396*1c12ee1eSDan Willemsen	defer func() {
397*1c12ee1eSDan Willemsen		if err == nil && d.tryConsumeChar(':') {
398*1c12ee1eSDan Willemsen			tok.attrs |= hasSeparator
399*1c12ee1eSDan Willemsen		}
400*1c12ee1eSDan Willemsen	}()
401*1c12ee1eSDan Willemsen
402*1c12ee1eSDan Willemsen	// Extension or Any type URL.
403*1c12ee1eSDan Willemsen	if d.in[0] == '[' {
404*1c12ee1eSDan Willemsen		return d.parseTypeName()
405*1c12ee1eSDan Willemsen	}
406*1c12ee1eSDan Willemsen
407*1c12ee1eSDan Willemsen	// Identifier.
408*1c12ee1eSDan Willemsen	if size := parseIdent(d.in, false); size > 0 {
409*1c12ee1eSDan Willemsen		return d.consumeToken(Name, size, uint8(IdentName)), nil
410*1c12ee1eSDan Willemsen	}
411*1c12ee1eSDan Willemsen
412*1c12ee1eSDan Willemsen	// Field number. Identify if input is a valid number that is not negative
413*1c12ee1eSDan Willemsen	// and is decimal integer within 32-bit range.
414*1c12ee1eSDan Willemsen	if num := parseNumber(d.in); num.size > 0 {
415*1c12ee1eSDan Willemsen		str := num.string(d.in)
416*1c12ee1eSDan Willemsen		if !num.neg && num.kind == numDec {
417*1c12ee1eSDan Willemsen			if _, err := strconv.ParseInt(str, 10, 32); err == nil {
418*1c12ee1eSDan Willemsen				return d.consumeToken(Name, num.size, uint8(FieldNumber)), nil
419*1c12ee1eSDan Willemsen			}
420*1c12ee1eSDan Willemsen		}
421*1c12ee1eSDan Willemsen		return Token{}, d.newSyntaxError("invalid field number: %s", str)
422*1c12ee1eSDan Willemsen	}
423*1c12ee1eSDan Willemsen
424*1c12ee1eSDan Willemsen	return Token{}, d.newSyntaxError("invalid field name: %s", errId(d.in))
425*1c12ee1eSDan Willemsen}
426*1c12ee1eSDan Willemsen
427*1c12ee1eSDan Willemsen// parseTypeName parses Any type URL or extension field name. The name is
428*1c12ee1eSDan Willemsen// enclosed in [ and ] characters. The C++ parser does not handle many legal URL
429*1c12ee1eSDan Willemsen// strings. This implementation is more liberal and allows for the pattern
430*1c12ee1eSDan Willemsen// ^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`). Whitespaces and comments are allowed
431*1c12ee1eSDan Willemsen// in between [ ], '.', '/' and the sub names.
432*1c12ee1eSDan Willemsenfunc (d *Decoder) parseTypeName() (Token, error) {
433*1c12ee1eSDan Willemsen	startPos := len(d.orig) - len(d.in)
434*1c12ee1eSDan Willemsen	// Use alias s to advance first in order to use d.in for error handling.
435*1c12ee1eSDan Willemsen	// Caller already checks for [ as first character.
436*1c12ee1eSDan Willemsen	s := consume(d.in[1:], 0)
437*1c12ee1eSDan Willemsen	if len(s) == 0 {
438*1c12ee1eSDan Willemsen		return Token{}, ErrUnexpectedEOF
439*1c12ee1eSDan Willemsen	}
440*1c12ee1eSDan Willemsen
441*1c12ee1eSDan Willemsen	var name []byte
442*1c12ee1eSDan Willemsen	for len(s) > 0 && isTypeNameChar(s[0]) {
443*1c12ee1eSDan Willemsen		name = append(name, s[0])
444*1c12ee1eSDan Willemsen		s = s[1:]
445*1c12ee1eSDan Willemsen	}
446*1c12ee1eSDan Willemsen	s = consume(s, 0)
447*1c12ee1eSDan Willemsen
448*1c12ee1eSDan Willemsen	var closed bool
449*1c12ee1eSDan Willemsen	for len(s) > 0 && !closed {
450*1c12ee1eSDan Willemsen		switch {
451*1c12ee1eSDan Willemsen		case s[0] == ']':
452*1c12ee1eSDan Willemsen			s = s[1:]
453*1c12ee1eSDan Willemsen			closed = true
454*1c12ee1eSDan Willemsen
455*1c12ee1eSDan Willemsen		case s[0] == '/', s[0] == '.':
456*1c12ee1eSDan Willemsen			if len(name) > 0 && (name[len(name)-1] == '/' || name[len(name)-1] == '.') {
457*1c12ee1eSDan Willemsen				return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s",
458*1c12ee1eSDan Willemsen					d.orig[startPos:len(d.orig)-len(s)+1])
459*1c12ee1eSDan Willemsen			}
460*1c12ee1eSDan Willemsen			name = append(name, s[0])
461*1c12ee1eSDan Willemsen			s = s[1:]
462*1c12ee1eSDan Willemsen			s = consume(s, 0)
463*1c12ee1eSDan Willemsen			for len(s) > 0 && isTypeNameChar(s[0]) {
464*1c12ee1eSDan Willemsen				name = append(name, s[0])
465*1c12ee1eSDan Willemsen				s = s[1:]
466*1c12ee1eSDan Willemsen			}
467*1c12ee1eSDan Willemsen			s = consume(s, 0)
468*1c12ee1eSDan Willemsen
469*1c12ee1eSDan Willemsen		default:
470*1c12ee1eSDan Willemsen			return Token{}, d.newSyntaxError(
471*1c12ee1eSDan Willemsen				"invalid type URL/extension field name: %s", d.orig[startPos:len(d.orig)-len(s)+1])
472*1c12ee1eSDan Willemsen		}
473*1c12ee1eSDan Willemsen	}
474*1c12ee1eSDan Willemsen
475*1c12ee1eSDan Willemsen	if !closed {
476*1c12ee1eSDan Willemsen		return Token{}, ErrUnexpectedEOF
477*1c12ee1eSDan Willemsen	}
478*1c12ee1eSDan Willemsen
479*1c12ee1eSDan Willemsen	// First character cannot be '.'. Last character cannot be '.' or '/'.
480*1c12ee1eSDan Willemsen	size := len(name)
481*1c12ee1eSDan Willemsen	if size == 0 || name[0] == '.' || name[size-1] == '.' || name[size-1] == '/' {
482*1c12ee1eSDan Willemsen		return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s",
483*1c12ee1eSDan Willemsen			d.orig[startPos:len(d.orig)-len(s)])
484*1c12ee1eSDan Willemsen	}
485*1c12ee1eSDan Willemsen
486*1c12ee1eSDan Willemsen	d.in = s
487*1c12ee1eSDan Willemsen	endPos := len(d.orig) - len(d.in)
488*1c12ee1eSDan Willemsen	d.consume(0)
489*1c12ee1eSDan Willemsen
490*1c12ee1eSDan Willemsen	return Token{
491*1c12ee1eSDan Willemsen		kind:  Name,
492*1c12ee1eSDan Willemsen		attrs: uint8(TypeName),
493*1c12ee1eSDan Willemsen		pos:   startPos,
494*1c12ee1eSDan Willemsen		raw:   d.orig[startPos:endPos],
495*1c12ee1eSDan Willemsen		str:   string(name),
496*1c12ee1eSDan Willemsen	}, nil
497*1c12ee1eSDan Willemsen}
498*1c12ee1eSDan Willemsen
499*1c12ee1eSDan Willemsenfunc isTypeNameChar(b byte) bool {
500*1c12ee1eSDan Willemsen	return (b == '-' || b == '_' ||
501*1c12ee1eSDan Willemsen		('0' <= b && b <= '9') ||
502*1c12ee1eSDan Willemsen		('a' <= b && b <= 'z') ||
503*1c12ee1eSDan Willemsen		('A' <= b && b <= 'Z'))
504*1c12ee1eSDan Willemsen}
505*1c12ee1eSDan Willemsen
506*1c12ee1eSDan Willemsenfunc isWhiteSpace(b byte) bool {
507*1c12ee1eSDan Willemsen	switch b {
508*1c12ee1eSDan Willemsen	case ' ', '\n', '\r', '\t':
509*1c12ee1eSDan Willemsen		return true
510*1c12ee1eSDan Willemsen	default:
511*1c12ee1eSDan Willemsen		return false
512*1c12ee1eSDan Willemsen	}
513*1c12ee1eSDan Willemsen}
514*1c12ee1eSDan Willemsen
515*1c12ee1eSDan Willemsen// parseIdent parses an unquoted proto identifier and returns size.
516*1c12ee1eSDan Willemsen// If allowNeg is true, it allows '-' to be the first character in the
517*1c12ee1eSDan Willemsen// identifier. This is used when parsing literal values like -infinity, etc.
518*1c12ee1eSDan Willemsen// Regular expression matches an identifier: `^[_a-zA-Z][_a-zA-Z0-9]*`
519*1c12ee1eSDan Willemsenfunc parseIdent(input []byte, allowNeg bool) int {
520*1c12ee1eSDan Willemsen	var size int
521*1c12ee1eSDan Willemsen
522*1c12ee1eSDan Willemsen	s := input
523*1c12ee1eSDan Willemsen	if len(s) == 0 {
524*1c12ee1eSDan Willemsen		return 0
525*1c12ee1eSDan Willemsen	}
526*1c12ee1eSDan Willemsen
527*1c12ee1eSDan Willemsen	if allowNeg && s[0] == '-' {
528*1c12ee1eSDan Willemsen		s = s[1:]
529*1c12ee1eSDan Willemsen		size++
530*1c12ee1eSDan Willemsen		if len(s) == 0 {
531*1c12ee1eSDan Willemsen			return 0
532*1c12ee1eSDan Willemsen		}
533*1c12ee1eSDan Willemsen	}
534*1c12ee1eSDan Willemsen
535*1c12ee1eSDan Willemsen	switch {
536*1c12ee1eSDan Willemsen	case s[0] == '_',
537*1c12ee1eSDan Willemsen		'a' <= s[0] && s[0] <= 'z',
538*1c12ee1eSDan Willemsen		'A' <= s[0] && s[0] <= 'Z':
539*1c12ee1eSDan Willemsen		s = s[1:]
540*1c12ee1eSDan Willemsen		size++
541*1c12ee1eSDan Willemsen	default:
542*1c12ee1eSDan Willemsen		return 0
543*1c12ee1eSDan Willemsen	}
544*1c12ee1eSDan Willemsen
545*1c12ee1eSDan Willemsen	for len(s) > 0 && (s[0] == '_' ||
546*1c12ee1eSDan Willemsen		'a' <= s[0] && s[0] <= 'z' ||
547*1c12ee1eSDan Willemsen		'A' <= s[0] && s[0] <= 'Z' ||
548*1c12ee1eSDan Willemsen		'0' <= s[0] && s[0] <= '9') {
549*1c12ee1eSDan Willemsen		s = s[1:]
550*1c12ee1eSDan Willemsen		size++
551*1c12ee1eSDan Willemsen	}
552*1c12ee1eSDan Willemsen
553*1c12ee1eSDan Willemsen	if len(s) > 0 && !isDelim(s[0]) {
554*1c12ee1eSDan Willemsen		return 0
555*1c12ee1eSDan Willemsen	}
556*1c12ee1eSDan Willemsen
557*1c12ee1eSDan Willemsen	return size
558*1c12ee1eSDan Willemsen}
559*1c12ee1eSDan Willemsen
560*1c12ee1eSDan Willemsen// parseScalar parses for a string, literal or number value.
561*1c12ee1eSDan Willemsenfunc (d *Decoder) parseScalar() (Token, error) {
562*1c12ee1eSDan Willemsen	if d.in[0] == '"' || d.in[0] == '\'' {
563*1c12ee1eSDan Willemsen		return d.parseStringValue()
564*1c12ee1eSDan Willemsen	}
565*1c12ee1eSDan Willemsen
566*1c12ee1eSDan Willemsen	if tok, ok := d.parseLiteralValue(); ok {
567*1c12ee1eSDan Willemsen		return tok, nil
568*1c12ee1eSDan Willemsen	}
569*1c12ee1eSDan Willemsen
570*1c12ee1eSDan Willemsen	if tok, ok := d.parseNumberValue(); ok {
571*1c12ee1eSDan Willemsen		return tok, nil
572*1c12ee1eSDan Willemsen	}
573*1c12ee1eSDan Willemsen
574*1c12ee1eSDan Willemsen	return Token{}, d.newSyntaxError("invalid scalar value: %s", errId(d.in))
575*1c12ee1eSDan Willemsen}
576*1c12ee1eSDan Willemsen
577*1c12ee1eSDan Willemsen// parseLiteralValue parses a literal value. A literal value is used for
578*1c12ee1eSDan Willemsen// bools, special floats and enums. This function simply identifies that the
579*1c12ee1eSDan Willemsen// field value is a literal.
580*1c12ee1eSDan Willemsenfunc (d *Decoder) parseLiteralValue() (Token, bool) {
581*1c12ee1eSDan Willemsen	size := parseIdent(d.in, true)
582*1c12ee1eSDan Willemsen	if size == 0 {
583*1c12ee1eSDan Willemsen		return Token{}, false
584*1c12ee1eSDan Willemsen	}
585*1c12ee1eSDan Willemsen	return d.consumeToken(Scalar, size, literalValue), true
586*1c12ee1eSDan Willemsen}
587*1c12ee1eSDan Willemsen
588*1c12ee1eSDan Willemsen// consumeToken constructs a Token for given Kind from d.in and consumes given
589*1c12ee1eSDan Willemsen// size-length from it.
590*1c12ee1eSDan Willemsenfunc (d *Decoder) consumeToken(kind Kind, size int, attrs uint8) Token {
591*1c12ee1eSDan Willemsen	// Important to compute raw and pos before consuming.
592*1c12ee1eSDan Willemsen	tok := Token{
593*1c12ee1eSDan Willemsen		kind:  kind,
594*1c12ee1eSDan Willemsen		attrs: attrs,
595*1c12ee1eSDan Willemsen		pos:   len(d.orig) - len(d.in),
596*1c12ee1eSDan Willemsen		raw:   d.in[:size],
597*1c12ee1eSDan Willemsen	}
598*1c12ee1eSDan Willemsen	d.consume(size)
599*1c12ee1eSDan Willemsen	return tok
600*1c12ee1eSDan Willemsen}
601*1c12ee1eSDan Willemsen
602*1c12ee1eSDan Willemsen// newSyntaxError returns a syntax error with line and column information for
603*1c12ee1eSDan Willemsen// current position.
604*1c12ee1eSDan Willemsenfunc (d *Decoder) newSyntaxError(f string, x ...interface{}) error {
605*1c12ee1eSDan Willemsen	e := errors.New(f, x...)
606*1c12ee1eSDan Willemsen	line, column := d.Position(len(d.orig) - len(d.in))
607*1c12ee1eSDan Willemsen	return errors.New("syntax error (line %d:%d): %v", line, column, e)
608*1c12ee1eSDan Willemsen}
609*1c12ee1eSDan Willemsen
610*1c12ee1eSDan Willemsen// Position returns line and column number of given index of the original input.
611*1c12ee1eSDan Willemsen// It will panic if index is out of range.
612*1c12ee1eSDan Willemsenfunc (d *Decoder) Position(idx int) (line int, column int) {
613*1c12ee1eSDan Willemsen	b := d.orig[:idx]
614*1c12ee1eSDan Willemsen	line = bytes.Count(b, []byte("\n")) + 1
615*1c12ee1eSDan Willemsen	if i := bytes.LastIndexByte(b, '\n'); i >= 0 {
616*1c12ee1eSDan Willemsen		b = b[i+1:]
617*1c12ee1eSDan Willemsen	}
618*1c12ee1eSDan Willemsen	column = utf8.RuneCount(b) + 1 // ignore multi-rune characters
619*1c12ee1eSDan Willemsen	return line, column
620*1c12ee1eSDan Willemsen}
621*1c12ee1eSDan Willemsen
622*1c12ee1eSDan Willemsenfunc (d *Decoder) tryConsumeChar(c byte) bool {
623*1c12ee1eSDan Willemsen	if len(d.in) > 0 && d.in[0] == c {
624*1c12ee1eSDan Willemsen		d.consume(1)
625*1c12ee1eSDan Willemsen		return true
626*1c12ee1eSDan Willemsen	}
627*1c12ee1eSDan Willemsen	return false
628*1c12ee1eSDan Willemsen}
629*1c12ee1eSDan Willemsen
630*1c12ee1eSDan Willemsen// consume consumes n bytes of input and any subsequent whitespace or comments.
631*1c12ee1eSDan Willemsenfunc (d *Decoder) consume(n int) {
632*1c12ee1eSDan Willemsen	d.in = consume(d.in, n)
633*1c12ee1eSDan Willemsen	return
634*1c12ee1eSDan Willemsen}
635*1c12ee1eSDan Willemsen
636*1c12ee1eSDan Willemsen// consume consumes n bytes of input and any subsequent whitespace or comments.
637*1c12ee1eSDan Willemsenfunc consume(b []byte, n int) []byte {
638*1c12ee1eSDan Willemsen	b = b[n:]
639*1c12ee1eSDan Willemsen	for len(b) > 0 {
640*1c12ee1eSDan Willemsen		switch b[0] {
641*1c12ee1eSDan Willemsen		case ' ', '\n', '\r', '\t':
642*1c12ee1eSDan Willemsen			b = b[1:]
643*1c12ee1eSDan Willemsen		case '#':
644*1c12ee1eSDan Willemsen			if i := bytes.IndexByte(b, '\n'); i >= 0 {
645*1c12ee1eSDan Willemsen				b = b[i+len("\n"):]
646*1c12ee1eSDan Willemsen			} else {
647*1c12ee1eSDan Willemsen				b = nil
648*1c12ee1eSDan Willemsen			}
649*1c12ee1eSDan Willemsen		default:
650*1c12ee1eSDan Willemsen			return b
651*1c12ee1eSDan Willemsen		}
652*1c12ee1eSDan Willemsen	}
653*1c12ee1eSDan Willemsen	return b
654*1c12ee1eSDan Willemsen}
655*1c12ee1eSDan Willemsen
656*1c12ee1eSDan Willemsen// errId extracts a byte sequence that looks like an invalid ID
657*1c12ee1eSDan Willemsen// (for the purposes of error reporting).
658*1c12ee1eSDan Willemsenfunc errId(seq []byte) []byte {
659*1c12ee1eSDan Willemsen	const maxLen = 32
660*1c12ee1eSDan Willemsen	for i := 0; i < len(seq); {
661*1c12ee1eSDan Willemsen		if i > maxLen {
662*1c12ee1eSDan Willemsen			return append(seq[:i:i], "…"...)
663*1c12ee1eSDan Willemsen		}
664*1c12ee1eSDan Willemsen		r, size := utf8.DecodeRune(seq[i:])
665*1c12ee1eSDan Willemsen		if r > utf8.RuneSelf || (r != '/' && isDelim(byte(r))) {
666*1c12ee1eSDan Willemsen			if i == 0 {
667*1c12ee1eSDan Willemsen				// Either the first byte is invalid UTF-8 or a
668*1c12ee1eSDan Willemsen				// delimiter, or the first rune is non-ASCII.
669*1c12ee1eSDan Willemsen				// Return it as-is.
670*1c12ee1eSDan Willemsen				i = size
671*1c12ee1eSDan Willemsen			}
672*1c12ee1eSDan Willemsen			return seq[:i:i]
673*1c12ee1eSDan Willemsen		}
674*1c12ee1eSDan Willemsen		i += size
675*1c12ee1eSDan Willemsen	}
676*1c12ee1eSDan Willemsen	// No delimiter found.
677*1c12ee1eSDan Willemsen	return seq
678*1c12ee1eSDan Willemsen}
679*1c12ee1eSDan Willemsen
680*1c12ee1eSDan Willemsen// isDelim returns true if given byte is a delimiter character.
681*1c12ee1eSDan Willemsenfunc isDelim(c byte) bool {
682*1c12ee1eSDan Willemsen	return !(c == '-' || c == '+' || c == '.' || c == '_' ||
683*1c12ee1eSDan Willemsen		('a' <= c && c <= 'z') ||
684*1c12ee1eSDan Willemsen		('A' <= c && c <= 'Z') ||
685*1c12ee1eSDan Willemsen		('0' <= c && c <= '9'))
686*1c12ee1eSDan Willemsen}
687