1*1c12ee1eSDan Willemsen// Copyright 2018 The Go Authors. All rights reserved. 2*1c12ee1eSDan Willemsen// Use of this source code is governed by a BSD-style 3*1c12ee1eSDan Willemsen// license that can be found in the LICENSE file. 4*1c12ee1eSDan Willemsen 5*1c12ee1eSDan Willemsenpackage json 6*1c12ee1eSDan Willemsen 7*1c12ee1eSDan Willemsenimport ( 8*1c12ee1eSDan Willemsen "bytes" 9*1c12ee1eSDan Willemsen "fmt" 10*1c12ee1eSDan Willemsen "io" 11*1c12ee1eSDan Willemsen "regexp" 12*1c12ee1eSDan Willemsen "unicode/utf8" 13*1c12ee1eSDan Willemsen 14*1c12ee1eSDan Willemsen "google.golang.org/protobuf/internal/errors" 15*1c12ee1eSDan Willemsen) 16*1c12ee1eSDan Willemsen 17*1c12ee1eSDan Willemsen// call specifies which Decoder method was invoked. 18*1c12ee1eSDan Willemsentype call uint8 19*1c12ee1eSDan Willemsen 20*1c12ee1eSDan Willemsenconst ( 21*1c12ee1eSDan Willemsen readCall call = iota 22*1c12ee1eSDan Willemsen peekCall 23*1c12ee1eSDan Willemsen) 24*1c12ee1eSDan Willemsen 25*1c12ee1eSDan Willemsenconst unexpectedFmt = "unexpected token %s" 26*1c12ee1eSDan Willemsen 27*1c12ee1eSDan Willemsen// ErrUnexpectedEOF means that EOF was encountered in the middle of the input. 28*1c12ee1eSDan Willemsenvar ErrUnexpectedEOF = errors.New("%v", io.ErrUnexpectedEOF) 29*1c12ee1eSDan Willemsen 30*1c12ee1eSDan Willemsen// Decoder is a token-based JSON decoder. 31*1c12ee1eSDan Willemsentype Decoder struct { 32*1c12ee1eSDan Willemsen // lastCall is last method called, either readCall or peekCall. 33*1c12ee1eSDan Willemsen // Initial value is readCall. 34*1c12ee1eSDan Willemsen lastCall call 35*1c12ee1eSDan Willemsen 36*1c12ee1eSDan Willemsen // lastToken contains the last read token. 37*1c12ee1eSDan Willemsen lastToken Token 38*1c12ee1eSDan Willemsen 39*1c12ee1eSDan Willemsen // lastErr contains the last read error. 40*1c12ee1eSDan Willemsen lastErr error 41*1c12ee1eSDan Willemsen 42*1c12ee1eSDan Willemsen // openStack is a stack containing ObjectOpen and ArrayOpen values. The 43*1c12ee1eSDan Willemsen // top of stack represents the object or the array the current value is 44*1c12ee1eSDan Willemsen // directly located in. 45*1c12ee1eSDan Willemsen openStack []Kind 46*1c12ee1eSDan Willemsen 47*1c12ee1eSDan Willemsen // orig is used in reporting line and column. 48*1c12ee1eSDan Willemsen orig []byte 49*1c12ee1eSDan Willemsen // in contains the unconsumed input. 50*1c12ee1eSDan Willemsen in []byte 51*1c12ee1eSDan Willemsen} 52*1c12ee1eSDan Willemsen 53*1c12ee1eSDan Willemsen// NewDecoder returns a Decoder to read the given []byte. 54*1c12ee1eSDan Willemsenfunc NewDecoder(b []byte) *Decoder { 55*1c12ee1eSDan Willemsen return &Decoder{orig: b, in: b} 56*1c12ee1eSDan Willemsen} 57*1c12ee1eSDan Willemsen 58*1c12ee1eSDan Willemsen// Peek looks ahead and returns the next token kind without advancing a read. 59*1c12ee1eSDan Willemsenfunc (d *Decoder) Peek() (Token, error) { 60*1c12ee1eSDan Willemsen defer func() { d.lastCall = peekCall }() 61*1c12ee1eSDan Willemsen if d.lastCall == readCall { 62*1c12ee1eSDan Willemsen d.lastToken, d.lastErr = d.Read() 63*1c12ee1eSDan Willemsen } 64*1c12ee1eSDan Willemsen return d.lastToken, d.lastErr 65*1c12ee1eSDan Willemsen} 66*1c12ee1eSDan Willemsen 67*1c12ee1eSDan Willemsen// Read returns the next JSON token. 68*1c12ee1eSDan Willemsen// It will return an error if there is no valid token. 69*1c12ee1eSDan Willemsenfunc (d *Decoder) Read() (Token, error) { 70*1c12ee1eSDan Willemsen const scalar = Null | Bool | Number | String 71*1c12ee1eSDan Willemsen 72*1c12ee1eSDan Willemsen defer func() { d.lastCall = readCall }() 73*1c12ee1eSDan Willemsen if d.lastCall == peekCall { 74*1c12ee1eSDan Willemsen return d.lastToken, d.lastErr 75*1c12ee1eSDan Willemsen } 76*1c12ee1eSDan Willemsen 77*1c12ee1eSDan Willemsen tok, err := d.parseNext() 78*1c12ee1eSDan Willemsen if err != nil { 79*1c12ee1eSDan Willemsen return Token{}, err 80*1c12ee1eSDan Willemsen } 81*1c12ee1eSDan Willemsen 82*1c12ee1eSDan Willemsen switch tok.kind { 83*1c12ee1eSDan Willemsen case EOF: 84*1c12ee1eSDan Willemsen if len(d.openStack) != 0 || 85*1c12ee1eSDan Willemsen d.lastToken.kind&scalar|ObjectClose|ArrayClose == 0 { 86*1c12ee1eSDan Willemsen return Token{}, ErrUnexpectedEOF 87*1c12ee1eSDan Willemsen } 88*1c12ee1eSDan Willemsen 89*1c12ee1eSDan Willemsen case Null: 90*1c12ee1eSDan Willemsen if !d.isValueNext() { 91*1c12ee1eSDan Willemsen return Token{}, d.newSyntaxError(tok.pos, unexpectedFmt, tok.RawString()) 92*1c12ee1eSDan Willemsen } 93*1c12ee1eSDan Willemsen 94*1c12ee1eSDan Willemsen case Bool, Number: 95*1c12ee1eSDan Willemsen if !d.isValueNext() { 96*1c12ee1eSDan Willemsen return Token{}, d.newSyntaxError(tok.pos, unexpectedFmt, tok.RawString()) 97*1c12ee1eSDan Willemsen } 98*1c12ee1eSDan Willemsen 99*1c12ee1eSDan Willemsen case String: 100*1c12ee1eSDan Willemsen if d.isValueNext() { 101*1c12ee1eSDan Willemsen break 102*1c12ee1eSDan Willemsen } 103*1c12ee1eSDan Willemsen // This string token should only be for a field name. 104*1c12ee1eSDan Willemsen if d.lastToken.kind&(ObjectOpen|comma) == 0 { 105*1c12ee1eSDan Willemsen return Token{}, d.newSyntaxError(tok.pos, unexpectedFmt, tok.RawString()) 106*1c12ee1eSDan Willemsen } 107*1c12ee1eSDan Willemsen if len(d.in) == 0 { 108*1c12ee1eSDan Willemsen return Token{}, ErrUnexpectedEOF 109*1c12ee1eSDan Willemsen } 110*1c12ee1eSDan Willemsen if c := d.in[0]; c != ':' { 111*1c12ee1eSDan Willemsen return Token{}, d.newSyntaxError(d.currPos(), `unexpected character %s, missing ":" after field name`, string(c)) 112*1c12ee1eSDan Willemsen } 113*1c12ee1eSDan Willemsen tok.kind = Name 114*1c12ee1eSDan Willemsen d.consume(1) 115*1c12ee1eSDan Willemsen 116*1c12ee1eSDan Willemsen case ObjectOpen, ArrayOpen: 117*1c12ee1eSDan Willemsen if !d.isValueNext() { 118*1c12ee1eSDan Willemsen return Token{}, d.newSyntaxError(tok.pos, unexpectedFmt, tok.RawString()) 119*1c12ee1eSDan Willemsen } 120*1c12ee1eSDan Willemsen d.openStack = append(d.openStack, tok.kind) 121*1c12ee1eSDan Willemsen 122*1c12ee1eSDan Willemsen case ObjectClose: 123*1c12ee1eSDan Willemsen if len(d.openStack) == 0 || 124*1c12ee1eSDan Willemsen d.lastToken.kind == comma || 125*1c12ee1eSDan Willemsen d.openStack[len(d.openStack)-1] != ObjectOpen { 126*1c12ee1eSDan Willemsen return Token{}, d.newSyntaxError(tok.pos, unexpectedFmt, tok.RawString()) 127*1c12ee1eSDan Willemsen } 128*1c12ee1eSDan Willemsen d.openStack = d.openStack[:len(d.openStack)-1] 129*1c12ee1eSDan Willemsen 130*1c12ee1eSDan Willemsen case ArrayClose: 131*1c12ee1eSDan Willemsen if len(d.openStack) == 0 || 132*1c12ee1eSDan Willemsen d.lastToken.kind == comma || 133*1c12ee1eSDan Willemsen d.openStack[len(d.openStack)-1] != ArrayOpen { 134*1c12ee1eSDan Willemsen return Token{}, d.newSyntaxError(tok.pos, unexpectedFmt, tok.RawString()) 135*1c12ee1eSDan Willemsen } 136*1c12ee1eSDan Willemsen d.openStack = d.openStack[:len(d.openStack)-1] 137*1c12ee1eSDan Willemsen 138*1c12ee1eSDan Willemsen case comma: 139*1c12ee1eSDan Willemsen if len(d.openStack) == 0 || 140*1c12ee1eSDan Willemsen d.lastToken.kind&(scalar|ObjectClose|ArrayClose) == 0 { 141*1c12ee1eSDan Willemsen return Token{}, d.newSyntaxError(tok.pos, unexpectedFmt, tok.RawString()) 142*1c12ee1eSDan Willemsen } 143*1c12ee1eSDan Willemsen } 144*1c12ee1eSDan Willemsen 145*1c12ee1eSDan Willemsen // Update d.lastToken only after validating token to be in the right sequence. 146*1c12ee1eSDan Willemsen d.lastToken = tok 147*1c12ee1eSDan Willemsen 148*1c12ee1eSDan Willemsen if d.lastToken.kind == comma { 149*1c12ee1eSDan Willemsen return d.Read() 150*1c12ee1eSDan Willemsen } 151*1c12ee1eSDan Willemsen return tok, nil 152*1c12ee1eSDan Willemsen} 153*1c12ee1eSDan Willemsen 154*1c12ee1eSDan Willemsen// Any sequence that looks like a non-delimiter (for error reporting). 155*1c12ee1eSDan Willemsenvar errRegexp = regexp.MustCompile(`^([-+._a-zA-Z0-9]{1,32}|.)`) 156*1c12ee1eSDan Willemsen 157*1c12ee1eSDan Willemsen// parseNext parses for the next JSON token. It returns a Token object for 158*1c12ee1eSDan Willemsen// different types, except for Name. It does not handle whether the next token 159*1c12ee1eSDan Willemsen// is in a valid sequence or not. 160*1c12ee1eSDan Willemsenfunc (d *Decoder) parseNext() (Token, error) { 161*1c12ee1eSDan Willemsen // Trim leading spaces. 162*1c12ee1eSDan Willemsen d.consume(0) 163*1c12ee1eSDan Willemsen 164*1c12ee1eSDan Willemsen in := d.in 165*1c12ee1eSDan Willemsen if len(in) == 0 { 166*1c12ee1eSDan Willemsen return d.consumeToken(EOF, 0), nil 167*1c12ee1eSDan Willemsen } 168*1c12ee1eSDan Willemsen 169*1c12ee1eSDan Willemsen switch in[0] { 170*1c12ee1eSDan Willemsen case 'n': 171*1c12ee1eSDan Willemsen if n := matchWithDelim("null", in); n != 0 { 172*1c12ee1eSDan Willemsen return d.consumeToken(Null, n), nil 173*1c12ee1eSDan Willemsen } 174*1c12ee1eSDan Willemsen 175*1c12ee1eSDan Willemsen case 't': 176*1c12ee1eSDan Willemsen if n := matchWithDelim("true", in); n != 0 { 177*1c12ee1eSDan Willemsen return d.consumeBoolToken(true, n), nil 178*1c12ee1eSDan Willemsen } 179*1c12ee1eSDan Willemsen 180*1c12ee1eSDan Willemsen case 'f': 181*1c12ee1eSDan Willemsen if n := matchWithDelim("false", in); n != 0 { 182*1c12ee1eSDan Willemsen return d.consumeBoolToken(false, n), nil 183*1c12ee1eSDan Willemsen } 184*1c12ee1eSDan Willemsen 185*1c12ee1eSDan Willemsen case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 186*1c12ee1eSDan Willemsen if n, ok := parseNumber(in); ok { 187*1c12ee1eSDan Willemsen return d.consumeToken(Number, n), nil 188*1c12ee1eSDan Willemsen } 189*1c12ee1eSDan Willemsen 190*1c12ee1eSDan Willemsen case '"': 191*1c12ee1eSDan Willemsen s, n, err := d.parseString(in) 192*1c12ee1eSDan Willemsen if err != nil { 193*1c12ee1eSDan Willemsen return Token{}, err 194*1c12ee1eSDan Willemsen } 195*1c12ee1eSDan Willemsen return d.consumeStringToken(s, n), nil 196*1c12ee1eSDan Willemsen 197*1c12ee1eSDan Willemsen case '{': 198*1c12ee1eSDan Willemsen return d.consumeToken(ObjectOpen, 1), nil 199*1c12ee1eSDan Willemsen 200*1c12ee1eSDan Willemsen case '}': 201*1c12ee1eSDan Willemsen return d.consumeToken(ObjectClose, 1), nil 202*1c12ee1eSDan Willemsen 203*1c12ee1eSDan Willemsen case '[': 204*1c12ee1eSDan Willemsen return d.consumeToken(ArrayOpen, 1), nil 205*1c12ee1eSDan Willemsen 206*1c12ee1eSDan Willemsen case ']': 207*1c12ee1eSDan Willemsen return d.consumeToken(ArrayClose, 1), nil 208*1c12ee1eSDan Willemsen 209*1c12ee1eSDan Willemsen case ',': 210*1c12ee1eSDan Willemsen return d.consumeToken(comma, 1), nil 211*1c12ee1eSDan Willemsen } 212*1c12ee1eSDan Willemsen return Token{}, d.newSyntaxError(d.currPos(), "invalid value %s", errRegexp.Find(in)) 213*1c12ee1eSDan Willemsen} 214*1c12ee1eSDan Willemsen 215*1c12ee1eSDan Willemsen// newSyntaxError returns an error with line and column information useful for 216*1c12ee1eSDan Willemsen// syntax errors. 217*1c12ee1eSDan Willemsenfunc (d *Decoder) newSyntaxError(pos int, f string, x ...interface{}) error { 218*1c12ee1eSDan Willemsen e := errors.New(f, x...) 219*1c12ee1eSDan Willemsen line, column := d.Position(pos) 220*1c12ee1eSDan Willemsen return errors.New("syntax error (line %d:%d): %v", line, column, e) 221*1c12ee1eSDan Willemsen} 222*1c12ee1eSDan Willemsen 223*1c12ee1eSDan Willemsen// Position returns line and column number of given index of the original input. 224*1c12ee1eSDan Willemsen// It will panic if index is out of range. 225*1c12ee1eSDan Willemsenfunc (d *Decoder) Position(idx int) (line int, column int) { 226*1c12ee1eSDan Willemsen b := d.orig[:idx] 227*1c12ee1eSDan Willemsen line = bytes.Count(b, []byte("\n")) + 1 228*1c12ee1eSDan Willemsen if i := bytes.LastIndexByte(b, '\n'); i >= 0 { 229*1c12ee1eSDan Willemsen b = b[i+1:] 230*1c12ee1eSDan Willemsen } 231*1c12ee1eSDan Willemsen column = utf8.RuneCount(b) + 1 // ignore multi-rune characters 232*1c12ee1eSDan Willemsen return line, column 233*1c12ee1eSDan Willemsen} 234*1c12ee1eSDan Willemsen 235*1c12ee1eSDan Willemsen// currPos returns the current index position of d.in from d.orig. 236*1c12ee1eSDan Willemsenfunc (d *Decoder) currPos() int { 237*1c12ee1eSDan Willemsen return len(d.orig) - len(d.in) 238*1c12ee1eSDan Willemsen} 239*1c12ee1eSDan Willemsen 240*1c12ee1eSDan Willemsen// matchWithDelim matches s with the input b and verifies that the match 241*1c12ee1eSDan Willemsen// terminates with a delimiter of some form (e.g., r"[^-+_.a-zA-Z0-9]"). 242*1c12ee1eSDan Willemsen// As a special case, EOF is considered a delimiter. It returns the length of s 243*1c12ee1eSDan Willemsen// if there is a match, else 0. 244*1c12ee1eSDan Willemsenfunc matchWithDelim(s string, b []byte) int { 245*1c12ee1eSDan Willemsen if !bytes.HasPrefix(b, []byte(s)) { 246*1c12ee1eSDan Willemsen return 0 247*1c12ee1eSDan Willemsen } 248*1c12ee1eSDan Willemsen 249*1c12ee1eSDan Willemsen n := len(s) 250*1c12ee1eSDan Willemsen if n < len(b) && isNotDelim(b[n]) { 251*1c12ee1eSDan Willemsen return 0 252*1c12ee1eSDan Willemsen } 253*1c12ee1eSDan Willemsen return n 254*1c12ee1eSDan Willemsen} 255*1c12ee1eSDan Willemsen 256*1c12ee1eSDan Willemsen// isNotDelim returns true if given byte is a not delimiter character. 257*1c12ee1eSDan Willemsenfunc isNotDelim(c byte) bool { 258*1c12ee1eSDan Willemsen return (c == '-' || c == '+' || c == '.' || c == '_' || 259*1c12ee1eSDan Willemsen ('a' <= c && c <= 'z') || 260*1c12ee1eSDan Willemsen ('A' <= c && c <= 'Z') || 261*1c12ee1eSDan Willemsen ('0' <= c && c <= '9')) 262*1c12ee1eSDan Willemsen} 263*1c12ee1eSDan Willemsen 264*1c12ee1eSDan Willemsen// consume consumes n bytes of input and any subsequent whitespace. 265*1c12ee1eSDan Willemsenfunc (d *Decoder) consume(n int) { 266*1c12ee1eSDan Willemsen d.in = d.in[n:] 267*1c12ee1eSDan Willemsen for len(d.in) > 0 { 268*1c12ee1eSDan Willemsen switch d.in[0] { 269*1c12ee1eSDan Willemsen case ' ', '\n', '\r', '\t': 270*1c12ee1eSDan Willemsen d.in = d.in[1:] 271*1c12ee1eSDan Willemsen default: 272*1c12ee1eSDan Willemsen return 273*1c12ee1eSDan Willemsen } 274*1c12ee1eSDan Willemsen } 275*1c12ee1eSDan Willemsen} 276*1c12ee1eSDan Willemsen 277*1c12ee1eSDan Willemsen// isValueNext returns true if next type should be a JSON value: Null, 278*1c12ee1eSDan Willemsen// Number, String or Bool. 279*1c12ee1eSDan Willemsenfunc (d *Decoder) isValueNext() bool { 280*1c12ee1eSDan Willemsen if len(d.openStack) == 0 { 281*1c12ee1eSDan Willemsen return d.lastToken.kind == 0 282*1c12ee1eSDan Willemsen } 283*1c12ee1eSDan Willemsen 284*1c12ee1eSDan Willemsen start := d.openStack[len(d.openStack)-1] 285*1c12ee1eSDan Willemsen switch start { 286*1c12ee1eSDan Willemsen case ObjectOpen: 287*1c12ee1eSDan Willemsen return d.lastToken.kind&Name != 0 288*1c12ee1eSDan Willemsen case ArrayOpen: 289*1c12ee1eSDan Willemsen return d.lastToken.kind&(ArrayOpen|comma) != 0 290*1c12ee1eSDan Willemsen } 291*1c12ee1eSDan Willemsen panic(fmt.Sprintf( 292*1c12ee1eSDan Willemsen "unreachable logic in Decoder.isValueNext, lastToken.kind: %v, openStack: %v", 293*1c12ee1eSDan Willemsen d.lastToken.kind, start)) 294*1c12ee1eSDan Willemsen} 295*1c12ee1eSDan Willemsen 296*1c12ee1eSDan Willemsen// consumeToken constructs a Token for given Kind with raw value derived from 297*1c12ee1eSDan Willemsen// current d.in and given size, and consumes the given size-length of it. 298*1c12ee1eSDan Willemsenfunc (d *Decoder) consumeToken(kind Kind, size int) Token { 299*1c12ee1eSDan Willemsen tok := Token{ 300*1c12ee1eSDan Willemsen kind: kind, 301*1c12ee1eSDan Willemsen raw: d.in[:size], 302*1c12ee1eSDan Willemsen pos: len(d.orig) - len(d.in), 303*1c12ee1eSDan Willemsen } 304*1c12ee1eSDan Willemsen d.consume(size) 305*1c12ee1eSDan Willemsen return tok 306*1c12ee1eSDan Willemsen} 307*1c12ee1eSDan Willemsen 308*1c12ee1eSDan Willemsen// consumeBoolToken constructs a Token for a Bool kind with raw value derived from 309*1c12ee1eSDan Willemsen// current d.in and given size. 310*1c12ee1eSDan Willemsenfunc (d *Decoder) consumeBoolToken(b bool, size int) Token { 311*1c12ee1eSDan Willemsen tok := Token{ 312*1c12ee1eSDan Willemsen kind: Bool, 313*1c12ee1eSDan Willemsen raw: d.in[:size], 314*1c12ee1eSDan Willemsen pos: len(d.orig) - len(d.in), 315*1c12ee1eSDan Willemsen boo: b, 316*1c12ee1eSDan Willemsen } 317*1c12ee1eSDan Willemsen d.consume(size) 318*1c12ee1eSDan Willemsen return tok 319*1c12ee1eSDan Willemsen} 320*1c12ee1eSDan Willemsen 321*1c12ee1eSDan Willemsen// consumeStringToken constructs a Token for a String kind with raw value derived 322*1c12ee1eSDan Willemsen// from current d.in and given size. 323*1c12ee1eSDan Willemsenfunc (d *Decoder) consumeStringToken(s string, size int) Token { 324*1c12ee1eSDan Willemsen tok := Token{ 325*1c12ee1eSDan Willemsen kind: String, 326*1c12ee1eSDan Willemsen raw: d.in[:size], 327*1c12ee1eSDan Willemsen pos: len(d.orig) - len(d.in), 328*1c12ee1eSDan Willemsen str: s, 329*1c12ee1eSDan Willemsen } 330*1c12ee1eSDan Willemsen d.consume(size) 331*1c12ee1eSDan Willemsen return tok 332*1c12ee1eSDan Willemsen} 333*1c12ee1eSDan Willemsen 334*1c12ee1eSDan Willemsen// Clone returns a copy of the Decoder for use in reading ahead the next JSON 335*1c12ee1eSDan Willemsen// object, array or other values without affecting current Decoder. 336*1c12ee1eSDan Willemsenfunc (d *Decoder) Clone() *Decoder { 337*1c12ee1eSDan Willemsen ret := *d 338*1c12ee1eSDan Willemsen ret.openStack = append([]Kind(nil), ret.openStack...) 339*1c12ee1eSDan Willemsen return &ret 340*1c12ee1eSDan Willemsen} 341