1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// This file implements scanner, a lexical tokenizer for
6// Go source. After initialization, consecutive calls of
7// next advance the scanner one token at a time.
8//
9// This file, source.go, tokens.go, and token_string.go are self-contained
10// (`go tool compile scanner.go source.go tokens.go token_string.go` compiles)
11// and thus could be made into their own package.
12
13package syntax
14
15import (
16	"fmt"
17	"io"
18	"unicode"
19	"unicode/utf8"
20)
21
22// The mode flags below control which comments are reported
23// by calling the error handler. If no flag is set, comments
24// are ignored.
25const (
26	comments   uint = 1 << iota // call handler for all comments
27	directives                  // call handler for directives only
28)
29
30type scanner struct {
31	source
32	mode   uint
33	nlsemi bool // if set '\n' and EOF translate to ';'
34
35	// current token, valid after calling next()
36	line, col uint
37	blank     bool // line is blank up to col
38	tok       token
39	lit       string   // valid if tok is _Name, _Literal, or _Semi ("semicolon", "newline", or "EOF"); may be malformed if bad is true
40	bad       bool     // valid if tok is _Literal, true if a syntax error occurred, lit may be malformed
41	kind      LitKind  // valid if tok is _Literal
42	op        Operator // valid if tok is _Operator, _Star, _AssignOp, or _IncOp
43	prec      int      // valid if tok is _Operator, _Star, _AssignOp, or _IncOp
44}
45
46func (s *scanner) init(src io.Reader, errh func(line, col uint, msg string), mode uint) {
47	s.source.init(src, errh)
48	s.mode = mode
49	s.nlsemi = false
50}
51
52// errorf reports an error at the most recently read character position.
53func (s *scanner) errorf(format string, args ...interface{}) {
54	s.error(fmt.Sprintf(format, args...))
55}
56
57// errorAtf reports an error at a byte column offset relative to the current token start.
58func (s *scanner) errorAtf(offset int, format string, args ...interface{}) {
59	s.errh(s.line, s.col+uint(offset), fmt.Sprintf(format, args...))
60}
61
62// setLit sets the scanner state for a recognized _Literal token.
63func (s *scanner) setLit(kind LitKind, ok bool) {
64	s.nlsemi = true
65	s.tok = _Literal
66	s.lit = string(s.segment())
67	s.bad = !ok
68	s.kind = kind
69}
70
71// next advances the scanner by reading the next token.
72//
73// If a read, source encoding, or lexical error occurs, next calls
74// the installed error handler with the respective error position
75// and message. The error message is guaranteed to be non-empty and
76// never starts with a '/'. The error handler must exist.
77//
78// If the scanner mode includes the comments flag and a comment
79// (including comments containing directives) is encountered, the
80// error handler is also called with each comment position and text
81// (including opening /* or // and closing */, but without a newline
82// at the end of line comments). Comment text always starts with a /
83// which can be used to distinguish these handler calls from errors.
84//
85// If the scanner mode includes the directives (but not the comments)
86// flag, only comments containing a //line, /*line, or //go: directive
87// are reported, in the same way as regular comments.
88func (s *scanner) next() {
89	nlsemi := s.nlsemi
90	s.nlsemi = false
91
92redo:
93	// skip white space
94	s.stop()
95	startLine, startCol := s.pos()
96	for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !nlsemi || s.ch == '\r' {
97		s.nextch()
98	}
99
100	// token start
101	s.line, s.col = s.pos()
102	s.blank = s.line > startLine || startCol == colbase
103	s.start()
104	if isLetter(s.ch) || s.ch >= utf8.RuneSelf && s.atIdentChar(true) {
105		s.nextch()
106		s.ident()
107		return
108	}
109
110	switch s.ch {
111	case -1:
112		if nlsemi {
113			s.lit = "EOF"
114			s.tok = _Semi
115			break
116		}
117		s.tok = _EOF
118
119	case '\n':
120		s.nextch()
121		s.lit = "newline"
122		s.tok = _Semi
123
124	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
125		s.number(false)
126
127	case '"':
128		s.stdString()
129
130	case '`':
131		s.rawString()
132
133	case '\'':
134		s.rune()
135
136	case '(':
137		s.nextch()
138		s.tok = _Lparen
139
140	case '[':
141		s.nextch()
142		s.tok = _Lbrack
143
144	case '{':
145		s.nextch()
146		s.tok = _Lbrace
147
148	case ',':
149		s.nextch()
150		s.tok = _Comma
151
152	case ';':
153		s.nextch()
154		s.lit = "semicolon"
155		s.tok = _Semi
156
157	case ')':
158		s.nextch()
159		s.nlsemi = true
160		s.tok = _Rparen
161
162	case ']':
163		s.nextch()
164		s.nlsemi = true
165		s.tok = _Rbrack
166
167	case '}':
168		s.nextch()
169		s.nlsemi = true
170		s.tok = _Rbrace
171
172	case ':':
173		s.nextch()
174		if s.ch == '=' {
175			s.nextch()
176			s.tok = _Define
177			break
178		}
179		s.tok = _Colon
180
181	case '.':
182		s.nextch()
183		if isDecimal(s.ch) {
184			s.number(true)
185			break
186		}
187		if s.ch == '.' {
188			s.nextch()
189			if s.ch == '.' {
190				s.nextch()
191				s.tok = _DotDotDot
192				break
193			}
194			s.rewind() // now s.ch holds 1st '.'
195			s.nextch() // consume 1st '.' again
196		}
197		s.tok = _Dot
198
199	case '+':
200		s.nextch()
201		s.op, s.prec = Add, precAdd
202		if s.ch != '+' {
203			goto assignop
204		}
205		s.nextch()
206		s.nlsemi = true
207		s.tok = _IncOp
208
209	case '-':
210		s.nextch()
211		s.op, s.prec = Sub, precAdd
212		if s.ch != '-' {
213			goto assignop
214		}
215		s.nextch()
216		s.nlsemi = true
217		s.tok = _IncOp
218
219	case '*':
220		s.nextch()
221		s.op, s.prec = Mul, precMul
222		// don't goto assignop - want _Star token
223		if s.ch == '=' {
224			s.nextch()
225			s.tok = _AssignOp
226			break
227		}
228		s.tok = _Star
229
230	case '/':
231		s.nextch()
232		if s.ch == '/' {
233			s.nextch()
234			s.lineComment()
235			goto redo
236		}
237		if s.ch == '*' {
238			s.nextch()
239			s.fullComment()
240			if line, _ := s.pos(); line > s.line && nlsemi {
241				// A multi-line comment acts like a newline;
242				// it translates to a ';' if nlsemi is set.
243				s.lit = "newline"
244				s.tok = _Semi
245				break
246			}
247			goto redo
248		}
249		s.op, s.prec = Div, precMul
250		goto assignop
251
252	case '%':
253		s.nextch()
254		s.op, s.prec = Rem, precMul
255		goto assignop
256
257	case '&':
258		s.nextch()
259		if s.ch == '&' {
260			s.nextch()
261			s.op, s.prec = AndAnd, precAndAnd
262			s.tok = _Operator
263			break
264		}
265		s.op, s.prec = And, precMul
266		if s.ch == '^' {
267			s.nextch()
268			s.op = AndNot
269		}
270		goto assignop
271
272	case '|':
273		s.nextch()
274		if s.ch == '|' {
275			s.nextch()
276			s.op, s.prec = OrOr, precOrOr
277			s.tok = _Operator
278			break
279		}
280		s.op, s.prec = Or, precAdd
281		goto assignop
282
283	case '^':
284		s.nextch()
285		s.op, s.prec = Xor, precAdd
286		goto assignop
287
288	case '<':
289		s.nextch()
290		if s.ch == '=' {
291			s.nextch()
292			s.op, s.prec = Leq, precCmp
293			s.tok = _Operator
294			break
295		}
296		if s.ch == '<' {
297			s.nextch()
298			s.op, s.prec = Shl, precMul
299			goto assignop
300		}
301		if s.ch == '-' {
302			s.nextch()
303			s.tok = _Arrow
304			break
305		}
306		s.op, s.prec = Lss, precCmp
307		s.tok = _Operator
308
309	case '>':
310		s.nextch()
311		if s.ch == '=' {
312			s.nextch()
313			s.op, s.prec = Geq, precCmp
314			s.tok = _Operator
315			break
316		}
317		if s.ch == '>' {
318			s.nextch()
319			s.op, s.prec = Shr, precMul
320			goto assignop
321		}
322		s.op, s.prec = Gtr, precCmp
323		s.tok = _Operator
324
325	case '=':
326		s.nextch()
327		if s.ch == '=' {
328			s.nextch()
329			s.op, s.prec = Eql, precCmp
330			s.tok = _Operator
331			break
332		}
333		s.tok = _Assign
334
335	case '!':
336		s.nextch()
337		if s.ch == '=' {
338			s.nextch()
339			s.op, s.prec = Neq, precCmp
340			s.tok = _Operator
341			break
342		}
343		s.op, s.prec = Not, 0
344		s.tok = _Operator
345
346	case '~':
347		s.nextch()
348		s.op, s.prec = Tilde, 0
349		s.tok = _Operator
350
351	default:
352		s.errorf("invalid character %#U", s.ch)
353		s.nextch()
354		goto redo
355	}
356
357	return
358
359assignop:
360	if s.ch == '=' {
361		s.nextch()
362		s.tok = _AssignOp
363		return
364	}
365	s.tok = _Operator
366}
367
368func (s *scanner) ident() {
369	// accelerate common case (7bit ASCII)
370	for isLetter(s.ch) || isDecimal(s.ch) {
371		s.nextch()
372	}
373
374	// general case
375	if s.ch >= utf8.RuneSelf {
376		for s.atIdentChar(false) {
377			s.nextch()
378		}
379	}
380
381	// possibly a keyword
382	lit := s.segment()
383	if len(lit) >= 2 {
384		if tok := keywordMap[hash(lit)]; tok != 0 && tokStrFast(tok) == string(lit) {
385			s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok)
386			s.tok = tok
387			return
388		}
389	}
390
391	s.nlsemi = true
392	s.lit = string(lit)
393	s.tok = _Name
394}
395
396// tokStrFast is a faster version of token.String, which assumes that tok
397// is one of the valid tokens - and can thus skip bounds checks.
398func tokStrFast(tok token) string {
399	return _token_name[_token_index[tok-1]:_token_index[tok]]
400}
401
402func (s *scanner) atIdentChar(first bool) bool {
403	switch {
404	case unicode.IsLetter(s.ch) || s.ch == '_':
405		// ok
406	case unicode.IsDigit(s.ch):
407		if first {
408			s.errorf("identifier cannot begin with digit %#U", s.ch)
409		}
410	case s.ch >= utf8.RuneSelf:
411		s.errorf("invalid character %#U in identifier", s.ch)
412	default:
413		return false
414	}
415	return true
416}
417
418// hash is a perfect hash function for keywords.
419// It assumes that s has at least length 2.
420func hash(s []byte) uint {
421	return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1)
422}
423
424var keywordMap [1 << 6]token // size must be power of two
425
426func init() {
427	// populate keywordMap
428	for tok := _Break; tok <= _Var; tok++ {
429		h := hash([]byte(tok.String()))
430		if keywordMap[h] != 0 {
431			panic("imperfect hash")
432		}
433		keywordMap[h] = tok
434	}
435}
436
437func lower(ch rune) rune     { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
438func isLetter(ch rune) bool  { return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' }
439func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
440func isHex(ch rune) bool     { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
441
442// digits accepts the sequence { digit | '_' }.
443// If base <= 10, digits accepts any decimal digit but records
444// the index (relative to the literal start) of a digit >= base
445// in *invalid, if *invalid < 0.
446// digits returns a bitset describing whether the sequence contained
447// digits (bit 0 is set), or separators '_' (bit 1 is set).
448func (s *scanner) digits(base int, invalid *int) (digsep int) {
449	if base <= 10 {
450		max := rune('0' + base)
451		for isDecimal(s.ch) || s.ch == '_' {
452			ds := 1
453			if s.ch == '_' {
454				ds = 2
455			} else if s.ch >= max && *invalid < 0 {
456				_, col := s.pos()
457				*invalid = int(col - s.col) // record invalid rune index
458			}
459			digsep |= ds
460			s.nextch()
461		}
462	} else {
463		for isHex(s.ch) || s.ch == '_' {
464			ds := 1
465			if s.ch == '_' {
466				ds = 2
467			}
468			digsep |= ds
469			s.nextch()
470		}
471	}
472	return
473}
474
475func (s *scanner) number(seenPoint bool) {
476	ok := true
477	kind := IntLit
478	base := 10        // number base
479	prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
480	digsep := 0       // bit 0: digit present, bit 1: '_' present
481	invalid := -1     // index of invalid digit in literal, or < 0
482
483	// integer part
484	if !seenPoint {
485		if s.ch == '0' {
486			s.nextch()
487			switch lower(s.ch) {
488			case 'x':
489				s.nextch()
490				base, prefix = 16, 'x'
491			case 'o':
492				s.nextch()
493				base, prefix = 8, 'o'
494			case 'b':
495				s.nextch()
496				base, prefix = 2, 'b'
497			default:
498				base, prefix = 8, '0'
499				digsep = 1 // leading 0
500			}
501		}
502		digsep |= s.digits(base, &invalid)
503		if s.ch == '.' {
504			if prefix == 'o' || prefix == 'b' {
505				s.errorf("invalid radix point in %s literal", baseName(base))
506				ok = false
507			}
508			s.nextch()
509			seenPoint = true
510		}
511	}
512
513	// fractional part
514	if seenPoint {
515		kind = FloatLit
516		digsep |= s.digits(base, &invalid)
517	}
518
519	if digsep&1 == 0 && ok {
520		s.errorf("%s literal has no digits", baseName(base))
521		ok = false
522	}
523
524	// exponent
525	if e := lower(s.ch); e == 'e' || e == 'p' {
526		if ok {
527			switch {
528			case e == 'e' && prefix != 0 && prefix != '0':
529				s.errorf("%q exponent requires decimal mantissa", s.ch)
530				ok = false
531			case e == 'p' && prefix != 'x':
532				s.errorf("%q exponent requires hexadecimal mantissa", s.ch)
533				ok = false
534			}
535		}
536		s.nextch()
537		kind = FloatLit
538		if s.ch == '+' || s.ch == '-' {
539			s.nextch()
540		}
541		digsep = s.digits(10, nil) | digsep&2 // don't lose sep bit
542		if digsep&1 == 0 && ok {
543			s.errorf("exponent has no digits")
544			ok = false
545		}
546	} else if prefix == 'x' && kind == FloatLit && ok {
547		s.errorf("hexadecimal mantissa requires a 'p' exponent")
548		ok = false
549	}
550
551	// suffix 'i'
552	if s.ch == 'i' {
553		kind = ImagLit
554		s.nextch()
555	}
556
557	s.setLit(kind, ok) // do this now so we can use s.lit below
558
559	if kind == IntLit && invalid >= 0 && ok {
560		s.errorAtf(invalid, "invalid digit %q in %s literal", s.lit[invalid], baseName(base))
561		ok = false
562	}
563
564	if digsep&2 != 0 && ok {
565		if i := invalidSep(s.lit); i >= 0 {
566			s.errorAtf(i, "'_' must separate successive digits")
567			ok = false
568		}
569	}
570
571	s.bad = !ok // correct s.bad
572}
573
574func baseName(base int) string {
575	switch base {
576	case 2:
577		return "binary"
578	case 8:
579		return "octal"
580	case 10:
581		return "decimal"
582	case 16:
583		return "hexadecimal"
584	}
585	panic("invalid base")
586}
587
588// invalidSep returns the index of the first invalid separator in x, or -1.
589func invalidSep(x string) int {
590	x1 := ' ' // prefix char, we only care if it's 'x'
591	d := '.'  // digit, one of '_', '0' (a digit), or '.' (anything else)
592	i := 0
593
594	// a prefix counts as a digit
595	if len(x) >= 2 && x[0] == '0' {
596		x1 = lower(rune(x[1]))
597		if x1 == 'x' || x1 == 'o' || x1 == 'b' {
598			d = '0'
599			i = 2
600		}
601	}
602
603	// mantissa and exponent
604	for ; i < len(x); i++ {
605		p := d // previous digit
606		d = rune(x[i])
607		switch {
608		case d == '_':
609			if p != '0' {
610				return i
611			}
612		case isDecimal(d) || x1 == 'x' && isHex(d):
613			d = '0'
614		default:
615			if p == '_' {
616				return i - 1
617			}
618			d = '.'
619		}
620	}
621	if d == '_' {
622		return len(x) - 1
623	}
624
625	return -1
626}
627
628func (s *scanner) rune() {
629	ok := true
630	s.nextch()
631
632	n := 0
633	for ; ; n++ {
634		if s.ch == '\'' {
635			if ok {
636				if n == 0 {
637					s.errorf("empty rune literal or unescaped '")
638					ok = false
639				} else if n != 1 {
640					s.errorAtf(0, "more than one character in rune literal")
641					ok = false
642				}
643			}
644			s.nextch()
645			break
646		}
647		if s.ch == '\\' {
648			s.nextch()
649			if !s.escape('\'') {
650				ok = false
651			}
652			continue
653		}
654		if s.ch == '\n' {
655			if ok {
656				s.errorf("newline in rune literal")
657				ok = false
658			}
659			break
660		}
661		if s.ch < 0 {
662			if ok {
663				s.errorAtf(0, "rune literal not terminated")
664				ok = false
665			}
666			break
667		}
668		s.nextch()
669	}
670
671	s.setLit(RuneLit, ok)
672}
673
674func (s *scanner) stdString() {
675	ok := true
676	s.nextch()
677
678	for {
679		if s.ch == '"' {
680			s.nextch()
681			break
682		}
683		if s.ch == '\\' {
684			s.nextch()
685			if !s.escape('"') {
686				ok = false
687			}
688			continue
689		}
690		if s.ch == '\n' {
691			s.errorf("newline in string")
692			ok = false
693			break
694		}
695		if s.ch < 0 {
696			s.errorAtf(0, "string not terminated")
697			ok = false
698			break
699		}
700		s.nextch()
701	}
702
703	s.setLit(StringLit, ok)
704}
705
706func (s *scanner) rawString() {
707	ok := true
708	s.nextch()
709
710	for {
711		if s.ch == '`' {
712			s.nextch()
713			break
714		}
715		if s.ch < 0 {
716			s.errorAtf(0, "string not terminated")
717			ok = false
718			break
719		}
720		s.nextch()
721	}
722	// We leave CRs in the string since they are part of the
723	// literal (even though they are not part of the literal
724	// value).
725
726	s.setLit(StringLit, ok)
727}
728
729func (s *scanner) comment(text string) {
730	s.errorAtf(0, "%s", text)
731}
732
733func (s *scanner) skipLine() {
734	// don't consume '\n' - needed for nlsemi logic
735	for s.ch >= 0 && s.ch != '\n' {
736		s.nextch()
737	}
738}
739
740func (s *scanner) lineComment() {
741	// opening has already been consumed
742
743	if s.mode&comments != 0 {
744		s.skipLine()
745		s.comment(string(s.segment()))
746		return
747	}
748
749	// are we saving directives? or is this definitely not a directive?
750	if s.mode&directives == 0 || (s.ch != 'g' && s.ch != 'l') {
751		s.stop()
752		s.skipLine()
753		return
754	}
755
756	// recognize go: or line directives
757	prefix := "go:"
758	if s.ch == 'l' {
759		prefix = "line "
760	}
761	for _, m := range prefix {
762		if s.ch != m {
763			s.stop()
764			s.skipLine()
765			return
766		}
767		s.nextch()
768	}
769
770	// directive text
771	s.skipLine()
772	s.comment(string(s.segment()))
773}
774
775func (s *scanner) skipComment() bool {
776	for s.ch >= 0 {
777		for s.ch == '*' {
778			s.nextch()
779			if s.ch == '/' {
780				s.nextch()
781				return true
782			}
783		}
784		s.nextch()
785	}
786	s.errorAtf(0, "comment not terminated")
787	return false
788}
789
790func (s *scanner) fullComment() {
791	/* opening has already been consumed */
792
793	if s.mode&comments != 0 {
794		if s.skipComment() {
795			s.comment(string(s.segment()))
796		}
797		return
798	}
799
800	if s.mode&directives == 0 || s.ch != 'l' {
801		s.stop()
802		s.skipComment()
803		return
804	}
805
806	// recognize line directive
807	const prefix = "line "
808	for _, m := range prefix {
809		if s.ch != m {
810			s.stop()
811			s.skipComment()
812			return
813		}
814		s.nextch()
815	}
816
817	// directive text
818	if s.skipComment() {
819		s.comment(string(s.segment()))
820	}
821}
822
823func (s *scanner) escape(quote rune) bool {
824	var n int
825	var base, max uint32
826
827	switch s.ch {
828	case quote, 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\':
829		s.nextch()
830		return true
831	case '0', '1', '2', '3', '4', '5', '6', '7':
832		n, base, max = 3, 8, 255
833	case 'x':
834		s.nextch()
835		n, base, max = 2, 16, 255
836	case 'u':
837		s.nextch()
838		n, base, max = 4, 16, unicode.MaxRune
839	case 'U':
840		s.nextch()
841		n, base, max = 8, 16, unicode.MaxRune
842	default:
843		if s.ch < 0 {
844			return true // complain in caller about EOF
845		}
846		s.errorf("unknown escape")
847		return false
848	}
849
850	var x uint32
851	for i := n; i > 0; i-- {
852		if s.ch < 0 {
853			return true // complain in caller about EOF
854		}
855		d := base
856		if isDecimal(s.ch) {
857			d = uint32(s.ch) - '0'
858		} else if 'a' <= lower(s.ch) && lower(s.ch) <= 'f' {
859			d = uint32(lower(s.ch)) - 'a' + 10
860		}
861		if d >= base {
862			s.errorf("invalid character %q in %s escape", s.ch, baseName(int(base)))
863			return false
864		}
865		// d < base
866		x = x*base + d
867		s.nextch()
868	}
869
870	if x > max && base == 8 {
871		s.errorf("octal escape value %d > 255", x)
872		return false
873	}
874
875	if x > max || 0xD800 <= x && x < 0xE000 /* surrogate range */ {
876		s.errorf("escape is invalid Unicode code point %#U", x)
877		return false
878	}
879
880	return true
881}
882