licenseclassifier/commentparser/comment_parser.go

// Copyright 2017 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package commentparser does a basic parse over a source file and returns all
// of the comments from the code. This is useful for when you want to analyze
// text written in comments (like copyright notices) but not in the code
// itself.
package commentparser

import (
	"bytes"
	"strings"
	"unicode/utf8"

	"github.com/google/licenseclassifier/commentparser/language"
)

const (
	eofInString            = "%d:EOF in string"
	eofInSingleLineComment = "%d:EOF in single line comment"
	eofInMultilineComment  = "%d:EOF in multiline comment"
)

// Parse parses the input data and returns the comments.
func Parse(contents []byte, lang language.Language) Comments {
	if len(contents) == 0 {
		return nil
	}

	c := string(contents)
	if !strings.HasSuffix(c, "\n") {
		// Force a terminating newline if one isn't present.
		c += "\n"
	}
	i := &input{
		s:      c,
		lang:   lang,
		offset: 0,
		pos:    position{line: 1, lineRune: []int{0}},
	}
	i.lex()
	return i.comments
}

// Comment is either a single line or multiline comment in a source code file.
// A single line comment has StartLine equal to EndLine. The lines are 1-based.
type Comment struct {
	StartLine int
	EndLine   int
	Text      string
}

// Comments allows us to treat a slice of comments as a unit.
type Comments []*Comment

// ChunkIterator returns a read-only channel and generates the comments in a
// goroutine, then closes the channel.
func (c Comments) ChunkIterator() <-chan Comments {
	ch := make(chan Comments)
	go func() {
		defer close(ch)

		if len(c) == 0 {
			return
		}

		prevChunk := c[0]
		for index := 0; index < len(c); index++ {
			var chunk Comments
			for ; index < len(c); index++ {
				if c[index].StartLine > prevChunk.StartLine+1 {
					break
				}
				if c[index].StartLine == prevChunk.StartLine+2 {
					if c[index].StartLine != c[index].EndLine || prevChunk.StartLine != prevChunk.EndLine {
						break
					}
				}
				chunk = append(chunk, c[index])
				prevChunk = c[index]
			}
			if len(chunk) == 0 {
				break
			}

			ch <- chunk
			if index >= len(c) {
				break
			}

			prevChunk = c[index]
			index--
		}
	}()
	return ch
}

// StartLine is the line number (1-based) the first part of the comment block
// starts on.
func (c Comments) StartLine() int {
	if len(c) == 0 {
		return 0
	}
	return c[0].StartLine
}

// String creates a string out of the text of the comments. Comment begin and
// end markers are removed.
func (c Comments) String() string {
	var s []string
	for _, cmt := range c {
		s = append(s, cmt.Text)
	}
	return strings.Join(s, "\n")
}

// position records the location of a lexeme.
type position struct {
	line     int   // Line number of input: 1-based
	lineRune []int // Rune offset from beginning of line: 0-based
}

// input holds the current state of the lexer.
type input struct {
	s        string            // Entire input.
	lang     language.Language // Source code language.
	offset   int               // Offset into input.
	pos      position          // Current position in the input.
	comments Comments          // Comments in the source file.
}

// lex is called to obtain the comments.
func (i *input) lex() {
	for {
		c, ok := i.peekRune()
		if !ok {
			break
		}

		switch c {
		case '"', '\'', '`': // String
			// Ignore strings because they could contain comment
			// start or end sequences which we need to ignore.
			if i.lang == language.HTML {
				// Quotes in HTML-like files aren't meaningful,
				// because it's basically plain text
				break
			}

			ok, hasEscape := i.lang.QuoteCharacter(c)
			if !ok {
				break
			}

			var content bytes.Buffer
			isDocString := false
			quote := string(c)
			if i.lang == language.Python {
				if c == '\'' && i.match("'''") {
					quote = "'''"
					// Assume module-level docstrings start at the
					// beginning of a line.  Function docstrings not
					// supported.
					if i.pos.lineRune[len(i.pos.lineRune)-1] == 3 {
						isDocString = true
					}
				} else if c == '"' && i.match(`"""`) {
					quote = `"""`
					if i.pos.lineRune[len(i.pos.lineRune)-1] == 3 {
						isDocString = true
					}
				} else {
					i.readRune() // Eat quote.
				}
			} else {
				i.readRune() // Eat quote.
			}

			startLine := i.pos.line
			for {
				c, ok = i.peekRune()
				if !ok {
					return
				}
				if hasEscape && c == '\\' {
					i.readRune() // Eat escape.
				} else if i.match(quote) {
					break
				} else if (i.lang == language.JavaScript || i.lang == language.Perl) && c == '\n' {
					// JavaScript and Perl allow you to
					// specify regexes without quotes, but
					// which contain quotes. So treat the
					// newline as terminating the string.
					break
				}
				c := i.readRune()
				if isDocString {
					content.WriteRune(c)
				}
				if i.eof() {
					return
				}
			}
			if isDocString {
				i.comments = append(i.comments, &Comment{
					StartLine: startLine,
					EndLine:   i.pos.line,
					Text:      content.String(),
				})
			}
		default:
			startLine := i.pos.line
			var comment bytes.Buffer
			if ok, start, end := i.multiLineComment(); ok { // Multiline comment
				nesting := 0
				startLine := i.pos.line
				for {
					if i.eof() {
						return
					}
					c := i.readRune()
					comment.WriteRune(c)
					if i.lang.NestedComments() && i.match(start) {
						// Allows nested comments.
						comment.WriteString(start)
						nesting++
					}
					if i.match(end) {
						if nesting > 0 {
							comment.WriteString(end)
							nesting--
						} else {
							break
						}
					}
				}
				i.comments = append(i.comments, &Comment{
					StartLine: startLine,
					EndLine:   i.pos.line,
					Text:      comment.String(),
				})
			} else if i.singleLineComment() { // Single line comment
				for {
					if i.eof() {
						return
					}
					c = i.readRune()
					if c == '\n' {
						i.unreadRune(c)
						break
					}
					comment.WriteRune(c)
				}
				i.comments = append(i.comments, &Comment{
					StartLine: startLine,
					EndLine:   i.pos.line,
					Text:      comment.String(),
				})
			}
		}

		i.readRune() // Ignore non-comments.
	}
}

// singleLineComment returns 'true' if we've run across a single line comment
// in the given language.
func (i *input) singleLineComment() bool {
	if i.match(i.lang.SingleLineCommentStart()) {
		return true
	}

	if i.lang == language.SQL {
		return i.match(language.MySQL.SingleLineCommentStart())
	} else if i.lang == language.ObjectiveC {
		return i.match(language.Matlab.SingleLineCommentStart())
	}

	return false
}

// multiLineComment returns 'true' if we've run across a multiline comment in
// the given language.
func (i *input) multiLineComment() (bool, string, string) {
	if s := i.lang.MultilineCommentStart(); i.match(s) {
		return true, s, i.lang.MultilineCommentEnd()
	}

	if i.lang == language.SQL {
		if s := language.MySQL.MultilineCommentStart(); i.match(s) {
			return true, s, language.MySQL.MultilineCommentEnd()
		}
	} else if i.lang == language.ObjectiveC {
		if s := language.Matlab.MultilineCommentStart(); i.match(s) {
			return true, s, language.Matlab.MultilineCommentEnd()
		}
	}

	return false, "", ""
}

// match returns 'true' if the next tokens in the stream match the given
// string.
func (i *input) match(s string) bool {
	if s == "" {
		return false
	}
	saved := s
	var read []rune
	for len(s) > 0 && !i.eof() {
		r, size := utf8.DecodeRuneInString(s)
		if c, ok := i.peekRune(); ok && c == r {
			read = append(read, c)
		} else {
			// No match. Push the tokens we read back onto the stack.
			for idx := len(read) - 1; idx >= 0; idx-- {
				i.unreadRune(read[idx])
			}
			return false
		}
		s = s[size:]
		i.readRune() // Eat token.
	}
	return string(read) == saved
}

// eof reports whether the input has reached the end of the file.
func (i *input) eof() bool {
	return len(i.s) <= i.offset
}

// peekRune returns the next rune in the input without consuming it.
func (i *input) peekRune() (rune, bool) {
	if i.eof() {
		return rune(0), false
	}
	r, _ := utf8.DecodeRuneInString(i.s[i.offset:])
	return r, true
}

// readRune consumes and returns the next rune in the input.
func (i *input) readRune() rune {
	r, size := utf8.DecodeRuneInString(i.s[i.offset:])
	if r == '\n' {
		i.pos.line++
		i.pos.lineRune = append(i.pos.lineRune, 0)
	} else {
		i.pos.lineRune[len(i.pos.lineRune)-1]++
	}
	i.offset += size
	return r
}

// unreadRune winds the lexer's state back to before the rune was read.
func (i *input) unreadRune(c rune) {
	p := make([]byte, utf8.UTFMax)
	size := utf8.EncodeRune(p, c)
	i.offset -= size
	if c == '\n' {
		i.pos.line--
		if len(i.pos.lineRune) > 1 {
			i.pos.lineRune = i.pos.lineRune[:len(i.pos.lineRune)-1]
		} else {
			i.pos.lineRune[len(i.pos.lineRune)-1] = 0
		}
	} else {
		i.pos.lineRune[len(i.pos.lineRune)-1]--
	}
}