xref: /aosp_15_r20/external/licenseclassifier/commentparser/comment_parser.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1// Copyright 2017 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package commentparser does a basic parse over a source file and returns all
16// of the comments from the code. This is useful for when you want to analyze
17// text written in comments (like copyright notices) but not in the code
18// itself.
19package commentparser
20
21import (
22	"bytes"
23	"strings"
24	"unicode/utf8"
25
26	"github.com/google/licenseclassifier/commentparser/language"
27)
28
29const (
30	eofInString            = "%d:EOF in string"
31	eofInSingleLineComment = "%d:EOF in single line comment"
32	eofInMultilineComment  = "%d:EOF in multiline comment"
33)
34
35// Parse parses the input data and returns the comments.
36func Parse(contents []byte, lang language.Language) Comments {
37	if len(contents) == 0 {
38		return nil
39	}
40
41	c := string(contents)
42	if !strings.HasSuffix(c, "\n") {
43		// Force a terminating newline if one isn't present.
44		c += "\n"
45	}
46	i := &input{
47		s:      c,
48		lang:   lang,
49		offset: 0,
50		pos:    position{line: 1, lineRune: []int{0}},
51	}
52	i.lex()
53	return i.comments
54}
55
56// Comment is either a single line or multiline comment in a source code file.
57// A single line comment has StartLine equal to EndLine. The lines are 1-based.
58type Comment struct {
59	StartLine int
60	EndLine   int
61	Text      string
62}
63
64// Comments allows us to treat a slice of comments as a unit.
65type Comments []*Comment
66
67// ChunkIterator returns a read-only channel and generates the comments in a
68// goroutine, then closes the channel.
69func (c Comments) ChunkIterator() <-chan Comments {
70	ch := make(chan Comments)
71	go func() {
72		defer close(ch)
73
74		if len(c) == 0 {
75			return
76		}
77
78		prevChunk := c[0]
79		for index := 0; index < len(c); index++ {
80			var chunk Comments
81			for ; index < len(c); index++ {
82				if c[index].StartLine > prevChunk.StartLine+1 {
83					break
84				}
85				if c[index].StartLine == prevChunk.StartLine+2 {
86					if c[index].StartLine != c[index].EndLine || prevChunk.StartLine != prevChunk.EndLine {
87						break
88					}
89				}
90				chunk = append(chunk, c[index])
91				prevChunk = c[index]
92			}
93			if len(chunk) == 0 {
94				break
95			}
96
97			ch <- chunk
98			if index >= len(c) {
99				break
100			}
101
102			prevChunk = c[index]
103			index--
104		}
105	}()
106	return ch
107}
108
109// StartLine is the line number (1-based) the first part of the comment block
110// starts on.
111func (c Comments) StartLine() int {
112	if len(c) == 0 {
113		return 0
114	}
115	return c[0].StartLine
116}
117
118// String creates a string out of the text of the comments. Comment begin and
119// end markers are removed.
120func (c Comments) String() string {
121	var s []string
122	for _, cmt := range c {
123		s = append(s, cmt.Text)
124	}
125	return strings.Join(s, "\n")
126}
127
128// position records the location of a lexeme.
129type position struct {
130	line     int   // Line number of input: 1-based
131	lineRune []int // Rune offset from beginning of line: 0-based
132}
133
134// input holds the current state of the lexer.
135type input struct {
136	s        string            // Entire input.
137	lang     language.Language // Source code language.
138	offset   int               // Offset into input.
139	pos      position          // Current position in the input.
140	comments Comments          // Comments in the source file.
141}
142
143// lex is called to obtain the comments.
144func (i *input) lex() {
145	for {
146		c, ok := i.peekRune()
147		if !ok {
148			break
149		}
150
151		switch c {
152		case '"', '\'', '`': // String
153			// Ignore strings because they could contain comment
154			// start or end sequences which we need to ignore.
155			if i.lang == language.HTML {
156				// Quotes in HTML-like files aren't meaningful,
157				// because it's basically plain text
158				break
159			}
160
161			ok, hasEscape := i.lang.QuoteCharacter(c)
162			if !ok {
163				break
164			}
165
166			var content bytes.Buffer
167			isDocString := false
168			quote := string(c)
169			if i.lang == language.Python {
170				if c == '\'' && i.match("'''") {
171					quote = "'''"
172					// Assume module-level docstrings start at the
173					// beginning of a line.  Function docstrings not
174					// supported.
175					if i.pos.lineRune[len(i.pos.lineRune)-1] == 3 {
176						isDocString = true
177					}
178				} else if c == '"' && i.match(`"""`) {
179					quote = `"""`
180					if i.pos.lineRune[len(i.pos.lineRune)-1] == 3 {
181						isDocString = true
182					}
183				} else {
184					i.readRune() // Eat quote.
185				}
186			} else {
187				i.readRune() // Eat quote.
188			}
189
190			startLine := i.pos.line
191			for {
192				c, ok = i.peekRune()
193				if !ok {
194					return
195				}
196				if hasEscape && c == '\\' {
197					i.readRune() // Eat escape.
198				} else if i.match(quote) {
199					break
200				} else if (i.lang == language.JavaScript || i.lang == language.Perl) && c == '\n' {
201					// JavaScript and Perl allow you to
202					// specify regexes without quotes, but
203					// which contain quotes. So treat the
204					// newline as terminating the string.
205					break
206				}
207				c := i.readRune()
208				if isDocString {
209					content.WriteRune(c)
210				}
211				if i.eof() {
212					return
213				}
214			}
215			if isDocString {
216				i.comments = append(i.comments, &Comment{
217					StartLine: startLine,
218					EndLine:   i.pos.line,
219					Text:      content.String(),
220				})
221			}
222		default:
223			startLine := i.pos.line
224			var comment bytes.Buffer
225			if ok, start, end := i.multiLineComment(); ok { // Multiline comment
226				nesting := 0
227				startLine := i.pos.line
228				for {
229					if i.eof() {
230						return
231					}
232					c := i.readRune()
233					comment.WriteRune(c)
234					if i.lang.NestedComments() && i.match(start) {
235						// Allows nested comments.
236						comment.WriteString(start)
237						nesting++
238					}
239					if i.match(end) {
240						if nesting > 0 {
241							comment.WriteString(end)
242							nesting--
243						} else {
244							break
245						}
246					}
247				}
248				i.comments = append(i.comments, &Comment{
249					StartLine: startLine,
250					EndLine:   i.pos.line,
251					Text:      comment.String(),
252				})
253			} else if i.singleLineComment() { // Single line comment
254				for {
255					if i.eof() {
256						return
257					}
258					c = i.readRune()
259					if c == '\n' {
260						i.unreadRune(c)
261						break
262					}
263					comment.WriteRune(c)
264				}
265				i.comments = append(i.comments, &Comment{
266					StartLine: startLine,
267					EndLine:   i.pos.line,
268					Text:      comment.String(),
269				})
270			}
271		}
272
273		i.readRune() // Ignore non-comments.
274	}
275}
276
277// singleLineComment returns 'true' if we've run across a single line comment
278// in the given language.
279func (i *input) singleLineComment() bool {
280	if i.match(i.lang.SingleLineCommentStart()) {
281		return true
282	}
283
284	if i.lang == language.SQL {
285		return i.match(language.MySQL.SingleLineCommentStart())
286	} else if i.lang == language.ObjectiveC {
287		return i.match(language.Matlab.SingleLineCommentStart())
288	}
289
290	return false
291}
292
293// multiLineComment returns 'true' if we've run across a multiline comment in
294// the given language.
295func (i *input) multiLineComment() (bool, string, string) {
296	if s := i.lang.MultilineCommentStart(); i.match(s) {
297		return true, s, i.lang.MultilineCommentEnd()
298	}
299
300	if i.lang == language.SQL {
301		if s := language.MySQL.MultilineCommentStart(); i.match(s) {
302			return true, s, language.MySQL.MultilineCommentEnd()
303		}
304	} else if i.lang == language.ObjectiveC {
305		if s := language.Matlab.MultilineCommentStart(); i.match(s) {
306			return true, s, language.Matlab.MultilineCommentEnd()
307		}
308	}
309
310	return false, "", ""
311}
312
313// match returns 'true' if the next tokens in the stream match the given
314// string.
315func (i *input) match(s string) bool {
316	if s == "" {
317		return false
318	}
319	saved := s
320	var read []rune
321	for len(s) > 0 && !i.eof() {
322		r, size := utf8.DecodeRuneInString(s)
323		if c, ok := i.peekRune(); ok && c == r {
324			read = append(read, c)
325		} else {
326			// No match. Push the tokens we read back onto the stack.
327			for idx := len(read) - 1; idx >= 0; idx-- {
328				i.unreadRune(read[idx])
329			}
330			return false
331		}
332		s = s[size:]
333		i.readRune() // Eat token.
334	}
335	return string(read) == saved
336}
337
338// eof reports whether the input has reached the end of the file.
339func (i *input) eof() bool {
340	return len(i.s) <= i.offset
341}
342
343// peekRune returns the next rune in the input without consuming it.
344func (i *input) peekRune() (rune, bool) {
345	if i.eof() {
346		return rune(0), false
347	}
348	r, _ := utf8.DecodeRuneInString(i.s[i.offset:])
349	return r, true
350}
351
352// readRune consumes and returns the next rune in the input.
353func (i *input) readRune() rune {
354	r, size := utf8.DecodeRuneInString(i.s[i.offset:])
355	if r == '\n' {
356		i.pos.line++
357		i.pos.lineRune = append(i.pos.lineRune, 0)
358	} else {
359		i.pos.lineRune[len(i.pos.lineRune)-1]++
360	}
361	i.offset += size
362	return r
363}
364
365// unreadRune winds the lexer's state back to before the rune was read.
366func (i *input) unreadRune(c rune) {
367	p := make([]byte, utf8.UTFMax)
368	size := utf8.EncodeRune(p, c)
369	i.offset -= size
370	if c == '\n' {
371		i.pos.line--
372		if len(i.pos.lineRune) > 1 {
373			i.pos.lineRune = i.pos.lineRune[:len(i.pos.lineRune)-1]
374		} else {
375			i.pos.lineRune[len(i.pos.lineRune)-1] = 0
376		}
377	} else {
378		i.pos.lineRune[len(i.pos.lineRune)-1]--
379	}
380}
381