1// Copyright 2017 Google Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Package commentparser does a basic parse over a source file and returns all 16// of the comments from the code. This is useful for when you want to analyze 17// text written in comments (like copyright notices) but not in the code 18// itself. 19package commentparser 20 21import ( 22 "bytes" 23 "strings" 24 "unicode/utf8" 25 26 "github.com/google/licenseclassifier/commentparser/language" 27) 28 29const ( 30 eofInString = "%d:EOF in string" 31 eofInSingleLineComment = "%d:EOF in single line comment" 32 eofInMultilineComment = "%d:EOF in multiline comment" 33) 34 35// Parse parses the input data and returns the comments. 36func Parse(contents []byte, lang language.Language) Comments { 37 if len(contents) == 0 { 38 return nil 39 } 40 41 c := string(contents) 42 if !strings.HasSuffix(c, "\n") { 43 // Force a terminating newline if one isn't present. 44 c += "\n" 45 } 46 i := &input{ 47 s: c, 48 lang: lang, 49 offset: 0, 50 pos: position{line: 1, lineRune: []int{0}}, 51 } 52 i.lex() 53 return i.comments 54} 55 56// Comment is either a single line or multiline comment in a source code file. 57// A single line comment has StartLine equal to EndLine. The lines are 1-based. 58type Comment struct { 59 StartLine int 60 EndLine int 61 Text string 62} 63 64// Comments allows us to treat a slice of comments as a unit. 65type Comments []*Comment 66 67// ChunkIterator returns a read-only channel and generates the comments in a 68// goroutine, then closes the channel. 69func (c Comments) ChunkIterator() <-chan Comments { 70 ch := make(chan Comments) 71 go func() { 72 defer close(ch) 73 74 if len(c) == 0 { 75 return 76 } 77 78 prevChunk := c[0] 79 for index := 0; index < len(c); index++ { 80 var chunk Comments 81 for ; index < len(c); index++ { 82 if c[index].StartLine > prevChunk.StartLine+1 { 83 break 84 } 85 if c[index].StartLine == prevChunk.StartLine+2 { 86 if c[index].StartLine != c[index].EndLine || prevChunk.StartLine != prevChunk.EndLine { 87 break 88 } 89 } 90 chunk = append(chunk, c[index]) 91 prevChunk = c[index] 92 } 93 if len(chunk) == 0 { 94 break 95 } 96 97 ch <- chunk 98 if index >= len(c) { 99 break 100 } 101 102 prevChunk = c[index] 103 index-- 104 } 105 }() 106 return ch 107} 108 109// StartLine is the line number (1-based) the first part of the comment block 110// starts on. 111func (c Comments) StartLine() int { 112 if len(c) == 0 { 113 return 0 114 } 115 return c[0].StartLine 116} 117 118// String creates a string out of the text of the comments. Comment begin and 119// end markers are removed. 120func (c Comments) String() string { 121 var s []string 122 for _, cmt := range c { 123 s = append(s, cmt.Text) 124 } 125 return strings.Join(s, "\n") 126} 127 128// position records the location of a lexeme. 129type position struct { 130 line int // Line number of input: 1-based 131 lineRune []int // Rune offset from beginning of line: 0-based 132} 133 134// input holds the current state of the lexer. 135type input struct { 136 s string // Entire input. 137 lang language.Language // Source code language. 138 offset int // Offset into input. 139 pos position // Current position in the input. 140 comments Comments // Comments in the source file. 141} 142 143// lex is called to obtain the comments. 144func (i *input) lex() { 145 for { 146 c, ok := i.peekRune() 147 if !ok { 148 break 149 } 150 151 switch c { 152 case '"', '\'', '`': // String 153 // Ignore strings because they could contain comment 154 // start or end sequences which we need to ignore. 155 if i.lang == language.HTML { 156 // Quotes in HTML-like files aren't meaningful, 157 // because it's basically plain text 158 break 159 } 160 161 ok, hasEscape := i.lang.QuoteCharacter(c) 162 if !ok { 163 break 164 } 165 166 var content bytes.Buffer 167 isDocString := false 168 quote := string(c) 169 if i.lang == language.Python { 170 if c == '\'' && i.match("'''") { 171 quote = "'''" 172 // Assume module-level docstrings start at the 173 // beginning of a line. Function docstrings not 174 // supported. 175 if i.pos.lineRune[len(i.pos.lineRune)-1] == 3 { 176 isDocString = true 177 } 178 } else if c == '"' && i.match(`"""`) { 179 quote = `"""` 180 if i.pos.lineRune[len(i.pos.lineRune)-1] == 3 { 181 isDocString = true 182 } 183 } else { 184 i.readRune() // Eat quote. 185 } 186 } else { 187 i.readRune() // Eat quote. 188 } 189 190 startLine := i.pos.line 191 for { 192 c, ok = i.peekRune() 193 if !ok { 194 return 195 } 196 if hasEscape && c == '\\' { 197 i.readRune() // Eat escape. 198 } else if i.match(quote) { 199 break 200 } else if (i.lang == language.JavaScript || i.lang == language.Perl) && c == '\n' { 201 // JavaScript and Perl allow you to 202 // specify regexes without quotes, but 203 // which contain quotes. So treat the 204 // newline as terminating the string. 205 break 206 } 207 c := i.readRune() 208 if isDocString { 209 content.WriteRune(c) 210 } 211 if i.eof() { 212 return 213 } 214 } 215 if isDocString { 216 i.comments = append(i.comments, &Comment{ 217 StartLine: startLine, 218 EndLine: i.pos.line, 219 Text: content.String(), 220 }) 221 } 222 default: 223 startLine := i.pos.line 224 var comment bytes.Buffer 225 if ok, start, end := i.multiLineComment(); ok { // Multiline comment 226 nesting := 0 227 startLine := i.pos.line 228 for { 229 if i.eof() { 230 return 231 } 232 c := i.readRune() 233 comment.WriteRune(c) 234 if i.lang.NestedComments() && i.match(start) { 235 // Allows nested comments. 236 comment.WriteString(start) 237 nesting++ 238 } 239 if i.match(end) { 240 if nesting > 0 { 241 comment.WriteString(end) 242 nesting-- 243 } else { 244 break 245 } 246 } 247 } 248 i.comments = append(i.comments, &Comment{ 249 StartLine: startLine, 250 EndLine: i.pos.line, 251 Text: comment.String(), 252 }) 253 } else if i.singleLineComment() { // Single line comment 254 for { 255 if i.eof() { 256 return 257 } 258 c = i.readRune() 259 if c == '\n' { 260 i.unreadRune(c) 261 break 262 } 263 comment.WriteRune(c) 264 } 265 i.comments = append(i.comments, &Comment{ 266 StartLine: startLine, 267 EndLine: i.pos.line, 268 Text: comment.String(), 269 }) 270 } 271 } 272 273 i.readRune() // Ignore non-comments. 274 } 275} 276 277// singleLineComment returns 'true' if we've run across a single line comment 278// in the given language. 279func (i *input) singleLineComment() bool { 280 if i.match(i.lang.SingleLineCommentStart()) { 281 return true 282 } 283 284 if i.lang == language.SQL { 285 return i.match(language.MySQL.SingleLineCommentStart()) 286 } else if i.lang == language.ObjectiveC { 287 return i.match(language.Matlab.SingleLineCommentStart()) 288 } 289 290 return false 291} 292 293// multiLineComment returns 'true' if we've run across a multiline comment in 294// the given language. 295func (i *input) multiLineComment() (bool, string, string) { 296 if s := i.lang.MultilineCommentStart(); i.match(s) { 297 return true, s, i.lang.MultilineCommentEnd() 298 } 299 300 if i.lang == language.SQL { 301 if s := language.MySQL.MultilineCommentStart(); i.match(s) { 302 return true, s, language.MySQL.MultilineCommentEnd() 303 } 304 } else if i.lang == language.ObjectiveC { 305 if s := language.Matlab.MultilineCommentStart(); i.match(s) { 306 return true, s, language.Matlab.MultilineCommentEnd() 307 } 308 } 309 310 return false, "", "" 311} 312 313// match returns 'true' if the next tokens in the stream match the given 314// string. 315func (i *input) match(s string) bool { 316 if s == "" { 317 return false 318 } 319 saved := s 320 var read []rune 321 for len(s) > 0 && !i.eof() { 322 r, size := utf8.DecodeRuneInString(s) 323 if c, ok := i.peekRune(); ok && c == r { 324 read = append(read, c) 325 } else { 326 // No match. Push the tokens we read back onto the stack. 327 for idx := len(read) - 1; idx >= 0; idx-- { 328 i.unreadRune(read[idx]) 329 } 330 return false 331 } 332 s = s[size:] 333 i.readRune() // Eat token. 334 } 335 return string(read) == saved 336} 337 338// eof reports whether the input has reached the end of the file. 339func (i *input) eof() bool { 340 return len(i.s) <= i.offset 341} 342 343// peekRune returns the next rune in the input without consuming it. 344func (i *input) peekRune() (rune, bool) { 345 if i.eof() { 346 return rune(0), false 347 } 348 r, _ := utf8.DecodeRuneInString(i.s[i.offset:]) 349 return r, true 350} 351 352// readRune consumes and returns the next rune in the input. 353func (i *input) readRune() rune { 354 r, size := utf8.DecodeRuneInString(i.s[i.offset:]) 355 if r == '\n' { 356 i.pos.line++ 357 i.pos.lineRune = append(i.pos.lineRune, 0) 358 } else { 359 i.pos.lineRune[len(i.pos.lineRune)-1]++ 360 } 361 i.offset += size 362 return r 363} 364 365// unreadRune winds the lexer's state back to before the rune was read. 366func (i *input) unreadRune(c rune) { 367 p := make([]byte, utf8.UTFMax) 368 size := utf8.EncodeRune(p, c) 369 i.offset -= size 370 if c == '\n' { 371 i.pos.line-- 372 if len(i.pos.lineRune) > 1 { 373 i.pos.lineRune = i.pos.lineRune[:len(i.pos.lineRune)-1] 374 } else { 375 i.pos.lineRune[len(i.pos.lineRune)-1] = 0 376 } 377 } else { 378 i.pos.lineRune[len(i.pos.lineRune)-1]-- 379 } 380} 381