xref: /aosp_15_r20/external/licenseclassifier/commentparser/language/language.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1*46c4c49dSIbrahim Kanouche// Copyright 2017 Google Inc.
2*46c4c49dSIbrahim Kanouche//
3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License");
4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License.
5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at
6*46c4c49dSIbrahim Kanouche//
7*46c4c49dSIbrahim Kanouche//     http://www.apache.org/licenses/LICENSE-2.0
8*46c4c49dSIbrahim Kanouche//
9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software
10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS,
11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and
13*46c4c49dSIbrahim Kanouche// limitations under the License.
14*46c4c49dSIbrahim Kanouche
15*46c4c49dSIbrahim Kanouche// Package language contains methods and information about the different
16*46c4c49dSIbrahim Kanouche// programming languages the comment parser supports.
17*46c4c49dSIbrahim Kanouchepackage language
18*46c4c49dSIbrahim Kanouche
19*46c4c49dSIbrahim Kanoucheimport (
20*46c4c49dSIbrahim Kanouche	"path/filepath"
21*46c4c49dSIbrahim Kanouche	"strings"
22*46c4c49dSIbrahim Kanouche)
23*46c4c49dSIbrahim Kanouche
24*46c4c49dSIbrahim Kanouche// Language is the progamming language we're grabbing the comments from.
25*46c4c49dSIbrahim Kanouchetype Language int
26*46c4c49dSIbrahim Kanouche
27*46c4c49dSIbrahim Kanouche// Languages we can retrieve comments from.
28*46c4c49dSIbrahim Kanoucheconst (
29*46c4c49dSIbrahim Kanouche	Unknown Language = iota
30*46c4c49dSIbrahim Kanouche	AppleScript
31*46c4c49dSIbrahim Kanouche	Assembly
32*46c4c49dSIbrahim Kanouche	BLIF // Berkley Logic Interface Format
33*46c4c49dSIbrahim Kanouche	Batch
34*46c4c49dSIbrahim Kanouche	C
35*46c4c49dSIbrahim Kanouche	Clif
36*46c4c49dSIbrahim Kanouche	Clojure
37*46c4c49dSIbrahim Kanouche	CMake
38*46c4c49dSIbrahim Kanouche	CSharp
39*46c4c49dSIbrahim Kanouche	Dart
40*46c4c49dSIbrahim Kanouche	EDIF // Electronic Design Interchange Format
41*46c4c49dSIbrahim Kanouche	Elixir
42*46c4c49dSIbrahim Kanouche	Flex
43*46c4c49dSIbrahim Kanouche	Fortran
44*46c4c49dSIbrahim Kanouche	GLSLF // OpenGL Shading Language
45*46c4c49dSIbrahim Kanouche	Go
46*46c4c49dSIbrahim Kanouche	HTML
47*46c4c49dSIbrahim Kanouche	Haskell
48*46c4c49dSIbrahim Kanouche	Java
49*46c4c49dSIbrahim Kanouche	JavaScript
50*46c4c49dSIbrahim Kanouche	Kotlin
51*46c4c49dSIbrahim Kanouche	LEF // Library Exchange Format
52*46c4c49dSIbrahim Kanouche	Lisp
53*46c4c49dSIbrahim Kanouche	Markdown
54*46c4c49dSIbrahim Kanouche	Matlab
55*46c4c49dSIbrahim Kanouche	MySQL
56*46c4c49dSIbrahim Kanouche	NinjaBuild
57*46c4c49dSIbrahim Kanouche	ObjectiveC
58*46c4c49dSIbrahim Kanouche	Perl
59*46c4c49dSIbrahim Kanouche	Python
60*46c4c49dSIbrahim Kanouche	R
61*46c4c49dSIbrahim Kanouche	Ruby
62*46c4c49dSIbrahim Kanouche	Rust
63*46c4c49dSIbrahim Kanouche	SDC  // Synopsis Design Constraint
64*46c4c49dSIbrahim Kanouche	SDF  // Standard Delay Format
65*46c4c49dSIbrahim Kanouche	SPEF // Standard Parasitics Exchange Format
66*46c4c49dSIbrahim Kanouche	SQL
67*46c4c49dSIbrahim Kanouche	SWIG
68*46c4c49dSIbrahim Kanouche	Shader
69*46c4c49dSIbrahim Kanouche	Shell
70*46c4c49dSIbrahim Kanouche	Swift
71*46c4c49dSIbrahim Kanouche	SystemVerilog
72*46c4c49dSIbrahim Kanouche	TCL
73*46c4c49dSIbrahim Kanouche	TypeScript
74*46c4c49dSIbrahim Kanouche	Verilog
75*46c4c49dSIbrahim Kanouche	XDC // Xilinx Design Constraint files
76*46c4c49dSIbrahim Kanouche	Yacc
77*46c4c49dSIbrahim Kanouche	Yaml
78*46c4c49dSIbrahim Kanouche)
79*46c4c49dSIbrahim Kanouche
80*46c4c49dSIbrahim Kanouche// style is the comment styles that a language uses.
81*46c4c49dSIbrahim Kanouchetype style int
82*46c4c49dSIbrahim Kanouche
83*46c4c49dSIbrahim Kanouche// Comment styles.
84*46c4c49dSIbrahim Kanoucheconst (
85*46c4c49dSIbrahim Kanouche	unknown     style = iota
86*46c4c49dSIbrahim Kanouche	applescript       // -- ... and (* ... *)
87*46c4c49dSIbrahim Kanouche	batch             // @REM
88*46c4c49dSIbrahim Kanouche	bcpl              // // ... and /* ... */
89*46c4c49dSIbrahim Kanouche	cmake             // # ... and #[[ ... ]]
90*46c4c49dSIbrahim Kanouche	fortran           // ! ...
91*46c4c49dSIbrahim Kanouche	hash              // # ...
92*46c4c49dSIbrahim Kanouche	haskell           // -- ... and {- ... -}
93*46c4c49dSIbrahim Kanouche	html              // <!-- ... -->
94*46c4c49dSIbrahim Kanouche	lisp              // ;; ...
95*46c4c49dSIbrahim Kanouche	matlab            // % ...
96*46c4c49dSIbrahim Kanouche	mysql             // # ... and /* ... */
97*46c4c49dSIbrahim Kanouche	ruby              // # ... and =begin ... =end
98*46c4c49dSIbrahim Kanouche	shell             // # ... and %{ ... %}
99*46c4c49dSIbrahim Kanouche	sql               // -- ... and /* ... */
100*46c4c49dSIbrahim Kanouche)
101*46c4c49dSIbrahim Kanouche
102*46c4c49dSIbrahim Kanouche// ClassifyLanguage determines what language the source code was written in. It
103*46c4c49dSIbrahim Kanouche// does this by looking at the file's extension.
104*46c4c49dSIbrahim Kanouchefunc ClassifyLanguage(filename string) Language {
105*46c4c49dSIbrahim Kanouche	ext := strings.ToLower(filepath.Ext(filename))
106*46c4c49dSIbrahim Kanouche	if len(ext) == 0 || ext[0] != '.' {
107*46c4c49dSIbrahim Kanouche		return Unknown
108*46c4c49dSIbrahim Kanouche	}
109*46c4c49dSIbrahim Kanouche
110*46c4c49dSIbrahim Kanouche	switch ext[1:] { // Skip the '.'.
111*46c4c49dSIbrahim Kanouche	case "applescript":
112*46c4c49dSIbrahim Kanouche		return AppleScript
113*46c4c49dSIbrahim Kanouche	case "bat":
114*46c4c49dSIbrahim Kanouche		return Batch
115*46c4c49dSIbrahim Kanouche	case "blif", "eblif":
116*46c4c49dSIbrahim Kanouche		return BLIF
117*46c4c49dSIbrahim Kanouche	case "c", "cc", "cpp", "c++", "h", "hh", "hpp":
118*46c4c49dSIbrahim Kanouche		return C
119*46c4c49dSIbrahim Kanouche	case "clif":
120*46c4c49dSIbrahim Kanouche		return Clif
121*46c4c49dSIbrahim Kanouche	case "cmake":
122*46c4c49dSIbrahim Kanouche		return CMake
123*46c4c49dSIbrahim Kanouche	case "cs":
124*46c4c49dSIbrahim Kanouche		return CSharp
125*46c4c49dSIbrahim Kanouche	case "dart":
126*46c4c49dSIbrahim Kanouche		return Dart
127*46c4c49dSIbrahim Kanouche	case "ex", "exs":
128*46c4c49dSIbrahim Kanouche		return Elixir
129*46c4c49dSIbrahim Kanouche	case "f", "f90", "f95":
130*46c4c49dSIbrahim Kanouche		return Fortran
131*46c4c49dSIbrahim Kanouche	case "glslf":
132*46c4c49dSIbrahim Kanouche		return GLSLF
133*46c4c49dSIbrahim Kanouche	case "go":
134*46c4c49dSIbrahim Kanouche		return Go
135*46c4c49dSIbrahim Kanouche	case "hs":
136*46c4c49dSIbrahim Kanouche		return Haskell
137*46c4c49dSIbrahim Kanouche	case "html", "htm", "ng", "sgml":
138*46c4c49dSIbrahim Kanouche		return HTML
139*46c4c49dSIbrahim Kanouche	case "java":
140*46c4c49dSIbrahim Kanouche		return Java
141*46c4c49dSIbrahim Kanouche	case "js":
142*46c4c49dSIbrahim Kanouche		return JavaScript
143*46c4c49dSIbrahim Kanouche	case "kt":
144*46c4c49dSIbrahim Kanouche		return Kotlin
145*46c4c49dSIbrahim Kanouche	case "l":
146*46c4c49dSIbrahim Kanouche		return Flex
147*46c4c49dSIbrahim Kanouche	case "lef":
148*46c4c49dSIbrahim Kanouche		return LEF
149*46c4c49dSIbrahim Kanouche	case "lisp", "el", "clj":
150*46c4c49dSIbrahim Kanouche		return Lisp
151*46c4c49dSIbrahim Kanouche	case "m", "mm":
152*46c4c49dSIbrahim Kanouche		return ObjectiveC
153*46c4c49dSIbrahim Kanouche	case "md":
154*46c4c49dSIbrahim Kanouche		return Markdown
155*46c4c49dSIbrahim Kanouche	case "gn":
156*46c4c49dSIbrahim Kanouche		return NinjaBuild
157*46c4c49dSIbrahim Kanouche	case "pl", "pm":
158*46c4c49dSIbrahim Kanouche		return Perl
159*46c4c49dSIbrahim Kanouche	case "py", "pi":
160*46c4c49dSIbrahim Kanouche		return Python
161*46c4c49dSIbrahim Kanouche	case "r":
162*46c4c49dSIbrahim Kanouche		return R
163*46c4c49dSIbrahim Kanouche	case "rb":
164*46c4c49dSIbrahim Kanouche		return Ruby
165*46c4c49dSIbrahim Kanouche	case "rs":
166*46c4c49dSIbrahim Kanouche		return Rust
167*46c4c49dSIbrahim Kanouche	case "s":
168*46c4c49dSIbrahim Kanouche		return Assembly
169*46c4c49dSIbrahim Kanouche	case "sdf":
170*46c4c49dSIbrahim Kanouche		return SDF
171*46c4c49dSIbrahim Kanouche	case "sh":
172*46c4c49dSIbrahim Kanouche		return Shell
173*46c4c49dSIbrahim Kanouche	case "shader":
174*46c4c49dSIbrahim Kanouche		return Shader
175*46c4c49dSIbrahim Kanouche	case "sql":
176*46c4c49dSIbrahim Kanouche		return SQL
177*46c4c49dSIbrahim Kanouche	case "swift":
178*46c4c49dSIbrahim Kanouche		return Swift
179*46c4c49dSIbrahim Kanouche	case "swig":
180*46c4c49dSIbrahim Kanouche		return SWIG
181*46c4c49dSIbrahim Kanouche	case "sv", "svh":
182*46c4c49dSIbrahim Kanouche		return SystemVerilog
183*46c4c49dSIbrahim Kanouche	case "tcl", "sdc", "xdc":
184*46c4c49dSIbrahim Kanouche		return TCL
185*46c4c49dSIbrahim Kanouche	case "ts", "tsx":
186*46c4c49dSIbrahim Kanouche		return TypeScript
187*46c4c49dSIbrahim Kanouche	case "v", "vh":
188*46c4c49dSIbrahim Kanouche		return Verilog
189*46c4c49dSIbrahim Kanouche	case "y":
190*46c4c49dSIbrahim Kanouche		return Yacc
191*46c4c49dSIbrahim Kanouche	case "yaml":
192*46c4c49dSIbrahim Kanouche		return Yaml
193*46c4c49dSIbrahim Kanouche	}
194*46c4c49dSIbrahim Kanouche	return Unknown
195*46c4c49dSIbrahim Kanouche}
196*46c4c49dSIbrahim Kanouche
197*46c4c49dSIbrahim Kanouche// commentStyle returns the language's comment style.
198*46c4c49dSIbrahim Kanouchefunc (lang Language) commentStyle() style {
199*46c4c49dSIbrahim Kanouche	switch lang {
200*46c4c49dSIbrahim Kanouche	case Assembly, C, CSharp, Dart, Flex, GLSLF, Go, Java, JavaScript, Kotlin, ObjectiveC, Rust, Shader, Swift, SWIG, TypeScript, Yacc, Verilog, SystemVerilog, SDF, SPEF:
201*46c4c49dSIbrahim Kanouche		return bcpl
202*46c4c49dSIbrahim Kanouche	case Batch:
203*46c4c49dSIbrahim Kanouche		return batch
204*46c4c49dSIbrahim Kanouche	case BLIF, TCL:
205*46c4c49dSIbrahim Kanouche		return hash
206*46c4c49dSIbrahim Kanouche	case CMake:
207*46c4c49dSIbrahim Kanouche		return cmake
208*46c4c49dSIbrahim Kanouche	case Fortran:
209*46c4c49dSIbrahim Kanouche		return fortran
210*46c4c49dSIbrahim Kanouche	case Haskell:
211*46c4c49dSIbrahim Kanouche		return haskell
212*46c4c49dSIbrahim Kanouche	case HTML, Markdown:
213*46c4c49dSIbrahim Kanouche		return html
214*46c4c49dSIbrahim Kanouche	case Clojure, Lisp:
215*46c4c49dSIbrahim Kanouche		return lisp
216*46c4c49dSIbrahim Kanouche	case Ruby:
217*46c4c49dSIbrahim Kanouche		return ruby
218*46c4c49dSIbrahim Kanouche	case Clif, Elixir, NinjaBuild, Perl, Python, R, Shell, Yaml:
219*46c4c49dSIbrahim Kanouche		return shell
220*46c4c49dSIbrahim Kanouche	case Matlab:
221*46c4c49dSIbrahim Kanouche		return matlab
222*46c4c49dSIbrahim Kanouche	case MySQL:
223*46c4c49dSIbrahim Kanouche		return mysql
224*46c4c49dSIbrahim Kanouche	case SQL:
225*46c4c49dSIbrahim Kanouche		return sql
226*46c4c49dSIbrahim Kanouche	}
227*46c4c49dSIbrahim Kanouche	return unknown
228*46c4c49dSIbrahim Kanouche}
229*46c4c49dSIbrahim Kanouche
230*46c4c49dSIbrahim Kanouche// SingleLineCommentStart returns the starting string of a single line comment
231*46c4c49dSIbrahim Kanouche// for the given language. There is no equivalent "End" method, because it's
232*46c4c49dSIbrahim Kanouche// the end of line.
233*46c4c49dSIbrahim Kanouchefunc (lang Language) SingleLineCommentStart() string {
234*46c4c49dSIbrahim Kanouche	switch lang.commentStyle() {
235*46c4c49dSIbrahim Kanouche	case applescript, haskell, sql:
236*46c4c49dSIbrahim Kanouche		return "--"
237*46c4c49dSIbrahim Kanouche	case batch:
238*46c4c49dSIbrahim Kanouche		return "@REM"
239*46c4c49dSIbrahim Kanouche	case bcpl:
240*46c4c49dSIbrahim Kanouche		return "//"
241*46c4c49dSIbrahim Kanouche	case fortran:
242*46c4c49dSIbrahim Kanouche		return "!"
243*46c4c49dSIbrahim Kanouche	case lisp:
244*46c4c49dSIbrahim Kanouche		return ";"
245*46c4c49dSIbrahim Kanouche	case matlab:
246*46c4c49dSIbrahim Kanouche		return "%"
247*46c4c49dSIbrahim Kanouche	case shell, ruby, cmake, mysql, hash:
248*46c4c49dSIbrahim Kanouche		return "#"
249*46c4c49dSIbrahim Kanouche	}
250*46c4c49dSIbrahim Kanouche	return ""
251*46c4c49dSIbrahim Kanouche}
252*46c4c49dSIbrahim Kanouche
253*46c4c49dSIbrahim Kanouche// MultilineCommentStart returns the starting string of a multiline comment for
254*46c4c49dSIbrahim Kanouche// the given language.
255*46c4c49dSIbrahim Kanouchefunc (lang Language) MultilineCommentStart() string {
256*46c4c49dSIbrahim Kanouche	switch lang.commentStyle() {
257*46c4c49dSIbrahim Kanouche	case applescript:
258*46c4c49dSIbrahim Kanouche		return "(*"
259*46c4c49dSIbrahim Kanouche	case bcpl, mysql:
260*46c4c49dSIbrahim Kanouche		if lang != Rust {
261*46c4c49dSIbrahim Kanouche			return "/*"
262*46c4c49dSIbrahim Kanouche		}
263*46c4c49dSIbrahim Kanouche	case cmake:
264*46c4c49dSIbrahim Kanouche		return "#[["
265*46c4c49dSIbrahim Kanouche	case haskell:
266*46c4c49dSIbrahim Kanouche		return "{-"
267*46c4c49dSIbrahim Kanouche	case html:
268*46c4c49dSIbrahim Kanouche		return "<!--"
269*46c4c49dSIbrahim Kanouche	case matlab:
270*46c4c49dSIbrahim Kanouche		return "%{"
271*46c4c49dSIbrahim Kanouche	case ruby:
272*46c4c49dSIbrahim Kanouche		return "=begin"
273*46c4c49dSIbrahim Kanouche	}
274*46c4c49dSIbrahim Kanouche	return ""
275*46c4c49dSIbrahim Kanouche}
276*46c4c49dSIbrahim Kanouche
277*46c4c49dSIbrahim Kanouche// MultilineCommentEnd returns the ending string of a multiline comment for the
278*46c4c49dSIbrahim Kanouche// given language.
279*46c4c49dSIbrahim Kanouchefunc (lang Language) MultilineCommentEnd() string {
280*46c4c49dSIbrahim Kanouche	switch lang.commentStyle() {
281*46c4c49dSIbrahim Kanouche	case applescript:
282*46c4c49dSIbrahim Kanouche		return "*)"
283*46c4c49dSIbrahim Kanouche	case bcpl, mysql:
284*46c4c49dSIbrahim Kanouche		if lang != Rust {
285*46c4c49dSIbrahim Kanouche			return "*/"
286*46c4c49dSIbrahim Kanouche		}
287*46c4c49dSIbrahim Kanouche	case cmake:
288*46c4c49dSIbrahim Kanouche		return "]]"
289*46c4c49dSIbrahim Kanouche	case haskell:
290*46c4c49dSIbrahim Kanouche		return "-}"
291*46c4c49dSIbrahim Kanouche	case html:
292*46c4c49dSIbrahim Kanouche		return "-->"
293*46c4c49dSIbrahim Kanouche	case matlab:
294*46c4c49dSIbrahim Kanouche		return "%}"
295*46c4c49dSIbrahim Kanouche	case ruby:
296*46c4c49dSIbrahim Kanouche		return "=end"
297*46c4c49dSIbrahim Kanouche	}
298*46c4c49dSIbrahim Kanouche	return ""
299*46c4c49dSIbrahim Kanouche}
300*46c4c49dSIbrahim Kanouche
301*46c4c49dSIbrahim Kanouche// QuoteCharacter returns 'true' if the character is considered the beginning
302*46c4c49dSIbrahim Kanouche// of a string in the given language. The second return value is true if the
303*46c4c49dSIbrahim Kanouche// string allows for escaping.
304*46c4c49dSIbrahim Kanouchefunc (lang Language) QuoteCharacter(quote rune) (ok bool, escape bool) {
305*46c4c49dSIbrahim Kanouche	switch quote {
306*46c4c49dSIbrahim Kanouche	case '"', '\'':
307*46c4c49dSIbrahim Kanouche		return true, true
308*46c4c49dSIbrahim Kanouche	case '`':
309*46c4c49dSIbrahim Kanouche		if lang == Go {
310*46c4c49dSIbrahim Kanouche			return true, false
311*46c4c49dSIbrahim Kanouche		}
312*46c4c49dSIbrahim Kanouche	}
313*46c4c49dSIbrahim Kanouche	return false, false
314*46c4c49dSIbrahim Kanouche}
315*46c4c49dSIbrahim Kanouche
316*46c4c49dSIbrahim Kanouche// NestedComments returns true if the language allows for nested multiline comments.
317*46c4c49dSIbrahim Kanouchefunc (lang Language) NestedComments() bool {
318*46c4c49dSIbrahim Kanouche	return lang == Swift
319*46c4c49dSIbrahim Kanouche}
320