xref: /aosp_15_r20/external/licenseclassifier/classifier.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1*46c4c49dSIbrahim Kanouche// Copyright 2017 Google Inc.
2*46c4c49dSIbrahim Kanouche//
3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License");
4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License.
5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at
6*46c4c49dSIbrahim Kanouche//
7*46c4c49dSIbrahim Kanouche//     http://www.apache.org/licenses/LICENSE-2.0
8*46c4c49dSIbrahim Kanouche//
9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software
10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS,
11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and
13*46c4c49dSIbrahim Kanouche// limitations under the License.
14*46c4c49dSIbrahim Kanouche
15*46c4c49dSIbrahim Kanouche// Package licenseclassifier provides methods to identify the open source
16*46c4c49dSIbrahim Kanouche// license that most closely matches an unknown license.
17*46c4c49dSIbrahim Kanouchepackage licenseclassifier
18*46c4c49dSIbrahim Kanouche
19*46c4c49dSIbrahim Kanoucheimport (
20*46c4c49dSIbrahim Kanouche	"archive/tar"
21*46c4c49dSIbrahim Kanouche	"bytes"
22*46c4c49dSIbrahim Kanouche	"compress/gzip"
23*46c4c49dSIbrahim Kanouche	"fmt"
24*46c4c49dSIbrahim Kanouche	"html"
25*46c4c49dSIbrahim Kanouche	"io"
26*46c4c49dSIbrahim Kanouche	"math"
27*46c4c49dSIbrahim Kanouche	"regexp"
28*46c4c49dSIbrahim Kanouche	"sort"
29*46c4c49dSIbrahim Kanouche	"strings"
30*46c4c49dSIbrahim Kanouche	"sync"
31*46c4c49dSIbrahim Kanouche	"unicode"
32*46c4c49dSIbrahim Kanouche
33*46c4c49dSIbrahim Kanouche	"github.com/google/licenseclassifier/stringclassifier"
34*46c4c49dSIbrahim Kanouche	"github.com/google/licenseclassifier/stringclassifier/searchset"
35*46c4c49dSIbrahim Kanouche)
36*46c4c49dSIbrahim Kanouche
37*46c4c49dSIbrahim Kanouche// DefaultConfidenceThreshold is the minimum confidence percentage we're willing to accept in order
38*46c4c49dSIbrahim Kanouche// to say that a match is good.
39*46c4c49dSIbrahim Kanoucheconst DefaultConfidenceThreshold = 0.80
40*46c4c49dSIbrahim Kanouche
41*46c4c49dSIbrahim Kanouchevar (
42*46c4c49dSIbrahim Kanouche	// Normalizers is a list of functions that get applied to the strings
43*46c4c49dSIbrahim Kanouche	// before they are registered with the string classifier.
44*46c4c49dSIbrahim Kanouche	Normalizers = []stringclassifier.NormalizeFunc{
45*46c4c49dSIbrahim Kanouche		html.UnescapeString,
46*46c4c49dSIbrahim Kanouche		removeShebangLine,
47*46c4c49dSIbrahim Kanouche		RemoveNonWords,
48*46c4c49dSIbrahim Kanouche		NormalizeEquivalentWords,
49*46c4c49dSIbrahim Kanouche		NormalizePunctuation,
50*46c4c49dSIbrahim Kanouche		strings.ToLower,
51*46c4c49dSIbrahim Kanouche		removeIgnorableTexts,
52*46c4c49dSIbrahim Kanouche		stringclassifier.FlattenWhitespace,
53*46c4c49dSIbrahim Kanouche		strings.TrimSpace,
54*46c4c49dSIbrahim Kanouche	}
55*46c4c49dSIbrahim Kanouche
56*46c4c49dSIbrahim Kanouche	// commonLicenseWords are words that are common to all known licenses.
57*46c4c49dSIbrahim Kanouche	// If an unknown text doesn't have at least one of these, then we can
58*46c4c49dSIbrahim Kanouche	// ignore it.
59*46c4c49dSIbrahim Kanouche	commonLicenseWords = []*regexp.Regexp{
60*46c4c49dSIbrahim Kanouche		regexp.MustCompile(`(?i)\bcode\b`),
61*46c4c49dSIbrahim Kanouche		regexp.MustCompile(`(?i)\blicense\b`),
62*46c4c49dSIbrahim Kanouche		regexp.MustCompile(`(?i)\boriginal\b`),
63*46c4c49dSIbrahim Kanouche		regexp.MustCompile(`(?i)\brights\b`),
64*46c4c49dSIbrahim Kanouche		regexp.MustCompile(`(?i)\bsoftware\b`),
65*46c4c49dSIbrahim Kanouche		regexp.MustCompile(`(?i)\bterms\b`),
66*46c4c49dSIbrahim Kanouche		regexp.MustCompile(`(?i)\bversion\b`),
67*46c4c49dSIbrahim Kanouche		regexp.MustCompile(`(?i)\bwork\b`),
68*46c4c49dSIbrahim Kanouche	}
69*46c4c49dSIbrahim Kanouche)
70*46c4c49dSIbrahim Kanouche
71*46c4c49dSIbrahim Kanouche// License is a classifier pre-loaded with known open source licenses.
72*46c4c49dSIbrahim Kanouchetype License struct {
73*46c4c49dSIbrahim Kanouche	c *stringclassifier.Classifier
74*46c4c49dSIbrahim Kanouche
75*46c4c49dSIbrahim Kanouche	// Threshold is the lowest confidence percentage acceptable for the
76*46c4c49dSIbrahim Kanouche	// classifier.
77*46c4c49dSIbrahim Kanouche	Threshold float64
78*46c4c49dSIbrahim Kanouche
79*46c4c49dSIbrahim Kanouche	// archive is a function that must return the contents of the license archive.
80*46c4c49dSIbrahim Kanouche	// When archive is nil, ReadLicenseFile(LicenseFile) is used to retrieve the
81*46c4c49dSIbrahim Kanouche	// contents.
82*46c4c49dSIbrahim Kanouche	archive func() ([]byte, error)
83*46c4c49dSIbrahim Kanouche}
84*46c4c49dSIbrahim Kanouche
85*46c4c49dSIbrahim Kanouche// OptionFunc set options on a License struct.
86*46c4c49dSIbrahim Kanouchetype OptionFunc func(l *License) error
87*46c4c49dSIbrahim Kanouche
88*46c4c49dSIbrahim Kanouche// Archive is an OptionFunc to specify the location of the license archive file.
89*46c4c49dSIbrahim Kanouchefunc Archive(f string) OptionFunc {
90*46c4c49dSIbrahim Kanouche	return func(l *License) error {
91*46c4c49dSIbrahim Kanouche		l.archive = func() ([]byte, error) { return ReadLicenseFile(f) }
92*46c4c49dSIbrahim Kanouche		return nil
93*46c4c49dSIbrahim Kanouche	}
94*46c4c49dSIbrahim Kanouche}
95*46c4c49dSIbrahim Kanouche
96*46c4c49dSIbrahim Kanouche// ArchiveBytes is an OptionFunc that provides the contents of the license archive file.
97*46c4c49dSIbrahim Kanouche// The caller must not overwrite the contents of b as it is not copied.
98*46c4c49dSIbrahim Kanouchefunc ArchiveBytes(b []byte) OptionFunc {
99*46c4c49dSIbrahim Kanouche	return func(l *License) error {
100*46c4c49dSIbrahim Kanouche		l.archive = func() ([]byte, error) { return b, nil }
101*46c4c49dSIbrahim Kanouche		return nil
102*46c4c49dSIbrahim Kanouche	}
103*46c4c49dSIbrahim Kanouche}
104*46c4c49dSIbrahim Kanouche
105*46c4c49dSIbrahim Kanouche// ArchiveFunc is an OptionFunc that provides a function that must return the contents
106*46c4c49dSIbrahim Kanouche// of the license archive file.
107*46c4c49dSIbrahim Kanouchefunc ArchiveFunc(f func() ([]byte, error)) OptionFunc {
108*46c4c49dSIbrahim Kanouche	return func(l *License) error {
109*46c4c49dSIbrahim Kanouche		l.archive = f
110*46c4c49dSIbrahim Kanouche		return nil
111*46c4c49dSIbrahim Kanouche	}
112*46c4c49dSIbrahim Kanouche}
113*46c4c49dSIbrahim Kanouche
114*46c4c49dSIbrahim Kanouche// New creates a license classifier and pre-loads it with known open source licenses.
115*46c4c49dSIbrahim Kanouchefunc New(threshold float64, options ...OptionFunc) (*License, error) {
116*46c4c49dSIbrahim Kanouche	classifier := &License{
117*46c4c49dSIbrahim Kanouche		c:         stringclassifier.New(threshold, Normalizers...),
118*46c4c49dSIbrahim Kanouche		Threshold: threshold,
119*46c4c49dSIbrahim Kanouche	}
120*46c4c49dSIbrahim Kanouche
121*46c4c49dSIbrahim Kanouche	for _, o := range options {
122*46c4c49dSIbrahim Kanouche		err := o(classifier)
123*46c4c49dSIbrahim Kanouche		if err != nil {
124*46c4c49dSIbrahim Kanouche			return nil, fmt.Errorf("error setting option %v: %v", o, err)
125*46c4c49dSIbrahim Kanouche		}
126*46c4c49dSIbrahim Kanouche	}
127*46c4c49dSIbrahim Kanouche
128*46c4c49dSIbrahim Kanouche	if err := classifier.registerLicenses(); err != nil {
129*46c4c49dSIbrahim Kanouche		return nil, fmt.Errorf("cannot register licenses from archive: %v", err)
130*46c4c49dSIbrahim Kanouche	}
131*46c4c49dSIbrahim Kanouche	return classifier, nil
132*46c4c49dSIbrahim Kanouche}
133*46c4c49dSIbrahim Kanouche
134*46c4c49dSIbrahim Kanouche// NewWithForbiddenLicenses creates a license classifier and pre-loads it with
135*46c4c49dSIbrahim Kanouche// known open source licenses which are forbidden.
136*46c4c49dSIbrahim Kanouchefunc NewWithForbiddenLicenses(threshold float64, options ...OptionFunc) (*License, error) {
137*46c4c49dSIbrahim Kanouche	opts := []OptionFunc{Archive(ForbiddenLicenseArchive)}
138*46c4c49dSIbrahim Kanouche	opts = append(opts, options...)
139*46c4c49dSIbrahim Kanouche	return New(threshold, opts...)
140*46c4c49dSIbrahim Kanouche}
141*46c4c49dSIbrahim Kanouche
142*46c4c49dSIbrahim Kanouche// WithinConfidenceThreshold returns true if the confidence value is above or
143*46c4c49dSIbrahim Kanouche// equal to the confidence threshold.
144*46c4c49dSIbrahim Kanouchefunc (c *License) WithinConfidenceThreshold(conf float64) bool {
145*46c4c49dSIbrahim Kanouche	return conf > c.Threshold || math.Abs(conf-c.Threshold) < math.SmallestNonzeroFloat64
146*46c4c49dSIbrahim Kanouche}
147*46c4c49dSIbrahim Kanouche
148*46c4c49dSIbrahim Kanouche// NearestMatch returns the "nearest" match to the given set of known licenses.
149*46c4c49dSIbrahim Kanouche// Returned are the name of the license, and a confidence percentage indicating
150*46c4c49dSIbrahim Kanouche// how confident the classifier is in the result.
151*46c4c49dSIbrahim Kanouchefunc (c *License) NearestMatch(contents string) *stringclassifier.Match {
152*46c4c49dSIbrahim Kanouche	if !c.hasCommonLicenseWords(contents) {
153*46c4c49dSIbrahim Kanouche		return nil
154*46c4c49dSIbrahim Kanouche	}
155*46c4c49dSIbrahim Kanouche	m := c.c.NearestMatch(contents)
156*46c4c49dSIbrahim Kanouche	m.Name = strings.TrimSuffix(m.Name, ".header")
157*46c4c49dSIbrahim Kanouche	return m
158*46c4c49dSIbrahim Kanouche}
159*46c4c49dSIbrahim Kanouche
160*46c4c49dSIbrahim Kanouche// MultipleMatch matches all licenses within an unknown text.
161*46c4c49dSIbrahim Kanouchefunc (c *License) MultipleMatch(contents string, includeHeaders bool) stringclassifier.Matches {
162*46c4c49dSIbrahim Kanouche	norm := normalizeText(contents)
163*46c4c49dSIbrahim Kanouche	if !c.hasCommonLicenseWords(norm) {
164*46c4c49dSIbrahim Kanouche		return nil
165*46c4c49dSIbrahim Kanouche	}
166*46c4c49dSIbrahim Kanouche
167*46c4c49dSIbrahim Kanouche	m := make(map[stringclassifier.Match]bool)
168*46c4c49dSIbrahim Kanouche	var matches stringclassifier.Matches
169*46c4c49dSIbrahim Kanouche	for _, v := range c.c.MultipleMatch(norm) {
170*46c4c49dSIbrahim Kanouche		if !c.WithinConfidenceThreshold(v.Confidence) {
171*46c4c49dSIbrahim Kanouche			continue
172*46c4c49dSIbrahim Kanouche		}
173*46c4c49dSIbrahim Kanouche
174*46c4c49dSIbrahim Kanouche		if !includeHeaders && strings.HasSuffix(v.Name, ".header") {
175*46c4c49dSIbrahim Kanouche			continue
176*46c4c49dSIbrahim Kanouche		}
177*46c4c49dSIbrahim Kanouche
178*46c4c49dSIbrahim Kanouche		v.Name = strings.TrimSuffix(v.Name, ".header")
179*46c4c49dSIbrahim Kanouche		if re, ok := forbiddenRegexps[v.Name]; ok && !re.MatchString(norm) {
180*46c4c49dSIbrahim Kanouche			continue
181*46c4c49dSIbrahim Kanouche		}
182*46c4c49dSIbrahim Kanouche		if _, ok := m[*v]; !ok {
183*46c4c49dSIbrahim Kanouche			m[*v] = true
184*46c4c49dSIbrahim Kanouche			matches = append(matches, v)
185*46c4c49dSIbrahim Kanouche		}
186*46c4c49dSIbrahim Kanouche	}
187*46c4c49dSIbrahim Kanouche	sort.Sort(matches)
188*46c4c49dSIbrahim Kanouche	return matches
189*46c4c49dSIbrahim Kanouche}
190*46c4c49dSIbrahim Kanouche
191*46c4c49dSIbrahim Kanouchefunc normalizeText(s string) string {
192*46c4c49dSIbrahim Kanouche	for _, n := range Normalizers {
193*46c4c49dSIbrahim Kanouche		s = n(s)
194*46c4c49dSIbrahim Kanouche	}
195*46c4c49dSIbrahim Kanouche	return s
196*46c4c49dSIbrahim Kanouche}
197*46c4c49dSIbrahim Kanouche
198*46c4c49dSIbrahim Kanouche// hasCommonLicenseWords returns true if the unknown text has at least one word
199*46c4c49dSIbrahim Kanouche// that's common to all licenses.
200*46c4c49dSIbrahim Kanouchefunc (c *License) hasCommonLicenseWords(s string) bool {
201*46c4c49dSIbrahim Kanouche	for _, re := range commonLicenseWords {
202*46c4c49dSIbrahim Kanouche		if re.MatchString(s) {
203*46c4c49dSIbrahim Kanouche			return true
204*46c4c49dSIbrahim Kanouche		}
205*46c4c49dSIbrahim Kanouche	}
206*46c4c49dSIbrahim Kanouche	return false
207*46c4c49dSIbrahim Kanouche}
208*46c4c49dSIbrahim Kanouche
209*46c4c49dSIbrahim Kanouchetype archivedValue struct {
210*46c4c49dSIbrahim Kanouche	name       string
211*46c4c49dSIbrahim Kanouche	normalized string
212*46c4c49dSIbrahim Kanouche	set        *searchset.SearchSet
213*46c4c49dSIbrahim Kanouche}
214*46c4c49dSIbrahim Kanouche
215*46c4c49dSIbrahim Kanouche// registerLicenses loads all known licenses and adds them to c as known values
216*46c4c49dSIbrahim Kanouche// for comparison. The allocated space after ingesting the 'licenses.db'
217*46c4c49dSIbrahim Kanouche// archive is ~167M.
218*46c4c49dSIbrahim Kanouchefunc (c *License) registerLicenses() error {
219*46c4c49dSIbrahim Kanouche	var contents []byte
220*46c4c49dSIbrahim Kanouche	var err error
221*46c4c49dSIbrahim Kanouche	if c.archive == nil {
222*46c4c49dSIbrahim Kanouche		contents, err = ReadLicenseFile(LicenseArchive)
223*46c4c49dSIbrahim Kanouche	} else {
224*46c4c49dSIbrahim Kanouche		contents, err = c.archive()
225*46c4c49dSIbrahim Kanouche	}
226*46c4c49dSIbrahim Kanouche	if err != nil {
227*46c4c49dSIbrahim Kanouche		return err
228*46c4c49dSIbrahim Kanouche	}
229*46c4c49dSIbrahim Kanouche
230*46c4c49dSIbrahim Kanouche	reader := bytes.NewReader(contents)
231*46c4c49dSIbrahim Kanouche	gr, err := gzip.NewReader(reader)
232*46c4c49dSIbrahim Kanouche	if err != nil {
233*46c4c49dSIbrahim Kanouche		return err
234*46c4c49dSIbrahim Kanouche	}
235*46c4c49dSIbrahim Kanouche	defer gr.Close()
236*46c4c49dSIbrahim Kanouche
237*46c4c49dSIbrahim Kanouche	tr := tar.NewReader(gr)
238*46c4c49dSIbrahim Kanouche
239*46c4c49dSIbrahim Kanouche	var muVals sync.Mutex
240*46c4c49dSIbrahim Kanouche	var vals []archivedValue
241*46c4c49dSIbrahim Kanouche	for i := 0; ; i++ {
242*46c4c49dSIbrahim Kanouche		hdr, err := tr.Next()
243*46c4c49dSIbrahim Kanouche		if err == io.EOF {
244*46c4c49dSIbrahim Kanouche			break
245*46c4c49dSIbrahim Kanouche		}
246*46c4c49dSIbrahim Kanouche		if err != nil {
247*46c4c49dSIbrahim Kanouche			return err
248*46c4c49dSIbrahim Kanouche		}
249*46c4c49dSIbrahim Kanouche
250*46c4c49dSIbrahim Kanouche		name := strings.TrimSuffix(hdr.Name, ".txt")
251*46c4c49dSIbrahim Kanouche
252*46c4c49dSIbrahim Kanouche		// Read normalized value.
253*46c4c49dSIbrahim Kanouche		var b bytes.Buffer
254*46c4c49dSIbrahim Kanouche		if _, err := io.Copy(&b, tr); err != nil {
255*46c4c49dSIbrahim Kanouche			return err
256*46c4c49dSIbrahim Kanouche		}
257*46c4c49dSIbrahim Kanouche		normalized := b.String()
258*46c4c49dSIbrahim Kanouche		b.Reset()
259*46c4c49dSIbrahim Kanouche
260*46c4c49dSIbrahim Kanouche		// Read precomputed hashes.
261*46c4c49dSIbrahim Kanouche		hdr, err = tr.Next()
262*46c4c49dSIbrahim Kanouche		if err != nil {
263*46c4c49dSIbrahim Kanouche			return err
264*46c4c49dSIbrahim Kanouche		}
265*46c4c49dSIbrahim Kanouche
266*46c4c49dSIbrahim Kanouche		if _, err := io.Copy(&b, tr); err != nil {
267*46c4c49dSIbrahim Kanouche			return err
268*46c4c49dSIbrahim Kanouche		}
269*46c4c49dSIbrahim Kanouche
270*46c4c49dSIbrahim Kanouche		var set searchset.SearchSet
271*46c4c49dSIbrahim Kanouche		searchset.Deserialize(&b, &set)
272*46c4c49dSIbrahim Kanouche
273*46c4c49dSIbrahim Kanouche		muVals.Lock()
274*46c4c49dSIbrahim Kanouche		vals = append(vals, archivedValue{name, normalized, &set})
275*46c4c49dSIbrahim Kanouche		muVals.Unlock()
276*46c4c49dSIbrahim Kanouche	}
277*46c4c49dSIbrahim Kanouche
278*46c4c49dSIbrahim Kanouche	for _, v := range vals {
279*46c4c49dSIbrahim Kanouche		if err = c.c.AddPrecomputedValue(v.name, v.normalized, v.set); err != nil {
280*46c4c49dSIbrahim Kanouche			return err
281*46c4c49dSIbrahim Kanouche		}
282*46c4c49dSIbrahim Kanouche	}
283*46c4c49dSIbrahim Kanouche	return nil
284*46c4c49dSIbrahim Kanouche}
285*46c4c49dSIbrahim Kanouche
286*46c4c49dSIbrahim Kanouche// endOfLicenseText is text commonly associated with the end of a license. We
287*46c4c49dSIbrahim Kanouche// can remove text that occurs after it.
288*46c4c49dSIbrahim Kanouchevar endOfLicenseText = []string{
289*46c4c49dSIbrahim Kanouche	"END OF TERMS AND CONDITIONS",
290*46c4c49dSIbrahim Kanouche}
291*46c4c49dSIbrahim Kanouche
292*46c4c49dSIbrahim Kanouche// TrimExtraneousTrailingText removes text after an obvious end of the license
293*46c4c49dSIbrahim Kanouche// and does not include substantive text of the license.
294*46c4c49dSIbrahim Kanouchefunc TrimExtraneousTrailingText(s string) string {
295*46c4c49dSIbrahim Kanouche	for _, e := range endOfLicenseText {
296*46c4c49dSIbrahim Kanouche		if i := strings.LastIndex(s, e); i != -1 {
297*46c4c49dSIbrahim Kanouche			return s[:i+len(e)]
298*46c4c49dSIbrahim Kanouche		}
299*46c4c49dSIbrahim Kanouche	}
300*46c4c49dSIbrahim Kanouche	return s
301*46c4c49dSIbrahim Kanouche}
302*46c4c49dSIbrahim Kanouche
303*46c4c49dSIbrahim Kanouchevar copyrightRE = regexp.MustCompile(`(?m)(?i:Copyright)\s+(?i:©\s+|\(c\)\s+)?(?:\d{2,4})(?:[-,]\s*\d{2,4})*,?\s*(?i:by)?\s*(.*?(?i:\s+Inc\.)?)[.,]?\s*(?i:All rights reserved\.?)?\s*$`)
304*46c4c49dSIbrahim Kanouche
305*46c4c49dSIbrahim Kanouche// CopyrightHolder finds a copyright notification, if it exists, and returns
306*46c4c49dSIbrahim Kanouche// the copyright holder.
307*46c4c49dSIbrahim Kanouchefunc CopyrightHolder(contents string) string {
308*46c4c49dSIbrahim Kanouche	matches := copyrightRE.FindStringSubmatch(contents)
309*46c4c49dSIbrahim Kanouche	if len(matches) == 2 {
310*46c4c49dSIbrahim Kanouche		return matches[1]
311*46c4c49dSIbrahim Kanouche	}
312*46c4c49dSIbrahim Kanouche	return ""
313*46c4c49dSIbrahim Kanouche}
314*46c4c49dSIbrahim Kanouche
315*46c4c49dSIbrahim Kanouchevar publicDomainRE = regexp.MustCompile("(?i)(this file )?is( in the)? public domain")
316*46c4c49dSIbrahim Kanouche
317*46c4c49dSIbrahim Kanouche// HasPublicDomainNotice performs a simple regex over the contents to see if a
318*46c4c49dSIbrahim Kanouche// public domain notice is in there. As you can imagine, this isn't 100%
319*46c4c49dSIbrahim Kanouche// definitive, but can be useful if a license match isn't found.
320*46c4c49dSIbrahim Kanouchefunc (c *License) HasPublicDomainNotice(contents string) bool {
321*46c4c49dSIbrahim Kanouche	return publicDomainRE.FindString(contents) != ""
322*46c4c49dSIbrahim Kanouche}
323*46c4c49dSIbrahim Kanouche
324*46c4c49dSIbrahim Kanouche// ignorableTexts is a list of lines at the start of the string we can remove
325*46c4c49dSIbrahim Kanouche// to get a cleaner match.
326*46c4c49dSIbrahim Kanouchevar ignorableTexts = []*regexp.Regexp{
327*46c4c49dSIbrahim Kanouche	regexp.MustCompile(`(?i)^(?:the )?mit license(?: \(mit\))?$`),
328*46c4c49dSIbrahim Kanouche	regexp.MustCompile(`(?i)^(?:new )?bsd license$`),
329*46c4c49dSIbrahim Kanouche	regexp.MustCompile(`(?i)^copyright and permission notice$`),
330*46c4c49dSIbrahim Kanouche	regexp.MustCompile(`(?i)^copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]? .*$`),
331*46c4c49dSIbrahim Kanouche	regexp.MustCompile(`(?i)^(all|some) rights reserved\.?$`),
332*46c4c49dSIbrahim Kanouche	regexp.MustCompile(`(?i)^@license$`),
333*46c4c49dSIbrahim Kanouche	regexp.MustCompile(`^\s*$`),
334*46c4c49dSIbrahim Kanouche}
335*46c4c49dSIbrahim Kanouche
336*46c4c49dSIbrahim Kanouche// removeIgnorableTexts removes common text, which is not important for
337*46c4c49dSIbrahim Kanouche// classification, that shows up before the body of the license.
338*46c4c49dSIbrahim Kanouchefunc removeIgnorableTexts(s string) string {
339*46c4c49dSIbrahim Kanouche	lines := strings.Split(strings.TrimRight(s, "\n"), "\n")
340*46c4c49dSIbrahim Kanouche	var start int
341*46c4c49dSIbrahim Kanouche	for ; start < len(lines); start++ {
342*46c4c49dSIbrahim Kanouche		line := strings.TrimSpace(lines[start])
343*46c4c49dSIbrahim Kanouche		var matches bool
344*46c4c49dSIbrahim Kanouche		for _, re := range ignorableTexts {
345*46c4c49dSIbrahim Kanouche			if re.MatchString(line) {
346*46c4c49dSIbrahim Kanouche				matches = true
347*46c4c49dSIbrahim Kanouche				break
348*46c4c49dSIbrahim Kanouche			}
349*46c4c49dSIbrahim Kanouche		}
350*46c4c49dSIbrahim Kanouche		if !matches {
351*46c4c49dSIbrahim Kanouche			break
352*46c4c49dSIbrahim Kanouche		}
353*46c4c49dSIbrahim Kanouche	}
354*46c4c49dSIbrahim Kanouche	end := len(lines)
355*46c4c49dSIbrahim Kanouche	if start > end {
356*46c4c49dSIbrahim Kanouche		return "\n"
357*46c4c49dSIbrahim Kanouche	}
358*46c4c49dSIbrahim Kanouche	return strings.Join(lines[start:end], "\n") + "\n"
359*46c4c49dSIbrahim Kanouche}
360*46c4c49dSIbrahim Kanouche
361*46c4c49dSIbrahim Kanouche// removeShebangLine removes the '#!...' line if it's the first line in the
362*46c4c49dSIbrahim Kanouche// file. Note that if it's the only line in a comment, it won't be removed.
363*46c4c49dSIbrahim Kanouchefunc removeShebangLine(s string) string {
364*46c4c49dSIbrahim Kanouche	lines := strings.Split(s, "\n")
365*46c4c49dSIbrahim Kanouche	if len(lines) <= 1 || !strings.HasPrefix(lines[0], "#!") {
366*46c4c49dSIbrahim Kanouche		return s
367*46c4c49dSIbrahim Kanouche	}
368*46c4c49dSIbrahim Kanouche
369*46c4c49dSIbrahim Kanouche	return strings.Join(lines[1:], "\n")
370*46c4c49dSIbrahim Kanouche}
371*46c4c49dSIbrahim Kanouche
372*46c4c49dSIbrahim Kanouche// isDecorative returns true if the line is made up purely of non-letter and
373*46c4c49dSIbrahim Kanouche// non-digit characters.
374*46c4c49dSIbrahim Kanouchefunc isDecorative(s string) bool {
375*46c4c49dSIbrahim Kanouche	for _, c := range s {
376*46c4c49dSIbrahim Kanouche		if unicode.IsLetter(c) || unicode.IsDigit(c) {
377*46c4c49dSIbrahim Kanouche			return false
378*46c4c49dSIbrahim Kanouche		}
379*46c4c49dSIbrahim Kanouche	}
380*46c4c49dSIbrahim Kanouche	return true
381*46c4c49dSIbrahim Kanouche}
382*46c4c49dSIbrahim Kanouche
383*46c4c49dSIbrahim Kanouchevar nonWords = regexp.MustCompile("[[:punct:]]+")
384*46c4c49dSIbrahim Kanouche
385*46c4c49dSIbrahim Kanouche// RemoveNonWords removes non-words from the string.
386*46c4c49dSIbrahim Kanouchefunc RemoveNonWords(s string) string {
387*46c4c49dSIbrahim Kanouche	return nonWords.ReplaceAllString(s, " ")
388*46c4c49dSIbrahim Kanouche}
389*46c4c49dSIbrahim Kanouche
390*46c4c49dSIbrahim Kanouche// interchangeablePunctutation is punctuation that can be normalized.
391*46c4c49dSIbrahim Kanouchevar interchangeablePunctuation = []struct {
392*46c4c49dSIbrahim Kanouche	interchangeable *regexp.Regexp
393*46c4c49dSIbrahim Kanouche	substitute      string
394*46c4c49dSIbrahim Kanouche}{
395*46c4c49dSIbrahim Kanouche	// Hyphen, Dash, En Dash, and Em Dash.
396*46c4c49dSIbrahim Kanouche	{regexp.MustCompile(`[-‒–—]`), "-"},
397*46c4c49dSIbrahim Kanouche	// Single, Double, Curly Single, and Curly Double.
398*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("['\"`‘’“”]"), "'"},
399*46c4c49dSIbrahim Kanouche	// Copyright.
400*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("©"), "(c)"},
401*46c4c49dSIbrahim Kanouche	// Hyphen-separated words.
402*46c4c49dSIbrahim Kanouche	{regexp.MustCompile(`(\S)-\s+(\S)`), "${1}-${2}"},
403*46c4c49dSIbrahim Kanouche	// Currency and Section. (Different copies of the CDDL use each marker.)
404*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("[§¤]"), "(s)"},
405*46c4c49dSIbrahim Kanouche	// Middle Dot
406*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("·"), "*"},
407*46c4c49dSIbrahim Kanouche}
408*46c4c49dSIbrahim Kanouche
409*46c4c49dSIbrahim Kanouche// NormalizePunctuation takes all hyphens and quotes and normalizes them.
410*46c4c49dSIbrahim Kanouchefunc NormalizePunctuation(s string) string {
411*46c4c49dSIbrahim Kanouche	for _, iw := range interchangeablePunctuation {
412*46c4c49dSIbrahim Kanouche		s = iw.interchangeable.ReplaceAllString(s, iw.substitute)
413*46c4c49dSIbrahim Kanouche	}
414*46c4c49dSIbrahim Kanouche	return s
415*46c4c49dSIbrahim Kanouche}
416*46c4c49dSIbrahim Kanouche
417*46c4c49dSIbrahim Kanouche// interchangeableWords are words we can substitute for a normalized form
418*46c4c49dSIbrahim Kanouche// without changing the meaning of the license. See
419*46c4c49dSIbrahim Kanouche// https://spdx.org/spdx-license-list/matching-guidelines for the list.
420*46c4c49dSIbrahim Kanouchevar interchangeableWords = []struct {
421*46c4c49dSIbrahim Kanouche	interchangeable *regexp.Regexp
422*46c4c49dSIbrahim Kanouche	substitute      string
423*46c4c49dSIbrahim Kanouche}{
424*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Acknowledgment"), "Acknowledgement"},
425*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Analogue"), "Analog"},
426*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Analyse"), "Analyze"},
427*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Artefact"), "Artifact"},
428*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Authorisation"), "Authorization"},
429*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Authorised"), "Authorized"},
430*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Calibre"), "Caliber"},
431*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Cancelled"), "Canceled"},
432*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Capitalisations"), "Capitalizations"},
433*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Catalogue"), "Catalog"},
434*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Categorise"), "Categorize"},
435*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Centre"), "Center"},
436*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Emphasised"), "Emphasized"},
437*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Favour"), "Favor"},
438*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Favourite"), "Favorite"},
439*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Fulfil"), "Fulfill"},
440*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Fulfilment"), "Fulfillment"},
441*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Initialise"), "Initialize"},
442*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Judgment"), "Judgement"},
443*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Labelling"), "Labeling"},
444*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Labour"), "Labor"},
445*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Licence"), "License"},
446*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Maximise"), "Maximize"},
447*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Modelled"), "Modeled"},
448*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Modelling"), "Modeling"},
449*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Offence"), "Offense"},
450*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Optimise"), "Optimize"},
451*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Organisation"), "Organization"},
452*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Organise"), "Organize"},
453*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Practise"), "Practice"},
454*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Programme"), "Program"},
455*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Realise"), "Realize"},
456*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Recognise"), "Recognize"},
457*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Signalling"), "Signaling"},
458*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Sub[- ]license"), "Sublicense"},
459*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Utilisation"), "Utilization"},
460*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Whilst"), "While"},
461*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Wilful"), "Wilfull"},
462*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Non-commercial"), "Noncommercial"},
463*46c4c49dSIbrahim Kanouche	{regexp.MustCompile("(?i)Per cent"), "Percent"},
464*46c4c49dSIbrahim Kanouche}
465*46c4c49dSIbrahim Kanouche
466*46c4c49dSIbrahim Kanouche// NormalizeEquivalentWords normalizes equivalent words that are interchangeable.
467*46c4c49dSIbrahim Kanouchefunc NormalizeEquivalentWords(s string) string {
468*46c4c49dSIbrahim Kanouche	for _, iw := range interchangeableWords {
469*46c4c49dSIbrahim Kanouche		s = iw.interchangeable.ReplaceAllString(s, iw.substitute)
470*46c4c49dSIbrahim Kanouche	}
471*46c4c49dSIbrahim Kanouche	return s
472*46c4c49dSIbrahim Kanouche}
473