1*46c4c49dSIbrahim Kanouche// Copyright 2017 Google Inc. 2*46c4c49dSIbrahim Kanouche// 3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License"); 4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License. 5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at 6*46c4c49dSIbrahim Kanouche// 7*46c4c49dSIbrahim Kanouche// http://www.apache.org/licenses/LICENSE-2.0 8*46c4c49dSIbrahim Kanouche// 9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software 10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS, 11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and 13*46c4c49dSIbrahim Kanouche// limitations under the License. 14*46c4c49dSIbrahim Kanouche 15*46c4c49dSIbrahim Kanouche// Package licenseclassifier provides methods to identify the open source 16*46c4c49dSIbrahim Kanouche// license that most closely matches an unknown license. 17*46c4c49dSIbrahim Kanouchepackage licenseclassifier 18*46c4c49dSIbrahim Kanouche 19*46c4c49dSIbrahim Kanoucheimport ( 20*46c4c49dSIbrahim Kanouche "archive/tar" 21*46c4c49dSIbrahim Kanouche "bytes" 22*46c4c49dSIbrahim Kanouche "compress/gzip" 23*46c4c49dSIbrahim Kanouche "fmt" 24*46c4c49dSIbrahim Kanouche "html" 25*46c4c49dSIbrahim Kanouche "io" 26*46c4c49dSIbrahim Kanouche "math" 27*46c4c49dSIbrahim Kanouche "regexp" 28*46c4c49dSIbrahim Kanouche "sort" 29*46c4c49dSIbrahim Kanouche "strings" 30*46c4c49dSIbrahim Kanouche "sync" 31*46c4c49dSIbrahim Kanouche "unicode" 32*46c4c49dSIbrahim Kanouche 33*46c4c49dSIbrahim Kanouche "github.com/google/licenseclassifier/stringclassifier" 34*46c4c49dSIbrahim Kanouche "github.com/google/licenseclassifier/stringclassifier/searchset" 35*46c4c49dSIbrahim Kanouche) 36*46c4c49dSIbrahim Kanouche 37*46c4c49dSIbrahim Kanouche// DefaultConfidenceThreshold is the minimum confidence percentage we're willing to accept in order 38*46c4c49dSIbrahim Kanouche// to say that a match is good. 39*46c4c49dSIbrahim Kanoucheconst DefaultConfidenceThreshold = 0.80 40*46c4c49dSIbrahim Kanouche 41*46c4c49dSIbrahim Kanouchevar ( 42*46c4c49dSIbrahim Kanouche // Normalizers is a list of functions that get applied to the strings 43*46c4c49dSIbrahim Kanouche // before they are registered with the string classifier. 44*46c4c49dSIbrahim Kanouche Normalizers = []stringclassifier.NormalizeFunc{ 45*46c4c49dSIbrahim Kanouche html.UnescapeString, 46*46c4c49dSIbrahim Kanouche removeShebangLine, 47*46c4c49dSIbrahim Kanouche RemoveNonWords, 48*46c4c49dSIbrahim Kanouche NormalizeEquivalentWords, 49*46c4c49dSIbrahim Kanouche NormalizePunctuation, 50*46c4c49dSIbrahim Kanouche strings.ToLower, 51*46c4c49dSIbrahim Kanouche removeIgnorableTexts, 52*46c4c49dSIbrahim Kanouche stringclassifier.FlattenWhitespace, 53*46c4c49dSIbrahim Kanouche strings.TrimSpace, 54*46c4c49dSIbrahim Kanouche } 55*46c4c49dSIbrahim Kanouche 56*46c4c49dSIbrahim Kanouche // commonLicenseWords are words that are common to all known licenses. 57*46c4c49dSIbrahim Kanouche // If an unknown text doesn't have at least one of these, then we can 58*46c4c49dSIbrahim Kanouche // ignore it. 59*46c4c49dSIbrahim Kanouche commonLicenseWords = []*regexp.Regexp{ 60*46c4c49dSIbrahim Kanouche regexp.MustCompile(`(?i)\bcode\b`), 61*46c4c49dSIbrahim Kanouche regexp.MustCompile(`(?i)\blicense\b`), 62*46c4c49dSIbrahim Kanouche regexp.MustCompile(`(?i)\boriginal\b`), 63*46c4c49dSIbrahim Kanouche regexp.MustCompile(`(?i)\brights\b`), 64*46c4c49dSIbrahim Kanouche regexp.MustCompile(`(?i)\bsoftware\b`), 65*46c4c49dSIbrahim Kanouche regexp.MustCompile(`(?i)\bterms\b`), 66*46c4c49dSIbrahim Kanouche regexp.MustCompile(`(?i)\bversion\b`), 67*46c4c49dSIbrahim Kanouche regexp.MustCompile(`(?i)\bwork\b`), 68*46c4c49dSIbrahim Kanouche } 69*46c4c49dSIbrahim Kanouche) 70*46c4c49dSIbrahim Kanouche 71*46c4c49dSIbrahim Kanouche// License is a classifier pre-loaded with known open source licenses. 72*46c4c49dSIbrahim Kanouchetype License struct { 73*46c4c49dSIbrahim Kanouche c *stringclassifier.Classifier 74*46c4c49dSIbrahim Kanouche 75*46c4c49dSIbrahim Kanouche // Threshold is the lowest confidence percentage acceptable for the 76*46c4c49dSIbrahim Kanouche // classifier. 77*46c4c49dSIbrahim Kanouche Threshold float64 78*46c4c49dSIbrahim Kanouche 79*46c4c49dSIbrahim Kanouche // archive is a function that must return the contents of the license archive. 80*46c4c49dSIbrahim Kanouche // When archive is nil, ReadLicenseFile(LicenseFile) is used to retrieve the 81*46c4c49dSIbrahim Kanouche // contents. 82*46c4c49dSIbrahim Kanouche archive func() ([]byte, error) 83*46c4c49dSIbrahim Kanouche} 84*46c4c49dSIbrahim Kanouche 85*46c4c49dSIbrahim Kanouche// OptionFunc set options on a License struct. 86*46c4c49dSIbrahim Kanouchetype OptionFunc func(l *License) error 87*46c4c49dSIbrahim Kanouche 88*46c4c49dSIbrahim Kanouche// Archive is an OptionFunc to specify the location of the license archive file. 89*46c4c49dSIbrahim Kanouchefunc Archive(f string) OptionFunc { 90*46c4c49dSIbrahim Kanouche return func(l *License) error { 91*46c4c49dSIbrahim Kanouche l.archive = func() ([]byte, error) { return ReadLicenseFile(f) } 92*46c4c49dSIbrahim Kanouche return nil 93*46c4c49dSIbrahim Kanouche } 94*46c4c49dSIbrahim Kanouche} 95*46c4c49dSIbrahim Kanouche 96*46c4c49dSIbrahim Kanouche// ArchiveBytes is an OptionFunc that provides the contents of the license archive file. 97*46c4c49dSIbrahim Kanouche// The caller must not overwrite the contents of b as it is not copied. 98*46c4c49dSIbrahim Kanouchefunc ArchiveBytes(b []byte) OptionFunc { 99*46c4c49dSIbrahim Kanouche return func(l *License) error { 100*46c4c49dSIbrahim Kanouche l.archive = func() ([]byte, error) { return b, nil } 101*46c4c49dSIbrahim Kanouche return nil 102*46c4c49dSIbrahim Kanouche } 103*46c4c49dSIbrahim Kanouche} 104*46c4c49dSIbrahim Kanouche 105*46c4c49dSIbrahim Kanouche// ArchiveFunc is an OptionFunc that provides a function that must return the contents 106*46c4c49dSIbrahim Kanouche// of the license archive file. 107*46c4c49dSIbrahim Kanouchefunc ArchiveFunc(f func() ([]byte, error)) OptionFunc { 108*46c4c49dSIbrahim Kanouche return func(l *License) error { 109*46c4c49dSIbrahim Kanouche l.archive = f 110*46c4c49dSIbrahim Kanouche return nil 111*46c4c49dSIbrahim Kanouche } 112*46c4c49dSIbrahim Kanouche} 113*46c4c49dSIbrahim Kanouche 114*46c4c49dSIbrahim Kanouche// New creates a license classifier and pre-loads it with known open source licenses. 115*46c4c49dSIbrahim Kanouchefunc New(threshold float64, options ...OptionFunc) (*License, error) { 116*46c4c49dSIbrahim Kanouche classifier := &License{ 117*46c4c49dSIbrahim Kanouche c: stringclassifier.New(threshold, Normalizers...), 118*46c4c49dSIbrahim Kanouche Threshold: threshold, 119*46c4c49dSIbrahim Kanouche } 120*46c4c49dSIbrahim Kanouche 121*46c4c49dSIbrahim Kanouche for _, o := range options { 122*46c4c49dSIbrahim Kanouche err := o(classifier) 123*46c4c49dSIbrahim Kanouche if err != nil { 124*46c4c49dSIbrahim Kanouche return nil, fmt.Errorf("error setting option %v: %v", o, err) 125*46c4c49dSIbrahim Kanouche } 126*46c4c49dSIbrahim Kanouche } 127*46c4c49dSIbrahim Kanouche 128*46c4c49dSIbrahim Kanouche if err := classifier.registerLicenses(); err != nil { 129*46c4c49dSIbrahim Kanouche return nil, fmt.Errorf("cannot register licenses from archive: %v", err) 130*46c4c49dSIbrahim Kanouche } 131*46c4c49dSIbrahim Kanouche return classifier, nil 132*46c4c49dSIbrahim Kanouche} 133*46c4c49dSIbrahim Kanouche 134*46c4c49dSIbrahim Kanouche// NewWithForbiddenLicenses creates a license classifier and pre-loads it with 135*46c4c49dSIbrahim Kanouche// known open source licenses which are forbidden. 136*46c4c49dSIbrahim Kanouchefunc NewWithForbiddenLicenses(threshold float64, options ...OptionFunc) (*License, error) { 137*46c4c49dSIbrahim Kanouche opts := []OptionFunc{Archive(ForbiddenLicenseArchive)} 138*46c4c49dSIbrahim Kanouche opts = append(opts, options...) 139*46c4c49dSIbrahim Kanouche return New(threshold, opts...) 140*46c4c49dSIbrahim Kanouche} 141*46c4c49dSIbrahim Kanouche 142*46c4c49dSIbrahim Kanouche// WithinConfidenceThreshold returns true if the confidence value is above or 143*46c4c49dSIbrahim Kanouche// equal to the confidence threshold. 144*46c4c49dSIbrahim Kanouchefunc (c *License) WithinConfidenceThreshold(conf float64) bool { 145*46c4c49dSIbrahim Kanouche return conf > c.Threshold || math.Abs(conf-c.Threshold) < math.SmallestNonzeroFloat64 146*46c4c49dSIbrahim Kanouche} 147*46c4c49dSIbrahim Kanouche 148*46c4c49dSIbrahim Kanouche// NearestMatch returns the "nearest" match to the given set of known licenses. 149*46c4c49dSIbrahim Kanouche// Returned are the name of the license, and a confidence percentage indicating 150*46c4c49dSIbrahim Kanouche// how confident the classifier is in the result. 151*46c4c49dSIbrahim Kanouchefunc (c *License) NearestMatch(contents string) *stringclassifier.Match { 152*46c4c49dSIbrahim Kanouche if !c.hasCommonLicenseWords(contents) { 153*46c4c49dSIbrahim Kanouche return nil 154*46c4c49dSIbrahim Kanouche } 155*46c4c49dSIbrahim Kanouche m := c.c.NearestMatch(contents) 156*46c4c49dSIbrahim Kanouche m.Name = strings.TrimSuffix(m.Name, ".header") 157*46c4c49dSIbrahim Kanouche return m 158*46c4c49dSIbrahim Kanouche} 159*46c4c49dSIbrahim Kanouche 160*46c4c49dSIbrahim Kanouche// MultipleMatch matches all licenses within an unknown text. 161*46c4c49dSIbrahim Kanouchefunc (c *License) MultipleMatch(contents string, includeHeaders bool) stringclassifier.Matches { 162*46c4c49dSIbrahim Kanouche norm := normalizeText(contents) 163*46c4c49dSIbrahim Kanouche if !c.hasCommonLicenseWords(norm) { 164*46c4c49dSIbrahim Kanouche return nil 165*46c4c49dSIbrahim Kanouche } 166*46c4c49dSIbrahim Kanouche 167*46c4c49dSIbrahim Kanouche m := make(map[stringclassifier.Match]bool) 168*46c4c49dSIbrahim Kanouche var matches stringclassifier.Matches 169*46c4c49dSIbrahim Kanouche for _, v := range c.c.MultipleMatch(norm) { 170*46c4c49dSIbrahim Kanouche if !c.WithinConfidenceThreshold(v.Confidence) { 171*46c4c49dSIbrahim Kanouche continue 172*46c4c49dSIbrahim Kanouche } 173*46c4c49dSIbrahim Kanouche 174*46c4c49dSIbrahim Kanouche if !includeHeaders && strings.HasSuffix(v.Name, ".header") { 175*46c4c49dSIbrahim Kanouche continue 176*46c4c49dSIbrahim Kanouche } 177*46c4c49dSIbrahim Kanouche 178*46c4c49dSIbrahim Kanouche v.Name = strings.TrimSuffix(v.Name, ".header") 179*46c4c49dSIbrahim Kanouche if re, ok := forbiddenRegexps[v.Name]; ok && !re.MatchString(norm) { 180*46c4c49dSIbrahim Kanouche continue 181*46c4c49dSIbrahim Kanouche } 182*46c4c49dSIbrahim Kanouche if _, ok := m[*v]; !ok { 183*46c4c49dSIbrahim Kanouche m[*v] = true 184*46c4c49dSIbrahim Kanouche matches = append(matches, v) 185*46c4c49dSIbrahim Kanouche } 186*46c4c49dSIbrahim Kanouche } 187*46c4c49dSIbrahim Kanouche sort.Sort(matches) 188*46c4c49dSIbrahim Kanouche return matches 189*46c4c49dSIbrahim Kanouche} 190*46c4c49dSIbrahim Kanouche 191*46c4c49dSIbrahim Kanouchefunc normalizeText(s string) string { 192*46c4c49dSIbrahim Kanouche for _, n := range Normalizers { 193*46c4c49dSIbrahim Kanouche s = n(s) 194*46c4c49dSIbrahim Kanouche } 195*46c4c49dSIbrahim Kanouche return s 196*46c4c49dSIbrahim Kanouche} 197*46c4c49dSIbrahim Kanouche 198*46c4c49dSIbrahim Kanouche// hasCommonLicenseWords returns true if the unknown text has at least one word 199*46c4c49dSIbrahim Kanouche// that's common to all licenses. 200*46c4c49dSIbrahim Kanouchefunc (c *License) hasCommonLicenseWords(s string) bool { 201*46c4c49dSIbrahim Kanouche for _, re := range commonLicenseWords { 202*46c4c49dSIbrahim Kanouche if re.MatchString(s) { 203*46c4c49dSIbrahim Kanouche return true 204*46c4c49dSIbrahim Kanouche } 205*46c4c49dSIbrahim Kanouche } 206*46c4c49dSIbrahim Kanouche return false 207*46c4c49dSIbrahim Kanouche} 208*46c4c49dSIbrahim Kanouche 209*46c4c49dSIbrahim Kanouchetype archivedValue struct { 210*46c4c49dSIbrahim Kanouche name string 211*46c4c49dSIbrahim Kanouche normalized string 212*46c4c49dSIbrahim Kanouche set *searchset.SearchSet 213*46c4c49dSIbrahim Kanouche} 214*46c4c49dSIbrahim Kanouche 215*46c4c49dSIbrahim Kanouche// registerLicenses loads all known licenses and adds them to c as known values 216*46c4c49dSIbrahim Kanouche// for comparison. The allocated space after ingesting the 'licenses.db' 217*46c4c49dSIbrahim Kanouche// archive is ~167M. 218*46c4c49dSIbrahim Kanouchefunc (c *License) registerLicenses() error { 219*46c4c49dSIbrahim Kanouche var contents []byte 220*46c4c49dSIbrahim Kanouche var err error 221*46c4c49dSIbrahim Kanouche if c.archive == nil { 222*46c4c49dSIbrahim Kanouche contents, err = ReadLicenseFile(LicenseArchive) 223*46c4c49dSIbrahim Kanouche } else { 224*46c4c49dSIbrahim Kanouche contents, err = c.archive() 225*46c4c49dSIbrahim Kanouche } 226*46c4c49dSIbrahim Kanouche if err != nil { 227*46c4c49dSIbrahim Kanouche return err 228*46c4c49dSIbrahim Kanouche } 229*46c4c49dSIbrahim Kanouche 230*46c4c49dSIbrahim Kanouche reader := bytes.NewReader(contents) 231*46c4c49dSIbrahim Kanouche gr, err := gzip.NewReader(reader) 232*46c4c49dSIbrahim Kanouche if err != nil { 233*46c4c49dSIbrahim Kanouche return err 234*46c4c49dSIbrahim Kanouche } 235*46c4c49dSIbrahim Kanouche defer gr.Close() 236*46c4c49dSIbrahim Kanouche 237*46c4c49dSIbrahim Kanouche tr := tar.NewReader(gr) 238*46c4c49dSIbrahim Kanouche 239*46c4c49dSIbrahim Kanouche var muVals sync.Mutex 240*46c4c49dSIbrahim Kanouche var vals []archivedValue 241*46c4c49dSIbrahim Kanouche for i := 0; ; i++ { 242*46c4c49dSIbrahim Kanouche hdr, err := tr.Next() 243*46c4c49dSIbrahim Kanouche if err == io.EOF { 244*46c4c49dSIbrahim Kanouche break 245*46c4c49dSIbrahim Kanouche } 246*46c4c49dSIbrahim Kanouche if err != nil { 247*46c4c49dSIbrahim Kanouche return err 248*46c4c49dSIbrahim Kanouche } 249*46c4c49dSIbrahim Kanouche 250*46c4c49dSIbrahim Kanouche name := strings.TrimSuffix(hdr.Name, ".txt") 251*46c4c49dSIbrahim Kanouche 252*46c4c49dSIbrahim Kanouche // Read normalized value. 253*46c4c49dSIbrahim Kanouche var b bytes.Buffer 254*46c4c49dSIbrahim Kanouche if _, err := io.Copy(&b, tr); err != nil { 255*46c4c49dSIbrahim Kanouche return err 256*46c4c49dSIbrahim Kanouche } 257*46c4c49dSIbrahim Kanouche normalized := b.String() 258*46c4c49dSIbrahim Kanouche b.Reset() 259*46c4c49dSIbrahim Kanouche 260*46c4c49dSIbrahim Kanouche // Read precomputed hashes. 261*46c4c49dSIbrahim Kanouche hdr, err = tr.Next() 262*46c4c49dSIbrahim Kanouche if err != nil { 263*46c4c49dSIbrahim Kanouche return err 264*46c4c49dSIbrahim Kanouche } 265*46c4c49dSIbrahim Kanouche 266*46c4c49dSIbrahim Kanouche if _, err := io.Copy(&b, tr); err != nil { 267*46c4c49dSIbrahim Kanouche return err 268*46c4c49dSIbrahim Kanouche } 269*46c4c49dSIbrahim Kanouche 270*46c4c49dSIbrahim Kanouche var set searchset.SearchSet 271*46c4c49dSIbrahim Kanouche searchset.Deserialize(&b, &set) 272*46c4c49dSIbrahim Kanouche 273*46c4c49dSIbrahim Kanouche muVals.Lock() 274*46c4c49dSIbrahim Kanouche vals = append(vals, archivedValue{name, normalized, &set}) 275*46c4c49dSIbrahim Kanouche muVals.Unlock() 276*46c4c49dSIbrahim Kanouche } 277*46c4c49dSIbrahim Kanouche 278*46c4c49dSIbrahim Kanouche for _, v := range vals { 279*46c4c49dSIbrahim Kanouche if err = c.c.AddPrecomputedValue(v.name, v.normalized, v.set); err != nil { 280*46c4c49dSIbrahim Kanouche return err 281*46c4c49dSIbrahim Kanouche } 282*46c4c49dSIbrahim Kanouche } 283*46c4c49dSIbrahim Kanouche return nil 284*46c4c49dSIbrahim Kanouche} 285*46c4c49dSIbrahim Kanouche 286*46c4c49dSIbrahim Kanouche// endOfLicenseText is text commonly associated with the end of a license. We 287*46c4c49dSIbrahim Kanouche// can remove text that occurs after it. 288*46c4c49dSIbrahim Kanouchevar endOfLicenseText = []string{ 289*46c4c49dSIbrahim Kanouche "END OF TERMS AND CONDITIONS", 290*46c4c49dSIbrahim Kanouche} 291*46c4c49dSIbrahim Kanouche 292*46c4c49dSIbrahim Kanouche// TrimExtraneousTrailingText removes text after an obvious end of the license 293*46c4c49dSIbrahim Kanouche// and does not include substantive text of the license. 294*46c4c49dSIbrahim Kanouchefunc TrimExtraneousTrailingText(s string) string { 295*46c4c49dSIbrahim Kanouche for _, e := range endOfLicenseText { 296*46c4c49dSIbrahim Kanouche if i := strings.LastIndex(s, e); i != -1 { 297*46c4c49dSIbrahim Kanouche return s[:i+len(e)] 298*46c4c49dSIbrahim Kanouche } 299*46c4c49dSIbrahim Kanouche } 300*46c4c49dSIbrahim Kanouche return s 301*46c4c49dSIbrahim Kanouche} 302*46c4c49dSIbrahim Kanouche 303*46c4c49dSIbrahim Kanouchevar copyrightRE = regexp.MustCompile(`(?m)(?i:Copyright)\s+(?i:©\s+|\(c\)\s+)?(?:\d{2,4})(?:[-,]\s*\d{2,4})*,?\s*(?i:by)?\s*(.*?(?i:\s+Inc\.)?)[.,]?\s*(?i:All rights reserved\.?)?\s*$`) 304*46c4c49dSIbrahim Kanouche 305*46c4c49dSIbrahim Kanouche// CopyrightHolder finds a copyright notification, if it exists, and returns 306*46c4c49dSIbrahim Kanouche// the copyright holder. 307*46c4c49dSIbrahim Kanouchefunc CopyrightHolder(contents string) string { 308*46c4c49dSIbrahim Kanouche matches := copyrightRE.FindStringSubmatch(contents) 309*46c4c49dSIbrahim Kanouche if len(matches) == 2 { 310*46c4c49dSIbrahim Kanouche return matches[1] 311*46c4c49dSIbrahim Kanouche } 312*46c4c49dSIbrahim Kanouche return "" 313*46c4c49dSIbrahim Kanouche} 314*46c4c49dSIbrahim Kanouche 315*46c4c49dSIbrahim Kanouchevar publicDomainRE = regexp.MustCompile("(?i)(this file )?is( in the)? public domain") 316*46c4c49dSIbrahim Kanouche 317*46c4c49dSIbrahim Kanouche// HasPublicDomainNotice performs a simple regex over the contents to see if a 318*46c4c49dSIbrahim Kanouche// public domain notice is in there. As you can imagine, this isn't 100% 319*46c4c49dSIbrahim Kanouche// definitive, but can be useful if a license match isn't found. 320*46c4c49dSIbrahim Kanouchefunc (c *License) HasPublicDomainNotice(contents string) bool { 321*46c4c49dSIbrahim Kanouche return publicDomainRE.FindString(contents) != "" 322*46c4c49dSIbrahim Kanouche} 323*46c4c49dSIbrahim Kanouche 324*46c4c49dSIbrahim Kanouche// ignorableTexts is a list of lines at the start of the string we can remove 325*46c4c49dSIbrahim Kanouche// to get a cleaner match. 326*46c4c49dSIbrahim Kanouchevar ignorableTexts = []*regexp.Regexp{ 327*46c4c49dSIbrahim Kanouche regexp.MustCompile(`(?i)^(?:the )?mit license(?: \(mit\))?$`), 328*46c4c49dSIbrahim Kanouche regexp.MustCompile(`(?i)^(?:new )?bsd license$`), 329*46c4c49dSIbrahim Kanouche regexp.MustCompile(`(?i)^copyright and permission notice$`), 330*46c4c49dSIbrahim Kanouche regexp.MustCompile(`(?i)^copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]? .*$`), 331*46c4c49dSIbrahim Kanouche regexp.MustCompile(`(?i)^(all|some) rights reserved\.?$`), 332*46c4c49dSIbrahim Kanouche regexp.MustCompile(`(?i)^@license$`), 333*46c4c49dSIbrahim Kanouche regexp.MustCompile(`^\s*$`), 334*46c4c49dSIbrahim Kanouche} 335*46c4c49dSIbrahim Kanouche 336*46c4c49dSIbrahim Kanouche// removeIgnorableTexts removes common text, which is not important for 337*46c4c49dSIbrahim Kanouche// classification, that shows up before the body of the license. 338*46c4c49dSIbrahim Kanouchefunc removeIgnorableTexts(s string) string { 339*46c4c49dSIbrahim Kanouche lines := strings.Split(strings.TrimRight(s, "\n"), "\n") 340*46c4c49dSIbrahim Kanouche var start int 341*46c4c49dSIbrahim Kanouche for ; start < len(lines); start++ { 342*46c4c49dSIbrahim Kanouche line := strings.TrimSpace(lines[start]) 343*46c4c49dSIbrahim Kanouche var matches bool 344*46c4c49dSIbrahim Kanouche for _, re := range ignorableTexts { 345*46c4c49dSIbrahim Kanouche if re.MatchString(line) { 346*46c4c49dSIbrahim Kanouche matches = true 347*46c4c49dSIbrahim Kanouche break 348*46c4c49dSIbrahim Kanouche } 349*46c4c49dSIbrahim Kanouche } 350*46c4c49dSIbrahim Kanouche if !matches { 351*46c4c49dSIbrahim Kanouche break 352*46c4c49dSIbrahim Kanouche } 353*46c4c49dSIbrahim Kanouche } 354*46c4c49dSIbrahim Kanouche end := len(lines) 355*46c4c49dSIbrahim Kanouche if start > end { 356*46c4c49dSIbrahim Kanouche return "\n" 357*46c4c49dSIbrahim Kanouche } 358*46c4c49dSIbrahim Kanouche return strings.Join(lines[start:end], "\n") + "\n" 359*46c4c49dSIbrahim Kanouche} 360*46c4c49dSIbrahim Kanouche 361*46c4c49dSIbrahim Kanouche// removeShebangLine removes the '#!...' line if it's the first line in the 362*46c4c49dSIbrahim Kanouche// file. Note that if it's the only line in a comment, it won't be removed. 363*46c4c49dSIbrahim Kanouchefunc removeShebangLine(s string) string { 364*46c4c49dSIbrahim Kanouche lines := strings.Split(s, "\n") 365*46c4c49dSIbrahim Kanouche if len(lines) <= 1 || !strings.HasPrefix(lines[0], "#!") { 366*46c4c49dSIbrahim Kanouche return s 367*46c4c49dSIbrahim Kanouche } 368*46c4c49dSIbrahim Kanouche 369*46c4c49dSIbrahim Kanouche return strings.Join(lines[1:], "\n") 370*46c4c49dSIbrahim Kanouche} 371*46c4c49dSIbrahim Kanouche 372*46c4c49dSIbrahim Kanouche// isDecorative returns true if the line is made up purely of non-letter and 373*46c4c49dSIbrahim Kanouche// non-digit characters. 374*46c4c49dSIbrahim Kanouchefunc isDecorative(s string) bool { 375*46c4c49dSIbrahim Kanouche for _, c := range s { 376*46c4c49dSIbrahim Kanouche if unicode.IsLetter(c) || unicode.IsDigit(c) { 377*46c4c49dSIbrahim Kanouche return false 378*46c4c49dSIbrahim Kanouche } 379*46c4c49dSIbrahim Kanouche } 380*46c4c49dSIbrahim Kanouche return true 381*46c4c49dSIbrahim Kanouche} 382*46c4c49dSIbrahim Kanouche 383*46c4c49dSIbrahim Kanouchevar nonWords = regexp.MustCompile("[[:punct:]]+") 384*46c4c49dSIbrahim Kanouche 385*46c4c49dSIbrahim Kanouche// RemoveNonWords removes non-words from the string. 386*46c4c49dSIbrahim Kanouchefunc RemoveNonWords(s string) string { 387*46c4c49dSIbrahim Kanouche return nonWords.ReplaceAllString(s, " ") 388*46c4c49dSIbrahim Kanouche} 389*46c4c49dSIbrahim Kanouche 390*46c4c49dSIbrahim Kanouche// interchangeablePunctutation is punctuation that can be normalized. 391*46c4c49dSIbrahim Kanouchevar interchangeablePunctuation = []struct { 392*46c4c49dSIbrahim Kanouche interchangeable *regexp.Regexp 393*46c4c49dSIbrahim Kanouche substitute string 394*46c4c49dSIbrahim Kanouche}{ 395*46c4c49dSIbrahim Kanouche // Hyphen, Dash, En Dash, and Em Dash. 396*46c4c49dSIbrahim Kanouche {regexp.MustCompile(`[-‒–—]`), "-"}, 397*46c4c49dSIbrahim Kanouche // Single, Double, Curly Single, and Curly Double. 398*46c4c49dSIbrahim Kanouche {regexp.MustCompile("['\"`‘’“”]"), "'"}, 399*46c4c49dSIbrahim Kanouche // Copyright. 400*46c4c49dSIbrahim Kanouche {regexp.MustCompile("©"), "(c)"}, 401*46c4c49dSIbrahim Kanouche // Hyphen-separated words. 402*46c4c49dSIbrahim Kanouche {regexp.MustCompile(`(\S)-\s+(\S)`), "${1}-${2}"}, 403*46c4c49dSIbrahim Kanouche // Currency and Section. (Different copies of the CDDL use each marker.) 404*46c4c49dSIbrahim Kanouche {regexp.MustCompile("[§¤]"), "(s)"}, 405*46c4c49dSIbrahim Kanouche // Middle Dot 406*46c4c49dSIbrahim Kanouche {regexp.MustCompile("·"), "*"}, 407*46c4c49dSIbrahim Kanouche} 408*46c4c49dSIbrahim Kanouche 409*46c4c49dSIbrahim Kanouche// NormalizePunctuation takes all hyphens and quotes and normalizes them. 410*46c4c49dSIbrahim Kanouchefunc NormalizePunctuation(s string) string { 411*46c4c49dSIbrahim Kanouche for _, iw := range interchangeablePunctuation { 412*46c4c49dSIbrahim Kanouche s = iw.interchangeable.ReplaceAllString(s, iw.substitute) 413*46c4c49dSIbrahim Kanouche } 414*46c4c49dSIbrahim Kanouche return s 415*46c4c49dSIbrahim Kanouche} 416*46c4c49dSIbrahim Kanouche 417*46c4c49dSIbrahim Kanouche// interchangeableWords are words we can substitute for a normalized form 418*46c4c49dSIbrahim Kanouche// without changing the meaning of the license. See 419*46c4c49dSIbrahim Kanouche// https://spdx.org/spdx-license-list/matching-guidelines for the list. 420*46c4c49dSIbrahim Kanouchevar interchangeableWords = []struct { 421*46c4c49dSIbrahim Kanouche interchangeable *regexp.Regexp 422*46c4c49dSIbrahim Kanouche substitute string 423*46c4c49dSIbrahim Kanouche}{ 424*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Acknowledgment"), "Acknowledgement"}, 425*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Analogue"), "Analog"}, 426*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Analyse"), "Analyze"}, 427*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Artefact"), "Artifact"}, 428*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Authorisation"), "Authorization"}, 429*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Authorised"), "Authorized"}, 430*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Calibre"), "Caliber"}, 431*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Cancelled"), "Canceled"}, 432*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Capitalisations"), "Capitalizations"}, 433*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Catalogue"), "Catalog"}, 434*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Categorise"), "Categorize"}, 435*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Centre"), "Center"}, 436*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Emphasised"), "Emphasized"}, 437*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Favour"), "Favor"}, 438*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Favourite"), "Favorite"}, 439*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Fulfil"), "Fulfill"}, 440*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Fulfilment"), "Fulfillment"}, 441*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Initialise"), "Initialize"}, 442*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Judgment"), "Judgement"}, 443*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Labelling"), "Labeling"}, 444*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Labour"), "Labor"}, 445*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Licence"), "License"}, 446*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Maximise"), "Maximize"}, 447*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Modelled"), "Modeled"}, 448*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Modelling"), "Modeling"}, 449*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Offence"), "Offense"}, 450*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Optimise"), "Optimize"}, 451*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Organisation"), "Organization"}, 452*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Organise"), "Organize"}, 453*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Practise"), "Practice"}, 454*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Programme"), "Program"}, 455*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Realise"), "Realize"}, 456*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Recognise"), "Recognize"}, 457*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Signalling"), "Signaling"}, 458*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Sub[- ]license"), "Sublicense"}, 459*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Utilisation"), "Utilization"}, 460*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Whilst"), "While"}, 461*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Wilful"), "Wilfull"}, 462*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Non-commercial"), "Noncommercial"}, 463*46c4c49dSIbrahim Kanouche {regexp.MustCompile("(?i)Per cent"), "Percent"}, 464*46c4c49dSIbrahim Kanouche} 465*46c4c49dSIbrahim Kanouche 466*46c4c49dSIbrahim Kanouche// NormalizeEquivalentWords normalizes equivalent words that are interchangeable. 467*46c4c49dSIbrahim Kanouchefunc NormalizeEquivalentWords(s string) string { 468*46c4c49dSIbrahim Kanouche for _, iw := range interchangeableWords { 469*46c4c49dSIbrahim Kanouche s = iw.interchangeable.ReplaceAllString(s, iw.substitute) 470*46c4c49dSIbrahim Kanouche } 471*46c4c49dSIbrahim Kanouche return s 472*46c4c49dSIbrahim Kanouche} 473