1*46c4c49dSIbrahim Kanouche// Copyright 2020 Google Inc. 2*46c4c49dSIbrahim Kanouche// 3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License"); 4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License. 5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at 6*46c4c49dSIbrahim Kanouche// 7*46c4c49dSIbrahim Kanouche// http://www.apache.org/licenses/LICENSE-2.0 8*46c4c49dSIbrahim Kanouche// 9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software 10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS, 11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and 13*46c4c49dSIbrahim Kanouche// limitations under the License. 14*46c4c49dSIbrahim Kanouche 15*46c4c49dSIbrahim Kanouchepackage classifier 16*46c4c49dSIbrahim Kanouche 17*46c4c49dSIbrahim Kanoucheimport ( 18*46c4c49dSIbrahim Kanouche "strings" 19*46c4c49dSIbrahim Kanouche "unicode" 20*46c4c49dSIbrahim Kanouche 21*46c4c49dSIbrahim Kanouche "github.com/davecgh/go-spew/spew" 22*46c4c49dSIbrahim Kanouche "github.com/sergi/go-diff/diffmatchpatch" 23*46c4c49dSIbrahim Kanouche) 24*46c4c49dSIbrahim Kanouche 25*46c4c49dSIbrahim Kanouche// return values for the distance function that explain why a diff 26*46c4c49dSIbrahim Kanouche// is not an acceptable match for the source document. 27*46c4c49dSIbrahim Kanoucheconst ( 28*46c4c49dSIbrahim Kanouche versionChange = -1 29*46c4c49dSIbrahim Kanouche introducedPhraseChange = -2 30*46c4c49dSIbrahim Kanouche lesserGPLChange = -3 31*46c4c49dSIbrahim Kanouche) 32*46c4c49dSIbrahim Kanouche 33*46c4c49dSIbrahim Kanouche// score computes a metric of similarity between the known and unknown 34*46c4c49dSIbrahim Kanouche// document, including the offsets into the unknown that yield the content 35*46c4c49dSIbrahim Kanouche// generating the computed similarity. 36*46c4c49dSIbrahim Kanouchefunc (c *Classifier) score(id string, unknown, known *indexedDocument, unknownStart, unknownEnd int) (float64, int, int) { 37*46c4c49dSIbrahim Kanouche if c.tc.traceScoring(known.s.origin) { 38*46c4c49dSIbrahim Kanouche c.tc.trace("Scoring %s: [%d-%d]", known.s.origin, unknownStart, unknownEnd) 39*46c4c49dSIbrahim Kanouche } 40*46c4c49dSIbrahim Kanouche 41*46c4c49dSIbrahim Kanouche knownLength := known.size() 42*46c4c49dSIbrahim Kanouche diffs := docDiff(id, unknown, unknownStart, unknownEnd, known, 0, knownLength) 43*46c4c49dSIbrahim Kanouche 44*46c4c49dSIbrahim Kanouche start, end := diffRange(known.Norm, diffs) 45*46c4c49dSIbrahim Kanouche distance := scoreDiffs(id, diffs[start:end]) 46*46c4c49dSIbrahim Kanouche 47*46c4c49dSIbrahim Kanouche if c.tc.traceScoring(known.s.origin) { 48*46c4c49dSIbrahim Kanouche c.tc.trace("Diffs against %s:\n%s", known.s.origin, spew.Sdump(diffs[start:end])) 49*46c4c49dSIbrahim Kanouche } 50*46c4c49dSIbrahim Kanouche 51*46c4c49dSIbrahim Kanouche if distance < 0 { 52*46c4c49dSIbrahim Kanouche // If the distance is negative, this indicates an unacceptable diff so we return a zero-confidence match. 53*46c4c49dSIbrahim Kanouche if c.tc.traceScoring(known.s.origin) { 54*46c4c49dSIbrahim Kanouche c.tc.trace("Distance result %v, rejected match", distance) 55*46c4c49dSIbrahim Kanouche } 56*46c4c49dSIbrahim Kanouche return 0.0, 0, 0 57*46c4c49dSIbrahim Kanouche } 58*46c4c49dSIbrahim Kanouche 59*46c4c49dSIbrahim Kanouche // Applying the diffRange-generated offsets provides the run of text from the 60*46c4c49dSIbrahim Kanouche // target most closely correlated to the source. This is the process for 61*46c4c49dSIbrahim Kanouche // compensating for errors from the searchset. With the reduced text, we 62*46c4c49dSIbrahim Kanouche // compute the final confidence score and exact text locations for the 63*46c4c49dSIbrahim Kanouche // matched content. 64*46c4c49dSIbrahim Kanouche // The diff slice consists of three regions: an optional deletion diff for 65*46c4c49dSIbrahim Kanouche // target text before the source, n elements that represent the diff between 66*46c4c49dSIbrahim Kanouche // the source and target, and an optional deletion diff for text after the 67*46c4c49dSIbrahim Kanouche // target. The helper functions identify the portions of the slice 68*46c4c49dSIbrahim Kanouche // corresponding to those regions. This results in a more accurate 69*46c4c49dSIbrahim Kanouche // confidence score and better position detection of the source in the 70*46c4c49dSIbrahim Kanouche // target. 71*46c4c49dSIbrahim Kanouche conf, so, eo := confidencePercentage(knownLength, distance), textLength(diffs[:start]), textLength(diffs[end:]) 72*46c4c49dSIbrahim Kanouche 73*46c4c49dSIbrahim Kanouche if c.tc.traceScoring(known.s.origin) { 74*46c4c49dSIbrahim Kanouche c.tc.trace("Score result: %v [%d-%d]", conf, so, eo) 75*46c4c49dSIbrahim Kanouche } 76*46c4c49dSIbrahim Kanouche return conf, so, eo 77*46c4c49dSIbrahim Kanouche} 78*46c4c49dSIbrahim Kanouche 79*46c4c49dSIbrahim Kanouche// confidencePercentage computes a confidence match score for the lengths, 80*46c4c49dSIbrahim Kanouche// handling the cases where source and target lengths differ. 81*46c4c49dSIbrahim Kanouchefunc confidencePercentage(klen, distance int) float64 { 82*46c4c49dSIbrahim Kanouche // No text is matched at 100% confidence (avoid divide by zero). 83*46c4c49dSIbrahim Kanouche if klen == 0 { 84*46c4c49dSIbrahim Kanouche return 1.0 85*46c4c49dSIbrahim Kanouche } 86*46c4c49dSIbrahim Kanouche 87*46c4c49dSIbrahim Kanouche // Return a computed fractional match against the known text. 88*46c4c49dSIbrahim Kanouche return 1.0 - float64(distance)/float64(klen) 89*46c4c49dSIbrahim Kanouche} 90*46c4c49dSIbrahim Kanouche 91*46c4c49dSIbrahim Kanouche// diffLevenshteinWord computes word-based Levenshtein count. 92*46c4c49dSIbrahim Kanouchefunc diffLevenshteinWord(diffs []diffmatchpatch.Diff) int { 93*46c4c49dSIbrahim Kanouche levenshtein := 0 94*46c4c49dSIbrahim Kanouche insertions := 0 95*46c4c49dSIbrahim Kanouche deletions := 0 96*46c4c49dSIbrahim Kanouche 97*46c4c49dSIbrahim Kanouche for _, aDiff := range diffs { 98*46c4c49dSIbrahim Kanouche switch aDiff.Type { 99*46c4c49dSIbrahim Kanouche case diffmatchpatch.DiffInsert: 100*46c4c49dSIbrahim Kanouche insertions += wordLen(aDiff.Text) 101*46c4c49dSIbrahim Kanouche case diffmatchpatch.DiffDelete: 102*46c4c49dSIbrahim Kanouche deletions += wordLen(aDiff.Text) 103*46c4c49dSIbrahim Kanouche case diffmatchpatch.DiffEqual: 104*46c4c49dSIbrahim Kanouche // A deletion and an insertion is one substitution. 105*46c4c49dSIbrahim Kanouche levenshtein += max(insertions, deletions) 106*46c4c49dSIbrahim Kanouche insertions = 0 107*46c4c49dSIbrahim Kanouche deletions = 0 108*46c4c49dSIbrahim Kanouche } 109*46c4c49dSIbrahim Kanouche } 110*46c4c49dSIbrahim Kanouche 111*46c4c49dSIbrahim Kanouche levenshtein += max(insertions, deletions) 112*46c4c49dSIbrahim Kanouche return levenshtein 113*46c4c49dSIbrahim Kanouche} 114*46c4c49dSIbrahim Kanouche 115*46c4c49dSIbrahim Kanouchefunc isVersionNumber(in string) bool { 116*46c4c49dSIbrahim Kanouche for _, r := range in { 117*46c4c49dSIbrahim Kanouche if !unicode.IsDigit(r) && r != '.' { 118*46c4c49dSIbrahim Kanouche return false 119*46c4c49dSIbrahim Kanouche } 120*46c4c49dSIbrahim Kanouche } 121*46c4c49dSIbrahim Kanouche return true 122*46c4c49dSIbrahim Kanouche} 123*46c4c49dSIbrahim Kanouche 124*46c4c49dSIbrahim Kanouche// scoreDiffs returns a score rating the acceptability of these diffs. A 125*46c4c49dSIbrahim Kanouche// negative value means that the changes represented by the diff are not an 126*46c4c49dSIbrahim Kanouche// acceptable transformation since it would change the underlying license. A 127*46c4c49dSIbrahim Kanouche// positive value indicates the Levenshtein word distance. 128*46c4c49dSIbrahim Kanouchefunc scoreDiffs(id string, diffs []diffmatchpatch.Diff) int { 129*46c4c49dSIbrahim Kanouche // We make a pass looking for unacceptable substitutions 130*46c4c49dSIbrahim Kanouche // Delete diffs are always ordered before insert diffs. This is leveraged to 131*46c4c49dSIbrahim Kanouche // analyze a change by checking an insert against the delete text that was 132*46c4c49dSIbrahim Kanouche // previously cached. 133*46c4c49dSIbrahim Kanouche prevText := "" 134*46c4c49dSIbrahim Kanouche prevDelete := "" 135*46c4c49dSIbrahim Kanouche for i, diff := range diffs { 136*46c4c49dSIbrahim Kanouche text := diff.Text 137*46c4c49dSIbrahim Kanouche switch diff.Type { 138*46c4c49dSIbrahim Kanouche case diffmatchpatch.DiffInsert: 139*46c4c49dSIbrahim Kanouche num := text 140*46c4c49dSIbrahim Kanouche if i := strings.Index(num, " "); i != -1 { 141*46c4c49dSIbrahim Kanouche num = num[0:i] 142*46c4c49dSIbrahim Kanouche } 143*46c4c49dSIbrahim Kanouche if isVersionNumber(num) && strings.HasSuffix(prevText, "version") { 144*46c4c49dSIbrahim Kanouche if !strings.HasSuffix(prevText, "the standard version") && !strings.HasSuffix(prevText, "the contributor version") { 145*46c4c49dSIbrahim Kanouche return versionChange 146*46c4c49dSIbrahim Kanouche } 147*46c4c49dSIbrahim Kanouche } 148*46c4c49dSIbrahim Kanouche // There are certain phrases that can't be introduced to make a license 149*46c4c49dSIbrahim Kanouche // hit. TODO: would like to generate this programmatically. Most of 150*46c4c49dSIbrahim Kanouche // these are words or phrases that appear in a single/small number of 151*46c4c49dSIbrahim Kanouche // licenses. Can we leverage frequency analysis to identify these 152*46c4c49dSIbrahim Kanouche // interesting words/phrases and auto-extract them? 153*46c4c49dSIbrahim Kanouche 154*46c4c49dSIbrahim Kanouche inducedPhrases := map[string][]string{ 155*46c4c49dSIbrahim Kanouche "AGPL": {"affero"}, 156*46c4c49dSIbrahim Kanouche "Atmel": {"atmel"}, 157*46c4c49dSIbrahim Kanouche "Apache": {"apache"}, 158*46c4c49dSIbrahim Kanouche "BSD": {"bsd"}, 159*46c4c49dSIbrahim Kanouche "BSD-3-Clause-Attribution": {"acknowledgment"}, 160*46c4c49dSIbrahim Kanouche "bzip2": {"seward"}, 161*46c4c49dSIbrahim Kanouche "GPL-2.0-with-GCC-exception": {"gcc linking exception"}, 162*46c4c49dSIbrahim Kanouche "GPL-2.0-with-autoconf-exception": {"autoconf exception"}, 163*46c4c49dSIbrahim Kanouche "GPL-2.0-with-bison-exception": {"bison exception"}, 164*46c4c49dSIbrahim Kanouche "GPL-2.0-with-classpath-exception": {"class path exception"}, 165*46c4c49dSIbrahim Kanouche "GPL-2.0-with-font-exception": {"font exception"}, 166*46c4c49dSIbrahim Kanouche "LGPL-2.0": {"library"}, 167*46c4c49dSIbrahim Kanouche "ImageMagick": {"imagemagick"}, 168*46c4c49dSIbrahim Kanouche "PHP": {"php"}, 169*46c4c49dSIbrahim Kanouche "SISSL": {"sun standards"}, 170*46c4c49dSIbrahim Kanouche "SGI-B": {"silicon graphics"}, 171*46c4c49dSIbrahim Kanouche "SunPro": {"sunpro"}, 172*46c4c49dSIbrahim Kanouche "X11": {"x consortium"}, 173*46c4c49dSIbrahim Kanouche } 174*46c4c49dSIbrahim Kanouche 175*46c4c49dSIbrahim Kanouche for k, ps := range inducedPhrases { 176*46c4c49dSIbrahim Kanouche if strings.HasPrefix(LicenseName(id), k) { 177*46c4c49dSIbrahim Kanouche for _, p := range ps { 178*46c4c49dSIbrahim Kanouche if strings.Index(text, p) != -1 { 179*46c4c49dSIbrahim Kanouche // Check to make sure there isn't a corresponding diff for this 180*46c4c49dSIbrahim Kanouche // insert that also contains the text. This prevents against diff 181*46c4c49dSIbrahim Kanouche // blocks that are too big and force a false hit on this check, 182*46c4c49dSIbrahim Kanouche // which usually happens with URLs since they are stored in one 183*46c4c49dSIbrahim Kanouche // token but can happen in other cases as well. We don't look just 184*46c4c49dSIbrahim Kanouche // for delete diffs because the subsequent text may reference the 185*46c4c49dSIbrahim Kanouche // content in case a URL was truncated. 186*46c4c49dSIbrahim Kanouche if i+1 < len(diffs) && strings.Index(diffs[i+1].Text, p) != -1 { 187*46c4c49dSIbrahim Kanouche continue 188*46c4c49dSIbrahim Kanouche } 189*46c4c49dSIbrahim Kanouche return introducedPhraseChange 190*46c4c49dSIbrahim Kanouche } 191*46c4c49dSIbrahim Kanouche } 192*46c4c49dSIbrahim Kanouche } 193*46c4c49dSIbrahim Kanouche } 194*46c4c49dSIbrahim Kanouche 195*46c4c49dSIbrahim Kanouche // Ignore changes between "library" and "lesser" in a GNU context as they 196*46c4c49dSIbrahim Kanouche // changed the terms, but look for introductions of Lesser that would 197*46c4c49dSIbrahim Kanouche // otherwise disqualify a match. 198*46c4c49dSIbrahim Kanouche if text == "lesser" && strings.HasSuffix(prevText, "gnu") && prevDelete != "library" { 199*46c4c49dSIbrahim Kanouche // The LGPL 3.0 doesn't have a standard header, so people tend to craft 200*46c4c49dSIbrahim Kanouche // their own. As a result, sometimes the warranty clause refers to the 201*46c4c49dSIbrahim Kanouche // GPL instead of the LGPL. This is fine from a licensing perspective, 202*46c4c49dSIbrahim Kanouche // but we need to tweak matching to ignore that particular case. In 203*46c4c49dSIbrahim Kanouche // other circumstances, inserting or removing the word Lesser in the 204*46c4c49dSIbrahim Kanouche // GPL context is not an acceptable change. There is also a reference to 205*46c4c49dSIbrahim Kanouche // it when suggesting to use the LGPL. 206*46c4c49dSIbrahim Kanouche if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") { 207*46c4c49dSIbrahim Kanouche return lesserGPLChange 208*46c4c49dSIbrahim Kanouche } 209*46c4c49dSIbrahim Kanouche } 210*46c4c49dSIbrahim Kanouche case diffmatchpatch.DiffEqual: 211*46c4c49dSIbrahim Kanouche prevText = text 212*46c4c49dSIbrahim Kanouche prevDelete = "" 213*46c4c49dSIbrahim Kanouche 214*46c4c49dSIbrahim Kanouche case diffmatchpatch.DiffDelete: 215*46c4c49dSIbrahim Kanouche // Avoid substitution in most cases. The two exceptions are with usage 216*46c4c49dSIbrahim Kanouche // statements that are talking about *another* license, and don't affect 217*46c4c49dSIbrahim Kanouche // the detection of the current license. 218*46c4c49dSIbrahim Kanouche if (text == "lesser" || text == "library") && strings.HasSuffix(prevText, "gnu") { 219*46c4c49dSIbrahim Kanouche // Same as above to avoid matching GPL instead of LGPL here. 220*46c4c49dSIbrahim Kanouche if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") { 221*46c4c49dSIbrahim Kanouche return lesserGPLChange 222*46c4c49dSIbrahim Kanouche } 223*46c4c49dSIbrahim Kanouche } 224*46c4c49dSIbrahim Kanouche prevDelete = text 225*46c4c49dSIbrahim Kanouche } 226*46c4c49dSIbrahim Kanouche } 227*46c4c49dSIbrahim Kanouche return diffLevenshteinWord(diffs) 228*46c4c49dSIbrahim Kanouche} 229