1// Copyright 2020 Google Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package classifier 16 17import ( 18 "fmt" 19 "strings" 20 "testing" 21 22 "github.com/sergi/go-diff/diffmatchpatch" 23) 24 25func TestLevenshteinDiff(t *testing.T) { 26 tests := []struct { 27 name string 28 diffs []diffmatchpatch.Diff 29 expected int 30 }{ 31 { 32 name: "identical text", 33 diffs: []diffmatchpatch.Diff{ 34 { 35 Type: diffmatchpatch.DiffEqual, 36 Text: "equivalent text", 37 }, 38 }, 39 expected: 0, 40 }, 41 { 42 name: "changed text", 43 // Adjacent inverse changes get scored with the maximum of the 2 change scores 44 diffs: []diffmatchpatch.Diff{ 45 { 46 Type: diffmatchpatch.DiffDelete, 47 Text: "removed words", 48 }, 49 { 50 Type: diffmatchpatch.DiffInsert, 51 Text: "inserted text here", 52 }, 53 }, 54 expected: 3, 55 }, 56 { 57 name: "inserted text", 58 diffs: []diffmatchpatch.Diff{ 59 { 60 Type: diffmatchpatch.DiffEqual, 61 Text: "identical words", 62 }, 63 { 64 Type: diffmatchpatch.DiffInsert, 65 Text: "inserted", 66 }, 67 }, 68 expected: 1, 69 }, 70 { 71 name: "deleted text", 72 diffs: []diffmatchpatch.Diff{ 73 { 74 Type: diffmatchpatch.DiffDelete, 75 Text: "many extraneous deleted words", 76 }, 77 { 78 Type: diffmatchpatch.DiffEqual, 79 Text: "before the equivalent text", 80 }, 81 }, 82 expected: 4, 83 }, 84 } 85 86 for _, test := range tests { 87 t.Run(test.name, func(t *testing.T) { 88 if got := diffLevenshteinWord(test.diffs); got != test.expected { 89 t.Errorf("got %d wanted %d", got, test.expected) 90 } 91 }) 92 } 93} 94 95func TestScoreDiffs(t *testing.T) { 96 tests := []struct { 97 name string 98 license string 99 diffs []diffmatchpatch.Diff 100 expected int 101 }{ 102 { 103 name: "identical text", 104 license: "License/MIT/license.txt", 105 diffs: nil, 106 expected: 0, 107 }, 108 { 109 name: "acceptable change", 110 license: "License/MIT/license.txt", 111 diffs: []diffmatchpatch.Diff{ 112 { 113 Type: diffmatchpatch.DiffEqual, 114 Text: "license", 115 }, 116 { 117 Type: diffmatchpatch.DiffInsert, 118 Text: "as needed", 119 }, 120 { 121 Type: diffmatchpatch.DiffDelete, 122 Text: "when necessary", 123 }, 124 }, 125 expected: 2, 126 }, 127 { 128 name: "version change", 129 license: "License/MIT/license.txt", 130 diffs: []diffmatchpatch.Diff{ 131 { 132 Type: diffmatchpatch.DiffEqual, 133 Text: "version", 134 }, 135 { 136 Type: diffmatchpatch.DiffInsert, 137 Text: "2", 138 }, 139 }, 140 expected: versionChange, 141 }, 142 { 143 name: "license name change by deletion", 144 license: "License/MIT/license.txt", 145 diffs: []diffmatchpatch.Diff{ 146 { 147 Type: diffmatchpatch.DiffEqual, 148 Text: "gnu", 149 }, 150 { 151 Type: diffmatchpatch.DiffDelete, 152 Text: "lesser", 153 }, 154 }, 155 expected: lesserGPLChange, 156 }, 157 { 158 name: "license name change by insertion", 159 license: "License/MIT/license.txt", 160 diffs: []diffmatchpatch.Diff{ 161 { 162 Type: diffmatchpatch.DiffEqual, 163 Text: "gnu", 164 }, 165 { 166 Type: diffmatchpatch.DiffInsert, 167 Text: "lesser", 168 }, 169 }, 170 expected: lesserGPLChange, 171 }, 172 { 173 name: "license name change by name insertion", 174 license: "License/ImageMagick/license.txt", 175 diffs: []diffmatchpatch.Diff{ 176 { 177 Type: diffmatchpatch.DiffEqual, 178 Text: "license", 179 }, 180 { 181 Type: diffmatchpatch.DiffInsert, 182 Text: "imagemagick", 183 }, 184 }, 185 expected: introducedPhraseChange, 186 }, 187 } 188 189 for _, test := range tests { 190 t.Run(test.name, func(t *testing.T) { 191 if got := scoreDiffs(test.license, test.diffs); got != test.expected { 192 t.Errorf("got %d, want %d", got, test.expected) 193 } 194 }) 195 } 196} 197 198func TestConfidencePercentage(t *testing.T) { 199 tests := []struct { 200 name string 201 klen, distance int 202 expected float64 203 }{ 204 { 205 name: "empty text", 206 klen: 0, 207 distance: 0, 208 expected: 1.0, 209 }, 210 { 211 name: "99% match", 212 klen: 100, 213 distance: 1, 214 expected: 0.99, 215 }, 216 } 217 218 for _, test := range tests { 219 t.Run(test.name, func(t *testing.T) { 220 if got := confidencePercentage(test.klen, test.distance); got != test.expected { 221 t.Errorf("got %v want %v", got, test.expected) 222 } 223 }) 224 } 225} 226 227func TestScore(t *testing.T) { 228 tests := []struct { 229 name string 230 known, unknown string 231 expectedConf float64 232 expectedStart, expectedEnd int 233 }{ 234 { 235 name: "identical text", 236 known: "here is some sample text", 237 unknown: "here is some sample text", 238 expectedConf: 1.00, 239 expectedStart: 0, 240 expectedEnd: 0, 241 }, 242 { 243 name: "close match with matching sizes", 244 known: "here is some sample text", 245 unknown: "here is different sample text", 246 expectedConf: .8, 247 expectedStart: 0, 248 expectedEnd: 0, 249 }, 250 { 251 name: "close match with different sizes", 252 known: "here is some sample text", 253 unknown: "padding before here is different sample text", 254 expectedConf: .8, 255 expectedStart: 2, 256 expectedEnd: 0, 257 }, 258 { 259 name: "no match due to unacceptable diff", 260 known: "here is some sample text for version 2 of the license", 261 unknown: "padding before here is different sample text for version 3 of the licenses", 262 expectedConf: 0.0, 263 expectedStart: 0, 264 expectedEnd: 0, 265 }, 266 } 267 268 for _, test := range tests { 269 t.Run(test.name, func(t *testing.T) { 270 var trace strings.Builder 271 c := NewClassifier(.8) 272 c.SetTraceConfiguration(&TraceConfiguration{ 273 TraceLicenses: "*", 274 TracePhases: "*", 275 Tracer: func(f string, args ...interface{}) { 276 trace.WriteString(fmt.Sprintf(f, args...)) 277 }, 278 }) 279 c.AddContent("", "known", "", []byte(test.known)) 280 kd := c.getIndexedDocument("", "known", "") 281 ud := c.createTargetIndexedDocument([]byte(test.unknown)) 282 // The name for the test needs to look like an asset path so we prepend 283 // the directory. 284 conf, so, eo := c.score("License/"+test.name, ud, kd, 0, ud.size()) 285 286 success := true 287 if conf != test.expectedConf { 288 t.Errorf("conf: got %v want %v", conf, test.expectedConf) 289 success = false 290 } 291 if so != test.expectedStart { 292 t.Errorf("start offset: got %v want %v", so, test.expectedStart) 293 success = false 294 } 295 if eo != test.expectedEnd { 296 t.Errorf("end offset: got %v want %v", so, test.expectedEnd) 297 success = false 298 } 299 300 if !success { 301 t.Errorf("Trace:\n%s", trace.String()) 302 } 303 }) 304 } 305} 306