1// Copyright 2020 Google Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package classifier 16 17import ( 18 "fmt" 19 "reflect" 20 "strings" 21 "testing" 22 23 "github.com/davecgh/go-spew/spew" 24 "github.com/google/go-cmp/cmp" 25) 26 27// hundredLicenseText is a baseline for debugging precise ordering issues to demonstrate that the token-run assembly process can identify maximally fragmented entries successfully. 28var hundredLicenseText = ` 291 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100` 30 31// prefixMissingText exercises the error margin detection case at the beginning of a body of text. 32var prefixMissingText = ` 3321 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100` 34 35// suffixMissingText exercises the error margin detection case at the beginning of a body of text. 36var suffixMissingText = ` 371 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80` 38 39// fragmentedText is worst-case fragmentation that requires maximum reassembly with full error tolerance. 40var fragmentedText = ` 411 2 3 4 X 6 7 8 9 X 11 12 13 14 X 16 17 18 19 X 21 22 23 24 X 26 27 28 29 X 31 32 33 34 X 36 37 38 39 X 41 42 43 44 X 46 47 48 49 X 51 52 53 54 X 56 57 58 59 X 61 62 63 64 X 66 67 68 69 X 71 72 73 74 X 76 77 78 79 X 81 82 83 84 X 86 87 88 89 X 91 92 93 94 X 96 97 98 99 X` 42 43// bigChunkText has a gap of maximal length to ensure that reassembly works. 44var bigChunkText = ` 451 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 X X X X X X X X X X X X X X X X X X X X 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100` 46 47// The 50 license text leverages repeated ordered tokens to exercise reassembly options of repeated text in a worst-case situation. 48var fiftyLicenseText = ` 491 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50` 50 51var repeatedWithBreakagesText = ` 521 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 X X X X X X X X X X X X X X X X X 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 X X X` 53 54func TestSearchSet_New(t *testing.T) { 55 tests := []struct { 56 description string 57 text string 58 q int 59 want *searchSet 60 }{ 61 { 62 description: "Empty string", 63 text: "", 64 q: 4, 65 want: &searchSet{ 66 Tokens: nil, 67 Hashes: make(hash), 68 Checksums: nil, 69 ChecksumRanges: nil, 70 }, 71 }, 72 { 73 description: "Small string", 74 text: "Hello world", 75 q: 4, 76 want: &searchSet{ 77 Tokens: []indexedToken{ 78 {Line: 1, ID: 1}, 79 {Line: 1, ID: 2}, 80 }, 81 Hashes: hash{1957950203: tokenRanges{&tokenRange{Start: 0, End: 2}}}, 82 Checksums: []uint32{1957950203}, 83 ChecksumRanges: tokenRanges{&tokenRange{Start: 0, End: 2}}, 84 nodes: []*node{{1957950203, &tokenRange{Start: 0, End: 2}}}, 85 q: 2, 86 }, 87 }, 88 } 89 90 for _, tt := range tests { 91 var trace strings.Builder 92 c := NewClassifier(.8) // This value doesn't affect the test. 93 c.SetTraceConfiguration(&TraceConfiguration{ 94 TraceLicenses: "*", 95 TracePhases: "*", 96 Tracer: func(f string, args ...interface{}) { 97 trace.WriteString(fmt.Sprintf(f, args...)) 98 }, 99 }) 100 c.AddContent("", "text", "", []byte(tt.text)) 101 if got := newSearchSet(c.getIndexedDocument("", "text", ""), tt.q); !reflect.DeepEqual(got, tt.want) { 102 t.Errorf("New(%q) = %+v, want %+v", tt.description, spew.Sdump(got), spew.Sdump(tt.want)) 103 t.Errorf("Trace:\n%s", trace.String()) 104 } 105 } 106} 107func TestFindPotentialMatches(t *testing.T) { 108 tests := []struct { 109 name string 110 src string 111 target string 112 confidence float64 113 expectedHits int 114 }{ 115 { 116 name: "maximally fragmented", 117 src: hundredLicenseText, 118 target: fragmentedText, 119 confidence: .8, 120 expectedHits: 1, 121 }, 122 { 123 name: "prefix missing", 124 src: hundredLicenseText, 125 target: prefixMissingText, 126 confidence: .8, 127 expectedHits: 1, 128 }, 129 { 130 name: "suffix missing", 131 src: hundredLicenseText, 132 target: suffixMissingText, 133 confidence: .8, 134 expectedHits: 1, 135 }, 136 { 137 name: "maximum-length error", 138 src: hundredLicenseText, 139 target: bigChunkText, 140 confidence: .8, 141 expectedHits: 1, 142 }, 143 } 144 145 for _, test := range tests { 146 t.Run(test.name, func(t *testing.T) { 147 var trace strings.Builder 148 c := NewClassifier(test.confidence) 149 c.SetTraceConfiguration(&TraceConfiguration{ 150 TraceLicenses: "*", 151 TracePhases: "*", 152 Tracer: func(f string, args ...interface{}) { 153 trace.WriteString(fmt.Sprintf(f, args...)) 154 }, 155 }) 156 c.AddContent("", "source", "", []byte(test.src)) 157 158 doc := c.createTargetIndexedDocument([]byte(test.target)) 159 doc.generateSearchSet(c.q) 160 hits := c.findPotentialMatches(c.getIndexedDocument("", "source", "").s, doc.s, test.confidence) 161 if actual := len(hits); actual != test.expectedHits { 162 t.Errorf("got %d hits, wanted %d", actual, test.expectedHits) 163 t.Errorf("Trace:\n%s", trace.String()) 164 } 165 }) 166 } 167} 168 169func TestFuseRanges(t *testing.T) { 170 // This test verifies that target ordering doesn't affect how ranges 171 // get fused. The data that is input for fuseRanges is not always 172 // ordered deterministically since it's the product of a map iteration. 173 // This was a bug that occurred during development. 174 tests := []struct { 175 name string 176 in matchRanges 177 out matchRanges 178 conf float64 179 size int 180 }{ 181 { 182 name: "in target order", 183 conf: .8, 184 size: 100, 185 in: matchRanges{ 186 &matchRange{ 187 SrcStart: 50, 188 SrcEnd: 93, 189 TargetStart: 0, 190 TargetEnd: 43, 191 TokensClaimed: 43, 192 }, 193 &matchRange{ 194 SrcStart: 0, 195 SrcEnd: 43, 196 TargetStart: 0, 197 TargetEnd: 43, 198 TokensClaimed: 43, 199 }, 200 &matchRange{ 201 SrcStart: 10, 202 SrcEnd: 47, 203 TargetStart: 60, 204 TargetEnd: 97, 205 TokensClaimed: 37, 206 }, 207 &matchRange{ 208 SrcStart: 60, 209 SrcEnd: 97, 210 TargetStart: 60, 211 TargetEnd: 97, 212 TokensClaimed: 37, 213 }, 214 }, 215 out: matchRanges{ 216 &matchRange{ 217 SrcStart: 0, 218 SrcEnd: 97, 219 TargetStart: 0, 220 TargetEnd: 97, 221 TokensClaimed: 80, 222 }, 223 }, 224 }, 225 { 226 name: "not in-target order", 227 conf: .8, 228 size: 100, 229 in: matchRanges{ 230 &matchRange{ 231 SrcStart: 0, 232 SrcEnd: 43, 233 TargetStart: 0, 234 TargetEnd: 43, 235 TokensClaimed: 43, 236 }, 237 &matchRange{ 238 SrcStart: 50, 239 SrcEnd: 93, 240 TargetStart: 0, 241 TargetEnd: 43, 242 TokensClaimed: 43, 243 }, 244 &matchRange{ 245 SrcStart: 60, 246 SrcEnd: 97, 247 TargetStart: 60, 248 TargetEnd: 97, 249 TokensClaimed: 37, 250 }, 251 &matchRange{ 252 SrcStart: 10, 253 SrcEnd: 47, 254 TargetStart: 60, 255 TargetEnd: 97, 256 TokensClaimed: 37, 257 }, 258 }, 259 out: matchRanges{ 260 &matchRange{ 261 SrcStart: 0, 262 SrcEnd: 97, 263 TargetStart: 0, 264 TargetEnd: 97, 265 TokensClaimed: 80, 266 }, 267 }, 268 }, 269 } 270 for _, test := range tests { 271 t.Run(test.name, func(t *testing.T) { 272 var trace strings.Builder 273 c := NewClassifier(.8) 274 c.SetTraceConfiguration(&TraceConfiguration{ 275 TraceLicenses: "*", 276 TracePhases: "*", 277 Tracer: func(f string, args ...interface{}) { 278 trace.WriteString(fmt.Sprintf(f, args...)) 279 }, 280 }) 281 runs := c.detectRuns(test.name, test.in, 100, 100, test.conf, 4) 282 actual := c.fuseRanges(test.name, test.in, test.conf, test.size, runs, 100) 283 if !cmp.Equal(actual, test.out) { 284 t.Errorf("%v: %v", test.name, cmp.Diff(actual, test.out)) 285 t.Errorf("Trace:\n%s", trace.String()) 286 } 287 }) 288 } 289} 290 291func TestDetectRuns(t *testing.T) { 292 tests := []struct { 293 name string 294 matched matchRanges 295 targetLength, subsetLength, q int 296 threshold float64 297 expected []matchRange 298 }{ 299 { 300 // For an exact match on 100 accurate tokens, the first q-gram 301 // is the only possible location hit we can return. 302 name: "precise matching on perfect runs", 303 threshold: 1.0, 304 targetLength: 100, 305 subsetLength: 100, 306 q: 4, 307 matched: matchRanges{ 308 &matchRange{TargetStart: 0, TargetEnd: 100}, 309 }, 310 expected: []matchRange{ 311 {SrcStart: 0, SrcEnd: 4}, 312 }, 313 }, 314 { 315 // For an 80% match on 100 accurate tokens, the first 20 token 316 // positions represent possible matches. 317 name: "approximate matching on perfect runs", 318 threshold: 0.8, 319 targetLength: 100, 320 subsetLength: 100, 321 q: 4, 322 matched: matchRanges{ 323 &matchRange{TargetStart: 0, TargetEnd: 100}, 324 }, 325 expected: []matchRange{ 326 {SrcStart: 0, SrcEnd: 24}, 327 }, 328 }, 329 { 330 name: "multiple runs in a single target", 331 threshold: 0.8, 332 targetLength: 100, 333 subsetLength: 10, 334 q: 4, 335 matched: matchRanges{ 336 &matchRange{TargetStart: 0, TargetEnd: 10}, 337 &matchRange{TargetStart: 20, TargetEnd: 25}, 338 &matchRange{TargetStart: 50, TargetEnd: 60}, 339 &matchRange{TargetStart: 70, TargetEnd: 77}, 340 }, 341 expected: []matchRange{ 342 // Runs end on 4-gram boundaries 343 {SrcStart: 0, SrcEnd: 6}, 344 // The run starts early because of error tolerance 345 {SrcStart: 48, SrcEnd: 56}, 346 }, 347 }, 348 { 349 name: "bridge broken runs in a single target", 350 threshold: 0.8, 351 targetLength: 100, 352 subsetLength: 10, 353 q: 4, 354 matched: matchRanges{ 355 &matchRange{TargetStart: 20, TargetEnd: 25}, 356 &matchRange{TargetStart: 26, TargetEnd: 30}, 357 &matchRange{TargetStart: 60, TargetEnd: 67}, 358 &matchRange{TargetStart: 68, TargetEnd: 72}, 359 }, 360 expected: []matchRange{ 361 // Runs end on 4-gram boundaries and start early because 362 // of error tolerance. 363 {SrcStart: 19, SrcEnd: 25}, 364 {SrcStart: 59, SrcEnd: 67}, 365 }, 366 }, 367 } 368 369 for _, test := range tests { 370 t.Run(test.name, func(t *testing.T) { 371 var trace strings.Builder 372 c := NewClassifier(.8) 373 c.SetTraceConfiguration(&TraceConfiguration{ 374 TraceLicenses: "*", 375 TracePhases: "*", 376 Tracer: func(f string, args ...interface{}) { 377 trace.WriteString(fmt.Sprintf(f, args...)) 378 }, 379 }) 380 if got := c.detectRuns(test.name, test.matched, test.targetLength, test.subsetLength, test.threshold, test.q); !cmp.Equal(got, test.expected) { 381 t.Errorf(cmp.Diff(got, test.expected)) 382 t.Errorf("Trace:\n%s", trace.String()) 383 } 384 385 }) 386 } 387} 388