xref: /aosp_15_r20/external/licenseclassifier/v2/searchset_test.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1// Copyright 2020 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package classifier
16
17import (
18	"fmt"
19	"reflect"
20	"strings"
21	"testing"
22
23	"github.com/davecgh/go-spew/spew"
24	"github.com/google/go-cmp/cmp"
25)
26
27// hundredLicenseText is a baseline for debugging precise ordering issues to demonstrate that the token-run assembly process can identify maximally fragmented entries successfully.
28var hundredLicenseText = `
291 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100`
30
31// prefixMissingText exercises the error margin detection case at the beginning of a body of text.
32var prefixMissingText = `
3321 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100`
34
35// suffixMissingText exercises the error margin detection case at the beginning of a body of text.
36var suffixMissingText = `
371 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80`
38
39// fragmentedText is worst-case fragmentation that requires maximum reassembly with full error tolerance.
40var fragmentedText = `
411 2 3 4 X 6 7 8 9 X 11 12 13 14 X 16 17 18 19 X 21 22 23 24 X 26 27 28 29 X 31 32 33 34 X 36 37 38 39 X 41 42 43 44 X 46 47 48 49 X 51 52 53 54 X 56 57 58 59 X 61 62 63 64 X 66 67 68 69 X 71 72 73 74 X 76 77 78 79 X 81 82 83 84 X 86 87 88 89 X 91 92 93 94 X 96 97 98 99 X`
42
43// bigChunkText has a gap of maximal length to ensure that reassembly works.
44var bigChunkText = `
451 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 X X X X X X X X X X X X X X X X X X X X 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100`
46
47// The 50 license text leverages repeated ordered tokens to exercise reassembly options of repeated text in a worst-case situation.
48var fiftyLicenseText = `
491 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50`
50
51var repeatedWithBreakagesText = `
521 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 X X X X X X X X X X X X X X X X X 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 X X X`
53
54func TestSearchSet_New(t *testing.T) {
55	tests := []struct {
56		description string
57		text        string
58		q           int
59		want        *searchSet
60	}{
61		{
62			description: "Empty string",
63			text:        "",
64			q:           4,
65			want: &searchSet{
66				Tokens:         nil,
67				Hashes:         make(hash),
68				Checksums:      nil,
69				ChecksumRanges: nil,
70			},
71		},
72		{
73			description: "Small string",
74			text:        "Hello world",
75			q:           4,
76			want: &searchSet{
77				Tokens: []indexedToken{
78					{Line: 1, ID: 1},
79					{Line: 1, ID: 2},
80				},
81				Hashes:         hash{1957950203: tokenRanges{&tokenRange{Start: 0, End: 2}}},
82				Checksums:      []uint32{1957950203},
83				ChecksumRanges: tokenRanges{&tokenRange{Start: 0, End: 2}},
84				nodes:          []*node{{1957950203, &tokenRange{Start: 0, End: 2}}},
85				q:              2,
86			},
87		},
88	}
89
90	for _, tt := range tests {
91		var trace strings.Builder
92		c := NewClassifier(.8) // This value doesn't affect the test.
93		c.SetTraceConfiguration(&TraceConfiguration{
94			TraceLicenses: "*",
95			TracePhases:   "*",
96			Tracer: func(f string, args ...interface{}) {
97				trace.WriteString(fmt.Sprintf(f, args...))
98			},
99		})
100		c.AddContent("", "text", "", []byte(tt.text))
101		if got := newSearchSet(c.getIndexedDocument("", "text", ""), tt.q); !reflect.DeepEqual(got, tt.want) {
102			t.Errorf("New(%q) = %+v, want %+v", tt.description, spew.Sdump(got), spew.Sdump(tt.want))
103			t.Errorf("Trace:\n%s", trace.String())
104		}
105	}
106}
107func TestFindPotentialMatches(t *testing.T) {
108	tests := []struct {
109		name         string
110		src          string
111		target       string
112		confidence   float64
113		expectedHits int
114	}{
115		{
116			name:         "maximally fragmented",
117			src:          hundredLicenseText,
118			target:       fragmentedText,
119			confidence:   .8,
120			expectedHits: 1,
121		},
122		{
123			name:         "prefix missing",
124			src:          hundredLicenseText,
125			target:       prefixMissingText,
126			confidence:   .8,
127			expectedHits: 1,
128		},
129		{
130			name:         "suffix missing",
131			src:          hundredLicenseText,
132			target:       suffixMissingText,
133			confidence:   .8,
134			expectedHits: 1,
135		},
136		{
137			name:         "maximum-length error",
138			src:          hundredLicenseText,
139			target:       bigChunkText,
140			confidence:   .8,
141			expectedHits: 1,
142		},
143	}
144
145	for _, test := range tests {
146		t.Run(test.name, func(t *testing.T) {
147			var trace strings.Builder
148			c := NewClassifier(test.confidence)
149			c.SetTraceConfiguration(&TraceConfiguration{
150				TraceLicenses: "*",
151				TracePhases:   "*",
152				Tracer: func(f string, args ...interface{}) {
153					trace.WriteString(fmt.Sprintf(f, args...))
154				},
155			})
156			c.AddContent("", "source", "", []byte(test.src))
157
158			doc := c.createTargetIndexedDocument([]byte(test.target))
159			doc.generateSearchSet(c.q)
160			hits := c.findPotentialMatches(c.getIndexedDocument("", "source", "").s, doc.s, test.confidence)
161			if actual := len(hits); actual != test.expectedHits {
162				t.Errorf("got %d hits, wanted %d", actual, test.expectedHits)
163				t.Errorf("Trace:\n%s", trace.String())
164			}
165		})
166	}
167}
168
169func TestFuseRanges(t *testing.T) {
170	// This test verifies that target ordering doesn't affect how ranges
171	// get fused. The data that is input for fuseRanges is not always
172	// ordered deterministically since it's the product of a map iteration.
173	// This was a bug that occurred during development.
174	tests := []struct {
175		name string
176		in   matchRanges
177		out  matchRanges
178		conf float64
179		size int
180	}{
181		{
182			name: "in target order",
183			conf: .8,
184			size: 100,
185			in: matchRanges{
186				&matchRange{
187					SrcStart:      50,
188					SrcEnd:        93,
189					TargetStart:   0,
190					TargetEnd:     43,
191					TokensClaimed: 43,
192				},
193				&matchRange{
194					SrcStart:      0,
195					SrcEnd:        43,
196					TargetStart:   0,
197					TargetEnd:     43,
198					TokensClaimed: 43,
199				},
200				&matchRange{
201					SrcStart:      10,
202					SrcEnd:        47,
203					TargetStart:   60,
204					TargetEnd:     97,
205					TokensClaimed: 37,
206				},
207				&matchRange{
208					SrcStart:      60,
209					SrcEnd:        97,
210					TargetStart:   60,
211					TargetEnd:     97,
212					TokensClaimed: 37,
213				},
214			},
215			out: matchRanges{
216				&matchRange{
217					SrcStart:      0,
218					SrcEnd:        97,
219					TargetStart:   0,
220					TargetEnd:     97,
221					TokensClaimed: 80,
222				},
223			},
224		},
225		{
226			name: "not in-target order",
227			conf: .8,
228			size: 100,
229			in: matchRanges{
230				&matchRange{
231					SrcStart:      0,
232					SrcEnd:        43,
233					TargetStart:   0,
234					TargetEnd:     43,
235					TokensClaimed: 43,
236				},
237				&matchRange{
238					SrcStart:      50,
239					SrcEnd:        93,
240					TargetStart:   0,
241					TargetEnd:     43,
242					TokensClaimed: 43,
243				},
244				&matchRange{
245					SrcStart:      60,
246					SrcEnd:        97,
247					TargetStart:   60,
248					TargetEnd:     97,
249					TokensClaimed: 37,
250				},
251				&matchRange{
252					SrcStart:      10,
253					SrcEnd:        47,
254					TargetStart:   60,
255					TargetEnd:     97,
256					TokensClaimed: 37,
257				},
258			},
259			out: matchRanges{
260				&matchRange{
261					SrcStart:      0,
262					SrcEnd:        97,
263					TargetStart:   0,
264					TargetEnd:     97,
265					TokensClaimed: 80,
266				},
267			},
268		},
269	}
270	for _, test := range tests {
271		t.Run(test.name, func(t *testing.T) {
272			var trace strings.Builder
273			c := NewClassifier(.8)
274			c.SetTraceConfiguration(&TraceConfiguration{
275				TraceLicenses: "*",
276				TracePhases:   "*",
277				Tracer: func(f string, args ...interface{}) {
278					trace.WriteString(fmt.Sprintf(f, args...))
279				},
280			})
281			runs := c.detectRuns(test.name, test.in, 100, 100, test.conf, 4)
282			actual := c.fuseRanges(test.name, test.in, test.conf, test.size, runs, 100)
283			if !cmp.Equal(actual, test.out) {
284				t.Errorf("%v: %v", test.name, cmp.Diff(actual, test.out))
285				t.Errorf("Trace:\n%s", trace.String())
286			}
287		})
288	}
289}
290
291func TestDetectRuns(t *testing.T) {
292	tests := []struct {
293		name                          string
294		matched                       matchRanges
295		targetLength, subsetLength, q int
296		threshold                     float64
297		expected                      []matchRange
298	}{
299		{
300			// For an exact match on 100 accurate tokens, the first q-gram
301			// is the only possible location hit we can return.
302			name:         "precise matching on perfect runs",
303			threshold:    1.0,
304			targetLength: 100,
305			subsetLength: 100,
306			q:            4,
307			matched: matchRanges{
308				&matchRange{TargetStart: 0, TargetEnd: 100},
309			},
310			expected: []matchRange{
311				{SrcStart: 0, SrcEnd: 4},
312			},
313		},
314		{
315			// For an 80% match on 100 accurate tokens, the first 20 token
316			// positions represent possible matches.
317			name:         "approximate matching on perfect runs",
318			threshold:    0.8,
319			targetLength: 100,
320			subsetLength: 100,
321			q:            4,
322			matched: matchRanges{
323				&matchRange{TargetStart: 0, TargetEnd: 100},
324			},
325			expected: []matchRange{
326				{SrcStart: 0, SrcEnd: 24},
327			},
328		},
329		{
330			name:         "multiple runs in a single target",
331			threshold:    0.8,
332			targetLength: 100,
333			subsetLength: 10,
334			q:            4,
335			matched: matchRanges{
336				&matchRange{TargetStart: 0, TargetEnd: 10},
337				&matchRange{TargetStart: 20, TargetEnd: 25},
338				&matchRange{TargetStart: 50, TargetEnd: 60},
339				&matchRange{TargetStart: 70, TargetEnd: 77},
340			},
341			expected: []matchRange{
342				// Runs end on 4-gram boundaries
343				{SrcStart: 0, SrcEnd: 6},
344				// The run starts early because of error tolerance
345				{SrcStart: 48, SrcEnd: 56},
346			},
347		},
348		{
349			name:         "bridge broken runs in a single target",
350			threshold:    0.8,
351			targetLength: 100,
352			subsetLength: 10,
353			q:            4,
354			matched: matchRanges{
355				&matchRange{TargetStart: 20, TargetEnd: 25},
356				&matchRange{TargetStart: 26, TargetEnd: 30},
357				&matchRange{TargetStart: 60, TargetEnd: 67},
358				&matchRange{TargetStart: 68, TargetEnd: 72},
359			},
360			expected: []matchRange{
361				// Runs end on 4-gram boundaries and start early because
362				// of error tolerance.
363				{SrcStart: 19, SrcEnd: 25},
364				{SrcStart: 59, SrcEnd: 67},
365			},
366		},
367	}
368
369	for _, test := range tests {
370		t.Run(test.name, func(t *testing.T) {
371			var trace strings.Builder
372			c := NewClassifier(.8)
373			c.SetTraceConfiguration(&TraceConfiguration{
374				TraceLicenses: "*",
375				TracePhases:   "*",
376				Tracer: func(f string, args ...interface{}) {
377					trace.WriteString(fmt.Sprintf(f, args...))
378				},
379			})
380			if got := c.detectRuns(test.name, test.matched, test.targetLength, test.subsetLength, test.threshold, test.q); !cmp.Equal(got, test.expected) {
381				t.Errorf(cmp.Diff(got, test.expected))
382				t.Errorf("Trace:\n%s", trace.String())
383			}
384
385		})
386	}
387}
388