xref: /aosp_15_r20/external/licenseclassifier/v2/searchset_test.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1*46c4c49dSIbrahim Kanouche// Copyright 2020 Google Inc.
2*46c4c49dSIbrahim Kanouche//
3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License");
4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License.
5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at
6*46c4c49dSIbrahim Kanouche//
7*46c4c49dSIbrahim Kanouche//     http://www.apache.org/licenses/LICENSE-2.0
8*46c4c49dSIbrahim Kanouche//
9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software
10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS,
11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and
13*46c4c49dSIbrahim Kanouche// limitations under the License.
14*46c4c49dSIbrahim Kanouche
15*46c4c49dSIbrahim Kanouchepackage classifier
16*46c4c49dSIbrahim Kanouche
17*46c4c49dSIbrahim Kanoucheimport (
18*46c4c49dSIbrahim Kanouche	"fmt"
19*46c4c49dSIbrahim Kanouche	"reflect"
20*46c4c49dSIbrahim Kanouche	"strings"
21*46c4c49dSIbrahim Kanouche	"testing"
22*46c4c49dSIbrahim Kanouche
23*46c4c49dSIbrahim Kanouche	"github.com/davecgh/go-spew/spew"
24*46c4c49dSIbrahim Kanouche	"github.com/google/go-cmp/cmp"
25*46c4c49dSIbrahim Kanouche)
26*46c4c49dSIbrahim Kanouche
27*46c4c49dSIbrahim Kanouche// hundredLicenseText is a baseline for debugging precise ordering issues to demonstrate that the token-run assembly process can identify maximally fragmented entries successfully.
28*46c4c49dSIbrahim Kanouchevar hundredLicenseText = `
29*46c4c49dSIbrahim Kanouche1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100`
30*46c4c49dSIbrahim Kanouche
31*46c4c49dSIbrahim Kanouche// prefixMissingText exercises the error margin detection case at the beginning of a body of text.
32*46c4c49dSIbrahim Kanouchevar prefixMissingText = `
33*46c4c49dSIbrahim Kanouche21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100`
34*46c4c49dSIbrahim Kanouche
35*46c4c49dSIbrahim Kanouche// suffixMissingText exercises the error margin detection case at the beginning of a body of text.
36*46c4c49dSIbrahim Kanouchevar suffixMissingText = `
37*46c4c49dSIbrahim Kanouche1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80`
38*46c4c49dSIbrahim Kanouche
39*46c4c49dSIbrahim Kanouche// fragmentedText is worst-case fragmentation that requires maximum reassembly with full error tolerance.
40*46c4c49dSIbrahim Kanouchevar fragmentedText = `
41*46c4c49dSIbrahim Kanouche1 2 3 4 X 6 7 8 9 X 11 12 13 14 X 16 17 18 19 X 21 22 23 24 X 26 27 28 29 X 31 32 33 34 X 36 37 38 39 X 41 42 43 44 X 46 47 48 49 X 51 52 53 54 X 56 57 58 59 X 61 62 63 64 X 66 67 68 69 X 71 72 73 74 X 76 77 78 79 X 81 82 83 84 X 86 87 88 89 X 91 92 93 94 X 96 97 98 99 X`
42*46c4c49dSIbrahim Kanouche
43*46c4c49dSIbrahim Kanouche// bigChunkText has a gap of maximal length to ensure that reassembly works.
44*46c4c49dSIbrahim Kanouchevar bigChunkText = `
45*46c4c49dSIbrahim Kanouche1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 X X X X X X X X X X X X X X X X X X X X 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100`
46*46c4c49dSIbrahim Kanouche
47*46c4c49dSIbrahim Kanouche// The 50 license text leverages repeated ordered tokens to exercise reassembly options of repeated text in a worst-case situation.
48*46c4c49dSIbrahim Kanouchevar fiftyLicenseText = `
49*46c4c49dSIbrahim Kanouche1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50`
50*46c4c49dSIbrahim Kanouche
51*46c4c49dSIbrahim Kanouchevar repeatedWithBreakagesText = `
52*46c4c49dSIbrahim Kanouche1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 X X X X X X X X X X X X X X X X X 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 X X X`
53*46c4c49dSIbrahim Kanouche
54*46c4c49dSIbrahim Kanouchefunc TestSearchSet_New(t *testing.T) {
55*46c4c49dSIbrahim Kanouche	tests := []struct {
56*46c4c49dSIbrahim Kanouche		description string
57*46c4c49dSIbrahim Kanouche		text        string
58*46c4c49dSIbrahim Kanouche		q           int
59*46c4c49dSIbrahim Kanouche		want        *searchSet
60*46c4c49dSIbrahim Kanouche	}{
61*46c4c49dSIbrahim Kanouche		{
62*46c4c49dSIbrahim Kanouche			description: "Empty string",
63*46c4c49dSIbrahim Kanouche			text:        "",
64*46c4c49dSIbrahim Kanouche			q:           4,
65*46c4c49dSIbrahim Kanouche			want: &searchSet{
66*46c4c49dSIbrahim Kanouche				Tokens:         nil,
67*46c4c49dSIbrahim Kanouche				Hashes:         make(hash),
68*46c4c49dSIbrahim Kanouche				Checksums:      nil,
69*46c4c49dSIbrahim Kanouche				ChecksumRanges: nil,
70*46c4c49dSIbrahim Kanouche			},
71*46c4c49dSIbrahim Kanouche		},
72*46c4c49dSIbrahim Kanouche		{
73*46c4c49dSIbrahim Kanouche			description: "Small string",
74*46c4c49dSIbrahim Kanouche			text:        "Hello world",
75*46c4c49dSIbrahim Kanouche			q:           4,
76*46c4c49dSIbrahim Kanouche			want: &searchSet{
77*46c4c49dSIbrahim Kanouche				Tokens: []indexedToken{
78*46c4c49dSIbrahim Kanouche					{Line: 1, ID: 1},
79*46c4c49dSIbrahim Kanouche					{Line: 1, ID: 2},
80*46c4c49dSIbrahim Kanouche				},
81*46c4c49dSIbrahim Kanouche				Hashes:         hash{1957950203: tokenRanges{&tokenRange{Start: 0, End: 2}}},
82*46c4c49dSIbrahim Kanouche				Checksums:      []uint32{1957950203},
83*46c4c49dSIbrahim Kanouche				ChecksumRanges: tokenRanges{&tokenRange{Start: 0, End: 2}},
84*46c4c49dSIbrahim Kanouche				nodes:          []*node{{1957950203, &tokenRange{Start: 0, End: 2}}},
85*46c4c49dSIbrahim Kanouche				q:              2,
86*46c4c49dSIbrahim Kanouche			},
87*46c4c49dSIbrahim Kanouche		},
88*46c4c49dSIbrahim Kanouche	}
89*46c4c49dSIbrahim Kanouche
90*46c4c49dSIbrahim Kanouche	for _, tt := range tests {
91*46c4c49dSIbrahim Kanouche		var trace strings.Builder
92*46c4c49dSIbrahim Kanouche		c := NewClassifier(.8) // This value doesn't affect the test.
93*46c4c49dSIbrahim Kanouche		c.SetTraceConfiguration(&TraceConfiguration{
94*46c4c49dSIbrahim Kanouche			TraceLicenses: "*",
95*46c4c49dSIbrahim Kanouche			TracePhases:   "*",
96*46c4c49dSIbrahim Kanouche			Tracer: func(f string, args ...interface{}) {
97*46c4c49dSIbrahim Kanouche				trace.WriteString(fmt.Sprintf(f, args...))
98*46c4c49dSIbrahim Kanouche			},
99*46c4c49dSIbrahim Kanouche		})
100*46c4c49dSIbrahim Kanouche		c.AddContent("", "text", "", []byte(tt.text))
101*46c4c49dSIbrahim Kanouche		if got := newSearchSet(c.getIndexedDocument("", "text", ""), tt.q); !reflect.DeepEqual(got, tt.want) {
102*46c4c49dSIbrahim Kanouche			t.Errorf("New(%q) = %+v, want %+v", tt.description, spew.Sdump(got), spew.Sdump(tt.want))
103*46c4c49dSIbrahim Kanouche			t.Errorf("Trace:\n%s", trace.String())
104*46c4c49dSIbrahim Kanouche		}
105*46c4c49dSIbrahim Kanouche	}
106*46c4c49dSIbrahim Kanouche}
107*46c4c49dSIbrahim Kanouchefunc TestFindPotentialMatches(t *testing.T) {
108*46c4c49dSIbrahim Kanouche	tests := []struct {
109*46c4c49dSIbrahim Kanouche		name         string
110*46c4c49dSIbrahim Kanouche		src          string
111*46c4c49dSIbrahim Kanouche		target       string
112*46c4c49dSIbrahim Kanouche		confidence   float64
113*46c4c49dSIbrahim Kanouche		expectedHits int
114*46c4c49dSIbrahim Kanouche	}{
115*46c4c49dSIbrahim Kanouche		{
116*46c4c49dSIbrahim Kanouche			name:         "maximally fragmented",
117*46c4c49dSIbrahim Kanouche			src:          hundredLicenseText,
118*46c4c49dSIbrahim Kanouche			target:       fragmentedText,
119*46c4c49dSIbrahim Kanouche			confidence:   .8,
120*46c4c49dSIbrahim Kanouche			expectedHits: 1,
121*46c4c49dSIbrahim Kanouche		},
122*46c4c49dSIbrahim Kanouche		{
123*46c4c49dSIbrahim Kanouche			name:         "prefix missing",
124*46c4c49dSIbrahim Kanouche			src:          hundredLicenseText,
125*46c4c49dSIbrahim Kanouche			target:       prefixMissingText,
126*46c4c49dSIbrahim Kanouche			confidence:   .8,
127*46c4c49dSIbrahim Kanouche			expectedHits: 1,
128*46c4c49dSIbrahim Kanouche		},
129*46c4c49dSIbrahim Kanouche		{
130*46c4c49dSIbrahim Kanouche			name:         "suffix missing",
131*46c4c49dSIbrahim Kanouche			src:          hundredLicenseText,
132*46c4c49dSIbrahim Kanouche			target:       suffixMissingText,
133*46c4c49dSIbrahim Kanouche			confidence:   .8,
134*46c4c49dSIbrahim Kanouche			expectedHits: 1,
135*46c4c49dSIbrahim Kanouche		},
136*46c4c49dSIbrahim Kanouche		{
137*46c4c49dSIbrahim Kanouche			name:         "maximum-length error",
138*46c4c49dSIbrahim Kanouche			src:          hundredLicenseText,
139*46c4c49dSIbrahim Kanouche			target:       bigChunkText,
140*46c4c49dSIbrahim Kanouche			confidence:   .8,
141*46c4c49dSIbrahim Kanouche			expectedHits: 1,
142*46c4c49dSIbrahim Kanouche		},
143*46c4c49dSIbrahim Kanouche	}
144*46c4c49dSIbrahim Kanouche
145*46c4c49dSIbrahim Kanouche	for _, test := range tests {
146*46c4c49dSIbrahim Kanouche		t.Run(test.name, func(t *testing.T) {
147*46c4c49dSIbrahim Kanouche			var trace strings.Builder
148*46c4c49dSIbrahim Kanouche			c := NewClassifier(test.confidence)
149*46c4c49dSIbrahim Kanouche			c.SetTraceConfiguration(&TraceConfiguration{
150*46c4c49dSIbrahim Kanouche				TraceLicenses: "*",
151*46c4c49dSIbrahim Kanouche				TracePhases:   "*",
152*46c4c49dSIbrahim Kanouche				Tracer: func(f string, args ...interface{}) {
153*46c4c49dSIbrahim Kanouche					trace.WriteString(fmt.Sprintf(f, args...))
154*46c4c49dSIbrahim Kanouche				},
155*46c4c49dSIbrahim Kanouche			})
156*46c4c49dSIbrahim Kanouche			c.AddContent("", "source", "", []byte(test.src))
157*46c4c49dSIbrahim Kanouche
158*46c4c49dSIbrahim Kanouche			doc := c.createTargetIndexedDocument([]byte(test.target))
159*46c4c49dSIbrahim Kanouche			doc.generateSearchSet(c.q)
160*46c4c49dSIbrahim Kanouche			hits := c.findPotentialMatches(c.getIndexedDocument("", "source", "").s, doc.s, test.confidence)
161*46c4c49dSIbrahim Kanouche			if actual := len(hits); actual != test.expectedHits {
162*46c4c49dSIbrahim Kanouche				t.Errorf("got %d hits, wanted %d", actual, test.expectedHits)
163*46c4c49dSIbrahim Kanouche				t.Errorf("Trace:\n%s", trace.String())
164*46c4c49dSIbrahim Kanouche			}
165*46c4c49dSIbrahim Kanouche		})
166*46c4c49dSIbrahim Kanouche	}
167*46c4c49dSIbrahim Kanouche}
168*46c4c49dSIbrahim Kanouche
169*46c4c49dSIbrahim Kanouchefunc TestFuseRanges(t *testing.T) {
170*46c4c49dSIbrahim Kanouche	// This test verifies that target ordering doesn't affect how ranges
171*46c4c49dSIbrahim Kanouche	// get fused. The data that is input for fuseRanges is not always
172*46c4c49dSIbrahim Kanouche	// ordered deterministically since it's the product of a map iteration.
173*46c4c49dSIbrahim Kanouche	// This was a bug that occurred during development.
174*46c4c49dSIbrahim Kanouche	tests := []struct {
175*46c4c49dSIbrahim Kanouche		name string
176*46c4c49dSIbrahim Kanouche		in   matchRanges
177*46c4c49dSIbrahim Kanouche		out  matchRanges
178*46c4c49dSIbrahim Kanouche		conf float64
179*46c4c49dSIbrahim Kanouche		size int
180*46c4c49dSIbrahim Kanouche	}{
181*46c4c49dSIbrahim Kanouche		{
182*46c4c49dSIbrahim Kanouche			name: "in target order",
183*46c4c49dSIbrahim Kanouche			conf: .8,
184*46c4c49dSIbrahim Kanouche			size: 100,
185*46c4c49dSIbrahim Kanouche			in: matchRanges{
186*46c4c49dSIbrahim Kanouche				&matchRange{
187*46c4c49dSIbrahim Kanouche					SrcStart:      50,
188*46c4c49dSIbrahim Kanouche					SrcEnd:        93,
189*46c4c49dSIbrahim Kanouche					TargetStart:   0,
190*46c4c49dSIbrahim Kanouche					TargetEnd:     43,
191*46c4c49dSIbrahim Kanouche					TokensClaimed: 43,
192*46c4c49dSIbrahim Kanouche				},
193*46c4c49dSIbrahim Kanouche				&matchRange{
194*46c4c49dSIbrahim Kanouche					SrcStart:      0,
195*46c4c49dSIbrahim Kanouche					SrcEnd:        43,
196*46c4c49dSIbrahim Kanouche					TargetStart:   0,
197*46c4c49dSIbrahim Kanouche					TargetEnd:     43,
198*46c4c49dSIbrahim Kanouche					TokensClaimed: 43,
199*46c4c49dSIbrahim Kanouche				},
200*46c4c49dSIbrahim Kanouche				&matchRange{
201*46c4c49dSIbrahim Kanouche					SrcStart:      10,
202*46c4c49dSIbrahim Kanouche					SrcEnd:        47,
203*46c4c49dSIbrahim Kanouche					TargetStart:   60,
204*46c4c49dSIbrahim Kanouche					TargetEnd:     97,
205*46c4c49dSIbrahim Kanouche					TokensClaimed: 37,
206*46c4c49dSIbrahim Kanouche				},
207*46c4c49dSIbrahim Kanouche				&matchRange{
208*46c4c49dSIbrahim Kanouche					SrcStart:      60,
209*46c4c49dSIbrahim Kanouche					SrcEnd:        97,
210*46c4c49dSIbrahim Kanouche					TargetStart:   60,
211*46c4c49dSIbrahim Kanouche					TargetEnd:     97,
212*46c4c49dSIbrahim Kanouche					TokensClaimed: 37,
213*46c4c49dSIbrahim Kanouche				},
214*46c4c49dSIbrahim Kanouche			},
215*46c4c49dSIbrahim Kanouche			out: matchRanges{
216*46c4c49dSIbrahim Kanouche				&matchRange{
217*46c4c49dSIbrahim Kanouche					SrcStart:      0,
218*46c4c49dSIbrahim Kanouche					SrcEnd:        97,
219*46c4c49dSIbrahim Kanouche					TargetStart:   0,
220*46c4c49dSIbrahim Kanouche					TargetEnd:     97,
221*46c4c49dSIbrahim Kanouche					TokensClaimed: 80,
222*46c4c49dSIbrahim Kanouche				},
223*46c4c49dSIbrahim Kanouche			},
224*46c4c49dSIbrahim Kanouche		},
225*46c4c49dSIbrahim Kanouche		{
226*46c4c49dSIbrahim Kanouche			name: "not in-target order",
227*46c4c49dSIbrahim Kanouche			conf: .8,
228*46c4c49dSIbrahim Kanouche			size: 100,
229*46c4c49dSIbrahim Kanouche			in: matchRanges{
230*46c4c49dSIbrahim Kanouche				&matchRange{
231*46c4c49dSIbrahim Kanouche					SrcStart:      0,
232*46c4c49dSIbrahim Kanouche					SrcEnd:        43,
233*46c4c49dSIbrahim Kanouche					TargetStart:   0,
234*46c4c49dSIbrahim Kanouche					TargetEnd:     43,
235*46c4c49dSIbrahim Kanouche					TokensClaimed: 43,
236*46c4c49dSIbrahim Kanouche				},
237*46c4c49dSIbrahim Kanouche				&matchRange{
238*46c4c49dSIbrahim Kanouche					SrcStart:      50,
239*46c4c49dSIbrahim Kanouche					SrcEnd:        93,
240*46c4c49dSIbrahim Kanouche					TargetStart:   0,
241*46c4c49dSIbrahim Kanouche					TargetEnd:     43,
242*46c4c49dSIbrahim Kanouche					TokensClaimed: 43,
243*46c4c49dSIbrahim Kanouche				},
244*46c4c49dSIbrahim Kanouche				&matchRange{
245*46c4c49dSIbrahim Kanouche					SrcStart:      60,
246*46c4c49dSIbrahim Kanouche					SrcEnd:        97,
247*46c4c49dSIbrahim Kanouche					TargetStart:   60,
248*46c4c49dSIbrahim Kanouche					TargetEnd:     97,
249*46c4c49dSIbrahim Kanouche					TokensClaimed: 37,
250*46c4c49dSIbrahim Kanouche				},
251*46c4c49dSIbrahim Kanouche				&matchRange{
252*46c4c49dSIbrahim Kanouche					SrcStart:      10,
253*46c4c49dSIbrahim Kanouche					SrcEnd:        47,
254*46c4c49dSIbrahim Kanouche					TargetStart:   60,
255*46c4c49dSIbrahim Kanouche					TargetEnd:     97,
256*46c4c49dSIbrahim Kanouche					TokensClaimed: 37,
257*46c4c49dSIbrahim Kanouche				},
258*46c4c49dSIbrahim Kanouche			},
259*46c4c49dSIbrahim Kanouche			out: matchRanges{
260*46c4c49dSIbrahim Kanouche				&matchRange{
261*46c4c49dSIbrahim Kanouche					SrcStart:      0,
262*46c4c49dSIbrahim Kanouche					SrcEnd:        97,
263*46c4c49dSIbrahim Kanouche					TargetStart:   0,
264*46c4c49dSIbrahim Kanouche					TargetEnd:     97,
265*46c4c49dSIbrahim Kanouche					TokensClaimed: 80,
266*46c4c49dSIbrahim Kanouche				},
267*46c4c49dSIbrahim Kanouche			},
268*46c4c49dSIbrahim Kanouche		},
269*46c4c49dSIbrahim Kanouche	}
270*46c4c49dSIbrahim Kanouche	for _, test := range tests {
271*46c4c49dSIbrahim Kanouche		t.Run(test.name, func(t *testing.T) {
272*46c4c49dSIbrahim Kanouche			var trace strings.Builder
273*46c4c49dSIbrahim Kanouche			c := NewClassifier(.8)
274*46c4c49dSIbrahim Kanouche			c.SetTraceConfiguration(&TraceConfiguration{
275*46c4c49dSIbrahim Kanouche				TraceLicenses: "*",
276*46c4c49dSIbrahim Kanouche				TracePhases:   "*",
277*46c4c49dSIbrahim Kanouche				Tracer: func(f string, args ...interface{}) {
278*46c4c49dSIbrahim Kanouche					trace.WriteString(fmt.Sprintf(f, args...))
279*46c4c49dSIbrahim Kanouche				},
280*46c4c49dSIbrahim Kanouche			})
281*46c4c49dSIbrahim Kanouche			runs := c.detectRuns(test.name, test.in, 100, 100, test.conf, 4)
282*46c4c49dSIbrahim Kanouche			actual := c.fuseRanges(test.name, test.in, test.conf, test.size, runs, 100)
283*46c4c49dSIbrahim Kanouche			if !cmp.Equal(actual, test.out) {
284*46c4c49dSIbrahim Kanouche				t.Errorf("%v: %v", test.name, cmp.Diff(actual, test.out))
285*46c4c49dSIbrahim Kanouche				t.Errorf("Trace:\n%s", trace.String())
286*46c4c49dSIbrahim Kanouche			}
287*46c4c49dSIbrahim Kanouche		})
288*46c4c49dSIbrahim Kanouche	}
289*46c4c49dSIbrahim Kanouche}
290*46c4c49dSIbrahim Kanouche
291*46c4c49dSIbrahim Kanouchefunc TestDetectRuns(t *testing.T) {
292*46c4c49dSIbrahim Kanouche	tests := []struct {
293*46c4c49dSIbrahim Kanouche		name                          string
294*46c4c49dSIbrahim Kanouche		matched                       matchRanges
295*46c4c49dSIbrahim Kanouche		targetLength, subsetLength, q int
296*46c4c49dSIbrahim Kanouche		threshold                     float64
297*46c4c49dSIbrahim Kanouche		expected                      []matchRange
298*46c4c49dSIbrahim Kanouche	}{
299*46c4c49dSIbrahim Kanouche		{
300*46c4c49dSIbrahim Kanouche			// For an exact match on 100 accurate tokens, the first q-gram
301*46c4c49dSIbrahim Kanouche			// is the only possible location hit we can return.
302*46c4c49dSIbrahim Kanouche			name:         "precise matching on perfect runs",
303*46c4c49dSIbrahim Kanouche			threshold:    1.0,
304*46c4c49dSIbrahim Kanouche			targetLength: 100,
305*46c4c49dSIbrahim Kanouche			subsetLength: 100,
306*46c4c49dSIbrahim Kanouche			q:            4,
307*46c4c49dSIbrahim Kanouche			matched: matchRanges{
308*46c4c49dSIbrahim Kanouche				&matchRange{TargetStart: 0, TargetEnd: 100},
309*46c4c49dSIbrahim Kanouche			},
310*46c4c49dSIbrahim Kanouche			expected: []matchRange{
311*46c4c49dSIbrahim Kanouche				{SrcStart: 0, SrcEnd: 4},
312*46c4c49dSIbrahim Kanouche			},
313*46c4c49dSIbrahim Kanouche		},
314*46c4c49dSIbrahim Kanouche		{
315*46c4c49dSIbrahim Kanouche			// For an 80% match on 100 accurate tokens, the first 20 token
316*46c4c49dSIbrahim Kanouche			// positions represent possible matches.
317*46c4c49dSIbrahim Kanouche			name:         "approximate matching on perfect runs",
318*46c4c49dSIbrahim Kanouche			threshold:    0.8,
319*46c4c49dSIbrahim Kanouche			targetLength: 100,
320*46c4c49dSIbrahim Kanouche			subsetLength: 100,
321*46c4c49dSIbrahim Kanouche			q:            4,
322*46c4c49dSIbrahim Kanouche			matched: matchRanges{
323*46c4c49dSIbrahim Kanouche				&matchRange{TargetStart: 0, TargetEnd: 100},
324*46c4c49dSIbrahim Kanouche			},
325*46c4c49dSIbrahim Kanouche			expected: []matchRange{
326*46c4c49dSIbrahim Kanouche				{SrcStart: 0, SrcEnd: 24},
327*46c4c49dSIbrahim Kanouche			},
328*46c4c49dSIbrahim Kanouche		},
329*46c4c49dSIbrahim Kanouche		{
330*46c4c49dSIbrahim Kanouche			name:         "multiple runs in a single target",
331*46c4c49dSIbrahim Kanouche			threshold:    0.8,
332*46c4c49dSIbrahim Kanouche			targetLength: 100,
333*46c4c49dSIbrahim Kanouche			subsetLength: 10,
334*46c4c49dSIbrahim Kanouche			q:            4,
335*46c4c49dSIbrahim Kanouche			matched: matchRanges{
336*46c4c49dSIbrahim Kanouche				&matchRange{TargetStart: 0, TargetEnd: 10},
337*46c4c49dSIbrahim Kanouche				&matchRange{TargetStart: 20, TargetEnd: 25},
338*46c4c49dSIbrahim Kanouche				&matchRange{TargetStart: 50, TargetEnd: 60},
339*46c4c49dSIbrahim Kanouche				&matchRange{TargetStart: 70, TargetEnd: 77},
340*46c4c49dSIbrahim Kanouche			},
341*46c4c49dSIbrahim Kanouche			expected: []matchRange{
342*46c4c49dSIbrahim Kanouche				// Runs end on 4-gram boundaries
343*46c4c49dSIbrahim Kanouche				{SrcStart: 0, SrcEnd: 6},
344*46c4c49dSIbrahim Kanouche				// The run starts early because of error tolerance
345*46c4c49dSIbrahim Kanouche				{SrcStart: 48, SrcEnd: 56},
346*46c4c49dSIbrahim Kanouche			},
347*46c4c49dSIbrahim Kanouche		},
348*46c4c49dSIbrahim Kanouche		{
349*46c4c49dSIbrahim Kanouche			name:         "bridge broken runs in a single target",
350*46c4c49dSIbrahim Kanouche			threshold:    0.8,
351*46c4c49dSIbrahim Kanouche			targetLength: 100,
352*46c4c49dSIbrahim Kanouche			subsetLength: 10,
353*46c4c49dSIbrahim Kanouche			q:            4,
354*46c4c49dSIbrahim Kanouche			matched: matchRanges{
355*46c4c49dSIbrahim Kanouche				&matchRange{TargetStart: 20, TargetEnd: 25},
356*46c4c49dSIbrahim Kanouche				&matchRange{TargetStart: 26, TargetEnd: 30},
357*46c4c49dSIbrahim Kanouche				&matchRange{TargetStart: 60, TargetEnd: 67},
358*46c4c49dSIbrahim Kanouche				&matchRange{TargetStart: 68, TargetEnd: 72},
359*46c4c49dSIbrahim Kanouche			},
360*46c4c49dSIbrahim Kanouche			expected: []matchRange{
361*46c4c49dSIbrahim Kanouche				// Runs end on 4-gram boundaries and start early because
362*46c4c49dSIbrahim Kanouche				// of error tolerance.
363*46c4c49dSIbrahim Kanouche				{SrcStart: 19, SrcEnd: 25},
364*46c4c49dSIbrahim Kanouche				{SrcStart: 59, SrcEnd: 67},
365*46c4c49dSIbrahim Kanouche			},
366*46c4c49dSIbrahim Kanouche		},
367*46c4c49dSIbrahim Kanouche	}
368*46c4c49dSIbrahim Kanouche
369*46c4c49dSIbrahim Kanouche	for _, test := range tests {
370*46c4c49dSIbrahim Kanouche		t.Run(test.name, func(t *testing.T) {
371*46c4c49dSIbrahim Kanouche			var trace strings.Builder
372*46c4c49dSIbrahim Kanouche			c := NewClassifier(.8)
373*46c4c49dSIbrahim Kanouche			c.SetTraceConfiguration(&TraceConfiguration{
374*46c4c49dSIbrahim Kanouche				TraceLicenses: "*",
375*46c4c49dSIbrahim Kanouche				TracePhases:   "*",
376*46c4c49dSIbrahim Kanouche				Tracer: func(f string, args ...interface{}) {
377*46c4c49dSIbrahim Kanouche					trace.WriteString(fmt.Sprintf(f, args...))
378*46c4c49dSIbrahim Kanouche				},
379*46c4c49dSIbrahim Kanouche			})
380*46c4c49dSIbrahim Kanouche			if got := c.detectRuns(test.name, test.matched, test.targetLength, test.subsetLength, test.threshold, test.q); !cmp.Equal(got, test.expected) {
381*46c4c49dSIbrahim Kanouche				t.Errorf(cmp.Diff(got, test.expected))
382*46c4c49dSIbrahim Kanouche				t.Errorf("Trace:\n%s", trace.String())
383*46c4c49dSIbrahim Kanouche			}
384*46c4c49dSIbrahim Kanouche
385*46c4c49dSIbrahim Kanouche		})
386*46c4c49dSIbrahim Kanouche	}
387*46c4c49dSIbrahim Kanouche}
388