xref: /aosp_15_r20/external/licenseclassifier/v2/scoring_test.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1// Copyright 2020 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package classifier
16
17import (
18	"fmt"
19	"strings"
20	"testing"
21
22	"github.com/sergi/go-diff/diffmatchpatch"
23)
24
25func TestLevenshteinDiff(t *testing.T) {
26	tests := []struct {
27		name     string
28		diffs    []diffmatchpatch.Diff
29		expected int
30	}{
31		{
32			name: "identical text",
33			diffs: []diffmatchpatch.Diff{
34				{
35					Type: diffmatchpatch.DiffEqual,
36					Text: "equivalent text",
37				},
38			},
39			expected: 0,
40		},
41		{
42			name: "changed text",
43			// Adjacent inverse changes get scored with the maximum of the 2 change scores
44			diffs: []diffmatchpatch.Diff{
45				{
46					Type: diffmatchpatch.DiffDelete,
47					Text: "removed words",
48				},
49				{
50					Type: diffmatchpatch.DiffInsert,
51					Text: "inserted text here",
52				},
53			},
54			expected: 3,
55		},
56		{
57			name: "inserted text",
58			diffs: []diffmatchpatch.Diff{
59				{
60					Type: diffmatchpatch.DiffEqual,
61					Text: "identical words",
62				},
63				{
64					Type: diffmatchpatch.DiffInsert,
65					Text: "inserted",
66				},
67			},
68			expected: 1,
69		},
70		{
71			name: "deleted text",
72			diffs: []diffmatchpatch.Diff{
73				{
74					Type: diffmatchpatch.DiffDelete,
75					Text: "many extraneous deleted words",
76				},
77				{
78					Type: diffmatchpatch.DiffEqual,
79					Text: "before the equivalent text",
80				},
81			},
82			expected: 4,
83		},
84	}
85
86	for _, test := range tests {
87		t.Run(test.name, func(t *testing.T) {
88			if got := diffLevenshteinWord(test.diffs); got != test.expected {
89				t.Errorf("got %d wanted %d", got, test.expected)
90			}
91		})
92	}
93}
94
95func TestScoreDiffs(t *testing.T) {
96	tests := []struct {
97		name     string
98		license  string
99		diffs    []diffmatchpatch.Diff
100		expected int
101	}{
102		{
103			name:     "identical text",
104			license:  "License/MIT/license.txt",
105			diffs:    nil,
106			expected: 0,
107		},
108		{
109			name:    "acceptable change",
110			license: "License/MIT/license.txt",
111			diffs: []diffmatchpatch.Diff{
112				{
113					Type: diffmatchpatch.DiffEqual,
114					Text: "license",
115				},
116				{
117					Type: diffmatchpatch.DiffInsert,
118					Text: "as needed",
119				},
120				{
121					Type: diffmatchpatch.DiffDelete,
122					Text: "when necessary",
123				},
124			},
125			expected: 2,
126		},
127		{
128			name:    "version change",
129			license: "License/MIT/license.txt",
130			diffs: []diffmatchpatch.Diff{
131				{
132					Type: diffmatchpatch.DiffEqual,
133					Text: "version",
134				},
135				{
136					Type: diffmatchpatch.DiffInsert,
137					Text: "2",
138				},
139			},
140			expected: versionChange,
141		},
142		{
143			name:    "license name change by deletion",
144			license: "License/MIT/license.txt",
145			diffs: []diffmatchpatch.Diff{
146				{
147					Type: diffmatchpatch.DiffEqual,
148					Text: "gnu",
149				},
150				{
151					Type: diffmatchpatch.DiffDelete,
152					Text: "lesser",
153				},
154			},
155			expected: lesserGPLChange,
156		},
157		{
158			name:    "license name change by insertion",
159			license: "License/MIT/license.txt",
160			diffs: []diffmatchpatch.Diff{
161				{
162					Type: diffmatchpatch.DiffEqual,
163					Text: "gnu",
164				},
165				{
166					Type: diffmatchpatch.DiffInsert,
167					Text: "lesser",
168				},
169			},
170			expected: lesserGPLChange,
171		},
172		{
173			name:    "license name change by name insertion",
174			license: "License/ImageMagick/license.txt",
175			diffs: []diffmatchpatch.Diff{
176				{
177					Type: diffmatchpatch.DiffEqual,
178					Text: "license",
179				},
180				{
181					Type: diffmatchpatch.DiffInsert,
182					Text: "imagemagick",
183				},
184			},
185			expected: introducedPhraseChange,
186		},
187	}
188
189	for _, test := range tests {
190		t.Run(test.name, func(t *testing.T) {
191			if got := scoreDiffs(test.license, test.diffs); got != test.expected {
192				t.Errorf("got %d, want %d", got, test.expected)
193			}
194		})
195	}
196}
197
198func TestConfidencePercentage(t *testing.T) {
199	tests := []struct {
200		name           string
201		klen, distance int
202		expected       float64
203	}{
204		{
205			name:     "empty text",
206			klen:     0,
207			distance: 0,
208			expected: 1.0,
209		},
210		{
211			name:     "99% match",
212			klen:     100,
213			distance: 1,
214			expected: 0.99,
215		},
216	}
217
218	for _, test := range tests {
219		t.Run(test.name, func(t *testing.T) {
220			if got := confidencePercentage(test.klen, test.distance); got != test.expected {
221				t.Errorf("got %v want %v", got, test.expected)
222			}
223		})
224	}
225}
226
227func TestScore(t *testing.T) {
228	tests := []struct {
229		name                       string
230		known, unknown             string
231		expectedConf               float64
232		expectedStart, expectedEnd int
233	}{
234		{
235			name:          "identical text",
236			known:         "here is some sample text",
237			unknown:       "here is some sample text",
238			expectedConf:  1.00,
239			expectedStart: 0,
240			expectedEnd:   0,
241		},
242		{
243			name:          "close match with matching sizes",
244			known:         "here is some sample text",
245			unknown:       "here is different sample text",
246			expectedConf:  .8,
247			expectedStart: 0,
248			expectedEnd:   0,
249		},
250		{
251			name:          "close match with different sizes",
252			known:         "here is some sample text",
253			unknown:       "padding before here is different sample text",
254			expectedConf:  .8,
255			expectedStart: 2,
256			expectedEnd:   0,
257		},
258		{
259			name:          "no match due to unacceptable diff",
260			known:         "here is some sample text for version 2 of the license",
261			unknown:       "padding before here is different sample text for version 3 of the licenses",
262			expectedConf:  0.0,
263			expectedStart: 0,
264			expectedEnd:   0,
265		},
266	}
267
268	for _, test := range tests {
269		t.Run(test.name, func(t *testing.T) {
270			var trace strings.Builder
271			c := NewClassifier(.8)
272			c.SetTraceConfiguration(&TraceConfiguration{
273				TraceLicenses: "*",
274				TracePhases:   "*",
275				Tracer: func(f string, args ...interface{}) {
276					trace.WriteString(fmt.Sprintf(f, args...))
277				},
278			})
279			c.AddContent("", "known", "", []byte(test.known))
280			kd := c.getIndexedDocument("", "known", "")
281			ud := c.createTargetIndexedDocument([]byte(test.unknown))
282			// The name for the test needs to look like an asset path so we prepend
283			// the directory.
284			conf, so, eo := c.score("License/"+test.name, ud, kd, 0, ud.size())
285
286			success := true
287			if conf != test.expectedConf {
288				t.Errorf("conf: got %v want %v", conf, test.expectedConf)
289				success = false
290			}
291			if so != test.expectedStart {
292				t.Errorf("start offset: got %v want %v", so, test.expectedStart)
293				success = false
294			}
295			if eo != test.expectedEnd {
296				t.Errorf("end offset: got %v want %v", so, test.expectedEnd)
297				success = false
298			}
299
300			if !success {
301				t.Errorf("Trace:\n%s", trace.String())
302			}
303		})
304	}
305}
306