xref: /aosp_15_r20/external/licenseclassifier/classifier_test.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1*46c4c49dSIbrahim Kanouche// Copyright 2017 Google Inc.
2*46c4c49dSIbrahim Kanouche//
3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License");
4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License.
5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at
6*46c4c49dSIbrahim Kanouche//
7*46c4c49dSIbrahim Kanouche//     http://www.apache.org/licenses/LICENSE-2.0
8*46c4c49dSIbrahim Kanouche//
9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software
10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS,
11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and
13*46c4c49dSIbrahim Kanouche// limitations under the License.
14*46c4c49dSIbrahim Kanouche
15*46c4c49dSIbrahim Kanouchepackage licenseclassifier
16*46c4c49dSIbrahim Kanouche
17*46c4c49dSIbrahim Kanoucheimport (
18*46c4c49dSIbrahim Kanouche	"bytes"
19*46c4c49dSIbrahim Kanouche	"log"
20*46c4c49dSIbrahim Kanouche	"os"
21*46c4c49dSIbrahim Kanouche	"path/filepath"
22*46c4c49dSIbrahim Kanouche	"strings"
23*46c4c49dSIbrahim Kanouche	"testing"
24*46c4c49dSIbrahim Kanouche
25*46c4c49dSIbrahim Kanouche	"github.com/google/licenseclassifier/stringclassifier"
26*46c4c49dSIbrahim Kanouche)
27*46c4c49dSIbrahim Kanouche
28*46c4c49dSIbrahim Kanouchevar (
29*46c4c49dSIbrahim Kanouche	agpl30, agpl30Header, apache20, bsd3, gpl20, ccbync20 string
30*46c4c49dSIbrahim Kanouche	classifier                                            *License
31*46c4c49dSIbrahim Kanouche)
32*46c4c49dSIbrahim Kanouche
33*46c4c49dSIbrahim Kanouchefunc TestMain(m *testing.M) {
34*46c4c49dSIbrahim Kanouche	a30, err := ReadLicenseFile("AGPL-3.0.txt")
35*46c4c49dSIbrahim Kanouche	if err != nil {
36*46c4c49dSIbrahim Kanouche		log.Fatalf("error reading contents of AGPL-3.0.txt: %v", err)
37*46c4c49dSIbrahim Kanouche	}
38*46c4c49dSIbrahim Kanouche	a30h, err := ReadLicenseFile("AGPL-3.0.header.txt")
39*46c4c49dSIbrahim Kanouche	if err != nil {
40*46c4c49dSIbrahim Kanouche		log.Fatalf("error reading contents of AGPL-3.0.header.txt: %v", err)
41*46c4c49dSIbrahim Kanouche	}
42*46c4c49dSIbrahim Kanouche	a20, err := ReadLicenseFile("Apache-2.0.txt")
43*46c4c49dSIbrahim Kanouche	if err != nil {
44*46c4c49dSIbrahim Kanouche		log.Fatalf("error reading contents of Apache-2.0.txt: %v", err)
45*46c4c49dSIbrahim Kanouche	}
46*46c4c49dSIbrahim Kanouche	b3, err := ReadLicenseFile("BSD-3-Clause.txt")
47*46c4c49dSIbrahim Kanouche	if err != nil {
48*46c4c49dSIbrahim Kanouche		log.Fatalf("error reading contents of BSD-3-Clause.txt: %v", err)
49*46c4c49dSIbrahim Kanouche	}
50*46c4c49dSIbrahim Kanouche	g2, err := ReadLicenseFile("GPL-2.0.txt")
51*46c4c49dSIbrahim Kanouche	if err != nil {
52*46c4c49dSIbrahim Kanouche		log.Fatalf("error reading contents of GPL-2.0.txt: %v", err)
53*46c4c49dSIbrahim Kanouche	}
54*46c4c49dSIbrahim Kanouche	cc20, err := ReadLicenseFile("CC-BY-NC-2.0.txt")
55*46c4c49dSIbrahim Kanouche	if err != nil {
56*46c4c49dSIbrahim Kanouche		log.Fatalf("error reading contents of CC-BY-NC-2.0.txt: %v", err)
57*46c4c49dSIbrahim Kanouche	}
58*46c4c49dSIbrahim Kanouche
59*46c4c49dSIbrahim Kanouche	agpl30 = TrimExtraneousTrailingText(string(a30))
60*46c4c49dSIbrahim Kanouche	agpl30Header = TrimExtraneousTrailingText(string(a30h))
61*46c4c49dSIbrahim Kanouche	apache20 = TrimExtraneousTrailingText(string(a20))
62*46c4c49dSIbrahim Kanouche	bsd3 = TrimExtraneousTrailingText(string(b3))
63*46c4c49dSIbrahim Kanouche	gpl20 = TrimExtraneousTrailingText(string(g2))
64*46c4c49dSIbrahim Kanouche	ccbync20 = TrimExtraneousTrailingText(string(cc20))
65*46c4c49dSIbrahim Kanouche
66*46c4c49dSIbrahim Kanouche	classifier, err = New(DefaultConfidenceThreshold)
67*46c4c49dSIbrahim Kanouche	if err != nil {
68*46c4c49dSIbrahim Kanouche		log.Fatalf("cannot create license classifier: %v", err)
69*46c4c49dSIbrahim Kanouche	}
70*46c4c49dSIbrahim Kanouche	os.Exit(m.Run())
71*46c4c49dSIbrahim Kanouche}
72*46c4c49dSIbrahim Kanouche
73*46c4c49dSIbrahim Kanouchefunc TestClassifier_NearestMatch(t *testing.T) {
74*46c4c49dSIbrahim Kanouche	tests := []struct {
75*46c4c49dSIbrahim Kanouche		description    string
76*46c4c49dSIbrahim Kanouche		filename       string
77*46c4c49dSIbrahim Kanouche		extraText      string
78*46c4c49dSIbrahim Kanouche		wantLicense    string
79*46c4c49dSIbrahim Kanouche		wantConfidence float64
80*46c4c49dSIbrahim Kanouche	}{
81*46c4c49dSIbrahim Kanouche		{
82*46c4c49dSIbrahim Kanouche			description:    "AGPL 3.0 license",
83*46c4c49dSIbrahim Kanouche			filename:       "AGPL-3.0.txt",
84*46c4c49dSIbrahim Kanouche			wantLicense:    "AGPL-3.0",
85*46c4c49dSIbrahim Kanouche			wantConfidence: 1.0,
86*46c4c49dSIbrahim Kanouche		},
87*46c4c49dSIbrahim Kanouche		{
88*46c4c49dSIbrahim Kanouche			description:    "Apache 2.0 license",
89*46c4c49dSIbrahim Kanouche			filename:       "Apache-2.0.txt",
90*46c4c49dSIbrahim Kanouche			wantLicense:    "Apache-2.0",
91*46c4c49dSIbrahim Kanouche			wantConfidence: 1.0,
92*46c4c49dSIbrahim Kanouche		},
93*46c4c49dSIbrahim Kanouche		{
94*46c4c49dSIbrahim Kanouche			description:    "GPL 2.0 license",
95*46c4c49dSIbrahim Kanouche			filename:       "GPL-2.0.txt",
96*46c4c49dSIbrahim Kanouche			wantLicense:    "GPL-2.0",
97*46c4c49dSIbrahim Kanouche			wantConfidence: 1.0,
98*46c4c49dSIbrahim Kanouche		},
99*46c4c49dSIbrahim Kanouche		{
100*46c4c49dSIbrahim Kanouche			description:    "BSD 3 Clause license with extra text",
101*46c4c49dSIbrahim Kanouche			filename:       "BSD-3-Clause.txt",
102*46c4c49dSIbrahim Kanouche			extraText:      "New BSD License\nCopyright © 1998 Yoyodyne, Inc.\n",
103*46c4c49dSIbrahim Kanouche			wantLicense:    "BSD-3-Clause",
104*46c4c49dSIbrahim Kanouche			wantConfidence: 0.94,
105*46c4c49dSIbrahim Kanouche		},
106*46c4c49dSIbrahim Kanouche	}
107*46c4c49dSIbrahim Kanouche
108*46c4c49dSIbrahim Kanouche	classifier.Threshold = DefaultConfidenceThreshold
109*46c4c49dSIbrahim Kanouche	for _, tt := range tests {
110*46c4c49dSIbrahim Kanouche		content, err := ReadLicenseFile(tt.filename)
111*46c4c49dSIbrahim Kanouche		if err != nil {
112*46c4c49dSIbrahim Kanouche			t.Errorf("error reading contents of %q license: %v", tt.wantLicense, err)
113*46c4c49dSIbrahim Kanouche			continue
114*46c4c49dSIbrahim Kanouche		}
115*46c4c49dSIbrahim Kanouche
116*46c4c49dSIbrahim Kanouche		m := classifier.NearestMatch(tt.extraText + TrimExtraneousTrailingText(string(content)))
117*46c4c49dSIbrahim Kanouche		if got, want := m.Name, tt.wantLicense; got != want {
118*46c4c49dSIbrahim Kanouche			t.Errorf("NearestMatch(%q) = %q, want %q", tt.description, got, want)
119*46c4c49dSIbrahim Kanouche		}
120*46c4c49dSIbrahim Kanouche		if got, want := m.Confidence, tt.wantConfidence; got < want {
121*46c4c49dSIbrahim Kanouche			t.Errorf("NearestMatch(%q) = %v, want %v", tt.description, got, want)
122*46c4c49dSIbrahim Kanouche		}
123*46c4c49dSIbrahim Kanouche	}
124*46c4c49dSIbrahim Kanouche}
125*46c4c49dSIbrahim Kanouche
126*46c4c49dSIbrahim Kanouchefunc TestClassifier_MultipleMatch(t *testing.T) {
127*46c4c49dSIbrahim Kanouche	tests := []struct {
128*46c4c49dSIbrahim Kanouche		description string
129*46c4c49dSIbrahim Kanouche		text        string
130*46c4c49dSIbrahim Kanouche		want        stringclassifier.Matches
131*46c4c49dSIbrahim Kanouche	}{
132*46c4c49dSIbrahim Kanouche		{
133*46c4c49dSIbrahim Kanouche			description: "Two licenses",
134*46c4c49dSIbrahim Kanouche			text:        "Copyright (c) 2016 Yoyodyne, Inc.\n" + apache20 + strings.Repeat("-", 80) + "\n" + bsd3,
135*46c4c49dSIbrahim Kanouche			want: stringclassifier.Matches{
136*46c4c49dSIbrahim Kanouche				{
137*46c4c49dSIbrahim Kanouche					Name:       "Apache-2.0",
138*46c4c49dSIbrahim Kanouche					Confidence: 1.0,
139*46c4c49dSIbrahim Kanouche				},
140*46c4c49dSIbrahim Kanouche				{
141*46c4c49dSIbrahim Kanouche					Name:       "BSD-3-Clause",
142*46c4c49dSIbrahim Kanouche					Confidence: 1.0,
143*46c4c49dSIbrahim Kanouche				},
144*46c4c49dSIbrahim Kanouche			},
145*46c4c49dSIbrahim Kanouche		},
146*46c4c49dSIbrahim Kanouche		{
147*46c4c49dSIbrahim Kanouche			description: "Two licenses: partial match",
148*46c4c49dSIbrahim Kanouche			text: "Copyright (c) 2016 Yoyodyne, Inc.\n" +
149*46c4c49dSIbrahim Kanouche				string(apache20[:len(apache20)/2-1]) + string(apache20[len(apache20)/2+7:]) + strings.Repeat("-", 80) + "\n" +
150*46c4c49dSIbrahim Kanouche				string(bsd3[:len(bsd3)/2]) + "intervening stuff" + string(bsd3[len(bsd3)/2:]),
151*46c4c49dSIbrahim Kanouche			want: stringclassifier.Matches{
152*46c4c49dSIbrahim Kanouche				{
153*46c4c49dSIbrahim Kanouche					Name:       "Apache-2.0",
154*46c4c49dSIbrahim Kanouche					Confidence: 0.99,
155*46c4c49dSIbrahim Kanouche				},
156*46c4c49dSIbrahim Kanouche				{
157*46c4c49dSIbrahim Kanouche					Name:       "BSD-3-Clause",
158*46c4c49dSIbrahim Kanouche					Confidence: 0.98,
159*46c4c49dSIbrahim Kanouche				},
160*46c4c49dSIbrahim Kanouche			},
161*46c4c49dSIbrahim Kanouche		},
162*46c4c49dSIbrahim Kanouche		{
163*46c4c49dSIbrahim Kanouche			description: "Two licenses: one forbidden the other okay",
164*46c4c49dSIbrahim Kanouche			text:        "Copyright (c) 2016 Yoyodyne, Inc.\n" + apache20 + strings.Repeat("-", 80) + "\n" + ccbync20,
165*46c4c49dSIbrahim Kanouche			want: stringclassifier.Matches{
166*46c4c49dSIbrahim Kanouche				{
167*46c4c49dSIbrahim Kanouche					Name:       "Apache-2.0",
168*46c4c49dSIbrahim Kanouche					Confidence: 0.99,
169*46c4c49dSIbrahim Kanouche				},
170*46c4c49dSIbrahim Kanouche				{
171*46c4c49dSIbrahim Kanouche					Name:       "CC-BY-NC-2.0",
172*46c4c49dSIbrahim Kanouche					Confidence: 1.0,
173*46c4c49dSIbrahim Kanouche				},
174*46c4c49dSIbrahim Kanouche			},
175*46c4c49dSIbrahim Kanouche		},
176*46c4c49dSIbrahim Kanouche		{
177*46c4c49dSIbrahim Kanouche			description: "Two licenses without any space between them.",
178*46c4c49dSIbrahim Kanouche			text:        apache20 + "." + bsd3,
179*46c4c49dSIbrahim Kanouche			want: stringclassifier.Matches{
180*46c4c49dSIbrahim Kanouche				{
181*46c4c49dSIbrahim Kanouche					Name:       "Apache-2.0",
182*46c4c49dSIbrahim Kanouche					Confidence: 1.0,
183*46c4c49dSIbrahim Kanouche				},
184*46c4c49dSIbrahim Kanouche				{
185*46c4c49dSIbrahim Kanouche					Name:       "BSD-3-Clause",
186*46c4c49dSIbrahim Kanouche					Confidence: 1.0,
187*46c4c49dSIbrahim Kanouche				},
188*46c4c49dSIbrahim Kanouche			},
189*46c4c49dSIbrahim Kanouche		},
190*46c4c49dSIbrahim Kanouche	}
191*46c4c49dSIbrahim Kanouche
192*46c4c49dSIbrahim Kanouche	classifier.Threshold = 0.95
193*46c4c49dSIbrahim Kanouche	defer func() {
194*46c4c49dSIbrahim Kanouche		classifier.Threshold = DefaultConfidenceThreshold
195*46c4c49dSIbrahim Kanouche	}()
196*46c4c49dSIbrahim Kanouche	for _, tt := range tests {
197*46c4c49dSIbrahim Kanouche		m := classifier.MultipleMatch(tt.text, false)
198*46c4c49dSIbrahim Kanouche		if len(m) != len(tt.want) {
199*46c4c49dSIbrahim Kanouche			t.Fatalf("MultipleMatch(%q) number matches: %v, want %v", tt.description, len(m), len(tt.want))
200*46c4c49dSIbrahim Kanouche			continue
201*46c4c49dSIbrahim Kanouche		}
202*46c4c49dSIbrahim Kanouche
203*46c4c49dSIbrahim Kanouche		for i := 0; i < len(m); i++ {
204*46c4c49dSIbrahim Kanouche			w := tt.want[i]
205*46c4c49dSIbrahim Kanouche			if got, want := m[i].Name, w.Name; got != want {
206*46c4c49dSIbrahim Kanouche				t.Errorf("MultipleMatch(%q) = %q, want %q", tt.description, got, want)
207*46c4c49dSIbrahim Kanouche			}
208*46c4c49dSIbrahim Kanouche			if got, want := m[i].Confidence, w.Confidence; got < want {
209*46c4c49dSIbrahim Kanouche				t.Errorf("MultipleMatch(%q) = %v, want %v", tt.description, got, want)
210*46c4c49dSIbrahim Kanouche			}
211*46c4c49dSIbrahim Kanouche		}
212*46c4c49dSIbrahim Kanouche	}
213*46c4c49dSIbrahim Kanouche}
214*46c4c49dSIbrahim Kanouche
215*46c4c49dSIbrahim Kanouchefunc TestClassifier_MultipleMatch_Headers(t *testing.T) {
216*46c4c49dSIbrahim Kanouche	tests := []struct {
217*46c4c49dSIbrahim Kanouche		description string
218*46c4c49dSIbrahim Kanouche		text        string
219*46c4c49dSIbrahim Kanouche		want        stringclassifier.Matches
220*46c4c49dSIbrahim Kanouche	}{
221*46c4c49dSIbrahim Kanouche		{
222*46c4c49dSIbrahim Kanouche			description: "AGPL-3.0 header",
223*46c4c49dSIbrahim Kanouche			text:        "Copyright (c) 2016 Yoyodyne, Inc.\n" + agpl30Header,
224*46c4c49dSIbrahim Kanouche			want: stringclassifier.Matches{
225*46c4c49dSIbrahim Kanouche				{
226*46c4c49dSIbrahim Kanouche					Name:       "AGPL-3.0",
227*46c4c49dSIbrahim Kanouche					Confidence: 1.0,
228*46c4c49dSIbrahim Kanouche					Offset:     0,
229*46c4c49dSIbrahim Kanouche				},
230*46c4c49dSIbrahim Kanouche			},
231*46c4c49dSIbrahim Kanouche		},
232*46c4c49dSIbrahim Kanouche		{
233*46c4c49dSIbrahim Kanouche			description: "Modified LGPL-2.1 header",
234*46c4c49dSIbrahim Kanouche			text: `Common Widget code.
235*46c4c49dSIbrahim Kanouche
236*46c4c49dSIbrahim KanoucheCopyright (C) 2013-2015 Yoyodyne, Inc.
237*46c4c49dSIbrahim Kanouche
238*46c4c49dSIbrahim KanoucheThis library is free software; you can redistribute it and/or
239*46c4c49dSIbrahim Kanouchemodify it under the terms of the GNU Lesser General Public
240*46c4c49dSIbrahim KanoucheLicense as published by the Free Software Foundation; either
241*46c4c49dSIbrahim Kanoucheversion 2.1 of the License, or (at your option) any later version (but not!).
242*46c4c49dSIbrahim Kanouche
243*46c4c49dSIbrahim KanoucheThis library is distributed in the hope that it will be useful,
244*46c4c49dSIbrahim Kanouchebut WITHOUT ANY WARRANTY; without even the implied warranty of
245*46c4c49dSIbrahim KanoucheMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
246*46c4c49dSIbrahim KanoucheLesser General Public License for more details.
247*46c4c49dSIbrahim Kanouche
248*46c4c49dSIbrahim KanoucheYou should have received a copy of the GNU Lesser General Public
249*46c4c49dSIbrahim KanoucheLicense along with this library; if not, write to the Free Software
250*46c4c49dSIbrahim KanoucheFoundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
251*46c4c49dSIbrahim Kanouche`,
252*46c4c49dSIbrahim Kanouche			want: stringclassifier.Matches{
253*46c4c49dSIbrahim Kanouche				{
254*46c4c49dSIbrahim Kanouche					Name:       "LGPL-2.1",
255*46c4c49dSIbrahim Kanouche					Confidence: 0.97,
256*46c4c49dSIbrahim Kanouche					Offset:     197,
257*46c4c49dSIbrahim Kanouche				},
258*46c4c49dSIbrahim Kanouche			},
259*46c4c49dSIbrahim Kanouche		},
260*46c4c49dSIbrahim Kanouche	}
261*46c4c49dSIbrahim Kanouche
262*46c4c49dSIbrahim Kanouche	classifier.Threshold = 0.90
263*46c4c49dSIbrahim Kanouche	defer func() {
264*46c4c49dSIbrahim Kanouche		classifier.Threshold = DefaultConfidenceThreshold
265*46c4c49dSIbrahim Kanouche	}()
266*46c4c49dSIbrahim Kanouche	for _, tt := range tests {
267*46c4c49dSIbrahim Kanouche		m := classifier.MultipleMatch(tt.text, true)
268*46c4c49dSIbrahim Kanouche		if len(m) != len(tt.want) {
269*46c4c49dSIbrahim Kanouche			t.Errorf("MultipleMatch(%q) number matches: %v, want %v", tt.description, len(m), len(tt.want))
270*46c4c49dSIbrahim Kanouche			continue
271*46c4c49dSIbrahim Kanouche		}
272*46c4c49dSIbrahim Kanouche
273*46c4c49dSIbrahim Kanouche		for i := 0; i < len(m); i++ {
274*46c4c49dSIbrahim Kanouche			w := tt.want[i]
275*46c4c49dSIbrahim Kanouche			if got, want := m[i].Name, w.Name; got != want {
276*46c4c49dSIbrahim Kanouche				t.Errorf("MultipleMatch(%q) = %q, want %q", tt.description, got, want)
277*46c4c49dSIbrahim Kanouche			}
278*46c4c49dSIbrahim Kanouche			if got, want := m[i].Confidence, w.Confidence; got < want {
279*46c4c49dSIbrahim Kanouche				t.Errorf("MultipleMatch(%q) = %v, want %v", tt.description, got, want)
280*46c4c49dSIbrahim Kanouche			}
281*46c4c49dSIbrahim Kanouche		}
282*46c4c49dSIbrahim Kanouche	}
283*46c4c49dSIbrahim Kanouche}
284*46c4c49dSIbrahim Kanouche
285*46c4c49dSIbrahim Kanouchefunc TestClassifier_CopyrightHolder(t *testing.T) {
286*46c4c49dSIbrahim Kanouche	tests := []struct {
287*46c4c49dSIbrahim Kanouche		copyright string
288*46c4c49dSIbrahim Kanouche		want      string
289*46c4c49dSIbrahim Kanouche	}{
290*46c4c49dSIbrahim Kanouche		{
291*46c4c49dSIbrahim Kanouche			copyright: "Copyright 2008 Yoyodyne Inc. All Rights Reserved.",
292*46c4c49dSIbrahim Kanouche			want:      "Yoyodyne Inc.",
293*46c4c49dSIbrahim Kanouche		},
294*46c4c49dSIbrahim Kanouche		{
295*46c4c49dSIbrahim Kanouche			copyright: "Copyright 2010-2016 Yoyodyne, Inc.",
296*46c4c49dSIbrahim Kanouche			want:      "Yoyodyne, Inc.",
297*46c4c49dSIbrahim Kanouche		},
298*46c4c49dSIbrahim Kanouche		{
299*46c4c49dSIbrahim Kanouche			copyright: "Copyright 2010, 2011, 2012 Yoyodyne, Inc., All rights reserved.",
300*46c4c49dSIbrahim Kanouche			want:      "Yoyodyne, Inc.",
301*46c4c49dSIbrahim Kanouche		},
302*46c4c49dSIbrahim Kanouche		{
303*46c4c49dSIbrahim Kanouche			copyright: "Copyright (c) 2015 Yoyodyne, Inc. All rights reserved.",
304*46c4c49dSIbrahim Kanouche			want:      "Yoyodyne, Inc.",
305*46c4c49dSIbrahim Kanouche		},
306*46c4c49dSIbrahim Kanouche		{
307*46c4c49dSIbrahim Kanouche			copyright: "Copyright © 1998 by Yoyodyne, Inc., San Narciso, CA, US.",
308*46c4c49dSIbrahim Kanouche			want:      "Yoyodyne, Inc., San Narciso, CA, US",
309*46c4c49dSIbrahim Kanouche		},
310*46c4c49dSIbrahim Kanouche		{
311*46c4c49dSIbrahim Kanouche			copyright: "Copyright (c) 2015 The Algonquin Round Table. All rights reserved.",
312*46c4c49dSIbrahim Kanouche			want:      "The Algonquin Round Table",
313*46c4c49dSIbrahim Kanouche		},
314*46c4c49dSIbrahim Kanouche		{
315*46c4c49dSIbrahim Kanouche			copyright: "Copyright 2016, The Android Open Source Project",
316*46c4c49dSIbrahim Kanouche			want:      "The Android Open Source Project",
317*46c4c49dSIbrahim Kanouche		},
318*46c4c49dSIbrahim Kanouche		{
319*46c4c49dSIbrahim Kanouche			copyright: `---------------------------------------------------------
320*46c4c49dSIbrahim Kanouchefoo.c:
321*46c4c49dSIbrahim KanoucheCopyright 2016, The Android Open Source Project
322*46c4c49dSIbrahim Kanouche`,
323*46c4c49dSIbrahim Kanouche			want: "The Android Open Source Project",
324*46c4c49dSIbrahim Kanouche		},
325*46c4c49dSIbrahim Kanouche	}
326*46c4c49dSIbrahim Kanouche
327*46c4c49dSIbrahim Kanouche	for _, tt := range tests {
328*46c4c49dSIbrahim Kanouche		got := CopyrightHolder(tt.copyright)
329*46c4c49dSIbrahim Kanouche		if got != tt.want {
330*46c4c49dSIbrahim Kanouche			t.Errorf("CopyrightHolder(%q) = %q, want %q", tt.copyright, got, tt.want)
331*46c4c49dSIbrahim Kanouche		}
332*46c4c49dSIbrahim Kanouche	}
333*46c4c49dSIbrahim Kanouche}
334*46c4c49dSIbrahim Kanouche
335*46c4c49dSIbrahim Kanouchefunc TestClassifier_WithinConfidenceThreshold(t *testing.T) {
336*46c4c49dSIbrahim Kanouche	tests := []struct {
337*46c4c49dSIbrahim Kanouche		description string
338*46c4c49dSIbrahim Kanouche		text        string
339*46c4c49dSIbrahim Kanouche		confDef     bool
340*46c4c49dSIbrahim Kanouche		conf99      bool
341*46c4c49dSIbrahim Kanouche		conf93      bool
342*46c4c49dSIbrahim Kanouche		conf5       bool
343*46c4c49dSIbrahim Kanouche	}{
344*46c4c49dSIbrahim Kanouche		{
345*46c4c49dSIbrahim Kanouche			description: "Apache 2.0",
346*46c4c49dSIbrahim Kanouche			text:        apache20,
347*46c4c49dSIbrahim Kanouche			confDef:     true,
348*46c4c49dSIbrahim Kanouche			conf99:      true,
349*46c4c49dSIbrahim Kanouche			conf93:      true,
350*46c4c49dSIbrahim Kanouche			conf5:       true,
351*46c4c49dSIbrahim Kanouche		},
352*46c4c49dSIbrahim Kanouche		{
353*46c4c49dSIbrahim Kanouche			description: "GPL 2.0",
354*46c4c49dSIbrahim Kanouche			text:        gpl20,
355*46c4c49dSIbrahim Kanouche			confDef:     true,
356*46c4c49dSIbrahim Kanouche			conf99:      true,
357*46c4c49dSIbrahim Kanouche			conf93:      true,
358*46c4c49dSIbrahim Kanouche			conf5:       true,
359*46c4c49dSIbrahim Kanouche		},
360*46c4c49dSIbrahim Kanouche		{
361*46c4c49dSIbrahim Kanouche			description: "BSD 3 Clause license with extra text",
362*46c4c49dSIbrahim Kanouche			text:        "New BSD License\nCopyright © 1998 Yoyodyne, Inc.\n" + bsd3,
363*46c4c49dSIbrahim Kanouche			confDef:     true,
364*46c4c49dSIbrahim Kanouche			conf99:      true,
365*46c4c49dSIbrahim Kanouche			conf93:      true,
366*46c4c49dSIbrahim Kanouche			conf5:       true,
367*46c4c49dSIbrahim Kanouche		},
368*46c4c49dSIbrahim Kanouche		{
369*46c4c49dSIbrahim Kanouche			description: "Very low confidence",
370*46c4c49dSIbrahim Kanouche			text:        strings.Repeat("Random text is random, but not a license\n", 40),
371*46c4c49dSIbrahim Kanouche			confDef:     false,
372*46c4c49dSIbrahim Kanouche			conf99:      false,
373*46c4c49dSIbrahim Kanouche			conf93:      false,
374*46c4c49dSIbrahim Kanouche			conf5:       true,
375*46c4c49dSIbrahim Kanouche		},
376*46c4c49dSIbrahim Kanouche	}
377*46c4c49dSIbrahim Kanouche
378*46c4c49dSIbrahim Kanouche	defer func() {
379*46c4c49dSIbrahim Kanouche		classifier.Threshold = DefaultConfidenceThreshold
380*46c4c49dSIbrahim Kanouche	}()
381*46c4c49dSIbrahim Kanouche	for _, tt := range tests {
382*46c4c49dSIbrahim Kanouche		t.Run(tt.description, func(t *testing.T) {
383*46c4c49dSIbrahim Kanouche			classifier.Threshold = DefaultConfidenceThreshold
384*46c4c49dSIbrahim Kanouche			m := classifier.NearestMatch(tt.text)
385*46c4c49dSIbrahim Kanouche			if got := classifier.WithinConfidenceThreshold(m.Confidence); got != tt.confDef {
386*46c4c49dSIbrahim Kanouche				t.Errorf("WithinConfidenceThreshold() at %v returned wrong result; got %v, want %v", classifier.Threshold, got, tt.confDef)
387*46c4c49dSIbrahim Kanouche			}
388*46c4c49dSIbrahim Kanouche
389*46c4c49dSIbrahim Kanouche			classifier.Threshold = 0.99
390*46c4c49dSIbrahim Kanouche			m = classifier.NearestMatch(tt.text)
391*46c4c49dSIbrahim Kanouche			if got := classifier.WithinConfidenceThreshold(m.Confidence); got != tt.conf99 {
392*46c4c49dSIbrahim Kanouche				t.Errorf("WithinConfidenceThreshold(%q) = %v, want %v", tt.description, got, tt.conf99)
393*46c4c49dSIbrahim Kanouche			}
394*46c4c49dSIbrahim Kanouche
395*46c4c49dSIbrahim Kanouche			classifier.Threshold = 0.93
396*46c4c49dSIbrahim Kanouche			m = classifier.NearestMatch(tt.text)
397*46c4c49dSIbrahim Kanouche			if got := classifier.WithinConfidenceThreshold(m.Confidence); got != tt.conf93 {
398*46c4c49dSIbrahim Kanouche				t.Errorf("WithinConfidenceThreshold(%q) = %v, want %v", tt.description, got, tt.conf93)
399*46c4c49dSIbrahim Kanouche			}
400*46c4c49dSIbrahim Kanouche
401*46c4c49dSIbrahim Kanouche			classifier.Threshold = 0.05
402*46c4c49dSIbrahim Kanouche			m = classifier.NearestMatch(tt.text)
403*46c4c49dSIbrahim Kanouche			if got := classifier.WithinConfidenceThreshold(m.Confidence); got != tt.conf5 {
404*46c4c49dSIbrahim Kanouche				t.Errorf("WithinConfidenceThreshold(%q) = %v, want %v", tt.description, got, tt.conf5)
405*46c4c49dSIbrahim Kanouche			}
406*46c4c49dSIbrahim Kanouche		})
407*46c4c49dSIbrahim Kanouche	}
408*46c4c49dSIbrahim Kanouche}
409*46c4c49dSIbrahim Kanouche
410*46c4c49dSIbrahim Kanouchefunc TestRemoveIgnorableText(t *testing.T) {
411*46c4c49dSIbrahim Kanouche	const want = `Lorem ipsum dolor sit amet, pellentesque wisi tortor duis, amet adipiscing bibendum elit aliquam
412*46c4c49dSIbrahim Kanoucheleo. Mattis commodo sed accumsan at in.
413*46c4c49dSIbrahim Kanouche`
414*46c4c49dSIbrahim Kanouche
415*46c4c49dSIbrahim Kanouche	tests := []struct {
416*46c4c49dSIbrahim Kanouche		original string
417*46c4c49dSIbrahim Kanouche		want     string
418*46c4c49dSIbrahim Kanouche	}{
419*46c4c49dSIbrahim Kanouche		{"MIT License\n", "\n"},
420*46c4c49dSIbrahim Kanouche		{"The MIT License\n", "\n"},
421*46c4c49dSIbrahim Kanouche		{"The MIT License (MIT)\n", "\n"},
422*46c4c49dSIbrahim Kanouche		{"BSD License\n", "\n"},
423*46c4c49dSIbrahim Kanouche		{"New BSD License\n", "\n"},
424*46c4c49dSIbrahim Kanouche		{"COPYRIGHT AND PERMISSION NOTICE\n", "\n"},
425*46c4c49dSIbrahim Kanouche		{"Copyright (c) 2016, Yoyodyne, Inc.\n", "\n"},
426*46c4c49dSIbrahim Kanouche		{"All rights reserved.\n", "\n"},
427*46c4c49dSIbrahim Kanouche		{"Some rights reserved.\n", "\n"},
428*46c4c49dSIbrahim Kanouche		{"@license\n", "\n"},
429*46c4c49dSIbrahim Kanouche
430*46c4c49dSIbrahim Kanouche		// Now with wanted texts.
431*46c4c49dSIbrahim Kanouche		{
432*46c4c49dSIbrahim Kanouche			original: `The MIT License
433*46c4c49dSIbrahim Kanouche
434*46c4c49dSIbrahim KanoucheCopyright (c) 2016, Yoyodyne, Inc.
435*46c4c49dSIbrahim KanoucheAll rights reserved.
436*46c4c49dSIbrahim Kanouche` + want,
437*46c4c49dSIbrahim Kanouche			want: strings.ToLower(want),
438*46c4c49dSIbrahim Kanouche		},
439*46c4c49dSIbrahim Kanouche	}
440*46c4c49dSIbrahim Kanouche
441*46c4c49dSIbrahim Kanouche	for _, tt := range tests {
442*46c4c49dSIbrahim Kanouche		if got := removeIgnorableTexts(strings.ToLower(tt.original)); got != tt.want {
443*46c4c49dSIbrahim Kanouche			t.Errorf("Mismatch(%q) =>\n%s\nwant:\n%s", tt.original, got, tt.want)
444*46c4c49dSIbrahim Kanouche		}
445*46c4c49dSIbrahim Kanouche	}
446*46c4c49dSIbrahim Kanouche}
447*46c4c49dSIbrahim Kanouche
448*46c4c49dSIbrahim Kanouchefunc TestRemoveShebangLine(t *testing.T) {
449*46c4c49dSIbrahim Kanouche	tests := []struct {
450*46c4c49dSIbrahim Kanouche		original string
451*46c4c49dSIbrahim Kanouche		want     string
452*46c4c49dSIbrahim Kanouche	}{
453*46c4c49dSIbrahim Kanouche		{
454*46c4c49dSIbrahim Kanouche			original: "",
455*46c4c49dSIbrahim Kanouche			want:     "",
456*46c4c49dSIbrahim Kanouche		},
457*46c4c49dSIbrahim Kanouche		{
458*46c4c49dSIbrahim Kanouche			original: "#!/usr/bin/env python -C",
459*46c4c49dSIbrahim Kanouche			want:     "#!/usr/bin/env python -C",
460*46c4c49dSIbrahim Kanouche		},
461*46c4c49dSIbrahim Kanouche		{
462*46c4c49dSIbrahim Kanouche			original: `#!/usr/bin/env python -C
463*46c4c49dSIbrahim Kanouche# First line of license text.
464*46c4c49dSIbrahim Kanouche# Second line of license text.
465*46c4c49dSIbrahim Kanouche`,
466*46c4c49dSIbrahim Kanouche			want: `# First line of license text.
467*46c4c49dSIbrahim Kanouche# Second line of license text.
468*46c4c49dSIbrahim Kanouche`,
469*46c4c49dSIbrahim Kanouche		},
470*46c4c49dSIbrahim Kanouche		{
471*46c4c49dSIbrahim Kanouche			original: `# First line of license text.
472*46c4c49dSIbrahim Kanouche# Second line of license text.
473*46c4c49dSIbrahim Kanouche`,
474*46c4c49dSIbrahim Kanouche			want: `# First line of license text.
475*46c4c49dSIbrahim Kanouche# Second line of license text.
476*46c4c49dSIbrahim Kanouche`,
477*46c4c49dSIbrahim Kanouche		},
478*46c4c49dSIbrahim Kanouche	}
479*46c4c49dSIbrahim Kanouche
480*46c4c49dSIbrahim Kanouche	for _, tt := range tests {
481*46c4c49dSIbrahim Kanouche		got := removeShebangLine(tt.original)
482*46c4c49dSIbrahim Kanouche		if got != tt.want {
483*46c4c49dSIbrahim Kanouche			t.Errorf("RemoveShebangLine(%q) =>\n%s\nwant:\n%s", tt.original, got, tt.want)
484*46c4c49dSIbrahim Kanouche		}
485*46c4c49dSIbrahim Kanouche	}
486*46c4c49dSIbrahim Kanouche}
487*46c4c49dSIbrahim Kanouche
488*46c4c49dSIbrahim Kanouchefunc TestRemoveNonWords(t *testing.T) {
489*46c4c49dSIbrahim Kanouche	tests := []struct {
490*46c4c49dSIbrahim Kanouche		original string
491*46c4c49dSIbrahim Kanouche		want     string
492*46c4c49dSIbrahim Kanouche	}{
493*46c4c49dSIbrahim Kanouche		{
494*46c4c49dSIbrahim Kanouche			original: `# # Hello
495*46c4c49dSIbrahim Kanouche## World
496*46c4c49dSIbrahim Kanouche`,
497*46c4c49dSIbrahim Kanouche			want: ` Hello World `,
498*46c4c49dSIbrahim Kanouche		},
499*46c4c49dSIbrahim Kanouche		{
500*46c4c49dSIbrahim Kanouche			original: ` * This text has a bulleted list:
501*46c4c49dSIbrahim Kanouche * * item 1
502*46c4c49dSIbrahim Kanouche * * item 2`,
503*46c4c49dSIbrahim Kanouche			want: ` This text has a bulleted list item 1 item 2`,
504*46c4c49dSIbrahim Kanouche		},
505*46c4c49dSIbrahim Kanouche		{
506*46c4c49dSIbrahim Kanouche			original: `
507*46c4c49dSIbrahim Kanouche
508*46c4c49dSIbrahim Kanouche * This text has a bulleted list:
509*46c4c49dSIbrahim Kanouche * * item 1
510*46c4c49dSIbrahim Kanouche * * item 2`,
511*46c4c49dSIbrahim Kanouche			want: ` This text has a bulleted list item 1 item 2`,
512*46c4c49dSIbrahim Kanouche		},
513*46c4c49dSIbrahim Kanouche		{
514*46c4c49dSIbrahim Kanouche			original: `// This text has a bulleted list:
515*46c4c49dSIbrahim Kanouche// 1. item 1
516*46c4c49dSIbrahim Kanouche// 2. item 2`,
517*46c4c49dSIbrahim Kanouche			want: ` This text has a bulleted list 1 item 1 2 item 2`,
518*46c4c49dSIbrahim Kanouche		},
519*46c4c49dSIbrahim Kanouche		{
520*46c4c49dSIbrahim Kanouche			original: `// «Copyright (c) 1998 Yoyodyne, Inc.»
521*46c4c49dSIbrahim Kanouche// This text has a bulleted list:
522*46c4c49dSIbrahim Kanouche// 1. item 1
523*46c4c49dSIbrahim Kanouche// 2. item 2
524*46c4c49dSIbrahim Kanouche`,
525*46c4c49dSIbrahim Kanouche			want: ` «Copyright c 1998 Yoyodyne Inc » This text has a bulleted list 1 item 1 2 item 2 `,
526*46c4c49dSIbrahim Kanouche		},
527*46c4c49dSIbrahim Kanouche		{
528*46c4c49dSIbrahim Kanouche			original: `*
529*46c4c49dSIbrahim Kanouche * This is the first line we want.
530*46c4c49dSIbrahim Kanouche * This is the second line we want.
531*46c4c49dSIbrahim Kanouche * This is the third line we want.
532*46c4c49dSIbrahim Kanouche * This is the last line we want.
533*46c4c49dSIbrahim Kanouche`,
534*46c4c49dSIbrahim Kanouche			want: ` This is the first line we want This is the second line we want This is the third line we want This is the last line we want `,
535*46c4c49dSIbrahim Kanouche		},
536*46c4c49dSIbrahim Kanouche		{
537*46c4c49dSIbrahim Kanouche			original: `===---------------------------------------------===
538*46c4c49dSIbrahim Kanouche***
539*46c4c49dSIbrahim Kanouche* This is the first line we want.
540*46c4c49dSIbrahim Kanouche* This is the second line we want.
541*46c4c49dSIbrahim Kanouche* This is the third line we want.
542*46c4c49dSIbrahim Kanouche* This is the last line we want.
543*46c4c49dSIbrahim Kanouche***
544*46c4c49dSIbrahim Kanouche===---------------------------------------------===
545*46c4c49dSIbrahim Kanouche`,
546*46c4c49dSIbrahim Kanouche			want: ` This is the first line we want This is the second line we want This is the third line we want This is the last line we want `,
547*46c4c49dSIbrahim Kanouche		},
548*46c4c49dSIbrahim Kanouche		{
549*46c4c49dSIbrahim Kanouche			original: strings.Repeat("-", 80),
550*46c4c49dSIbrahim Kanouche			want:     " ",
551*46c4c49dSIbrahim Kanouche		},
552*46c4c49dSIbrahim Kanouche		{
553*46c4c49dSIbrahim Kanouche			original: strings.Repeat("=", 80),
554*46c4c49dSIbrahim Kanouche			want:     " ",
555*46c4c49dSIbrahim Kanouche		},
556*46c4c49dSIbrahim Kanouche		{
557*46c4c49dSIbrahim Kanouche			original: "/*\n",
558*46c4c49dSIbrahim Kanouche			want:     " ",
559*46c4c49dSIbrahim Kanouche		},
560*46c4c49dSIbrahim Kanouche		{
561*46c4c49dSIbrahim Kanouche			original: "/*\n * precursor text\n */\n",
562*46c4c49dSIbrahim Kanouche			want:     " precursor text ",
563*46c4c49dSIbrahim Kanouche		},
564*46c4c49dSIbrahim Kanouche		// Test for b/63540492.
565*46c4c49dSIbrahim Kanouche		{
566*46c4c49dSIbrahim Kanouche			original: " */\n",
567*46c4c49dSIbrahim Kanouche			want:     " ",
568*46c4c49dSIbrahim Kanouche		},
569*46c4c49dSIbrahim Kanouche		{
570*46c4c49dSIbrahim Kanouche			original: "",
571*46c4c49dSIbrahim Kanouche			want:     "",
572*46c4c49dSIbrahim Kanouche		},
573*46c4c49dSIbrahim Kanouche	}
574*46c4c49dSIbrahim Kanouche
575*46c4c49dSIbrahim Kanouche	for _, tt := range tests {
576*46c4c49dSIbrahim Kanouche		if got := stringclassifier.FlattenWhitespace(RemoveNonWords(tt.original)); got != tt.want {
577*46c4c49dSIbrahim Kanouche			t.Errorf("Mismatch(%q) => %v, want %v", tt.original, got, tt.want)
578*46c4c49dSIbrahim Kanouche		}
579*46c4c49dSIbrahim Kanouche	}
580*46c4c49dSIbrahim Kanouche}
581*46c4c49dSIbrahim Kanouche
582*46c4c49dSIbrahim Kanouchefunc TestNormalizePunctuation(t *testing.T) {
583*46c4c49dSIbrahim Kanouche	tests := []struct {
584*46c4c49dSIbrahim Kanouche		original string
585*46c4c49dSIbrahim Kanouche		want     string
586*46c4c49dSIbrahim Kanouche	}{
587*46c4c49dSIbrahim Kanouche		// Hyphens and dashes.
588*46c4c49dSIbrahim Kanouche		{"—", "-"},
589*46c4c49dSIbrahim Kanouche		{"-", "-"},
590*46c4c49dSIbrahim Kanouche		{"‒", "-"},
591*46c4c49dSIbrahim Kanouche		{"–", "-"},
592*46c4c49dSIbrahim Kanouche		{"—", "-"},
593*46c4c49dSIbrahim Kanouche
594*46c4c49dSIbrahim Kanouche		// Quotes.
595*46c4c49dSIbrahim Kanouche		{"'", "'"},
596*46c4c49dSIbrahim Kanouche		{`"`, "'"},
597*46c4c49dSIbrahim Kanouche		{"‘", "'"},
598*46c4c49dSIbrahim Kanouche		{"", "'"},
599*46c4c49dSIbrahim Kanouche		{"“", "'"},
600*46c4c49dSIbrahim Kanouche		{"", "'"},
601*46c4c49dSIbrahim Kanouche		{" ” ", " ' "},
602*46c4c49dSIbrahim Kanouche
603*46c4c49dSIbrahim Kanouche		// Backtick.
604*46c4c49dSIbrahim Kanouche		{"`", "'"},
605*46c4c49dSIbrahim Kanouche
606*46c4c49dSIbrahim Kanouche		// Copyright mark.
607*46c4c49dSIbrahim Kanouche		{"©", "(c)"},
608*46c4c49dSIbrahim Kanouche
609*46c4c49dSIbrahim Kanouche		// Hyphen-separated words.
610*46c4c49dSIbrahim Kanouche		{"general- purpose, non- compliant", "general-purpose, non-compliant"},
611*46c4c49dSIbrahim Kanouche
612*46c4c49dSIbrahim Kanouche		// Section.
613*46c4c49dSIbrahim Kanouche		{"§", "(s)"},
614*46c4c49dSIbrahim Kanouche		{"¤", "(s)"},
615*46c4c49dSIbrahim Kanouche	}
616*46c4c49dSIbrahim Kanouche
617*46c4c49dSIbrahim Kanouche	for _, tt := range tests {
618*46c4c49dSIbrahim Kanouche		if got := NormalizePunctuation(tt.original); got != tt.want {
619*46c4c49dSIbrahim Kanouche			t.Errorf("Mismatch => %v, want %v", got, tt.want)
620*46c4c49dSIbrahim Kanouche		}
621*46c4c49dSIbrahim Kanouche	}
622*46c4c49dSIbrahim Kanouche}
623*46c4c49dSIbrahim Kanouche
624*46c4c49dSIbrahim Kanouchefunc TestNormalizeEquivalentWords(t *testing.T) {
625*46c4c49dSIbrahim Kanouche	tests := []struct {
626*46c4c49dSIbrahim Kanouche		original string
627*46c4c49dSIbrahim Kanouche		want     string
628*46c4c49dSIbrahim Kanouche	}{
629*46c4c49dSIbrahim Kanouche		{"acknowledgment", "Acknowledgement"},
630*46c4c49dSIbrahim Kanouche		{"ANalogue", "Analog"},
631*46c4c49dSIbrahim Kanouche		{"AnAlyse", "Analyze"},
632*46c4c49dSIbrahim Kanouche		{"ArtefacT", "Artifact"},
633*46c4c49dSIbrahim Kanouche		{"authorisation", "Authorization"},
634*46c4c49dSIbrahim Kanouche		{"AuthoriSed", "Authorized"},
635*46c4c49dSIbrahim Kanouche		{"CalIbre", "Caliber"},
636*46c4c49dSIbrahim Kanouche		{"CanCelled", "Canceled"},
637*46c4c49dSIbrahim Kanouche		{"CapitaliSations", "Capitalizations"},
638*46c4c49dSIbrahim Kanouche		{"CatalogUe", "Catalog"},
639*46c4c49dSIbrahim Kanouche		{"CategoriSe", "Categorize"},
640*46c4c49dSIbrahim Kanouche		{"CentRE", "Center"},
641*46c4c49dSIbrahim Kanouche		{"EmphasiSed", "Emphasized"},
642*46c4c49dSIbrahim Kanouche		{"FavoUr", "Favor"},
643*46c4c49dSIbrahim Kanouche		{"FavoUrite", "Favorite"},
644*46c4c49dSIbrahim Kanouche		{"FulfiL", "Fulfill"},
645*46c4c49dSIbrahim Kanouche		{"FulfiLment", "Fulfillment"},
646*46c4c49dSIbrahim Kanouche		{"InitialiSe", "Initialize"},
647*46c4c49dSIbrahim Kanouche		{"JudGMent", "Judgement"},
648*46c4c49dSIbrahim Kanouche		{"LabelLing", "Labeling"},
649*46c4c49dSIbrahim Kanouche		{"LaboUr", "Labor"},
650*46c4c49dSIbrahim Kanouche		{"LicenCe", "License"},
651*46c4c49dSIbrahim Kanouche		{"MaximiSe", "Maximize"},
652*46c4c49dSIbrahim Kanouche		{"ModelLed", "Modeled"},
653*46c4c49dSIbrahim Kanouche		{"ModeLling", "Modeling"},
654*46c4c49dSIbrahim Kanouche		{"OffenCe", "Offense"},
655*46c4c49dSIbrahim Kanouche		{"OptimiSe", "Optimize"},
656*46c4c49dSIbrahim Kanouche		{"OrganiSation", "Organization"},
657*46c4c49dSIbrahim Kanouche		{"OrganiSe", "Organize"},
658*46c4c49dSIbrahim Kanouche		{"PractiSe", "Practice"},
659*46c4c49dSIbrahim Kanouche		{"ProgramME", "Program"},
660*46c4c49dSIbrahim Kanouche		{"RealiSe", "Realize"},
661*46c4c49dSIbrahim Kanouche		{"RecogniSe", "Recognize"},
662*46c4c49dSIbrahim Kanouche		{"SignalLing", "Signaling"},
663*46c4c49dSIbrahim Kanouche		{"sub-license", "Sublicense"},
664*46c4c49dSIbrahim Kanouche		{"sub license", "Sublicense"},
665*46c4c49dSIbrahim Kanouche		{"UtiliSation", "Utilization"},
666*46c4c49dSIbrahim Kanouche		{"WhilST", "While"},
667*46c4c49dSIbrahim Kanouche		{"WilfuL", "Wilfull"},
668*46c4c49dSIbrahim Kanouche		{"Non-coMMercial", "Noncommercial"},
669*46c4c49dSIbrahim Kanouche		{"Per Cent", "Percent"},
670*46c4c49dSIbrahim Kanouche	}
671*46c4c49dSIbrahim Kanouche
672*46c4c49dSIbrahim Kanouche	for _, tt := range tests {
673*46c4c49dSIbrahim Kanouche		if got := NormalizeEquivalentWords(tt.original); got != tt.want {
674*46c4c49dSIbrahim Kanouche			t.Errorf("Mismatch => %v, want %v", got, tt.want)
675*46c4c49dSIbrahim Kanouche		}
676*46c4c49dSIbrahim Kanouche	}
677*46c4c49dSIbrahim Kanouche}
678*46c4c49dSIbrahim Kanouche
679*46c4c49dSIbrahim Kanouchefunc TestTrimExtraneousTrailingText(t *testing.T) {
680*46c4c49dSIbrahim Kanouche	tests := []struct {
681*46c4c49dSIbrahim Kanouche		original string
682*46c4c49dSIbrahim Kanouche		want     string
683*46c4c49dSIbrahim Kanouche	}{
684*46c4c49dSIbrahim Kanouche		{
685*46c4c49dSIbrahim Kanouche			original: `12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL
686*46c4c49dSIbrahim Kanouche    ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE
687*46c4c49dSIbrahim Kanouche    THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
688*46c4c49dSIbrahim Kanouche    GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
689*46c4c49dSIbrahim Kanouche    USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
690*46c4c49dSIbrahim Kanouche    DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
691*46c4c49dSIbrahim Kanouche    PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
692*46c4c49dSIbrahim Kanouche    EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
693*46c4c49dSIbrahim Kanouche    SUCH DAMAGES.
694*46c4c49dSIbrahim Kanouche
695*46c4c49dSIbrahim Kanouche        END OF TERMS AND CONDITIONS
696*46c4c49dSIbrahim Kanouche
697*46c4c49dSIbrahim Kanouche    How to Apply These Terms to Your New Programs
698*46c4c49dSIbrahim Kanouche
699*46c4c49dSIbrahim Kanouche    If you develop a new program, and you want it to be of the greatest
700*46c4c49dSIbrahim Kanouche    possible use to the public, the best way to achieve this is to make it free
701*46c4c49dSIbrahim Kanouche    software which everyone can redistribute and change under these terms.
702*46c4c49dSIbrahim Kanouche`,
703*46c4c49dSIbrahim Kanouche			want: `12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL
704*46c4c49dSIbrahim Kanouche    ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE
705*46c4c49dSIbrahim Kanouche    THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
706*46c4c49dSIbrahim Kanouche    GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
707*46c4c49dSIbrahim Kanouche    USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
708*46c4c49dSIbrahim Kanouche    DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
709*46c4c49dSIbrahim Kanouche    PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
710*46c4c49dSIbrahim Kanouche    EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
711*46c4c49dSIbrahim Kanouche    SUCH DAMAGES.
712*46c4c49dSIbrahim Kanouche
713*46c4c49dSIbrahim Kanouche        END OF TERMS AND CONDITIONS`,
714*46c4c49dSIbrahim Kanouche		},
715*46c4c49dSIbrahim Kanouche	}
716*46c4c49dSIbrahim Kanouche
717*46c4c49dSIbrahim Kanouche	for _, tt := range tests {
718*46c4c49dSIbrahim Kanouche		if got := TrimExtraneousTrailingText(tt.original); got != tt.want {
719*46c4c49dSIbrahim Kanouche			t.Errorf("Mismatch => %q, want %q", got, tt.want)
720*46c4c49dSIbrahim Kanouche		}
721*46c4c49dSIbrahim Kanouche	}
722*46c4c49dSIbrahim Kanouche}
723*46c4c49dSIbrahim Kanouche
724*46c4c49dSIbrahim Kanouchefunc TestCommonLicenseWords(t *testing.T) {
725*46c4c49dSIbrahim Kanouche	files, err := ReadLicenseDir()
726*46c4c49dSIbrahim Kanouche	if err != nil {
727*46c4c49dSIbrahim Kanouche		t.Fatalf("error: cannot read licenses directory: %v", err)
728*46c4c49dSIbrahim Kanouche	}
729*46c4c49dSIbrahim Kanouche	if files == nil {
730*46c4c49dSIbrahim Kanouche		t.Fatal("error: cannot get licenses from license directory")
731*46c4c49dSIbrahim Kanouche	}
732*46c4c49dSIbrahim Kanouche
733*46c4c49dSIbrahim Kanouche	for _, file := range files {
734*46c4c49dSIbrahim Kanouche		if filepath.Ext(file.Name()) != ".txt" {
735*46c4c49dSIbrahim Kanouche			continue
736*46c4c49dSIbrahim Kanouche		}
737*46c4c49dSIbrahim Kanouche		text, err := ReadLicenseFile(file.Name())
738*46c4c49dSIbrahim Kanouche		if err != nil {
739*46c4c49dSIbrahim Kanouche			t.Fatalf("error reading contents of %q: %v", file.Name(), err)
740*46c4c49dSIbrahim Kanouche		}
741*46c4c49dSIbrahim Kanouche
742*46c4c49dSIbrahim Kanouche		if got := classifier.hasCommonLicenseWords(string(text)); !got {
743*46c4c49dSIbrahim Kanouche			t.Errorf("Mismatch(%q) => false, want true", file.Name())
744*46c4c49dSIbrahim Kanouche		}
745*46c4c49dSIbrahim Kanouche	}
746*46c4c49dSIbrahim Kanouche
747*46c4c49dSIbrahim Kanouche	text := strings.Repeat("Þetta er ekki leyfi.\n", 80)
748*46c4c49dSIbrahim Kanouche	if got := classifier.hasCommonLicenseWords(text); got {
749*46c4c49dSIbrahim Kanouche		t.Error("Mismatch => true, want false")
750*46c4c49dSIbrahim Kanouche	}
751*46c4c49dSIbrahim Kanouche}
752*46c4c49dSIbrahim Kanouche
753*46c4c49dSIbrahim Kanouchefunc TestLicenseMatchQuality(t *testing.T) {
754*46c4c49dSIbrahim Kanouche	files, err := ReadLicenseDir()
755*46c4c49dSIbrahim Kanouche	if err != nil {
756*46c4c49dSIbrahim Kanouche		t.Fatalf("error: cannot read licenses directory: %v", err)
757*46c4c49dSIbrahim Kanouche	}
758*46c4c49dSIbrahim Kanouche
759*46c4c49dSIbrahim Kanouche	classifier.Threshold = 1.0
760*46c4c49dSIbrahim Kanouche	defer func() {
761*46c4c49dSIbrahim Kanouche		classifier.Threshold = DefaultConfidenceThreshold
762*46c4c49dSIbrahim Kanouche	}()
763*46c4c49dSIbrahim Kanouche	for _, file := range files {
764*46c4c49dSIbrahim Kanouche		if filepath.Ext(file.Name()) != ".txt" {
765*46c4c49dSIbrahim Kanouche			continue
766*46c4c49dSIbrahim Kanouche		}
767*46c4c49dSIbrahim Kanouche		name := strings.TrimSuffix(file.Name(), ".txt")
768*46c4c49dSIbrahim Kanouche
769*46c4c49dSIbrahim Kanouche		contents, err := ReadLicenseFile(file.Name())
770*46c4c49dSIbrahim Kanouche		if err != nil {
771*46c4c49dSIbrahim Kanouche			t.Fatalf("error reading contents of %q: %v", file.Name(), err)
772*46c4c49dSIbrahim Kanouche		}
773*46c4c49dSIbrahim Kanouche
774*46c4c49dSIbrahim Kanouche		m := classifier.NearestMatch(TrimExtraneousTrailingText(string(contents)))
775*46c4c49dSIbrahim Kanouche		if m == nil {
776*46c4c49dSIbrahim Kanouche			t.Errorf("Couldn't match %q", name)
777*46c4c49dSIbrahim Kanouche			continue
778*46c4c49dSIbrahim Kanouche		}
779*46c4c49dSIbrahim Kanouche
780*46c4c49dSIbrahim Kanouche		if !classifier.WithinConfidenceThreshold(m.Confidence) {
781*46c4c49dSIbrahim Kanouche			t.Errorf("ConfidenceMatch(%q) => %v, want %v", name, m.Confidence, 0.99)
782*46c4c49dSIbrahim Kanouche		}
783*46c4c49dSIbrahim Kanouche		want := strings.TrimSuffix(name, ".header")
784*46c4c49dSIbrahim Kanouche		if want != m.Name {
785*46c4c49dSIbrahim Kanouche			t.Errorf("LicenseMatch(%q) => %v, want %v", name, m.Name, want)
786*46c4c49dSIbrahim Kanouche		}
787*46c4c49dSIbrahim Kanouche	}
788*46c4c49dSIbrahim Kanouche}
789*46c4c49dSIbrahim Kanouche
790*46c4c49dSIbrahim Kanouchefunc BenchmarkClassifier(b *testing.B) {
791*46c4c49dSIbrahim Kanouche	contents := apache20[:len(apache20)/2] + "hello" + apache20[len(apache20)/2:]
792*46c4c49dSIbrahim Kanouche
793*46c4c49dSIbrahim Kanouche	b.ResetTimer()
794*46c4c49dSIbrahim Kanouche	for i := 0; i < b.N; i++ {
795*46c4c49dSIbrahim Kanouche		classifier, err := New(DefaultConfidenceThreshold)
796*46c4c49dSIbrahim Kanouche		if err != nil {
797*46c4c49dSIbrahim Kanouche			b.Errorf("Cannot create classifier: %v", err)
798*46c4c49dSIbrahim Kanouche			continue
799*46c4c49dSIbrahim Kanouche		}
800*46c4c49dSIbrahim Kanouche		classifier.NearestMatch(contents)
801*46c4c49dSIbrahim Kanouche	}
802*46c4c49dSIbrahim Kanouche}
803*46c4c49dSIbrahim Kanouche
804*46c4c49dSIbrahim Kanouchefunc TestNew(t *testing.T) {
805*46c4c49dSIbrahim Kanouche	tests := []struct {
806*46c4c49dSIbrahim Kanouche		desc        string
807*46c4c49dSIbrahim Kanouche		options     []OptionFunc
808*46c4c49dSIbrahim Kanouche		wantArchive func() []byte
809*46c4c49dSIbrahim Kanouche		wantErr     bool
810*46c4c49dSIbrahim Kanouche	}{
811*46c4c49dSIbrahim Kanouche		{
812*46c4c49dSIbrahim Kanouche			desc:        "no options, use default",
813*46c4c49dSIbrahim Kanouche			options:     []OptionFunc{},
814*46c4c49dSIbrahim Kanouche			wantArchive: nil,
815*46c4c49dSIbrahim Kanouche		},
816*46c4c49dSIbrahim Kanouche		{
817*46c4c49dSIbrahim Kanouche			desc:    "specify ForbiddenLicenseArchive",
818*46c4c49dSIbrahim Kanouche			options: []OptionFunc{Archive(ForbiddenLicenseArchive)},
819*46c4c49dSIbrahim Kanouche			wantArchive: func() []byte {
820*46c4c49dSIbrahim Kanouche				b, _ := ReadLicenseFile(ForbiddenLicenseArchive)
821*46c4c49dSIbrahim Kanouche				return b
822*46c4c49dSIbrahim Kanouche			},
823*46c4c49dSIbrahim Kanouche		},
824*46c4c49dSIbrahim Kanouche		{
825*46c4c49dSIbrahim Kanouche			desc:        "file doesn't exist results in error",
826*46c4c49dSIbrahim Kanouche			options:     []OptionFunc{Archive("doesnotexist")},
827*46c4c49dSIbrahim Kanouche			wantArchive: func() []byte { return nil },
828*46c4c49dSIbrahim Kanouche			wantErr:     true,
829*46c4c49dSIbrahim Kanouche		},
830*46c4c49dSIbrahim Kanouche		{
831*46c4c49dSIbrahim Kanouche			desc:        "raw bytes archive",
832*46c4c49dSIbrahim Kanouche			options:     []OptionFunc{ArchiveBytes([]byte("not a gzipped file"))},
833*46c4c49dSIbrahim Kanouche			wantArchive: func() []byte { return []byte("not a gzipped file") },
834*46c4c49dSIbrahim Kanouche			wantErr:     true,
835*46c4c49dSIbrahim Kanouche		},
836*46c4c49dSIbrahim Kanouche		{
837*46c4c49dSIbrahim Kanouche			desc: "function archive",
838*46c4c49dSIbrahim Kanouche			options: []OptionFunc{ArchiveFunc(func() ([]byte, error) {
839*46c4c49dSIbrahim Kanouche				return []byte("not a gzipped file"), nil
840*46c4c49dSIbrahim Kanouche			})},
841*46c4c49dSIbrahim Kanouche			wantArchive: func() []byte { return []byte("not a gzipped file") },
842*46c4c49dSIbrahim Kanouche			wantErr:     true,
843*46c4c49dSIbrahim Kanouche		},
844*46c4c49dSIbrahim Kanouche	}
845*46c4c49dSIbrahim Kanouche	for _, tt := range tests {
846*46c4c49dSIbrahim Kanouche		t.Run(tt.desc, func(t *testing.T) {
847*46c4c49dSIbrahim Kanouche			c, err := New(0.5, tt.options...)
848*46c4c49dSIbrahim Kanouche			if tt.wantErr != (err != nil) {
849*46c4c49dSIbrahim Kanouche				t.Fatalf("unexpected error: %v", err)
850*46c4c49dSIbrahim Kanouche			}
851*46c4c49dSIbrahim Kanouche			if err == nil {
852*46c4c49dSIbrahim Kanouche				if tt.wantArchive == nil {
853*46c4c49dSIbrahim Kanouche					if c.archive != nil {
854*46c4c49dSIbrahim Kanouche						t.Errorf("wanted default archive, but got specified archive")
855*46c4c49dSIbrahim Kanouche					}
856*46c4c49dSIbrahim Kanouche				} else {
857*46c4c49dSIbrahim Kanouche					got, _ := c.archive()
858*46c4c49dSIbrahim Kanouche					want := tt.wantArchive()
859*46c4c49dSIbrahim Kanouche					if !bytes.Equal(got, want) {
860*46c4c49dSIbrahim Kanouche						t.Errorf("archives did not match; got %d bytes, wanted %d", len(got), len(want))
861*46c4c49dSIbrahim Kanouche					}
862*46c4c49dSIbrahim Kanouche				}
863*46c4c49dSIbrahim Kanouche			}
864*46c4c49dSIbrahim Kanouche		})
865*46c4c49dSIbrahim Kanouche	}
866*46c4c49dSIbrahim Kanouche
867*46c4c49dSIbrahim Kanouche}
868