xref: /aosp_15_r20/external/licenseclassifier/classifier_test.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1// Copyright 2017 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package licenseclassifier
16
17import (
18	"bytes"
19	"log"
20	"os"
21	"path/filepath"
22	"strings"
23	"testing"
24
25	"github.com/google/licenseclassifier/stringclassifier"
26)
27
28var (
29	agpl30, agpl30Header, apache20, bsd3, gpl20, ccbync20 string
30	classifier                                            *License
31)
32
33func TestMain(m *testing.M) {
34	a30, err := ReadLicenseFile("AGPL-3.0.txt")
35	if err != nil {
36		log.Fatalf("error reading contents of AGPL-3.0.txt: %v", err)
37	}
38	a30h, err := ReadLicenseFile("AGPL-3.0.header.txt")
39	if err != nil {
40		log.Fatalf("error reading contents of AGPL-3.0.header.txt: %v", err)
41	}
42	a20, err := ReadLicenseFile("Apache-2.0.txt")
43	if err != nil {
44		log.Fatalf("error reading contents of Apache-2.0.txt: %v", err)
45	}
46	b3, err := ReadLicenseFile("BSD-3-Clause.txt")
47	if err != nil {
48		log.Fatalf("error reading contents of BSD-3-Clause.txt: %v", err)
49	}
50	g2, err := ReadLicenseFile("GPL-2.0.txt")
51	if err != nil {
52		log.Fatalf("error reading contents of GPL-2.0.txt: %v", err)
53	}
54	cc20, err := ReadLicenseFile("CC-BY-NC-2.0.txt")
55	if err != nil {
56		log.Fatalf("error reading contents of CC-BY-NC-2.0.txt: %v", err)
57	}
58
59	agpl30 = TrimExtraneousTrailingText(string(a30))
60	agpl30Header = TrimExtraneousTrailingText(string(a30h))
61	apache20 = TrimExtraneousTrailingText(string(a20))
62	bsd3 = TrimExtraneousTrailingText(string(b3))
63	gpl20 = TrimExtraneousTrailingText(string(g2))
64	ccbync20 = TrimExtraneousTrailingText(string(cc20))
65
66	classifier, err = New(DefaultConfidenceThreshold)
67	if err != nil {
68		log.Fatalf("cannot create license classifier: %v", err)
69	}
70	os.Exit(m.Run())
71}
72
73func TestClassifier_NearestMatch(t *testing.T) {
74	tests := []struct {
75		description    string
76		filename       string
77		extraText      string
78		wantLicense    string
79		wantConfidence float64
80	}{
81		{
82			description:    "AGPL 3.0 license",
83			filename:       "AGPL-3.0.txt",
84			wantLicense:    "AGPL-3.0",
85			wantConfidence: 1.0,
86		},
87		{
88			description:    "Apache 2.0 license",
89			filename:       "Apache-2.0.txt",
90			wantLicense:    "Apache-2.0",
91			wantConfidence: 1.0,
92		},
93		{
94			description:    "GPL 2.0 license",
95			filename:       "GPL-2.0.txt",
96			wantLicense:    "GPL-2.0",
97			wantConfidence: 1.0,
98		},
99		{
100			description:    "BSD 3 Clause license with extra text",
101			filename:       "BSD-3-Clause.txt",
102			extraText:      "New BSD License\nCopyright © 1998 Yoyodyne, Inc.\n",
103			wantLicense:    "BSD-3-Clause",
104			wantConfidence: 0.94,
105		},
106	}
107
108	classifier.Threshold = DefaultConfidenceThreshold
109	for _, tt := range tests {
110		content, err := ReadLicenseFile(tt.filename)
111		if err != nil {
112			t.Errorf("error reading contents of %q license: %v", tt.wantLicense, err)
113			continue
114		}
115
116		m := classifier.NearestMatch(tt.extraText + TrimExtraneousTrailingText(string(content)))
117		if got, want := m.Name, tt.wantLicense; got != want {
118			t.Errorf("NearestMatch(%q) = %q, want %q", tt.description, got, want)
119		}
120		if got, want := m.Confidence, tt.wantConfidence; got < want {
121			t.Errorf("NearestMatch(%q) = %v, want %v", tt.description, got, want)
122		}
123	}
124}
125
126func TestClassifier_MultipleMatch(t *testing.T) {
127	tests := []struct {
128		description string
129		text        string
130		want        stringclassifier.Matches
131	}{
132		{
133			description: "Two licenses",
134			text:        "Copyright (c) 2016 Yoyodyne, Inc.\n" + apache20 + strings.Repeat("-", 80) + "\n" + bsd3,
135			want: stringclassifier.Matches{
136				{
137					Name:       "Apache-2.0",
138					Confidence: 1.0,
139				},
140				{
141					Name:       "BSD-3-Clause",
142					Confidence: 1.0,
143				},
144			},
145		},
146		{
147			description: "Two licenses: partial match",
148			text: "Copyright (c) 2016 Yoyodyne, Inc.\n" +
149				string(apache20[:len(apache20)/2-1]) + string(apache20[len(apache20)/2+7:]) + strings.Repeat("-", 80) + "\n" +
150				string(bsd3[:len(bsd3)/2]) + "intervening stuff" + string(bsd3[len(bsd3)/2:]),
151			want: stringclassifier.Matches{
152				{
153					Name:       "Apache-2.0",
154					Confidence: 0.99,
155				},
156				{
157					Name:       "BSD-3-Clause",
158					Confidence: 0.98,
159				},
160			},
161		},
162		{
163			description: "Two licenses: one forbidden the other okay",
164			text:        "Copyright (c) 2016 Yoyodyne, Inc.\n" + apache20 + strings.Repeat("-", 80) + "\n" + ccbync20,
165			want: stringclassifier.Matches{
166				{
167					Name:       "Apache-2.0",
168					Confidence: 0.99,
169				},
170				{
171					Name:       "CC-BY-NC-2.0",
172					Confidence: 1.0,
173				},
174			},
175		},
176		{
177			description: "Two licenses without any space between them.",
178			text:        apache20 + "." + bsd3,
179			want: stringclassifier.Matches{
180				{
181					Name:       "Apache-2.0",
182					Confidence: 1.0,
183				},
184				{
185					Name:       "BSD-3-Clause",
186					Confidence: 1.0,
187				},
188			},
189		},
190	}
191
192	classifier.Threshold = 0.95
193	defer func() {
194		classifier.Threshold = DefaultConfidenceThreshold
195	}()
196	for _, tt := range tests {
197		m := classifier.MultipleMatch(tt.text, false)
198		if len(m) != len(tt.want) {
199			t.Fatalf("MultipleMatch(%q) number matches: %v, want %v", tt.description, len(m), len(tt.want))
200			continue
201		}
202
203		for i := 0; i < len(m); i++ {
204			w := tt.want[i]
205			if got, want := m[i].Name, w.Name; got != want {
206				t.Errorf("MultipleMatch(%q) = %q, want %q", tt.description, got, want)
207			}
208			if got, want := m[i].Confidence, w.Confidence; got < want {
209				t.Errorf("MultipleMatch(%q) = %v, want %v", tt.description, got, want)
210			}
211		}
212	}
213}
214
215func TestClassifier_MultipleMatch_Headers(t *testing.T) {
216	tests := []struct {
217		description string
218		text        string
219		want        stringclassifier.Matches
220	}{
221		{
222			description: "AGPL-3.0 header",
223			text:        "Copyright (c) 2016 Yoyodyne, Inc.\n" + agpl30Header,
224			want: stringclassifier.Matches{
225				{
226					Name:       "AGPL-3.0",
227					Confidence: 1.0,
228					Offset:     0,
229				},
230			},
231		},
232		{
233			description: "Modified LGPL-2.1 header",
234			text: `Common Widget code.
235
236Copyright (C) 2013-2015 Yoyodyne, Inc.
237
238This library is free software; you can redistribute it and/or
239modify it under the terms of the GNU Lesser General Public
240License as published by the Free Software Foundation; either
241version 2.1 of the License, or (at your option) any later version (but not!).
242
243This library is distributed in the hope that it will be useful,
244but WITHOUT ANY WARRANTY; without even the implied warranty of
245MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
246Lesser General Public License for more details.
247
248You should have received a copy of the GNU Lesser General Public
249License along with this library; if not, write to the Free Software
250Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
251`,
252			want: stringclassifier.Matches{
253				{
254					Name:       "LGPL-2.1",
255					Confidence: 0.97,
256					Offset:     197,
257				},
258			},
259		},
260	}
261
262	classifier.Threshold = 0.90
263	defer func() {
264		classifier.Threshold = DefaultConfidenceThreshold
265	}()
266	for _, tt := range tests {
267		m := classifier.MultipleMatch(tt.text, true)
268		if len(m) != len(tt.want) {
269			t.Errorf("MultipleMatch(%q) number matches: %v, want %v", tt.description, len(m), len(tt.want))
270			continue
271		}
272
273		for i := 0; i < len(m); i++ {
274			w := tt.want[i]
275			if got, want := m[i].Name, w.Name; got != want {
276				t.Errorf("MultipleMatch(%q) = %q, want %q", tt.description, got, want)
277			}
278			if got, want := m[i].Confidence, w.Confidence; got < want {
279				t.Errorf("MultipleMatch(%q) = %v, want %v", tt.description, got, want)
280			}
281		}
282	}
283}
284
285func TestClassifier_CopyrightHolder(t *testing.T) {
286	tests := []struct {
287		copyright string
288		want      string
289	}{
290		{
291			copyright: "Copyright 2008 Yoyodyne Inc. All Rights Reserved.",
292			want:      "Yoyodyne Inc.",
293		},
294		{
295			copyright: "Copyright 2010-2016 Yoyodyne, Inc.",
296			want:      "Yoyodyne, Inc.",
297		},
298		{
299			copyright: "Copyright 2010, 2011, 2012 Yoyodyne, Inc., All rights reserved.",
300			want:      "Yoyodyne, Inc.",
301		},
302		{
303			copyright: "Copyright (c) 2015 Yoyodyne, Inc. All rights reserved.",
304			want:      "Yoyodyne, Inc.",
305		},
306		{
307			copyright: "Copyright © 1998 by Yoyodyne, Inc., San Narciso, CA, US.",
308			want:      "Yoyodyne, Inc., San Narciso, CA, US",
309		},
310		{
311			copyright: "Copyright (c) 2015 The Algonquin Round Table. All rights reserved.",
312			want:      "The Algonquin Round Table",
313		},
314		{
315			copyright: "Copyright 2016, The Android Open Source Project",
316			want:      "The Android Open Source Project",
317		},
318		{
319			copyright: `---------------------------------------------------------
320foo.c:
321Copyright 2016, The Android Open Source Project
322`,
323			want: "The Android Open Source Project",
324		},
325	}
326
327	for _, tt := range tests {
328		got := CopyrightHolder(tt.copyright)
329		if got != tt.want {
330			t.Errorf("CopyrightHolder(%q) = %q, want %q", tt.copyright, got, tt.want)
331		}
332	}
333}
334
335func TestClassifier_WithinConfidenceThreshold(t *testing.T) {
336	tests := []struct {
337		description string
338		text        string
339		confDef     bool
340		conf99      bool
341		conf93      bool
342		conf5       bool
343	}{
344		{
345			description: "Apache 2.0",
346			text:        apache20,
347			confDef:     true,
348			conf99:      true,
349			conf93:      true,
350			conf5:       true,
351		},
352		{
353			description: "GPL 2.0",
354			text:        gpl20,
355			confDef:     true,
356			conf99:      true,
357			conf93:      true,
358			conf5:       true,
359		},
360		{
361			description: "BSD 3 Clause license with extra text",
362			text:        "New BSD License\nCopyright © 1998 Yoyodyne, Inc.\n" + bsd3,
363			confDef:     true,
364			conf99:      true,
365			conf93:      true,
366			conf5:       true,
367		},
368		{
369			description: "Very low confidence",
370			text:        strings.Repeat("Random text is random, but not a license\n", 40),
371			confDef:     false,
372			conf99:      false,
373			conf93:      false,
374			conf5:       true,
375		},
376	}
377
378	defer func() {
379		classifier.Threshold = DefaultConfidenceThreshold
380	}()
381	for _, tt := range tests {
382		t.Run(tt.description, func(t *testing.T) {
383			classifier.Threshold = DefaultConfidenceThreshold
384			m := classifier.NearestMatch(tt.text)
385			if got := classifier.WithinConfidenceThreshold(m.Confidence); got != tt.confDef {
386				t.Errorf("WithinConfidenceThreshold() at %v returned wrong result; got %v, want %v", classifier.Threshold, got, tt.confDef)
387			}
388
389			classifier.Threshold = 0.99
390			m = classifier.NearestMatch(tt.text)
391			if got := classifier.WithinConfidenceThreshold(m.Confidence); got != tt.conf99 {
392				t.Errorf("WithinConfidenceThreshold(%q) = %v, want %v", tt.description, got, tt.conf99)
393			}
394
395			classifier.Threshold = 0.93
396			m = classifier.NearestMatch(tt.text)
397			if got := classifier.WithinConfidenceThreshold(m.Confidence); got != tt.conf93 {
398				t.Errorf("WithinConfidenceThreshold(%q) = %v, want %v", tt.description, got, tt.conf93)
399			}
400
401			classifier.Threshold = 0.05
402			m = classifier.NearestMatch(tt.text)
403			if got := classifier.WithinConfidenceThreshold(m.Confidence); got != tt.conf5 {
404				t.Errorf("WithinConfidenceThreshold(%q) = %v, want %v", tt.description, got, tt.conf5)
405			}
406		})
407	}
408}
409
410func TestRemoveIgnorableText(t *testing.T) {
411	const want = `Lorem ipsum dolor sit amet, pellentesque wisi tortor duis, amet adipiscing bibendum elit aliquam
412leo. Mattis commodo sed accumsan at in.
413`
414
415	tests := []struct {
416		original string
417		want     string
418	}{
419		{"MIT License\n", "\n"},
420		{"The MIT License\n", "\n"},
421		{"The MIT License (MIT)\n", "\n"},
422		{"BSD License\n", "\n"},
423		{"New BSD License\n", "\n"},
424		{"COPYRIGHT AND PERMISSION NOTICE\n", "\n"},
425		{"Copyright (c) 2016, Yoyodyne, Inc.\n", "\n"},
426		{"All rights reserved.\n", "\n"},
427		{"Some rights reserved.\n", "\n"},
428		{"@license\n", "\n"},
429
430		// Now with wanted texts.
431		{
432			original: `The MIT License
433
434Copyright (c) 2016, Yoyodyne, Inc.
435All rights reserved.
436` + want,
437			want: strings.ToLower(want),
438		},
439	}
440
441	for _, tt := range tests {
442		if got := removeIgnorableTexts(strings.ToLower(tt.original)); got != tt.want {
443			t.Errorf("Mismatch(%q) =>\n%s\nwant:\n%s", tt.original, got, tt.want)
444		}
445	}
446}
447
448func TestRemoveShebangLine(t *testing.T) {
449	tests := []struct {
450		original string
451		want     string
452	}{
453		{
454			original: "",
455			want:     "",
456		},
457		{
458			original: "#!/usr/bin/env python -C",
459			want:     "#!/usr/bin/env python -C",
460		},
461		{
462			original: `#!/usr/bin/env python -C
463# First line of license text.
464# Second line of license text.
465`,
466			want: `# First line of license text.
467# Second line of license text.
468`,
469		},
470		{
471			original: `# First line of license text.
472# Second line of license text.
473`,
474			want: `# First line of license text.
475# Second line of license text.
476`,
477		},
478	}
479
480	for _, tt := range tests {
481		got := removeShebangLine(tt.original)
482		if got != tt.want {
483			t.Errorf("RemoveShebangLine(%q) =>\n%s\nwant:\n%s", tt.original, got, tt.want)
484		}
485	}
486}
487
488func TestRemoveNonWords(t *testing.T) {
489	tests := []struct {
490		original string
491		want     string
492	}{
493		{
494			original: `# # Hello
495## World
496`,
497			want: ` Hello World `,
498		},
499		{
500			original: ` * This text has a bulleted list:
501 * * item 1
502 * * item 2`,
503			want: ` This text has a bulleted list item 1 item 2`,
504		},
505		{
506			original: `
507
508 * This text has a bulleted list:
509 * * item 1
510 * * item 2`,
511			want: ` This text has a bulleted list item 1 item 2`,
512		},
513		{
514			original: `// This text has a bulleted list:
515// 1. item 1
516// 2. item 2`,
517			want: ` This text has a bulleted list 1 item 1 2 item 2`,
518		},
519		{
520			original: `// «Copyright (c) 1998 Yoyodyne, Inc.»
521// This text has a bulleted list:
522// 1. item 1
523// 2. item 2
524`,
525			want: ` «Copyright c 1998 Yoyodyne Inc » This text has a bulleted list 1 item 1 2 item 2 `,
526		},
527		{
528			original: `*
529 * This is the first line we want.
530 * This is the second line we want.
531 * This is the third line we want.
532 * This is the last line we want.
533`,
534			want: ` This is the first line we want This is the second line we want This is the third line we want This is the last line we want `,
535		},
536		{
537			original: `===---------------------------------------------===
538***
539* This is the first line we want.
540* This is the second line we want.
541* This is the third line we want.
542* This is the last line we want.
543***
544===---------------------------------------------===
545`,
546			want: ` This is the first line we want This is the second line we want This is the third line we want This is the last line we want `,
547		},
548		{
549			original: strings.Repeat("-", 80),
550			want:     " ",
551		},
552		{
553			original: strings.Repeat("=", 80),
554			want:     " ",
555		},
556		{
557			original: "/*\n",
558			want:     " ",
559		},
560		{
561			original: "/*\n * precursor text\n */\n",
562			want:     " precursor text ",
563		},
564		// Test for b/63540492.
565		{
566			original: " */\n",
567			want:     " ",
568		},
569		{
570			original: "",
571			want:     "",
572		},
573	}
574
575	for _, tt := range tests {
576		if got := stringclassifier.FlattenWhitespace(RemoveNonWords(tt.original)); got != tt.want {
577			t.Errorf("Mismatch(%q) => %v, want %v", tt.original, got, tt.want)
578		}
579	}
580}
581
582func TestNormalizePunctuation(t *testing.T) {
583	tests := []struct {
584		original string
585		want     string
586	}{
587		// Hyphens and dashes.
588		{"—", "-"},
589		{"-", "-"},
590		{"‒", "-"},
591		{"–", "-"},
592		{"—", "-"},
593
594		// Quotes.
595		{"'", "'"},
596		{`"`, "'"},
597		{"‘", "'"},
598		{"", "'"},
599		{"“", "'"},
600		{"", "'"},
601		{" ” ", " ' "},
602
603		// Backtick.
604		{"`", "'"},
605
606		// Copyright mark.
607		{"©", "(c)"},
608
609		// Hyphen-separated words.
610		{"general- purpose, non- compliant", "general-purpose, non-compliant"},
611
612		// Section.
613		{"§", "(s)"},
614		{"¤", "(s)"},
615	}
616
617	for _, tt := range tests {
618		if got := NormalizePunctuation(tt.original); got != tt.want {
619			t.Errorf("Mismatch => %v, want %v", got, tt.want)
620		}
621	}
622}
623
624func TestNormalizeEquivalentWords(t *testing.T) {
625	tests := []struct {
626		original string
627		want     string
628	}{
629		{"acknowledgment", "Acknowledgement"},
630		{"ANalogue", "Analog"},
631		{"AnAlyse", "Analyze"},
632		{"ArtefacT", "Artifact"},
633		{"authorisation", "Authorization"},
634		{"AuthoriSed", "Authorized"},
635		{"CalIbre", "Caliber"},
636		{"CanCelled", "Canceled"},
637		{"CapitaliSations", "Capitalizations"},
638		{"CatalogUe", "Catalog"},
639		{"CategoriSe", "Categorize"},
640		{"CentRE", "Center"},
641		{"EmphasiSed", "Emphasized"},
642		{"FavoUr", "Favor"},
643		{"FavoUrite", "Favorite"},
644		{"FulfiL", "Fulfill"},
645		{"FulfiLment", "Fulfillment"},
646		{"InitialiSe", "Initialize"},
647		{"JudGMent", "Judgement"},
648		{"LabelLing", "Labeling"},
649		{"LaboUr", "Labor"},
650		{"LicenCe", "License"},
651		{"MaximiSe", "Maximize"},
652		{"ModelLed", "Modeled"},
653		{"ModeLling", "Modeling"},
654		{"OffenCe", "Offense"},
655		{"OptimiSe", "Optimize"},
656		{"OrganiSation", "Organization"},
657		{"OrganiSe", "Organize"},
658		{"PractiSe", "Practice"},
659		{"ProgramME", "Program"},
660		{"RealiSe", "Realize"},
661		{"RecogniSe", "Recognize"},
662		{"SignalLing", "Signaling"},
663		{"sub-license", "Sublicense"},
664		{"sub license", "Sublicense"},
665		{"UtiliSation", "Utilization"},
666		{"WhilST", "While"},
667		{"WilfuL", "Wilfull"},
668		{"Non-coMMercial", "Noncommercial"},
669		{"Per Cent", "Percent"},
670	}
671
672	for _, tt := range tests {
673		if got := NormalizeEquivalentWords(tt.original); got != tt.want {
674			t.Errorf("Mismatch => %v, want %v", got, tt.want)
675		}
676	}
677}
678
679func TestTrimExtraneousTrailingText(t *testing.T) {
680	tests := []struct {
681		original string
682		want     string
683	}{
684		{
685			original: `12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL
686    ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE
687    THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
688    GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
689    USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
690    DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
691    PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
692    EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
693    SUCH DAMAGES.
694
695        END OF TERMS AND CONDITIONS
696
697    How to Apply These Terms to Your New Programs
698
699    If you develop a new program, and you want it to be of the greatest
700    possible use to the public, the best way to achieve this is to make it free
701    software which everyone can redistribute and change under these terms.
702`,
703			want: `12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL
704    ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE
705    THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
706    GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
707    USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
708    DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
709    PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
710    EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
711    SUCH DAMAGES.
712
713        END OF TERMS AND CONDITIONS`,
714		},
715	}
716
717	for _, tt := range tests {
718		if got := TrimExtraneousTrailingText(tt.original); got != tt.want {
719			t.Errorf("Mismatch => %q, want %q", got, tt.want)
720		}
721	}
722}
723
724func TestCommonLicenseWords(t *testing.T) {
725	files, err := ReadLicenseDir()
726	if err != nil {
727		t.Fatalf("error: cannot read licenses directory: %v", err)
728	}
729	if files == nil {
730		t.Fatal("error: cannot get licenses from license directory")
731	}
732
733	for _, file := range files {
734		if filepath.Ext(file.Name()) != ".txt" {
735			continue
736		}
737		text, err := ReadLicenseFile(file.Name())
738		if err != nil {
739			t.Fatalf("error reading contents of %q: %v", file.Name(), err)
740		}
741
742		if got := classifier.hasCommonLicenseWords(string(text)); !got {
743			t.Errorf("Mismatch(%q) => false, want true", file.Name())
744		}
745	}
746
747	text := strings.Repeat("Þetta er ekki leyfi.\n", 80)
748	if got := classifier.hasCommonLicenseWords(text); got {
749		t.Error("Mismatch => true, want false")
750	}
751}
752
753func TestLicenseMatchQuality(t *testing.T) {
754	files, err := ReadLicenseDir()
755	if err != nil {
756		t.Fatalf("error: cannot read licenses directory: %v", err)
757	}
758
759	classifier.Threshold = 1.0
760	defer func() {
761		classifier.Threshold = DefaultConfidenceThreshold
762	}()
763	for _, file := range files {
764		if filepath.Ext(file.Name()) != ".txt" {
765			continue
766		}
767		name := strings.TrimSuffix(file.Name(), ".txt")
768
769		contents, err := ReadLicenseFile(file.Name())
770		if err != nil {
771			t.Fatalf("error reading contents of %q: %v", file.Name(), err)
772		}
773
774		m := classifier.NearestMatch(TrimExtraneousTrailingText(string(contents)))
775		if m == nil {
776			t.Errorf("Couldn't match %q", name)
777			continue
778		}
779
780		if !classifier.WithinConfidenceThreshold(m.Confidence) {
781			t.Errorf("ConfidenceMatch(%q) => %v, want %v", name, m.Confidence, 0.99)
782		}
783		want := strings.TrimSuffix(name, ".header")
784		if want != m.Name {
785			t.Errorf("LicenseMatch(%q) => %v, want %v", name, m.Name, want)
786		}
787	}
788}
789
790func BenchmarkClassifier(b *testing.B) {
791	contents := apache20[:len(apache20)/2] + "hello" + apache20[len(apache20)/2:]
792
793	b.ResetTimer()
794	for i := 0; i < b.N; i++ {
795		classifier, err := New(DefaultConfidenceThreshold)
796		if err != nil {
797			b.Errorf("Cannot create classifier: %v", err)
798			continue
799		}
800		classifier.NearestMatch(contents)
801	}
802}
803
804func TestNew(t *testing.T) {
805	tests := []struct {
806		desc        string
807		options     []OptionFunc
808		wantArchive func() []byte
809		wantErr     bool
810	}{
811		{
812			desc:        "no options, use default",
813			options:     []OptionFunc{},
814			wantArchive: nil,
815		},
816		{
817			desc:    "specify ForbiddenLicenseArchive",
818			options: []OptionFunc{Archive(ForbiddenLicenseArchive)},
819			wantArchive: func() []byte {
820				b, _ := ReadLicenseFile(ForbiddenLicenseArchive)
821				return b
822			},
823		},
824		{
825			desc:        "file doesn't exist results in error",
826			options:     []OptionFunc{Archive("doesnotexist")},
827			wantArchive: func() []byte { return nil },
828			wantErr:     true,
829		},
830		{
831			desc:        "raw bytes archive",
832			options:     []OptionFunc{ArchiveBytes([]byte("not a gzipped file"))},
833			wantArchive: func() []byte { return []byte("not a gzipped file") },
834			wantErr:     true,
835		},
836		{
837			desc: "function archive",
838			options: []OptionFunc{ArchiveFunc(func() ([]byte, error) {
839				return []byte("not a gzipped file"), nil
840			})},
841			wantArchive: func() []byte { return []byte("not a gzipped file") },
842			wantErr:     true,
843		},
844	}
845	for _, tt := range tests {
846		t.Run(tt.desc, func(t *testing.T) {
847			c, err := New(0.5, tt.options...)
848			if tt.wantErr != (err != nil) {
849				t.Fatalf("unexpected error: %v", err)
850			}
851			if err == nil {
852				if tt.wantArchive == nil {
853					if c.archive != nil {
854						t.Errorf("wanted default archive, but got specified archive")
855					}
856				} else {
857					got, _ := c.archive()
858					want := tt.wantArchive()
859					if !bytes.Equal(got, want) {
860						t.Errorf("archives did not match; got %d bytes, wanted %d", len(got), len(want))
861					}
862				}
863			}
864		})
865	}
866
867}
868