xref: /aosp_15_r20/external/licenseclassifier/v2/tokenizer_test.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1// Copyright 2020 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package classifier
16
17import (
18	"bytes"
19	"io"
20	"strings"
21	"testing"
22
23	"github.com/google/go-cmp/cmp"
24	"github.com/google/go-cmp/cmp/cmpopts"
25)
26
27func TestCleanupToken(t *testing.T) {
28	tests := []struct {
29		input  string
30		output string
31	}{{
32		input:  "cleanup!",
33		output: "cleanup",
34	},
35		{
36			input:  "12345",
37			output: "12345",
38		},
39		{
40			input:  "r1@zx42-",
41			output: "rzx",
42		},
43		{
44			input:  "12345,",
45			output: "12345",
46		},
47		{
48			input:  "12345-6789",
49			output: "12345-6789",
50		},
51		{
52			input:  "1(a)",
53			output: "1",
54		},
55		{
56			input:  "1.2.3",
57			output: "1.2.3",
58		},
59	}
60	for _, test := range tests {
61		if got := cleanupToken(0, test.input, true); got != test.output {
62			t.Errorf("%q: got %q want %q", test.input, got, test.output)
63		}
64	}
65}
66
67func TestTokenize(t *testing.T) {
68	tests := []struct {
69		name   string
70		input  string
71		output *indexedDocument
72	}{
73		{name: "hyphenization recovery",
74			input: `basket-
75ball`,
76			output: &indexedDocument{
77				Tokens: []indexedToken{
78					{
79						ID:   1,
80						Line: 1,
81					},
82				},
83				Norm: "basketball",
84			},
85		},
86		{
87			name: "basic scenario",
88			input: `The AWESOME Project LICENSE
89
90Modifi-
91cations prohibited
92
93Copyright 1996-2002, 2006 by A. Developer
94
95Introduction
96
97The AWESOME Project`,
98			output: &indexedDocument{
99				Tokens: []indexedToken{
100					{
101						ID:   1,
102						Line: 1,
103					},
104					{
105						ID:   2,
106						Line: 1,
107					},
108					{
109						ID:   3,
110						Line: 1,
111					},
112					{
113						ID:   4,
114						Line: 1,
115					},
116					{
117						ID:   5,
118						Line: 3,
119					},
120					{
121						ID:   6,
122						Line: 4,
123					},
124					{
125						ID:   7,
126						Line: 8,
127					},
128					{
129						ID:   1,
130						Line: 10,
131					},
132					{
133						ID:   2,
134						Line: 10,
135					},
136					{
137						ID:   3,
138						Line: 10,
139					},
140				},
141				Matches: Matches{&Match{Name: "Copyright", Confidence: 1.0, MatchType: "Copyright", StartLine: 6, EndLine: 6}},
142				Norm:    "the awesome project license modifications prohibited introduction the awesome project",
143			},
144		},
145	}
146	for _, test := range tests {
147		t.Run(test.name, func(t *testing.T) {
148			d, err := tokenizeStream(bytes.NewReader([]byte(test.input)), true, newDictionary(), true)
149			if err != nil {
150				t.Errorf("%s failed: got unexpected error %v", test.name, err)
151			}
152			if diff := cmp.Diff(d, test.output, cmpopts.IgnoreUnexported(indexedDocument{})); diff != "" {
153				t.Errorf("%s failed:\nDiff(+got,-want): %s", test.name, diff)
154			}
155		})
156	}
157}
158
159type mockReader struct {
160	t        *testing.T
161	schedule []int
162	cur      int
163}
164
165func (m *mockReader) Read(buf []byte) (int, error) {
166	if m.cur > len(m.schedule) {
167		m.t.Fatal("Unexpected read on mock")
168	}
169
170	if m.cur == len(m.schedule) {
171		return 0, io.EOF
172	}
173
174	if len(buf) != m.schedule[m.cur] {
175		m.t.Fatalf("step %d: got %d, want %d", m.cur, len(buf), m.schedule[m.cur])
176	}
177	m.cur++
178
179	for i := range buf {
180		buf[i] = 'a'
181	}
182
183	return len(buf), nil
184}
185
186func TestTokenizerBuffering(t *testing.T) {
187	dict := newDictionary()
188	mr := mockReader{
189		t:        t,
190		schedule: []int{1024, 1020, 1020},
191	}
192	d, err := tokenizeStream(&mr, true, dict, true)
193	if err != nil {
194		t.Errorf("Read returned unexpected error: %v", err)
195	}
196
197	// Do a basic test to make sure the data returned is sound
198	if len(d.Tokens) != 1 {
199		t.Errorf("Got %d tokens, expected 1", len(d.Tokens))
200	}
201
202	if len(d.Norm) != 3064 {
203		t.Errorf("Got %d bytes, expected 3064", len(d.Norm))
204	}
205}
206
207func TestTokenizer(t *testing.T) {
208	// This test focuses primarily on the textual content extracted and does not look
209	// at the other parts of the document.
210	tests := []struct {
211		name   string
212		input  string
213		output string
214	}{
215		{
216			name:   "Basic Tokens",
217			input:  "Here are some words. ",
218			output: "here are some words",
219		},
220		{
221			name:   "skips bullet headers",
222			input:  "* item the first\n· item the second",
223			output: "item the first item the second",
224		},
225		{
226			name:   "preserves version numbers but not header numbers",
227			input:  "sample rules\n1. Python 2.7.8 is a version of the language.",
228			output: "sample rules python 2.7.8 is a version of the language",
229		},
230		{
231			name:   "preserves version numbers across line breaks",
232			input:  "Python version\n2.7.8 is a version of the language.",
233			output: "python version 2.7.8 is a version of the language",
234		},
235		{
236			name:   "preserves punctuation",
237			input:  "Bill, Larry, and Sergey agree precision is critical!",
238			output: "bill larry and sergey agree precision is critical",
239		},
240		{
241			name:   "ignores comment characters and bullet formatting",
242			input:  "/* * item the first",
243			output: "item the first",
244		},
245		{
246			name:   "produces blank line as needed",
247			input:  "/* *",
248			output: "",
249		},
250		{
251			name:   "clobbers header looking thing as appropriate",
252			input:  " iv. this is a test",
253			output: "this is a test",
254		},
255		{
256			name:   "clobbers header looking thing as appropriate even in comment",
257			input:  "/* 1.2.3. this is a test",
258			output: "this is a test",
259		},
260		{
261			name:   "preserve version number (not a header, but header-looking) not at beginning of sentence",
262			input:  "This is version 1.1.",
263			output: "this is version 1.1",
264		},
265		{
266			name:   "copyright inside a comment",
267			input:  " /* Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved",
268			output: "",
269		},
270		{
271			name: "FTL copyright text",
272			input: `The FreeType Project LICENSE
273
2742006-Jan-27
2752006-01-27
276
277Copyright 1996-2002, 2006 by David Turner, Robert Wilhelm, and Werner Lemberg
278
279Introduction
280
281The FreeType Project`,
282			output: "the freetype project license introduction the freetype project",
283		},
284		{
285			name: "Separated text",
286			input: `distribution and modifi287				       cation follow.`,
288			output: "distribution and modification follow",
289		},
290		{
291			name:   "preserve internal references, even on line break",
292			input:  "(ii) should be preserved as (ii) is preserved",
293			output: "ii should be preserved as ii is preserved",
294		},
295	}
296
297	for _, test := range tests {
298		t.Run(test.name, func(t *testing.T) {
299			dict := newDictionary()
300			d, err := tokenizeStream(bytes.NewReader([]byte(test.input)), true, dict, true)
301			if err != nil {
302				t.Errorf("%s failed: got unexpected error %v", test.name, err)
303			}
304			var b strings.Builder
305			for _, tok := range d.Tokens {
306				b.WriteString(dict.getWord(tok.ID))
307				b.WriteString(" ")
308			}
309			actual := strings.TrimSpace(b.String())
310			if actual != test.output {
311				t.Errorf("Tokenize(%q): got %q want %q", test.name, actual, test.output)
312			}
313		})
314	}
315}
316