xref: /aosp_15_r20/external/licenseclassifier/v2/tokenizer_test.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1*46c4c49dSIbrahim Kanouche// Copyright 2020 Google Inc.
2*46c4c49dSIbrahim Kanouche//
3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License");
4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License.
5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at
6*46c4c49dSIbrahim Kanouche//
7*46c4c49dSIbrahim Kanouche//     http://www.apache.org/licenses/LICENSE-2.0
8*46c4c49dSIbrahim Kanouche//
9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software
10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS,
11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and
13*46c4c49dSIbrahim Kanouche// limitations under the License.
14*46c4c49dSIbrahim Kanouche
15*46c4c49dSIbrahim Kanouchepackage classifier
16*46c4c49dSIbrahim Kanouche
17*46c4c49dSIbrahim Kanoucheimport (
18*46c4c49dSIbrahim Kanouche	"bytes"
19*46c4c49dSIbrahim Kanouche	"io"
20*46c4c49dSIbrahim Kanouche	"strings"
21*46c4c49dSIbrahim Kanouche	"testing"
22*46c4c49dSIbrahim Kanouche
23*46c4c49dSIbrahim Kanouche	"github.com/google/go-cmp/cmp"
24*46c4c49dSIbrahim Kanouche	"github.com/google/go-cmp/cmp/cmpopts"
25*46c4c49dSIbrahim Kanouche)
26*46c4c49dSIbrahim Kanouche
27*46c4c49dSIbrahim Kanouchefunc TestCleanupToken(t *testing.T) {
28*46c4c49dSIbrahim Kanouche	tests := []struct {
29*46c4c49dSIbrahim Kanouche		input  string
30*46c4c49dSIbrahim Kanouche		output string
31*46c4c49dSIbrahim Kanouche	}{{
32*46c4c49dSIbrahim Kanouche		input:  "cleanup!",
33*46c4c49dSIbrahim Kanouche		output: "cleanup",
34*46c4c49dSIbrahim Kanouche	},
35*46c4c49dSIbrahim Kanouche		{
36*46c4c49dSIbrahim Kanouche			input:  "12345",
37*46c4c49dSIbrahim Kanouche			output: "12345",
38*46c4c49dSIbrahim Kanouche		},
39*46c4c49dSIbrahim Kanouche		{
40*46c4c49dSIbrahim Kanouche			input:  "r1@zx42-",
41*46c4c49dSIbrahim Kanouche			output: "rzx",
42*46c4c49dSIbrahim Kanouche		},
43*46c4c49dSIbrahim Kanouche		{
44*46c4c49dSIbrahim Kanouche			input:  "12345,",
45*46c4c49dSIbrahim Kanouche			output: "12345",
46*46c4c49dSIbrahim Kanouche		},
47*46c4c49dSIbrahim Kanouche		{
48*46c4c49dSIbrahim Kanouche			input:  "12345-6789",
49*46c4c49dSIbrahim Kanouche			output: "12345-6789",
50*46c4c49dSIbrahim Kanouche		},
51*46c4c49dSIbrahim Kanouche		{
52*46c4c49dSIbrahim Kanouche			input:  "1(a)",
53*46c4c49dSIbrahim Kanouche			output: "1",
54*46c4c49dSIbrahim Kanouche		},
55*46c4c49dSIbrahim Kanouche		{
56*46c4c49dSIbrahim Kanouche			input:  "1.2.3",
57*46c4c49dSIbrahim Kanouche			output: "1.2.3",
58*46c4c49dSIbrahim Kanouche		},
59*46c4c49dSIbrahim Kanouche	}
60*46c4c49dSIbrahim Kanouche	for _, test := range tests {
61*46c4c49dSIbrahim Kanouche		if got := cleanupToken(0, test.input, true); got != test.output {
62*46c4c49dSIbrahim Kanouche			t.Errorf("%q: got %q want %q", test.input, got, test.output)
63*46c4c49dSIbrahim Kanouche		}
64*46c4c49dSIbrahim Kanouche	}
65*46c4c49dSIbrahim Kanouche}
66*46c4c49dSIbrahim Kanouche
67*46c4c49dSIbrahim Kanouchefunc TestTokenize(t *testing.T) {
68*46c4c49dSIbrahim Kanouche	tests := []struct {
69*46c4c49dSIbrahim Kanouche		name   string
70*46c4c49dSIbrahim Kanouche		input  string
71*46c4c49dSIbrahim Kanouche		output *indexedDocument
72*46c4c49dSIbrahim Kanouche	}{
73*46c4c49dSIbrahim Kanouche		{name: "hyphenization recovery",
74*46c4c49dSIbrahim Kanouche			input: `basket-
75*46c4c49dSIbrahim Kanoucheball`,
76*46c4c49dSIbrahim Kanouche			output: &indexedDocument{
77*46c4c49dSIbrahim Kanouche				Tokens: []indexedToken{
78*46c4c49dSIbrahim Kanouche					{
79*46c4c49dSIbrahim Kanouche						ID:   1,
80*46c4c49dSIbrahim Kanouche						Line: 1,
81*46c4c49dSIbrahim Kanouche					},
82*46c4c49dSIbrahim Kanouche				},
83*46c4c49dSIbrahim Kanouche				Norm: "basketball",
84*46c4c49dSIbrahim Kanouche			},
85*46c4c49dSIbrahim Kanouche		},
86*46c4c49dSIbrahim Kanouche		{
87*46c4c49dSIbrahim Kanouche			name: "basic scenario",
88*46c4c49dSIbrahim Kanouche			input: `The AWESOME Project LICENSE
89*46c4c49dSIbrahim Kanouche
90*46c4c49dSIbrahim KanoucheModifi-
91*46c4c49dSIbrahim Kanouchecations prohibited
92*46c4c49dSIbrahim Kanouche
93*46c4c49dSIbrahim KanoucheCopyright 1996-2002, 2006 by A. Developer
94*46c4c49dSIbrahim Kanouche
95*46c4c49dSIbrahim KanoucheIntroduction
96*46c4c49dSIbrahim Kanouche
97*46c4c49dSIbrahim KanoucheThe AWESOME Project`,
98*46c4c49dSIbrahim Kanouche			output: &indexedDocument{
99*46c4c49dSIbrahim Kanouche				Tokens: []indexedToken{
100*46c4c49dSIbrahim Kanouche					{
101*46c4c49dSIbrahim Kanouche						ID:   1,
102*46c4c49dSIbrahim Kanouche						Line: 1,
103*46c4c49dSIbrahim Kanouche					},
104*46c4c49dSIbrahim Kanouche					{
105*46c4c49dSIbrahim Kanouche						ID:   2,
106*46c4c49dSIbrahim Kanouche						Line: 1,
107*46c4c49dSIbrahim Kanouche					},
108*46c4c49dSIbrahim Kanouche					{
109*46c4c49dSIbrahim Kanouche						ID:   3,
110*46c4c49dSIbrahim Kanouche						Line: 1,
111*46c4c49dSIbrahim Kanouche					},
112*46c4c49dSIbrahim Kanouche					{
113*46c4c49dSIbrahim Kanouche						ID:   4,
114*46c4c49dSIbrahim Kanouche						Line: 1,
115*46c4c49dSIbrahim Kanouche					},
116*46c4c49dSIbrahim Kanouche					{
117*46c4c49dSIbrahim Kanouche						ID:   5,
118*46c4c49dSIbrahim Kanouche						Line: 3,
119*46c4c49dSIbrahim Kanouche					},
120*46c4c49dSIbrahim Kanouche					{
121*46c4c49dSIbrahim Kanouche						ID:   6,
122*46c4c49dSIbrahim Kanouche						Line: 4,
123*46c4c49dSIbrahim Kanouche					},
124*46c4c49dSIbrahim Kanouche					{
125*46c4c49dSIbrahim Kanouche						ID:   7,
126*46c4c49dSIbrahim Kanouche						Line: 8,
127*46c4c49dSIbrahim Kanouche					},
128*46c4c49dSIbrahim Kanouche					{
129*46c4c49dSIbrahim Kanouche						ID:   1,
130*46c4c49dSIbrahim Kanouche						Line: 10,
131*46c4c49dSIbrahim Kanouche					},
132*46c4c49dSIbrahim Kanouche					{
133*46c4c49dSIbrahim Kanouche						ID:   2,
134*46c4c49dSIbrahim Kanouche						Line: 10,
135*46c4c49dSIbrahim Kanouche					},
136*46c4c49dSIbrahim Kanouche					{
137*46c4c49dSIbrahim Kanouche						ID:   3,
138*46c4c49dSIbrahim Kanouche						Line: 10,
139*46c4c49dSIbrahim Kanouche					},
140*46c4c49dSIbrahim Kanouche				},
141*46c4c49dSIbrahim Kanouche				Matches: Matches{&Match{Name: "Copyright", Confidence: 1.0, MatchType: "Copyright", StartLine: 6, EndLine: 6}},
142*46c4c49dSIbrahim Kanouche				Norm:    "the awesome project license modifications prohibited introduction the awesome project",
143*46c4c49dSIbrahim Kanouche			},
144*46c4c49dSIbrahim Kanouche		},
145*46c4c49dSIbrahim Kanouche	}
146*46c4c49dSIbrahim Kanouche	for _, test := range tests {
147*46c4c49dSIbrahim Kanouche		t.Run(test.name, func(t *testing.T) {
148*46c4c49dSIbrahim Kanouche			d, err := tokenizeStream(bytes.NewReader([]byte(test.input)), true, newDictionary(), true)
149*46c4c49dSIbrahim Kanouche			if err != nil {
150*46c4c49dSIbrahim Kanouche				t.Errorf("%s failed: got unexpected error %v", test.name, err)
151*46c4c49dSIbrahim Kanouche			}
152*46c4c49dSIbrahim Kanouche			if diff := cmp.Diff(d, test.output, cmpopts.IgnoreUnexported(indexedDocument{})); diff != "" {
153*46c4c49dSIbrahim Kanouche				t.Errorf("%s failed:\nDiff(+got,-want): %s", test.name, diff)
154*46c4c49dSIbrahim Kanouche			}
155*46c4c49dSIbrahim Kanouche		})
156*46c4c49dSIbrahim Kanouche	}
157*46c4c49dSIbrahim Kanouche}
158*46c4c49dSIbrahim Kanouche
159*46c4c49dSIbrahim Kanouchetype mockReader struct {
160*46c4c49dSIbrahim Kanouche	t        *testing.T
161*46c4c49dSIbrahim Kanouche	schedule []int
162*46c4c49dSIbrahim Kanouche	cur      int
163*46c4c49dSIbrahim Kanouche}
164*46c4c49dSIbrahim Kanouche
165*46c4c49dSIbrahim Kanouchefunc (m *mockReader) Read(buf []byte) (int, error) {
166*46c4c49dSIbrahim Kanouche	if m.cur > len(m.schedule) {
167*46c4c49dSIbrahim Kanouche		m.t.Fatal("Unexpected read on mock")
168*46c4c49dSIbrahim Kanouche	}
169*46c4c49dSIbrahim Kanouche
170*46c4c49dSIbrahim Kanouche	if m.cur == len(m.schedule) {
171*46c4c49dSIbrahim Kanouche		return 0, io.EOF
172*46c4c49dSIbrahim Kanouche	}
173*46c4c49dSIbrahim Kanouche
174*46c4c49dSIbrahim Kanouche	if len(buf) != m.schedule[m.cur] {
175*46c4c49dSIbrahim Kanouche		m.t.Fatalf("step %d: got %d, want %d", m.cur, len(buf), m.schedule[m.cur])
176*46c4c49dSIbrahim Kanouche	}
177*46c4c49dSIbrahim Kanouche	m.cur++
178*46c4c49dSIbrahim Kanouche
179*46c4c49dSIbrahim Kanouche	for i := range buf {
180*46c4c49dSIbrahim Kanouche		buf[i] = 'a'
181*46c4c49dSIbrahim Kanouche	}
182*46c4c49dSIbrahim Kanouche
183*46c4c49dSIbrahim Kanouche	return len(buf), nil
184*46c4c49dSIbrahim Kanouche}
185*46c4c49dSIbrahim Kanouche
186*46c4c49dSIbrahim Kanouchefunc TestTokenizerBuffering(t *testing.T) {
187*46c4c49dSIbrahim Kanouche	dict := newDictionary()
188*46c4c49dSIbrahim Kanouche	mr := mockReader{
189*46c4c49dSIbrahim Kanouche		t:        t,
190*46c4c49dSIbrahim Kanouche		schedule: []int{1024, 1020, 1020},
191*46c4c49dSIbrahim Kanouche	}
192*46c4c49dSIbrahim Kanouche	d, err := tokenizeStream(&mr, true, dict, true)
193*46c4c49dSIbrahim Kanouche	if err != nil {
194*46c4c49dSIbrahim Kanouche		t.Errorf("Read returned unexpected error: %v", err)
195*46c4c49dSIbrahim Kanouche	}
196*46c4c49dSIbrahim Kanouche
197*46c4c49dSIbrahim Kanouche	// Do a basic test to make sure the data returned is sound
198*46c4c49dSIbrahim Kanouche	if len(d.Tokens) != 1 {
199*46c4c49dSIbrahim Kanouche		t.Errorf("Got %d tokens, expected 1", len(d.Tokens))
200*46c4c49dSIbrahim Kanouche	}
201*46c4c49dSIbrahim Kanouche
202*46c4c49dSIbrahim Kanouche	if len(d.Norm) != 3064 {
203*46c4c49dSIbrahim Kanouche		t.Errorf("Got %d bytes, expected 3064", len(d.Norm))
204*46c4c49dSIbrahim Kanouche	}
205*46c4c49dSIbrahim Kanouche}
206*46c4c49dSIbrahim Kanouche
207*46c4c49dSIbrahim Kanouchefunc TestTokenizer(t *testing.T) {
208*46c4c49dSIbrahim Kanouche	// This test focuses primarily on the textual content extracted and does not look
209*46c4c49dSIbrahim Kanouche	// at the other parts of the document.
210*46c4c49dSIbrahim Kanouche	tests := []struct {
211*46c4c49dSIbrahim Kanouche		name   string
212*46c4c49dSIbrahim Kanouche		input  string
213*46c4c49dSIbrahim Kanouche		output string
214*46c4c49dSIbrahim Kanouche	}{
215*46c4c49dSIbrahim Kanouche		{
216*46c4c49dSIbrahim Kanouche			name:   "Basic Tokens",
217*46c4c49dSIbrahim Kanouche			input:  "Here are some words. ",
218*46c4c49dSIbrahim Kanouche			output: "here are some words",
219*46c4c49dSIbrahim Kanouche		},
220*46c4c49dSIbrahim Kanouche		{
221*46c4c49dSIbrahim Kanouche			name:   "skips bullet headers",
222*46c4c49dSIbrahim Kanouche			input:  "* item the first\n· item the second",
223*46c4c49dSIbrahim Kanouche			output: "item the first item the second",
224*46c4c49dSIbrahim Kanouche		},
225*46c4c49dSIbrahim Kanouche		{
226*46c4c49dSIbrahim Kanouche			name:   "preserves version numbers but not header numbers",
227*46c4c49dSIbrahim Kanouche			input:  "sample rules\n1. Python 2.7.8 is a version of the language.",
228*46c4c49dSIbrahim Kanouche			output: "sample rules python 2.7.8 is a version of the language",
229*46c4c49dSIbrahim Kanouche		},
230*46c4c49dSIbrahim Kanouche		{
231*46c4c49dSIbrahim Kanouche			name:   "preserves version numbers across line breaks",
232*46c4c49dSIbrahim Kanouche			input:  "Python version\n2.7.8 is a version of the language.",
233*46c4c49dSIbrahim Kanouche			output: "python version 2.7.8 is a version of the language",
234*46c4c49dSIbrahim Kanouche		},
235*46c4c49dSIbrahim Kanouche		{
236*46c4c49dSIbrahim Kanouche			name:   "preserves punctuation",
237*46c4c49dSIbrahim Kanouche			input:  "Bill, Larry, and Sergey agree precision is critical!",
238*46c4c49dSIbrahim Kanouche			output: "bill larry and sergey agree precision is critical",
239*46c4c49dSIbrahim Kanouche		},
240*46c4c49dSIbrahim Kanouche		{
241*46c4c49dSIbrahim Kanouche			name:   "ignores comment characters and bullet formatting",
242*46c4c49dSIbrahim Kanouche			input:  "/* * item the first",
243*46c4c49dSIbrahim Kanouche			output: "item the first",
244*46c4c49dSIbrahim Kanouche		},
245*46c4c49dSIbrahim Kanouche		{
246*46c4c49dSIbrahim Kanouche			name:   "produces blank line as needed",
247*46c4c49dSIbrahim Kanouche			input:  "/* *",
248*46c4c49dSIbrahim Kanouche			output: "",
249*46c4c49dSIbrahim Kanouche		},
250*46c4c49dSIbrahim Kanouche		{
251*46c4c49dSIbrahim Kanouche			name:   "clobbers header looking thing as appropriate",
252*46c4c49dSIbrahim Kanouche			input:  " iv. this is a test",
253*46c4c49dSIbrahim Kanouche			output: "this is a test",
254*46c4c49dSIbrahim Kanouche		},
255*46c4c49dSIbrahim Kanouche		{
256*46c4c49dSIbrahim Kanouche			name:   "clobbers header looking thing as appropriate even in comment",
257*46c4c49dSIbrahim Kanouche			input:  "/* 1.2.3. this is a test",
258*46c4c49dSIbrahim Kanouche			output: "this is a test",
259*46c4c49dSIbrahim Kanouche		},
260*46c4c49dSIbrahim Kanouche		{
261*46c4c49dSIbrahim Kanouche			name:   "preserve version number (not a header, but header-looking) not at beginning of sentence",
262*46c4c49dSIbrahim Kanouche			input:  "This is version 1.1.",
263*46c4c49dSIbrahim Kanouche			output: "this is version 1.1",
264*46c4c49dSIbrahim Kanouche		},
265*46c4c49dSIbrahim Kanouche		{
266*46c4c49dSIbrahim Kanouche			name:   "copyright inside a comment",
267*46c4c49dSIbrahim Kanouche			input:  " /* Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved",
268*46c4c49dSIbrahim Kanouche			output: "",
269*46c4c49dSIbrahim Kanouche		},
270*46c4c49dSIbrahim Kanouche		{
271*46c4c49dSIbrahim Kanouche			name: "FTL copyright text",
272*46c4c49dSIbrahim Kanouche			input: `The FreeType Project LICENSE
273*46c4c49dSIbrahim Kanouche
274*46c4c49dSIbrahim Kanouche2006-Jan-27
275*46c4c49dSIbrahim Kanouche2006-01-27
276*46c4c49dSIbrahim Kanouche
277*46c4c49dSIbrahim KanoucheCopyright 1996-2002, 2006 by David Turner, Robert Wilhelm, and Werner Lemberg
278*46c4c49dSIbrahim Kanouche
279*46c4c49dSIbrahim KanoucheIntroduction
280*46c4c49dSIbrahim Kanouche
281*46c4c49dSIbrahim KanoucheThe FreeType Project`,
282*46c4c49dSIbrahim Kanouche			output: "the freetype project license introduction the freetype project",
283*46c4c49dSIbrahim Kanouche		},
284*46c4c49dSIbrahim Kanouche		{
285*46c4c49dSIbrahim Kanouche			name: "Separated text",
286*46c4c49dSIbrahim Kanouche			input: `distribution and modifi287*46c4c49dSIbrahim Kanouche				       cation follow.`,
288*46c4c49dSIbrahim Kanouche			output: "distribution and modification follow",
289*46c4c49dSIbrahim Kanouche		},
290*46c4c49dSIbrahim Kanouche		{
291*46c4c49dSIbrahim Kanouche			name:   "preserve internal references, even on line break",
292*46c4c49dSIbrahim Kanouche			input:  "(ii) should be preserved as (ii) is preserved",
293*46c4c49dSIbrahim Kanouche			output: "ii should be preserved as ii is preserved",
294*46c4c49dSIbrahim Kanouche		},
295*46c4c49dSIbrahim Kanouche	}
296*46c4c49dSIbrahim Kanouche
297*46c4c49dSIbrahim Kanouche	for _, test := range tests {
298*46c4c49dSIbrahim Kanouche		t.Run(test.name, func(t *testing.T) {
299*46c4c49dSIbrahim Kanouche			dict := newDictionary()
300*46c4c49dSIbrahim Kanouche			d, err := tokenizeStream(bytes.NewReader([]byte(test.input)), true, dict, true)
301*46c4c49dSIbrahim Kanouche			if err != nil {
302*46c4c49dSIbrahim Kanouche				t.Errorf("%s failed: got unexpected error %v", test.name, err)
303*46c4c49dSIbrahim Kanouche			}
304*46c4c49dSIbrahim Kanouche			var b strings.Builder
305*46c4c49dSIbrahim Kanouche			for _, tok := range d.Tokens {
306*46c4c49dSIbrahim Kanouche				b.WriteString(dict.getWord(tok.ID))
307*46c4c49dSIbrahim Kanouche				b.WriteString(" ")
308*46c4c49dSIbrahim Kanouche			}
309*46c4c49dSIbrahim Kanouche			actual := strings.TrimSpace(b.String())
310*46c4c49dSIbrahim Kanouche			if actual != test.output {
311*46c4c49dSIbrahim Kanouche				t.Errorf("Tokenize(%q): got %q want %q", test.name, actual, test.output)
312*46c4c49dSIbrahim Kanouche			}
313*46c4c49dSIbrahim Kanouche		})
314*46c4c49dSIbrahim Kanouche	}
315*46c4c49dSIbrahim Kanouche}
316