// Copyright 2020 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package classifier import ( "bytes" "io" "strings" "testing" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" ) func TestCleanupToken(t *testing.T) { tests := []struct { input string output string }{{ input: "cleanup!", output: "cleanup", }, { input: "12345", output: "12345", }, { input: "r1@zx42-", output: "rzx", }, { input: "12345,", output: "12345", }, { input: "12345-6789", output: "12345-6789", }, { input: "1(a)", output: "1", }, { input: "1.2.3", output: "1.2.3", }, } for _, test := range tests { if got := cleanupToken(0, test.input, true); got != test.output { t.Errorf("%q: got %q want %q", test.input, got, test.output) } } } func TestTokenize(t *testing.T) { tests := []struct { name string input string output *indexedDocument }{ {name: "hyphenization recovery", input: `basket- ball`, output: &indexedDocument{ Tokens: []indexedToken{ { ID: 1, Line: 1, }, }, Norm: "basketball", }, }, { name: "basic scenario", input: `The AWESOME Project LICENSE Modifi- cations prohibited Copyright 1996-2002, 2006 by A. Developer Introduction The AWESOME Project`, output: &indexedDocument{ Tokens: []indexedToken{ { ID: 1, Line: 1, }, { ID: 2, Line: 1, }, { ID: 3, Line: 1, }, { ID: 4, Line: 1, }, { ID: 5, Line: 3, }, { ID: 6, Line: 4, }, { ID: 7, Line: 8, }, { ID: 1, Line: 10, }, { ID: 2, Line: 10, }, { ID: 3, Line: 10, }, }, Matches: Matches{&Match{Name: "Copyright", Confidence: 1.0, MatchType: "Copyright", StartLine: 6, EndLine: 6}}, Norm: "the awesome project license modifications prohibited introduction the awesome project", }, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { d, err := tokenizeStream(bytes.NewReader([]byte(test.input)), true, newDictionary(), true) if err != nil { t.Errorf("%s failed: got unexpected error %v", test.name, err) } if diff := cmp.Diff(d, test.output, cmpopts.IgnoreUnexported(indexedDocument{})); diff != "" { t.Errorf("%s failed:\nDiff(+got,-want): %s", test.name, diff) } }) } } type mockReader struct { t *testing.T schedule []int cur int } func (m *mockReader) Read(buf []byte) (int, error) { if m.cur > len(m.schedule) { m.t.Fatal("Unexpected read on mock") } if m.cur == len(m.schedule) { return 0, io.EOF } if len(buf) != m.schedule[m.cur] { m.t.Fatalf("step %d: got %d, want %d", m.cur, len(buf), m.schedule[m.cur]) } m.cur++ for i := range buf { buf[i] = 'a' } return len(buf), nil } func TestTokenizerBuffering(t *testing.T) { dict := newDictionary() mr := mockReader{ t: t, schedule: []int{1024, 1020, 1020}, } d, err := tokenizeStream(&mr, true, dict, true) if err != nil { t.Errorf("Read returned unexpected error: %v", err) } // Do a basic test to make sure the data returned is sound if len(d.Tokens) != 1 { t.Errorf("Got %d tokens, expected 1", len(d.Tokens)) } if len(d.Norm) != 3064 { t.Errorf("Got %d bytes, expected 3064", len(d.Norm)) } } func TestTokenizer(t *testing.T) { // This test focuses primarily on the textual content extracted and does not look // at the other parts of the document. tests := []struct { name string input string output string }{ { name: "Basic Tokens", input: "Here are some words. ", output: "here are some words", }, { name: "skips bullet headers", input: "* item the first\n· item the second", output: "item the first item the second", }, { name: "preserves version numbers but not header numbers", input: "sample rules\n1. Python 2.7.8 is a version of the language.", output: "sample rules python 2.7.8 is a version of the language", }, { name: "preserves version numbers across line breaks", input: "Python version\n2.7.8 is a version of the language.", output: "python version 2.7.8 is a version of the language", }, { name: "preserves punctuation", input: "Bill, Larry, and Sergey agree precision is critical!", output: "bill larry and sergey agree precision is critical", }, { name: "ignores comment characters and bullet formatting", input: "/* * item the first", output: "item the first", }, { name: "produces blank line as needed", input: "/* *", output: "", }, { name: "clobbers header looking thing as appropriate", input: " iv. this is a test", output: "this is a test", }, { name: "clobbers header looking thing as appropriate even in comment", input: "/* 1.2.3. this is a test", output: "this is a test", }, { name: "preserve version number (not a header, but header-looking) not at beginning of sentence", input: "This is version 1.1.", output: "this is version 1.1", }, { name: "copyright inside a comment", input: " /* Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved", output: "", }, { name: "FTL copyright text", input: `The FreeType Project LICENSE 2006-Jan-27 2006-01-27 Copyright 1996-2002, 2006 by David Turner, Robert Wilhelm, and Werner Lemberg Introduction The FreeType Project`, output: "the freetype project license introduction the freetype project", }, { name: "Separated text", input: `distribution and modifi‐ cation follow.`, output: "distribution and modification follow", }, { name: "preserve internal references, even on line break", input: "(ii) should be preserved as (ii) is preserved", output: "ii should be preserved as ii is preserved", }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { dict := newDictionary() d, err := tokenizeStream(bytes.NewReader([]byte(test.input)), true, dict, true) if err != nil { t.Errorf("%s failed: got unexpected error %v", test.name, err) } var b strings.Builder for _, tok := range d.Tokens { b.WriteString(dict.getWord(tok.ID)) b.WriteString(" ") } actual := strings.TrimSpace(b.String()) if actual != test.output { t.Errorf("Tokenize(%q): got %q want %q", test.name, actual, test.output) } }) } }