1*46c4c49dSIbrahim Kanouche// Copyright 2020 Google Inc. 2*46c4c49dSIbrahim Kanouche// 3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License"); 4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License. 5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at 6*46c4c49dSIbrahim Kanouche// 7*46c4c49dSIbrahim Kanouche// http://www.apache.org/licenses/LICENSE-2.0 8*46c4c49dSIbrahim Kanouche// 9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software 10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS, 11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and 13*46c4c49dSIbrahim Kanouche// limitations under the License. 14*46c4c49dSIbrahim Kanouche 15*46c4c49dSIbrahim Kanouchepackage classifier 16*46c4c49dSIbrahim Kanouche 17*46c4c49dSIbrahim Kanoucheimport ( 18*46c4c49dSIbrahim Kanouche "bytes" 19*46c4c49dSIbrahim Kanouche "io" 20*46c4c49dSIbrahim Kanouche "strings" 21*46c4c49dSIbrahim Kanouche "testing" 22*46c4c49dSIbrahim Kanouche 23*46c4c49dSIbrahim Kanouche "github.com/google/go-cmp/cmp" 24*46c4c49dSIbrahim Kanouche "github.com/google/go-cmp/cmp/cmpopts" 25*46c4c49dSIbrahim Kanouche) 26*46c4c49dSIbrahim Kanouche 27*46c4c49dSIbrahim Kanouchefunc TestCleanupToken(t *testing.T) { 28*46c4c49dSIbrahim Kanouche tests := []struct { 29*46c4c49dSIbrahim Kanouche input string 30*46c4c49dSIbrahim Kanouche output string 31*46c4c49dSIbrahim Kanouche }{{ 32*46c4c49dSIbrahim Kanouche input: "cleanup!", 33*46c4c49dSIbrahim Kanouche output: "cleanup", 34*46c4c49dSIbrahim Kanouche }, 35*46c4c49dSIbrahim Kanouche { 36*46c4c49dSIbrahim Kanouche input: "12345", 37*46c4c49dSIbrahim Kanouche output: "12345", 38*46c4c49dSIbrahim Kanouche }, 39*46c4c49dSIbrahim Kanouche { 40*46c4c49dSIbrahim Kanouche input: "r1@zx42-", 41*46c4c49dSIbrahim Kanouche output: "rzx", 42*46c4c49dSIbrahim Kanouche }, 43*46c4c49dSIbrahim Kanouche { 44*46c4c49dSIbrahim Kanouche input: "12345,", 45*46c4c49dSIbrahim Kanouche output: "12345", 46*46c4c49dSIbrahim Kanouche }, 47*46c4c49dSIbrahim Kanouche { 48*46c4c49dSIbrahim Kanouche input: "12345-6789", 49*46c4c49dSIbrahim Kanouche output: "12345-6789", 50*46c4c49dSIbrahim Kanouche }, 51*46c4c49dSIbrahim Kanouche { 52*46c4c49dSIbrahim Kanouche input: "1(a)", 53*46c4c49dSIbrahim Kanouche output: "1", 54*46c4c49dSIbrahim Kanouche }, 55*46c4c49dSIbrahim Kanouche { 56*46c4c49dSIbrahim Kanouche input: "1.2.3", 57*46c4c49dSIbrahim Kanouche output: "1.2.3", 58*46c4c49dSIbrahim Kanouche }, 59*46c4c49dSIbrahim Kanouche } 60*46c4c49dSIbrahim Kanouche for _, test := range tests { 61*46c4c49dSIbrahim Kanouche if got := cleanupToken(0, test.input, true); got != test.output { 62*46c4c49dSIbrahim Kanouche t.Errorf("%q: got %q want %q", test.input, got, test.output) 63*46c4c49dSIbrahim Kanouche } 64*46c4c49dSIbrahim Kanouche } 65*46c4c49dSIbrahim Kanouche} 66*46c4c49dSIbrahim Kanouche 67*46c4c49dSIbrahim Kanouchefunc TestTokenize(t *testing.T) { 68*46c4c49dSIbrahim Kanouche tests := []struct { 69*46c4c49dSIbrahim Kanouche name string 70*46c4c49dSIbrahim Kanouche input string 71*46c4c49dSIbrahim Kanouche output *indexedDocument 72*46c4c49dSIbrahim Kanouche }{ 73*46c4c49dSIbrahim Kanouche {name: "hyphenization recovery", 74*46c4c49dSIbrahim Kanouche input: `basket- 75*46c4c49dSIbrahim Kanoucheball`, 76*46c4c49dSIbrahim Kanouche output: &indexedDocument{ 77*46c4c49dSIbrahim Kanouche Tokens: []indexedToken{ 78*46c4c49dSIbrahim Kanouche { 79*46c4c49dSIbrahim Kanouche ID: 1, 80*46c4c49dSIbrahim Kanouche Line: 1, 81*46c4c49dSIbrahim Kanouche }, 82*46c4c49dSIbrahim Kanouche }, 83*46c4c49dSIbrahim Kanouche Norm: "basketball", 84*46c4c49dSIbrahim Kanouche }, 85*46c4c49dSIbrahim Kanouche }, 86*46c4c49dSIbrahim Kanouche { 87*46c4c49dSIbrahim Kanouche name: "basic scenario", 88*46c4c49dSIbrahim Kanouche input: `The AWESOME Project LICENSE 89*46c4c49dSIbrahim Kanouche 90*46c4c49dSIbrahim KanoucheModifi- 91*46c4c49dSIbrahim Kanouchecations prohibited 92*46c4c49dSIbrahim Kanouche 93*46c4c49dSIbrahim KanoucheCopyright 1996-2002, 2006 by A. Developer 94*46c4c49dSIbrahim Kanouche 95*46c4c49dSIbrahim KanoucheIntroduction 96*46c4c49dSIbrahim Kanouche 97*46c4c49dSIbrahim KanoucheThe AWESOME Project`, 98*46c4c49dSIbrahim Kanouche output: &indexedDocument{ 99*46c4c49dSIbrahim Kanouche Tokens: []indexedToken{ 100*46c4c49dSIbrahim Kanouche { 101*46c4c49dSIbrahim Kanouche ID: 1, 102*46c4c49dSIbrahim Kanouche Line: 1, 103*46c4c49dSIbrahim Kanouche }, 104*46c4c49dSIbrahim Kanouche { 105*46c4c49dSIbrahim Kanouche ID: 2, 106*46c4c49dSIbrahim Kanouche Line: 1, 107*46c4c49dSIbrahim Kanouche }, 108*46c4c49dSIbrahim Kanouche { 109*46c4c49dSIbrahim Kanouche ID: 3, 110*46c4c49dSIbrahim Kanouche Line: 1, 111*46c4c49dSIbrahim Kanouche }, 112*46c4c49dSIbrahim Kanouche { 113*46c4c49dSIbrahim Kanouche ID: 4, 114*46c4c49dSIbrahim Kanouche Line: 1, 115*46c4c49dSIbrahim Kanouche }, 116*46c4c49dSIbrahim Kanouche { 117*46c4c49dSIbrahim Kanouche ID: 5, 118*46c4c49dSIbrahim Kanouche Line: 3, 119*46c4c49dSIbrahim Kanouche }, 120*46c4c49dSIbrahim Kanouche { 121*46c4c49dSIbrahim Kanouche ID: 6, 122*46c4c49dSIbrahim Kanouche Line: 4, 123*46c4c49dSIbrahim Kanouche }, 124*46c4c49dSIbrahim Kanouche { 125*46c4c49dSIbrahim Kanouche ID: 7, 126*46c4c49dSIbrahim Kanouche Line: 8, 127*46c4c49dSIbrahim Kanouche }, 128*46c4c49dSIbrahim Kanouche { 129*46c4c49dSIbrahim Kanouche ID: 1, 130*46c4c49dSIbrahim Kanouche Line: 10, 131*46c4c49dSIbrahim Kanouche }, 132*46c4c49dSIbrahim Kanouche { 133*46c4c49dSIbrahim Kanouche ID: 2, 134*46c4c49dSIbrahim Kanouche Line: 10, 135*46c4c49dSIbrahim Kanouche }, 136*46c4c49dSIbrahim Kanouche { 137*46c4c49dSIbrahim Kanouche ID: 3, 138*46c4c49dSIbrahim Kanouche Line: 10, 139*46c4c49dSIbrahim Kanouche }, 140*46c4c49dSIbrahim Kanouche }, 141*46c4c49dSIbrahim Kanouche Matches: Matches{&Match{Name: "Copyright", Confidence: 1.0, MatchType: "Copyright", StartLine: 6, EndLine: 6}}, 142*46c4c49dSIbrahim Kanouche Norm: "the awesome project license modifications prohibited introduction the awesome project", 143*46c4c49dSIbrahim Kanouche }, 144*46c4c49dSIbrahim Kanouche }, 145*46c4c49dSIbrahim Kanouche } 146*46c4c49dSIbrahim Kanouche for _, test := range tests { 147*46c4c49dSIbrahim Kanouche t.Run(test.name, func(t *testing.T) { 148*46c4c49dSIbrahim Kanouche d, err := tokenizeStream(bytes.NewReader([]byte(test.input)), true, newDictionary(), true) 149*46c4c49dSIbrahim Kanouche if err != nil { 150*46c4c49dSIbrahim Kanouche t.Errorf("%s failed: got unexpected error %v", test.name, err) 151*46c4c49dSIbrahim Kanouche } 152*46c4c49dSIbrahim Kanouche if diff := cmp.Diff(d, test.output, cmpopts.IgnoreUnexported(indexedDocument{})); diff != "" { 153*46c4c49dSIbrahim Kanouche t.Errorf("%s failed:\nDiff(+got,-want): %s", test.name, diff) 154*46c4c49dSIbrahim Kanouche } 155*46c4c49dSIbrahim Kanouche }) 156*46c4c49dSIbrahim Kanouche } 157*46c4c49dSIbrahim Kanouche} 158*46c4c49dSIbrahim Kanouche 159*46c4c49dSIbrahim Kanouchetype mockReader struct { 160*46c4c49dSIbrahim Kanouche t *testing.T 161*46c4c49dSIbrahim Kanouche schedule []int 162*46c4c49dSIbrahim Kanouche cur int 163*46c4c49dSIbrahim Kanouche} 164*46c4c49dSIbrahim Kanouche 165*46c4c49dSIbrahim Kanouchefunc (m *mockReader) Read(buf []byte) (int, error) { 166*46c4c49dSIbrahim Kanouche if m.cur > len(m.schedule) { 167*46c4c49dSIbrahim Kanouche m.t.Fatal("Unexpected read on mock") 168*46c4c49dSIbrahim Kanouche } 169*46c4c49dSIbrahim Kanouche 170*46c4c49dSIbrahim Kanouche if m.cur == len(m.schedule) { 171*46c4c49dSIbrahim Kanouche return 0, io.EOF 172*46c4c49dSIbrahim Kanouche } 173*46c4c49dSIbrahim Kanouche 174*46c4c49dSIbrahim Kanouche if len(buf) != m.schedule[m.cur] { 175*46c4c49dSIbrahim Kanouche m.t.Fatalf("step %d: got %d, want %d", m.cur, len(buf), m.schedule[m.cur]) 176*46c4c49dSIbrahim Kanouche } 177*46c4c49dSIbrahim Kanouche m.cur++ 178*46c4c49dSIbrahim Kanouche 179*46c4c49dSIbrahim Kanouche for i := range buf { 180*46c4c49dSIbrahim Kanouche buf[i] = 'a' 181*46c4c49dSIbrahim Kanouche } 182*46c4c49dSIbrahim Kanouche 183*46c4c49dSIbrahim Kanouche return len(buf), nil 184*46c4c49dSIbrahim Kanouche} 185*46c4c49dSIbrahim Kanouche 186*46c4c49dSIbrahim Kanouchefunc TestTokenizerBuffering(t *testing.T) { 187*46c4c49dSIbrahim Kanouche dict := newDictionary() 188*46c4c49dSIbrahim Kanouche mr := mockReader{ 189*46c4c49dSIbrahim Kanouche t: t, 190*46c4c49dSIbrahim Kanouche schedule: []int{1024, 1020, 1020}, 191*46c4c49dSIbrahim Kanouche } 192*46c4c49dSIbrahim Kanouche d, err := tokenizeStream(&mr, true, dict, true) 193*46c4c49dSIbrahim Kanouche if err != nil { 194*46c4c49dSIbrahim Kanouche t.Errorf("Read returned unexpected error: %v", err) 195*46c4c49dSIbrahim Kanouche } 196*46c4c49dSIbrahim Kanouche 197*46c4c49dSIbrahim Kanouche // Do a basic test to make sure the data returned is sound 198*46c4c49dSIbrahim Kanouche if len(d.Tokens) != 1 { 199*46c4c49dSIbrahim Kanouche t.Errorf("Got %d tokens, expected 1", len(d.Tokens)) 200*46c4c49dSIbrahim Kanouche } 201*46c4c49dSIbrahim Kanouche 202*46c4c49dSIbrahim Kanouche if len(d.Norm) != 3064 { 203*46c4c49dSIbrahim Kanouche t.Errorf("Got %d bytes, expected 3064", len(d.Norm)) 204*46c4c49dSIbrahim Kanouche } 205*46c4c49dSIbrahim Kanouche} 206*46c4c49dSIbrahim Kanouche 207*46c4c49dSIbrahim Kanouchefunc TestTokenizer(t *testing.T) { 208*46c4c49dSIbrahim Kanouche // This test focuses primarily on the textual content extracted and does not look 209*46c4c49dSIbrahim Kanouche // at the other parts of the document. 210*46c4c49dSIbrahim Kanouche tests := []struct { 211*46c4c49dSIbrahim Kanouche name string 212*46c4c49dSIbrahim Kanouche input string 213*46c4c49dSIbrahim Kanouche output string 214*46c4c49dSIbrahim Kanouche }{ 215*46c4c49dSIbrahim Kanouche { 216*46c4c49dSIbrahim Kanouche name: "Basic Tokens", 217*46c4c49dSIbrahim Kanouche input: "Here are some words. ", 218*46c4c49dSIbrahim Kanouche output: "here are some words", 219*46c4c49dSIbrahim Kanouche }, 220*46c4c49dSIbrahim Kanouche { 221*46c4c49dSIbrahim Kanouche name: "skips bullet headers", 222*46c4c49dSIbrahim Kanouche input: "* item the first\n· item the second", 223*46c4c49dSIbrahim Kanouche output: "item the first item the second", 224*46c4c49dSIbrahim Kanouche }, 225*46c4c49dSIbrahim Kanouche { 226*46c4c49dSIbrahim Kanouche name: "preserves version numbers but not header numbers", 227*46c4c49dSIbrahim Kanouche input: "sample rules\n1. Python 2.7.8 is a version of the language.", 228*46c4c49dSIbrahim Kanouche output: "sample rules python 2.7.8 is a version of the language", 229*46c4c49dSIbrahim Kanouche }, 230*46c4c49dSIbrahim Kanouche { 231*46c4c49dSIbrahim Kanouche name: "preserves version numbers across line breaks", 232*46c4c49dSIbrahim Kanouche input: "Python version\n2.7.8 is a version of the language.", 233*46c4c49dSIbrahim Kanouche output: "python version 2.7.8 is a version of the language", 234*46c4c49dSIbrahim Kanouche }, 235*46c4c49dSIbrahim Kanouche { 236*46c4c49dSIbrahim Kanouche name: "preserves punctuation", 237*46c4c49dSIbrahim Kanouche input: "Bill, Larry, and Sergey agree precision is critical!", 238*46c4c49dSIbrahim Kanouche output: "bill larry and sergey agree precision is critical", 239*46c4c49dSIbrahim Kanouche }, 240*46c4c49dSIbrahim Kanouche { 241*46c4c49dSIbrahim Kanouche name: "ignores comment characters and bullet formatting", 242*46c4c49dSIbrahim Kanouche input: "/* * item the first", 243*46c4c49dSIbrahim Kanouche output: "item the first", 244*46c4c49dSIbrahim Kanouche }, 245*46c4c49dSIbrahim Kanouche { 246*46c4c49dSIbrahim Kanouche name: "produces blank line as needed", 247*46c4c49dSIbrahim Kanouche input: "/* *", 248*46c4c49dSIbrahim Kanouche output: "", 249*46c4c49dSIbrahim Kanouche }, 250*46c4c49dSIbrahim Kanouche { 251*46c4c49dSIbrahim Kanouche name: "clobbers header looking thing as appropriate", 252*46c4c49dSIbrahim Kanouche input: " iv. this is a test", 253*46c4c49dSIbrahim Kanouche output: "this is a test", 254*46c4c49dSIbrahim Kanouche }, 255*46c4c49dSIbrahim Kanouche { 256*46c4c49dSIbrahim Kanouche name: "clobbers header looking thing as appropriate even in comment", 257*46c4c49dSIbrahim Kanouche input: "/* 1.2.3. this is a test", 258*46c4c49dSIbrahim Kanouche output: "this is a test", 259*46c4c49dSIbrahim Kanouche }, 260*46c4c49dSIbrahim Kanouche { 261*46c4c49dSIbrahim Kanouche name: "preserve version number (not a header, but header-looking) not at beginning of sentence", 262*46c4c49dSIbrahim Kanouche input: "This is version 1.1.", 263*46c4c49dSIbrahim Kanouche output: "this is version 1.1", 264*46c4c49dSIbrahim Kanouche }, 265*46c4c49dSIbrahim Kanouche { 266*46c4c49dSIbrahim Kanouche name: "copyright inside a comment", 267*46c4c49dSIbrahim Kanouche input: " /* Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved", 268*46c4c49dSIbrahim Kanouche output: "", 269*46c4c49dSIbrahim Kanouche }, 270*46c4c49dSIbrahim Kanouche { 271*46c4c49dSIbrahim Kanouche name: "FTL copyright text", 272*46c4c49dSIbrahim Kanouche input: `The FreeType Project LICENSE 273*46c4c49dSIbrahim Kanouche 274*46c4c49dSIbrahim Kanouche2006-Jan-27 275*46c4c49dSIbrahim Kanouche2006-01-27 276*46c4c49dSIbrahim Kanouche 277*46c4c49dSIbrahim KanoucheCopyright 1996-2002, 2006 by David Turner, Robert Wilhelm, and Werner Lemberg 278*46c4c49dSIbrahim Kanouche 279*46c4c49dSIbrahim KanoucheIntroduction 280*46c4c49dSIbrahim Kanouche 281*46c4c49dSIbrahim KanoucheThe FreeType Project`, 282*46c4c49dSIbrahim Kanouche output: "the freetype project license introduction the freetype project", 283*46c4c49dSIbrahim Kanouche }, 284*46c4c49dSIbrahim Kanouche { 285*46c4c49dSIbrahim Kanouche name: "Separated text", 286*46c4c49dSIbrahim Kanouche input: `distribution and modifi‐ 287*46c4c49dSIbrahim Kanouche cation follow.`, 288*46c4c49dSIbrahim Kanouche output: "distribution and modification follow", 289*46c4c49dSIbrahim Kanouche }, 290*46c4c49dSIbrahim Kanouche { 291*46c4c49dSIbrahim Kanouche name: "preserve internal references, even on line break", 292*46c4c49dSIbrahim Kanouche input: "(ii) should be preserved as (ii) is preserved", 293*46c4c49dSIbrahim Kanouche output: "ii should be preserved as ii is preserved", 294*46c4c49dSIbrahim Kanouche }, 295*46c4c49dSIbrahim Kanouche } 296*46c4c49dSIbrahim Kanouche 297*46c4c49dSIbrahim Kanouche for _, test := range tests { 298*46c4c49dSIbrahim Kanouche t.Run(test.name, func(t *testing.T) { 299*46c4c49dSIbrahim Kanouche dict := newDictionary() 300*46c4c49dSIbrahim Kanouche d, err := tokenizeStream(bytes.NewReader([]byte(test.input)), true, dict, true) 301*46c4c49dSIbrahim Kanouche if err != nil { 302*46c4c49dSIbrahim Kanouche t.Errorf("%s failed: got unexpected error %v", test.name, err) 303*46c4c49dSIbrahim Kanouche } 304*46c4c49dSIbrahim Kanouche var b strings.Builder 305*46c4c49dSIbrahim Kanouche for _, tok := range d.Tokens { 306*46c4c49dSIbrahim Kanouche b.WriteString(dict.getWord(tok.ID)) 307*46c4c49dSIbrahim Kanouche b.WriteString(" ") 308*46c4c49dSIbrahim Kanouche } 309*46c4c49dSIbrahim Kanouche actual := strings.TrimSpace(b.String()) 310*46c4c49dSIbrahim Kanouche if actual != test.output { 311*46c4c49dSIbrahim Kanouche t.Errorf("Tokenize(%q): got %q want %q", test.name, actual, test.output) 312*46c4c49dSIbrahim Kanouche } 313*46c4c49dSIbrahim Kanouche }) 314*46c4c49dSIbrahim Kanouche } 315*46c4c49dSIbrahim Kanouche} 316