1// Copyright 2020 Google Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package classifier 16 17import ( 18 "bytes" 19 "io" 20 "strings" 21 "testing" 22 23 "github.com/google/go-cmp/cmp" 24 "github.com/google/go-cmp/cmp/cmpopts" 25) 26 27func TestCleanupToken(t *testing.T) { 28 tests := []struct { 29 input string 30 output string 31 }{{ 32 input: "cleanup!", 33 output: "cleanup", 34 }, 35 { 36 input: "12345", 37 output: "12345", 38 }, 39 { 40 input: "r1@zx42-", 41 output: "rzx", 42 }, 43 { 44 input: "12345,", 45 output: "12345", 46 }, 47 { 48 input: "12345-6789", 49 output: "12345-6789", 50 }, 51 { 52 input: "1(a)", 53 output: "1", 54 }, 55 { 56 input: "1.2.3", 57 output: "1.2.3", 58 }, 59 } 60 for _, test := range tests { 61 if got := cleanupToken(0, test.input, true); got != test.output { 62 t.Errorf("%q: got %q want %q", test.input, got, test.output) 63 } 64 } 65} 66 67func TestTokenize(t *testing.T) { 68 tests := []struct { 69 name string 70 input string 71 output *indexedDocument 72 }{ 73 {name: "hyphenization recovery", 74 input: `basket- 75ball`, 76 output: &indexedDocument{ 77 Tokens: []indexedToken{ 78 { 79 ID: 1, 80 Line: 1, 81 }, 82 }, 83 Norm: "basketball", 84 }, 85 }, 86 { 87 name: "basic scenario", 88 input: `The AWESOME Project LICENSE 89 90Modifi- 91cations prohibited 92 93Copyright 1996-2002, 2006 by A. Developer 94 95Introduction 96 97The AWESOME Project`, 98 output: &indexedDocument{ 99 Tokens: []indexedToken{ 100 { 101 ID: 1, 102 Line: 1, 103 }, 104 { 105 ID: 2, 106 Line: 1, 107 }, 108 { 109 ID: 3, 110 Line: 1, 111 }, 112 { 113 ID: 4, 114 Line: 1, 115 }, 116 { 117 ID: 5, 118 Line: 3, 119 }, 120 { 121 ID: 6, 122 Line: 4, 123 }, 124 { 125 ID: 7, 126 Line: 8, 127 }, 128 { 129 ID: 1, 130 Line: 10, 131 }, 132 { 133 ID: 2, 134 Line: 10, 135 }, 136 { 137 ID: 3, 138 Line: 10, 139 }, 140 }, 141 Matches: Matches{&Match{Name: "Copyright", Confidence: 1.0, MatchType: "Copyright", StartLine: 6, EndLine: 6}}, 142 Norm: "the awesome project license modifications prohibited introduction the awesome project", 143 }, 144 }, 145 } 146 for _, test := range tests { 147 t.Run(test.name, func(t *testing.T) { 148 d, err := tokenizeStream(bytes.NewReader([]byte(test.input)), true, newDictionary(), true) 149 if err != nil { 150 t.Errorf("%s failed: got unexpected error %v", test.name, err) 151 } 152 if diff := cmp.Diff(d, test.output, cmpopts.IgnoreUnexported(indexedDocument{})); diff != "" { 153 t.Errorf("%s failed:\nDiff(+got,-want): %s", test.name, diff) 154 } 155 }) 156 } 157} 158 159type mockReader struct { 160 t *testing.T 161 schedule []int 162 cur int 163} 164 165func (m *mockReader) Read(buf []byte) (int, error) { 166 if m.cur > len(m.schedule) { 167 m.t.Fatal("Unexpected read on mock") 168 } 169 170 if m.cur == len(m.schedule) { 171 return 0, io.EOF 172 } 173 174 if len(buf) != m.schedule[m.cur] { 175 m.t.Fatalf("step %d: got %d, want %d", m.cur, len(buf), m.schedule[m.cur]) 176 } 177 m.cur++ 178 179 for i := range buf { 180 buf[i] = 'a' 181 } 182 183 return len(buf), nil 184} 185 186func TestTokenizerBuffering(t *testing.T) { 187 dict := newDictionary() 188 mr := mockReader{ 189 t: t, 190 schedule: []int{1024, 1020, 1020}, 191 } 192 d, err := tokenizeStream(&mr, true, dict, true) 193 if err != nil { 194 t.Errorf("Read returned unexpected error: %v", err) 195 } 196 197 // Do a basic test to make sure the data returned is sound 198 if len(d.Tokens) != 1 { 199 t.Errorf("Got %d tokens, expected 1", len(d.Tokens)) 200 } 201 202 if len(d.Norm) != 3064 { 203 t.Errorf("Got %d bytes, expected 3064", len(d.Norm)) 204 } 205} 206 207func TestTokenizer(t *testing.T) { 208 // This test focuses primarily on the textual content extracted and does not look 209 // at the other parts of the document. 210 tests := []struct { 211 name string 212 input string 213 output string 214 }{ 215 { 216 name: "Basic Tokens", 217 input: "Here are some words. ", 218 output: "here are some words", 219 }, 220 { 221 name: "skips bullet headers", 222 input: "* item the first\n· item the second", 223 output: "item the first item the second", 224 }, 225 { 226 name: "preserves version numbers but not header numbers", 227 input: "sample rules\n1. Python 2.7.8 is a version of the language.", 228 output: "sample rules python 2.7.8 is a version of the language", 229 }, 230 { 231 name: "preserves version numbers across line breaks", 232 input: "Python version\n2.7.8 is a version of the language.", 233 output: "python version 2.7.8 is a version of the language", 234 }, 235 { 236 name: "preserves punctuation", 237 input: "Bill, Larry, and Sergey agree precision is critical!", 238 output: "bill larry and sergey agree precision is critical", 239 }, 240 { 241 name: "ignores comment characters and bullet formatting", 242 input: "/* * item the first", 243 output: "item the first", 244 }, 245 { 246 name: "produces blank line as needed", 247 input: "/* *", 248 output: "", 249 }, 250 { 251 name: "clobbers header looking thing as appropriate", 252 input: " iv. this is a test", 253 output: "this is a test", 254 }, 255 { 256 name: "clobbers header looking thing as appropriate even in comment", 257 input: "/* 1.2.3. this is a test", 258 output: "this is a test", 259 }, 260 { 261 name: "preserve version number (not a header, but header-looking) not at beginning of sentence", 262 input: "This is version 1.1.", 263 output: "this is version 1.1", 264 }, 265 { 266 name: "copyright inside a comment", 267 input: " /* Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved", 268 output: "", 269 }, 270 { 271 name: "FTL copyright text", 272 input: `The FreeType Project LICENSE 273 2742006-Jan-27 2752006-01-27 276 277Copyright 1996-2002, 2006 by David Turner, Robert Wilhelm, and Werner Lemberg 278 279Introduction 280 281The FreeType Project`, 282 output: "the freetype project license introduction the freetype project", 283 }, 284 { 285 name: "Separated text", 286 input: `distribution and modifi‐ 287 cation follow.`, 288 output: "distribution and modification follow", 289 }, 290 { 291 name: "preserve internal references, even on line break", 292 input: "(ii) should be preserved as (ii) is preserved", 293 output: "ii should be preserved as ii is preserved", 294 }, 295 } 296 297 for _, test := range tests { 298 t.Run(test.name, func(t *testing.T) { 299 dict := newDictionary() 300 d, err := tokenizeStream(bytes.NewReader([]byte(test.input)), true, dict, true) 301 if err != nil { 302 t.Errorf("%s failed: got unexpected error %v", test.name, err) 303 } 304 var b strings.Builder 305 for _, tok := range d.Tokens { 306 b.WriteString(dict.getWord(tok.ID)) 307 b.WriteString(" ") 308 } 309 actual := strings.TrimSpace(b.String()) 310 if actual != test.output { 311 t.Errorf("Tokenize(%q): got %q want %q", test.name, actual, test.output) 312 } 313 }) 314 } 315} 316