1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "utils/tokenizer.h"
18
19 #include <vector>
20
21 #include "gmock/gmock.h"
22 #include "gtest/gtest.h"
23
24 namespace libtextclassifier3 {
25 namespace {
26
27 using testing::ElementsAreArray;
28
29 class TestingTokenizer : public Tokenizer {
30 public:
TestingTokenizer(const TokenizationType type,const UniLib * unilib,const std::vector<const TokenizationCodepointRange * > & codepoint_ranges,const std::vector<const CodepointRange * > & internal_tokenizer_codepoint_ranges,const bool split_on_script_change,const bool icu_preserve_whitespace_tokens,const bool preserve_floating_numbers)31 TestingTokenizer(
32 const TokenizationType type, const UniLib* unilib,
33 const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
34 const std::vector<const CodepointRange*>&
35 internal_tokenizer_codepoint_ranges,
36 const bool split_on_script_change,
37 const bool icu_preserve_whitespace_tokens,
38 const bool preserve_floating_numbers)
39 : Tokenizer(type, unilib, codepoint_ranges,
40 internal_tokenizer_codepoint_ranges, split_on_script_change,
41 icu_preserve_whitespace_tokens, preserve_floating_numbers) {}
42
43 using Tokenizer::FindTokenizationRange;
44 };
45
46 class TestingTokenizerProxy {
47 public:
TestingTokenizerProxy(TokenizationType type,const std::vector<TokenizationCodepointRangeT> & codepoint_range_configs,const std::vector<CodepointRangeT> & internal_codepoint_range_configs,const bool split_on_script_change,const bool icu_preserve_whitespace_tokens,const bool preserve_floating_numbers)48 TestingTokenizerProxy(
49 TokenizationType type,
50 const std::vector<TokenizationCodepointRangeT>& codepoint_range_configs,
51 const std::vector<CodepointRangeT>& internal_codepoint_range_configs,
52 const bool split_on_script_change,
53 const bool icu_preserve_whitespace_tokens,
54 const bool preserve_floating_numbers)
55 : INIT_UNILIB_FOR_TESTING(unilib_) {
56 const int num_configs = codepoint_range_configs.size();
57 std::vector<const TokenizationCodepointRange*> configs_fb;
58 configs_fb.reserve(num_configs);
59 const int num_internal_configs = internal_codepoint_range_configs.size();
60 std::vector<const CodepointRange*> internal_configs_fb;
61 internal_configs_fb.reserve(num_internal_configs);
62 buffers_.reserve(num_configs + num_internal_configs);
63 for (int i = 0; i < num_configs; i++) {
64 flatbuffers::FlatBufferBuilder builder;
65 builder.Finish(CreateTokenizationCodepointRange(
66 builder, &codepoint_range_configs[i]));
67 buffers_.push_back(builder.Release());
68 configs_fb.push_back(flatbuffers::GetRoot<TokenizationCodepointRange>(
69 buffers_.back().data()));
70 }
71 for (int i = 0; i < num_internal_configs; i++) {
72 flatbuffers::FlatBufferBuilder builder;
73 builder.Finish(
74 CreateCodepointRange(builder, &internal_codepoint_range_configs[i]));
75 buffers_.push_back(builder.Release());
76 internal_configs_fb.push_back(
77 flatbuffers::GetRoot<CodepointRange>(buffers_.back().data()));
78 }
79 tokenizer_ = std::unique_ptr<TestingTokenizer>(new TestingTokenizer(
80 type, &unilib_, configs_fb, internal_configs_fb, split_on_script_change,
81 icu_preserve_whitespace_tokens, preserve_floating_numbers));
82 }
83
TestFindTokenizationRole(int c) const84 TokenizationCodepointRange_::Role TestFindTokenizationRole(int c) const {
85 const TokenizationCodepointRangeT* range =
86 tokenizer_->FindTokenizationRange(c);
87 if (range != nullptr) {
88 return range->role;
89 } else {
90 return TokenizationCodepointRange_::Role_DEFAULT_ROLE;
91 }
92 }
93
Tokenize(const std::string & utf8_text) const94 std::vector<Token> Tokenize(const std::string& utf8_text) const {
95 return tokenizer_->Tokenize(utf8_text);
96 }
97
98 private:
99 UniLib unilib_;
100 std::vector<flatbuffers::DetachedBuffer> buffers_;
101 std::unique_ptr<TestingTokenizer> tokenizer_;
102 };
103
TEST(TokenizerTest,FindTokenizationRange)104 TEST(TokenizerTest, FindTokenizationRange) {
105 std::vector<TokenizationCodepointRangeT> configs;
106 TokenizationCodepointRangeT* config;
107
108 configs.emplace_back();
109 config = &configs.back();
110 config->start = 0;
111 config->end = 10;
112 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
113
114 configs.emplace_back();
115 config = &configs.back();
116 config->start = 32;
117 config->end = 33;
118 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
119
120 configs.emplace_back();
121 config = &configs.back();
122 config->start = 1234;
123 config->end = 12345;
124 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
125
126 TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
127 {}, /*split_on_script_change=*/false,
128 /*icu_preserve_whitespace_tokens=*/false,
129 /*preserve_floating_numbers=*/false);
130
131 // Test hits to the first group.
132 EXPECT_EQ(tokenizer.TestFindTokenizationRole(0),
133 TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
134 EXPECT_EQ(tokenizer.TestFindTokenizationRole(5),
135 TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
136 EXPECT_EQ(tokenizer.TestFindTokenizationRole(10),
137 TokenizationCodepointRange_::Role_DEFAULT_ROLE);
138
139 // Test a hit to the second group.
140 EXPECT_EQ(tokenizer.TestFindTokenizationRole(31),
141 TokenizationCodepointRange_::Role_DEFAULT_ROLE);
142 EXPECT_EQ(tokenizer.TestFindTokenizationRole(32),
143 TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR);
144 EXPECT_EQ(tokenizer.TestFindTokenizationRole(33),
145 TokenizationCodepointRange_::Role_DEFAULT_ROLE);
146
147 // Test hits to the third group.
148 EXPECT_EQ(tokenizer.TestFindTokenizationRole(1233),
149 TokenizationCodepointRange_::Role_DEFAULT_ROLE);
150 EXPECT_EQ(tokenizer.TestFindTokenizationRole(1234),
151 TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
152 EXPECT_EQ(tokenizer.TestFindTokenizationRole(12344),
153 TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
154 EXPECT_EQ(tokenizer.TestFindTokenizationRole(12345),
155 TokenizationCodepointRange_::Role_DEFAULT_ROLE);
156
157 // Test a hit outside.
158 EXPECT_EQ(tokenizer.TestFindTokenizationRole(99),
159 TokenizationCodepointRange_::Role_DEFAULT_ROLE);
160 }
161
TEST(TokenizerTest,TokenizeOnSpace)162 TEST(TokenizerTest, TokenizeOnSpace) {
163 std::vector<TokenizationCodepointRangeT> configs;
164 TokenizationCodepointRangeT* config;
165
166 configs.emplace_back();
167 config = &configs.back();
168 // Space character.
169 config->start = 32;
170 config->end = 33;
171 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
172
173 TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
174 {},
175 /*split_on_script_change=*/false,
176 /*icu_preserve_whitespace_tokens=*/false,
177 /*preserve_floating_numbers=*/false);
178 std::vector<Token> tokens = tokenizer.Tokenize("Hello world!");
179
180 EXPECT_THAT(tokens,
181 ElementsAreArray({Token("Hello", 0, 5), Token("world!", 6, 12)}));
182 }
183
TEST(TokenizerTest,TokenizeOnSpaceAndScriptChange)184 TEST(TokenizerTest, TokenizeOnSpaceAndScriptChange) {
185 std::vector<TokenizationCodepointRangeT> configs;
186 TokenizationCodepointRangeT* config;
187
188 // Latin.
189 configs.emplace_back();
190 config = &configs.back();
191 config->start = 0;
192 config->end = 32;
193 config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
194 config->script_id = 1;
195 configs.emplace_back();
196 config = &configs.back();
197 config->start = 32;
198 config->end = 33;
199 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
200 config->script_id = 1;
201 configs.emplace_back();
202 config = &configs.back();
203 config->start = 33;
204 config->end = 0x77F + 1;
205 config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
206 config->script_id = 1;
207
208 TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
209 {},
210 /*split_on_script_change=*/true,
211 /*icu_preserve_whitespace_tokens=*/false,
212 /*preserve_floating_numbers=*/false);
213 EXPECT_THAT(tokenizer.Tokenize("앨라배마 주 전화(123) 456-789웹사이트"),
214 std::vector<Token>({Token("앨라배마", 0, 4), Token("주", 5, 6),
215 Token("전화", 7, 10), Token("(123)", 10, 15),
216 Token("456-789", 16, 23),
217 Token("웹사이트", 23, 28)}));
218 } // namespace
219
TEST(TokenizerTest,TokenizeComplex)220 TEST(TokenizerTest, TokenizeComplex) {
221 std::vector<TokenizationCodepointRangeT> configs;
222 TokenizationCodepointRangeT* config;
223
224 // Source: http://www.unicode.org/Public/10.0.0/ucd/Blocks-10.0.0d1.txt
225 // Latin - cyrilic.
226 // 0000..007F; Basic Latin
227 // 0080..00FF; Latin-1 Supplement
228 // 0100..017F; Latin Extended-A
229 // 0180..024F; Latin Extended-B
230 // 0250..02AF; IPA Extensions
231 // 02B0..02FF; Spacing Modifier Letters
232 // 0300..036F; Combining Diacritical Marks
233 // 0370..03FF; Greek and Coptic
234 // 0400..04FF; Cyrillic
235 // 0500..052F; Cyrillic Supplement
236 // 0530..058F; Armenian
237 // 0590..05FF; Hebrew
238 // 0600..06FF; Arabic
239 // 0700..074F; Syriac
240 // 0750..077F; Arabic Supplement
241 configs.emplace_back();
242 config = &configs.back();
243 config->start = 0;
244 config->end = 32;
245 config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
246 configs.emplace_back();
247 config = &configs.back();
248 config->start = 32;
249 config->end = 33;
250 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
251 configs.emplace_back();
252 config = &configs.back();
253 config->start = 33;
254 config->end = 0x77F + 1;
255 config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
256
257 // CJK
258 // 2E80..2EFF; CJK Radicals Supplement
259 // 3000..303F; CJK Symbols and Punctuation
260 // 3040..309F; Hiragana
261 // 30A0..30FF; Katakana
262 // 3100..312F; Bopomofo
263 // 3130..318F; Hangul Compatibility Jamo
264 // 3190..319F; Kanbun
265 // 31A0..31BF; Bopomofo Extended
266 // 31C0..31EF; CJK Strokes
267 // 31F0..31FF; Katakana Phonetic Extensions
268 // 3200..32FF; Enclosed CJK Letters and Months
269 // 3300..33FF; CJK Compatibility
270 // 3400..4DBF; CJK Unified Ideographs Extension A
271 // 4DC0..4DFF; Yijing Hexagram Symbols
272 // 4E00..9FFF; CJK Unified Ideographs
273 // A000..A48F; Yi Syllables
274 // A490..A4CF; Yi Radicals
275 // A4D0..A4FF; Lisu
276 // A500..A63F; Vai
277 // F900..FAFF; CJK Compatibility Ideographs
278 // FE30..FE4F; CJK Compatibility Forms
279 // 20000..2A6DF; CJK Unified Ideographs Extension B
280 // 2A700..2B73F; CJK Unified Ideographs Extension C
281 // 2B740..2B81F; CJK Unified Ideographs Extension D
282 // 2B820..2CEAF; CJK Unified Ideographs Extension E
283 // 2CEB0..2EBEF; CJK Unified Ideographs Extension F
284 // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
285 configs.emplace_back();
286 config = &configs.back();
287 config->start = 0x2E80;
288 config->end = 0x2EFF + 1;
289 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
290 configs.emplace_back();
291 config = &configs.back();
292 config->start = 0x3000;
293 config->end = 0xA63F + 1;
294 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
295 configs.emplace_back();
296 config = &configs.back();
297 config->start = 0xF900;
298 config->end = 0xFAFF + 1;
299 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
300 configs.emplace_back();
301 config = &configs.back();
302 config->start = 0xFE30;
303 config->end = 0xFE4F + 1;
304 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
305 configs.emplace_back();
306 config = &configs.back();
307 config->start = 0x20000;
308 config->end = 0x2A6DF + 1;
309 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
310 configs.emplace_back();
311 config = &configs.back();
312 config->start = 0x2A700;
313 config->end = 0x2B73F + 1;
314 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
315 configs.emplace_back();
316 config = &configs.back();
317 config->start = 0x2B740;
318 config->end = 0x2B81F + 1;
319 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
320 configs.emplace_back();
321 config = &configs.back();
322 config->start = 0x2B820;
323 config->end = 0x2CEAF + 1;
324 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
325 configs.emplace_back();
326 config = &configs.back();
327 config->start = 0x2CEB0;
328 config->end = 0x2EBEF + 1;
329 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
330 configs.emplace_back();
331 config = &configs.back();
332 config->start = 0x2F800;
333 config->end = 0x2FA1F + 1;
334 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
335
336 // Thai.
337 // 0E00..0E7F; Thai
338 configs.emplace_back();
339 config = &configs.back();
340 config->start = 0x0E00;
341 config->end = 0x0E7F + 1;
342 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
343
344 TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
345 {},
346 /*split_on_script_change=*/false,
347 /*icu_preserve_whitespace_tokens=*/false,
348 /*preserve_floating_numbers=*/false);
349 std::vector<Token> tokens;
350
351 tokens = tokenizer.Tokenize(
352 "問少目木輸走猶術権自京門録球変。細開括省用掲情結傍走愛明氷。");
353 EXPECT_EQ(tokens.size(), 30);
354
355 tokens = tokenizer.Tokenize("問少目 hello 木輸ยามきゃ");
356 // clang-format off
357 EXPECT_THAT(
358 tokens,
359 ElementsAreArray({Token("問", 0, 1),
360 Token("少", 1, 2),
361 Token("目", 2, 3),
362 Token("hello", 4, 9),
363 Token("木", 10, 11),
364 Token("輸", 11, 12),
365 Token("ย", 12, 13),
366 Token("า", 13, 14),
367 Token("ม", 14, 15),
368 Token("き", 15, 16),
369 Token("ゃ", 16, 17)}));
370 // clang-format on
371 }
372
373 #if defined(TC3_TEST_ICU) || defined(__APPLE__)
TEST(TokenizerTest,ICUTokenizeWithWhitespaces)374 TEST(TokenizerTest, ICUTokenizeWithWhitespaces) {
375 TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
376 /*split_on_script_change=*/false,
377 /*icu_preserve_whitespace_tokens=*/true,
378 /*preserve_floating_numbers=*/false);
379 std::vector<Token> tokens = tokenizer.Tokenize("พระบาท สมเด็จ พระ ปร มิ");
380 // clang-format off
381 ASSERT_EQ(tokens,
382 std::vector<Token>({Token("พระบาท", 0, 6),
383 Token(" ", 6, 7),
384 Token("สมเด็จ", 7, 13),
385 Token(" ", 13, 14),
386 Token("พระ", 14, 17),
387 Token(" ", 17, 18),
388 Token("ปร", 18, 20),
389 Token(" ", 20, 21),
390 Token("มิ", 21, 23)}));
391 // clang-format on
392 }
393
TEST(TokenizerTest,ICUTokenizePunctuation)394 TEST(TokenizerTest, ICUTokenizePunctuation) {
395 TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
396 /*split_on_script_change=*/false,
397 /*icu_preserve_whitespace_tokens=*/true,
398 /*preserve_floating_numbers=*/false);
399 std::vector<Token> tokens =
400 tokenizer.Tokenize("The interval is: -(12, 138*)");
401 // clang-format off
402 ASSERT_EQ(
403 tokens,
404 std::vector<Token>({Token("The", 0, 3),
405 Token(" ", 3, 4),
406 Token("interval", 4, 12),
407 Token(" ", 12, 13),
408 Token("is", 13, 15),
409 Token(":", 15, 16),
410 Token(" ", 16, 17),
411 Token("-", 17, 18),
412 Token("(", 18, 19),
413 Token("12", 19, 21),
414 Token(",", 21, 22),
415 Token(" ", 22, 23),
416 Token("138", 23, 26),
417 Token("*", 26, 27),
418 Token(")", 27, 28)}));
419 // clang-format on
420 }
421
TEST(TokenizerTest,ICUTokenizeWithNumbers)422 TEST(TokenizerTest, ICUTokenizeWithNumbers) {
423 TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
424 /*split_on_script_change=*/false,
425 /*icu_preserve_whitespace_tokens=*/true,
426 /*preserve_floating_numbers=*/false);
427 std::vector<Token> tokens = tokenizer.Tokenize("3.1 3﹒2 3.3");
428 // clang-format off
429 ASSERT_EQ(tokens,
430 std::vector<Token>({Token("3.1", 0, 3),
431 Token(" ", 3, 4),
432 Token("3﹒2", 4, 7),
433 Token(" ", 7, 8),
434 Token("3.3", 8, 11)}));
435 // clang-format on
436 }
437 #endif
438
439 #if defined(TC3_TEST_ICU)
TEST(TokenizerTest,ICUTokenize)440 TEST(TokenizerTest, ICUTokenize) {
441 TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
442 /*split_on_script_change=*/false,
443 /*icu_preserve_whitespace_tokens=*/false,
444 /*preserve_floating_numbers=*/false);
445 std::vector<Token> tokens = tokenizer.Tokenize("พระบาทสมเด็จพระปรมิ");
446 // clang-format off
447 ASSERT_EQ(tokens,
448 std::vector<Token>({Token("พระบาท", 0, 6),
449 Token("สมเด็จ", 6, 12),
450 Token("พระ", 12, 15),
451 Token("ปร", 15, 17),
452 Token("มิ", 17, 19)}));
453 // clang-format on
454 }
455
TEST(TokenizerTest,MixedTokenize)456 TEST(TokenizerTest, MixedTokenize) {
457 std::vector<TokenizationCodepointRangeT> configs;
458 TokenizationCodepointRangeT* config;
459
460 configs.emplace_back();
461 config = &configs.back();
462 config->start = 32;
463 config->end = 33;
464 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
465
466 std::vector<CodepointRangeT> internal_configs;
467 CodepointRangeT* interal_config;
468
469 internal_configs.emplace_back();
470 interal_config = &internal_configs.back();
471 interal_config->start = 0;
472 interal_config->end = 128;
473
474 internal_configs.emplace_back();
475 interal_config = &internal_configs.back();
476 interal_config->start = 128;
477 interal_config->end = 256;
478
479 internal_configs.emplace_back();
480 interal_config = &internal_configs.back();
481 interal_config->start = 256;
482 interal_config->end = 384;
483
484 internal_configs.emplace_back();
485 interal_config = &internal_configs.back();
486 interal_config->start = 384;
487 interal_config->end = 592;
488
489 TestingTokenizerProxy tokenizer(TokenizationType_MIXED, configs,
490 internal_configs,
491 /*split_on_script_change=*/false,
492 /*icu_preserve_whitespace_tokens=*/false,
493 /*preserve_floating_numbers=*/false);
494
495 std::vector<Token> tokens = tokenizer.Tokenize(
496 "こんにちはJapanese-ląnguagę text 你好世界 http://www.google.com/");
497 ASSERT_EQ(
498 tokens,
499 // clang-format off
500 std::vector<Token>({Token("こんにちは", 0, 5),
501 Token("Japanese-ląnguagę", 5, 22),
502 Token("text", 23, 27),
503 Token("你好", 28, 30),
504 Token("世界", 30, 32),
505 Token("http://www.google.com/", 33, 55)}));
506 // clang-format on
507 }
508
TEST(TokenizerTest,InternalTokenizeOnScriptChange)509 TEST(TokenizerTest, InternalTokenizeOnScriptChange) {
510 std::vector<TokenizationCodepointRangeT> configs;
511 TokenizationCodepointRangeT* config;
512
513 configs.emplace_back();
514 config = &configs.back();
515 config->start = 0;
516 config->end = 256;
517 config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
518
519 {
520 TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER,
521 configs, {},
522 /*split_on_script_change=*/false,
523 /*icu_preserve_whitespace_tokens=*/false,
524 /*preserve_floating_numbers=*/false);
525
526 EXPECT_EQ(tokenizer.Tokenize("앨라배마123웹사이트"),
527 std::vector<Token>({Token("앨라배마123웹사이트", 0, 11)}));
528 }
529
530 {
531 TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER,
532 configs, {},
533 /*split_on_script_change=*/true,
534 /*icu_preserve_whitespace_tokens=*/false,
535 /*preserve_floating_numbers=*/false);
536 EXPECT_EQ(tokenizer.Tokenize("앨라배마123웹사이트"),
537 std::vector<Token>({Token("앨라배마", 0, 4), Token("123", 4, 7),
538 Token("웹사이트", 7, 11)}));
539 }
540 }
541 #endif
542
TEST(TokenizerTest,LetterDigitTokenize)543 TEST(TokenizerTest, LetterDigitTokenize) {
544 TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
545 /*split_on_script_change=*/false,
546 /*icu_preserve_whitespace_tokens=*/false,
547 /*preserve_floating_numbers=*/true);
548 std::vector<Token> tokens = tokenizer.Tokenize("7% -3.14 68.9#? 7% $99 .18.");
549 ASSERT_EQ(tokens,
550 std::vector<Token>(
551 {Token("7", 0, 1), Token("%", 1, 2), Token(" ", 2, 3),
552 Token("-", 3, 4), Token("3.14", 4, 8), Token(" ", 8, 9),
553 Token("68.9", 9, 13), Token("#", 13, 14), Token("?", 14, 15),
554 Token(" ", 15, 16), Token("7", 16, 17), Token("%", 17, 18),
555 Token(" ", 18, 19), Token("$", 19, 20), Token("99", 20, 22),
556 Token(" ", 22, 23), Token(".", 23, 24), Token("18", 24, 26),
557 Token(".", 26, 27)}));
558 }
559
TEST(TokenizerTest,LetterDigitTokenizeUnicode)560 TEST(TokenizerTest, LetterDigitTokenizeUnicode) {
561 TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
562 /*split_on_script_change=*/false,
563 /*icu_preserve_whitespace_tokens=*/false,
564 /*preserve_floating_numbers=*/true);
565 std::vector<Token> tokens = tokenizer.Tokenize("2 pércént 3パーセント");
566 ASSERT_EQ(tokens, std::vector<Token>({Token("2", 0, 1), Token(" ", 1, 2),
567 Token("pércént", 2, 9),
568 Token(" ", 9, 10), Token("3", 10, 11),
569 Token("パーセント", 11, 16)}));
570 }
571
TEST(TokenizerTest,LetterDigitTokenizeWithDots)572 TEST(TokenizerTest, LetterDigitTokenizeWithDots) {
573 TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
574 /*split_on_script_change=*/false,
575 /*icu_preserve_whitespace_tokens=*/false,
576 /*preserve_floating_numbers=*/true);
577 std::vector<Token> tokens = tokenizer.Tokenize("3 3﹒2 3.3%");
578 ASSERT_EQ(tokens,
579 std::vector<Token>({Token("3", 0, 1), Token(" ", 1, 2),
580 Token("3﹒2", 2, 5), Token(" ", 5, 6),
581 Token("3.3", 6, 9), Token("%", 9, 10)}));
582 }
583
TEST(TokenizerTest,LetterDigitTokenizeDoNotPreserveFloatingNumbers)584 TEST(TokenizerTest, LetterDigitTokenizeDoNotPreserveFloatingNumbers) {
585 TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
586 /*split_on_script_change=*/false,
587 /*icu_preserve_whitespace_tokens=*/false,
588 /*preserve_floating_numbers=*/false);
589 std::vector<Token> tokens = tokenizer.Tokenize("15.12.2019 january's 3.2");
590 ASSERT_EQ(tokens,
591 std::vector<Token>(
592 {Token("15", 0, 2), Token(".", 2, 3), Token("12", 3, 5),
593 Token(".", 5, 6), Token("2019", 6, 10), Token(" ", 10, 11),
594 Token("january", 11, 18), Token("'", 18, 19),
595 Token("s", 19, 20), Token(" ", 20, 21), Token("3", 21, 22),
596 Token(".", 22, 23), Token("2", 23, 24)}));
597 }
598
TEST(TokenizerTest,LetterDigitTokenizeStrangeStringFloatingNumbers)599 TEST(TokenizerTest, LetterDigitTokenizeStrangeStringFloatingNumbers) {
600 TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
601 /*split_on_script_change=*/false,
602 /*icu_preserve_whitespace_tokens=*/false,
603 /*preserve_floating_numbers=*/false);
604 std::vector<Token> tokens = tokenizer.Tokenize("The+2345++the +íí+");
605 ASSERT_EQ(tokens,
606 std::vector<Token>({Token("The", 0, 3), Token("+", 3, 4),
607 Token("2345", 4, 8), Token("+", 8, 9),
608 Token("+", 9, 10), Token("the", 10, 13),
609 Token(" ", 13, 14), Token("+", 14, 15),
610 Token("íí", 15, 17), Token("+", 17, 18)}));
611 }
612
TEST(TokenizerTest,LetterDigitTokenizeWhitespcesInSameToken)613 TEST(TokenizerTest, LetterDigitTokenizeWhitespcesInSameToken) {
614 TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
615 /*split_on_script_change=*/false,
616 /*icu_preserve_whitespace_tokens=*/false,
617 /*preserve_floating_numbers=*/false);
618 std::vector<Token> tokens = tokenizer.Tokenize("2 3 4 5");
619 ASSERT_EQ(tokens, std::vector<Token>({Token("2", 0, 1), Token(" ", 1, 2),
620 Token("3", 2, 3), Token(" ", 3, 5),
621 Token("4", 5, 6), Token(" ", 6, 9),
622 Token("5", 9, 10)}));
623 }
624
625 } // namespace
626 } // namespace libtextclassifier3
627