xref: /aosp_15_r20/external/libtextclassifier/native/utils/tokenizer_test.cc (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utils/tokenizer.h"
18 
19 #include <vector>
20 
21 #include "gmock/gmock.h"
22 #include "gtest/gtest.h"
23 
24 namespace libtextclassifier3 {
25 namespace {
26 
27 using testing::ElementsAreArray;
28 
29 class TestingTokenizer : public Tokenizer {
30  public:
TestingTokenizer(const TokenizationType type,const UniLib * unilib,const std::vector<const TokenizationCodepointRange * > & codepoint_ranges,const std::vector<const CodepointRange * > & internal_tokenizer_codepoint_ranges,const bool split_on_script_change,const bool icu_preserve_whitespace_tokens,const bool preserve_floating_numbers)31   TestingTokenizer(
32       const TokenizationType type, const UniLib* unilib,
33       const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
34       const std::vector<const CodepointRange*>&
35           internal_tokenizer_codepoint_ranges,
36       const bool split_on_script_change,
37       const bool icu_preserve_whitespace_tokens,
38       const bool preserve_floating_numbers)
39       : Tokenizer(type, unilib, codepoint_ranges,
40                   internal_tokenizer_codepoint_ranges, split_on_script_change,
41                   icu_preserve_whitespace_tokens, preserve_floating_numbers) {}
42 
43   using Tokenizer::FindTokenizationRange;
44 };
45 
46 class TestingTokenizerProxy {
47  public:
TestingTokenizerProxy(TokenizationType type,const std::vector<TokenizationCodepointRangeT> & codepoint_range_configs,const std::vector<CodepointRangeT> & internal_codepoint_range_configs,const bool split_on_script_change,const bool icu_preserve_whitespace_tokens,const bool preserve_floating_numbers)48   TestingTokenizerProxy(
49       TokenizationType type,
50       const std::vector<TokenizationCodepointRangeT>& codepoint_range_configs,
51       const std::vector<CodepointRangeT>& internal_codepoint_range_configs,
52       const bool split_on_script_change,
53       const bool icu_preserve_whitespace_tokens,
54       const bool preserve_floating_numbers)
55       : INIT_UNILIB_FOR_TESTING(unilib_) {
56     const int num_configs = codepoint_range_configs.size();
57     std::vector<const TokenizationCodepointRange*> configs_fb;
58     configs_fb.reserve(num_configs);
59     const int num_internal_configs = internal_codepoint_range_configs.size();
60     std::vector<const CodepointRange*> internal_configs_fb;
61     internal_configs_fb.reserve(num_internal_configs);
62     buffers_.reserve(num_configs + num_internal_configs);
63     for (int i = 0; i < num_configs; i++) {
64       flatbuffers::FlatBufferBuilder builder;
65       builder.Finish(CreateTokenizationCodepointRange(
66           builder, &codepoint_range_configs[i]));
67       buffers_.push_back(builder.Release());
68       configs_fb.push_back(flatbuffers::GetRoot<TokenizationCodepointRange>(
69           buffers_.back().data()));
70     }
71     for (int i = 0; i < num_internal_configs; i++) {
72       flatbuffers::FlatBufferBuilder builder;
73       builder.Finish(
74           CreateCodepointRange(builder, &internal_codepoint_range_configs[i]));
75       buffers_.push_back(builder.Release());
76       internal_configs_fb.push_back(
77           flatbuffers::GetRoot<CodepointRange>(buffers_.back().data()));
78     }
79     tokenizer_ = std::unique_ptr<TestingTokenizer>(new TestingTokenizer(
80         type, &unilib_, configs_fb, internal_configs_fb, split_on_script_change,
81         icu_preserve_whitespace_tokens, preserve_floating_numbers));
82   }
83 
TestFindTokenizationRole(int c) const84   TokenizationCodepointRange_::Role TestFindTokenizationRole(int c) const {
85     const TokenizationCodepointRangeT* range =
86         tokenizer_->FindTokenizationRange(c);
87     if (range != nullptr) {
88       return range->role;
89     } else {
90       return TokenizationCodepointRange_::Role_DEFAULT_ROLE;
91     }
92   }
93 
Tokenize(const std::string & utf8_text) const94   std::vector<Token> Tokenize(const std::string& utf8_text) const {
95     return tokenizer_->Tokenize(utf8_text);
96   }
97 
98  private:
99   UniLib unilib_;
100   std::vector<flatbuffers::DetachedBuffer> buffers_;
101   std::unique_ptr<TestingTokenizer> tokenizer_;
102 };
103 
TEST(TokenizerTest,FindTokenizationRange)104 TEST(TokenizerTest, FindTokenizationRange) {
105   std::vector<TokenizationCodepointRangeT> configs;
106   TokenizationCodepointRangeT* config;
107 
108   configs.emplace_back();
109   config = &configs.back();
110   config->start = 0;
111   config->end = 10;
112   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
113 
114   configs.emplace_back();
115   config = &configs.back();
116   config->start = 32;
117   config->end = 33;
118   config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
119 
120   configs.emplace_back();
121   config = &configs.back();
122   config->start = 1234;
123   config->end = 12345;
124   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
125 
126   TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
127                                   {}, /*split_on_script_change=*/false,
128                                   /*icu_preserve_whitespace_tokens=*/false,
129                                   /*preserve_floating_numbers=*/false);
130 
131   // Test hits to the first group.
132   EXPECT_EQ(tokenizer.TestFindTokenizationRole(0),
133             TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
134   EXPECT_EQ(tokenizer.TestFindTokenizationRole(5),
135             TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
136   EXPECT_EQ(tokenizer.TestFindTokenizationRole(10),
137             TokenizationCodepointRange_::Role_DEFAULT_ROLE);
138 
139   // Test a hit to the second group.
140   EXPECT_EQ(tokenizer.TestFindTokenizationRole(31),
141             TokenizationCodepointRange_::Role_DEFAULT_ROLE);
142   EXPECT_EQ(tokenizer.TestFindTokenizationRole(32),
143             TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR);
144   EXPECT_EQ(tokenizer.TestFindTokenizationRole(33),
145             TokenizationCodepointRange_::Role_DEFAULT_ROLE);
146 
147   // Test hits to the third group.
148   EXPECT_EQ(tokenizer.TestFindTokenizationRole(1233),
149             TokenizationCodepointRange_::Role_DEFAULT_ROLE);
150   EXPECT_EQ(tokenizer.TestFindTokenizationRole(1234),
151             TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
152   EXPECT_EQ(tokenizer.TestFindTokenizationRole(12344),
153             TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
154   EXPECT_EQ(tokenizer.TestFindTokenizationRole(12345),
155             TokenizationCodepointRange_::Role_DEFAULT_ROLE);
156 
157   // Test a hit outside.
158   EXPECT_EQ(tokenizer.TestFindTokenizationRole(99),
159             TokenizationCodepointRange_::Role_DEFAULT_ROLE);
160 }
161 
TEST(TokenizerTest,TokenizeOnSpace)162 TEST(TokenizerTest, TokenizeOnSpace) {
163   std::vector<TokenizationCodepointRangeT> configs;
164   TokenizationCodepointRangeT* config;
165 
166   configs.emplace_back();
167   config = &configs.back();
168   // Space character.
169   config->start = 32;
170   config->end = 33;
171   config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
172 
173   TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
174                                   {},
175                                   /*split_on_script_change=*/false,
176                                   /*icu_preserve_whitespace_tokens=*/false,
177                                   /*preserve_floating_numbers=*/false);
178   std::vector<Token> tokens = tokenizer.Tokenize("Hello world!");
179 
180   EXPECT_THAT(tokens,
181               ElementsAreArray({Token("Hello", 0, 5), Token("world!", 6, 12)}));
182 }
183 
TEST(TokenizerTest,TokenizeOnSpaceAndScriptChange)184 TEST(TokenizerTest, TokenizeOnSpaceAndScriptChange) {
185   std::vector<TokenizationCodepointRangeT> configs;
186   TokenizationCodepointRangeT* config;
187 
188   // Latin.
189   configs.emplace_back();
190   config = &configs.back();
191   config->start = 0;
192   config->end = 32;
193   config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
194   config->script_id = 1;
195   configs.emplace_back();
196   config = &configs.back();
197   config->start = 32;
198   config->end = 33;
199   config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
200   config->script_id = 1;
201   configs.emplace_back();
202   config = &configs.back();
203   config->start = 33;
204   config->end = 0x77F + 1;
205   config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
206   config->script_id = 1;
207 
208   TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
209                                   {},
210                                   /*split_on_script_change=*/true,
211                                   /*icu_preserve_whitespace_tokens=*/false,
212                                   /*preserve_floating_numbers=*/false);
213   EXPECT_THAT(tokenizer.Tokenize("앨라배마 주 전화(123) 456-789웹사이트"),
214               std::vector<Token>({Token("앨라배마", 0, 4), Token("주", 5, 6),
215                                   Token("전화", 7, 10), Token("(123)", 10, 15),
216                                   Token("456-789", 16, 23),
217                                   Token("웹사이트", 23, 28)}));
218 }  // namespace
219 
TEST(TokenizerTest,TokenizeComplex)220 TEST(TokenizerTest, TokenizeComplex) {
221   std::vector<TokenizationCodepointRangeT> configs;
222   TokenizationCodepointRangeT* config;
223 
224   // Source: http://www.unicode.org/Public/10.0.0/ucd/Blocks-10.0.0d1.txt
225   // Latin - cyrilic.
226   //   0000..007F; Basic Latin
227   //   0080..00FF; Latin-1 Supplement
228   //   0100..017F; Latin Extended-A
229   //   0180..024F; Latin Extended-B
230   //   0250..02AF; IPA Extensions
231   //   02B0..02FF; Spacing Modifier Letters
232   //   0300..036F; Combining Diacritical Marks
233   //   0370..03FF; Greek and Coptic
234   //   0400..04FF; Cyrillic
235   //   0500..052F; Cyrillic Supplement
236   //   0530..058F; Armenian
237   //   0590..05FF; Hebrew
238   //   0600..06FF; Arabic
239   //   0700..074F; Syriac
240   //   0750..077F; Arabic Supplement
241   configs.emplace_back();
242   config = &configs.back();
243   config->start = 0;
244   config->end = 32;
245   config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
246   configs.emplace_back();
247   config = &configs.back();
248   config->start = 32;
249   config->end = 33;
250   config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
251   configs.emplace_back();
252   config = &configs.back();
253   config->start = 33;
254   config->end = 0x77F + 1;
255   config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
256 
257   // CJK
258   // 2E80..2EFF; CJK Radicals Supplement
259   // 3000..303F; CJK Symbols and Punctuation
260   // 3040..309F; Hiragana
261   // 30A0..30FF; Katakana
262   // 3100..312F; Bopomofo
263   // 3130..318F; Hangul Compatibility Jamo
264   // 3190..319F; Kanbun
265   // 31A0..31BF; Bopomofo Extended
266   // 31C0..31EF; CJK Strokes
267   // 31F0..31FF; Katakana Phonetic Extensions
268   // 3200..32FF; Enclosed CJK Letters and Months
269   // 3300..33FF; CJK Compatibility
270   // 3400..4DBF; CJK Unified Ideographs Extension A
271   // 4DC0..4DFF; Yijing Hexagram Symbols
272   // 4E00..9FFF; CJK Unified Ideographs
273   // A000..A48F; Yi Syllables
274   // A490..A4CF; Yi Radicals
275   // A4D0..A4FF; Lisu
276   // A500..A63F; Vai
277   // F900..FAFF; CJK Compatibility Ideographs
278   // FE30..FE4F; CJK Compatibility Forms
279   // 20000..2A6DF; CJK Unified Ideographs Extension B
280   // 2A700..2B73F; CJK Unified Ideographs Extension C
281   // 2B740..2B81F; CJK Unified Ideographs Extension D
282   // 2B820..2CEAF; CJK Unified Ideographs Extension E
283   // 2CEB0..2EBEF; CJK Unified Ideographs Extension F
284   // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
285   configs.emplace_back();
286   config = &configs.back();
287   config->start = 0x2E80;
288   config->end = 0x2EFF + 1;
289   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
290   configs.emplace_back();
291   config = &configs.back();
292   config->start = 0x3000;
293   config->end = 0xA63F + 1;
294   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
295   configs.emplace_back();
296   config = &configs.back();
297   config->start = 0xF900;
298   config->end = 0xFAFF + 1;
299   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
300   configs.emplace_back();
301   config = &configs.back();
302   config->start = 0xFE30;
303   config->end = 0xFE4F + 1;
304   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
305   configs.emplace_back();
306   config = &configs.back();
307   config->start = 0x20000;
308   config->end = 0x2A6DF + 1;
309   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
310   configs.emplace_back();
311   config = &configs.back();
312   config->start = 0x2A700;
313   config->end = 0x2B73F + 1;
314   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
315   configs.emplace_back();
316   config = &configs.back();
317   config->start = 0x2B740;
318   config->end = 0x2B81F + 1;
319   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
320   configs.emplace_back();
321   config = &configs.back();
322   config->start = 0x2B820;
323   config->end = 0x2CEAF + 1;
324   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
325   configs.emplace_back();
326   config = &configs.back();
327   config->start = 0x2CEB0;
328   config->end = 0x2EBEF + 1;
329   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
330   configs.emplace_back();
331   config = &configs.back();
332   config->start = 0x2F800;
333   config->end = 0x2FA1F + 1;
334   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
335 
336   // Thai.
337   // 0E00..0E7F; Thai
338   configs.emplace_back();
339   config = &configs.back();
340   config->start = 0x0E00;
341   config->end = 0x0E7F + 1;
342   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
343 
344   TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
345                                   {},
346                                   /*split_on_script_change=*/false,
347                                   /*icu_preserve_whitespace_tokens=*/false,
348                                   /*preserve_floating_numbers=*/false);
349   std::vector<Token> tokens;
350 
351   tokens = tokenizer.Tokenize(
352       "問少目木輸走猶術権自京門録球変。細開括省用掲情結傍走愛明氷。");
353   EXPECT_EQ(tokens.size(), 30);
354 
355   tokens = tokenizer.Tokenize("問少目 hello 木輸ยามきゃ");
356   // clang-format off
357   EXPECT_THAT(
358       tokens,
359       ElementsAreArray({Token("問", 0, 1),
360                         Token("少", 1, 2),
361                         Token("目", 2, 3),
362                         Token("hello", 4, 9),
363                         Token("木", 10, 11),
364                         Token("輸", 11, 12),
365                         Token("ย", 12, 13),
366                         Token("า", 13, 14),
367                         Token("ม", 14, 15),
368                         Token("き", 15, 16),
369                         Token("ゃ", 16, 17)}));
370   // clang-format on
371 }
372 
373 #if defined(TC3_TEST_ICU) || defined(__APPLE__)
TEST(TokenizerTest,ICUTokenizeWithWhitespaces)374 TEST(TokenizerTest, ICUTokenizeWithWhitespaces) {
375   TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
376                                   /*split_on_script_change=*/false,
377                                   /*icu_preserve_whitespace_tokens=*/true,
378                                   /*preserve_floating_numbers=*/false);
379   std::vector<Token> tokens = tokenizer.Tokenize("พระบาท สมเด็จ พระ ปร มิ");
380   // clang-format off
381   ASSERT_EQ(tokens,
382             std::vector<Token>({Token("พระบาท", 0, 6),
383                                 Token(" ", 6, 7),
384                                 Token("สมเด็จ", 7, 13),
385                                 Token(" ", 13, 14),
386                                 Token("พระ", 14, 17),
387                                 Token(" ", 17, 18),
388                                 Token("ปร", 18, 20),
389                                 Token(" ", 20, 21),
390                                 Token("มิ", 21, 23)}));
391   // clang-format on
392 }
393 
TEST(TokenizerTest,ICUTokenizePunctuation)394 TEST(TokenizerTest, ICUTokenizePunctuation) {
395   TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
396                                   /*split_on_script_change=*/false,
397                                   /*icu_preserve_whitespace_tokens=*/true,
398                                   /*preserve_floating_numbers=*/false);
399   std::vector<Token> tokens =
400       tokenizer.Tokenize("The interval is: -(12, 138*)");
401   // clang-format off
402   ASSERT_EQ(
403       tokens,
404             std::vector<Token>({Token("The", 0, 3),
405                                 Token(" ", 3, 4),
406                                 Token("interval", 4, 12),
407                                 Token(" ", 12, 13),
408                                 Token("is", 13, 15),
409                                 Token(":", 15, 16),
410                                 Token(" ", 16, 17),
411                                 Token("-", 17, 18),
412                                 Token("(", 18, 19),
413                                 Token("12", 19, 21),
414                                 Token(",", 21, 22),
415                                 Token(" ", 22, 23),
416                                 Token("138", 23, 26),
417                                 Token("*", 26, 27),
418                                 Token(")", 27, 28)}));
419   // clang-format on
420 }
421 
TEST(TokenizerTest,ICUTokenizeWithNumbers)422 TEST(TokenizerTest, ICUTokenizeWithNumbers) {
423   TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
424                                   /*split_on_script_change=*/false,
425                                   /*icu_preserve_whitespace_tokens=*/true,
426                                   /*preserve_floating_numbers=*/false);
427   std::vector<Token> tokens = tokenizer.Tokenize("3.1 3﹒2 3.3");
428   // clang-format off
429   ASSERT_EQ(tokens,
430             std::vector<Token>({Token("3.1", 0, 3),
431                                 Token(" ", 3, 4),
432                                 Token("3﹒2", 4, 7),
433                                 Token(" ", 7, 8),
434                                 Token("3.3", 8, 11)}));
435   // clang-format on
436 }
437 #endif
438 
439 #if defined(TC3_TEST_ICU)
TEST(TokenizerTest,ICUTokenize)440 TEST(TokenizerTest, ICUTokenize) {
441   TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
442                                   /*split_on_script_change=*/false,
443                                   /*icu_preserve_whitespace_tokens=*/false,
444                                   /*preserve_floating_numbers=*/false);
445   std::vector<Token> tokens = tokenizer.Tokenize("พระบาทสมเด็จพระปรมิ");
446   // clang-format off
447   ASSERT_EQ(tokens,
448             std::vector<Token>({Token("พระบาท", 0, 6),
449                                 Token("สมเด็จ", 6, 12),
450                                 Token("พระ", 12, 15),
451                                 Token("ปร", 15, 17),
452                                 Token("มิ", 17, 19)}));
453   // clang-format on
454 }
455 
TEST(TokenizerTest,MixedTokenize)456 TEST(TokenizerTest, MixedTokenize) {
457   std::vector<TokenizationCodepointRangeT> configs;
458   TokenizationCodepointRangeT* config;
459 
460   configs.emplace_back();
461   config = &configs.back();
462   config->start = 32;
463   config->end = 33;
464   config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
465 
466   std::vector<CodepointRangeT> internal_configs;
467   CodepointRangeT* interal_config;
468 
469   internal_configs.emplace_back();
470   interal_config = &internal_configs.back();
471   interal_config->start = 0;
472   interal_config->end = 128;
473 
474   internal_configs.emplace_back();
475   interal_config = &internal_configs.back();
476   interal_config->start = 128;
477   interal_config->end = 256;
478 
479   internal_configs.emplace_back();
480   interal_config = &internal_configs.back();
481   interal_config->start = 256;
482   interal_config->end = 384;
483 
484   internal_configs.emplace_back();
485   interal_config = &internal_configs.back();
486   interal_config->start = 384;
487   interal_config->end = 592;
488 
489   TestingTokenizerProxy tokenizer(TokenizationType_MIXED, configs,
490                                   internal_configs,
491                                   /*split_on_script_change=*/false,
492                                   /*icu_preserve_whitespace_tokens=*/false,
493                                   /*preserve_floating_numbers=*/false);
494 
495   std::vector<Token> tokens = tokenizer.Tokenize(
496       "こんにちはJapanese-ląnguagę text 你好世界 http://www.google.com/");
497   ASSERT_EQ(
498       tokens,
499       // clang-format off
500       std::vector<Token>({Token("こんにちは", 0, 5),
501                           Token("Japanese-ląnguagę", 5, 22),
502                           Token("text", 23, 27),
503                           Token("你好", 28, 30),
504                           Token("世界", 30, 32),
505                           Token("http://www.google.com/", 33, 55)}));
506   // clang-format on
507 }
508 
TEST(TokenizerTest,InternalTokenizeOnScriptChange)509 TEST(TokenizerTest, InternalTokenizeOnScriptChange) {
510   std::vector<TokenizationCodepointRangeT> configs;
511   TokenizationCodepointRangeT* config;
512 
513   configs.emplace_back();
514   config = &configs.back();
515   config->start = 0;
516   config->end = 256;
517   config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
518 
519   {
520     TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER,
521                                     configs, {},
522                                     /*split_on_script_change=*/false,
523                                     /*icu_preserve_whitespace_tokens=*/false,
524                                     /*preserve_floating_numbers=*/false);
525 
526     EXPECT_EQ(tokenizer.Tokenize("앨라배마123웹사이트"),
527               std::vector<Token>({Token("앨라배마123웹사이트", 0, 11)}));
528   }
529 
530   {
531     TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER,
532                                     configs, {},
533                                     /*split_on_script_change=*/true,
534                                     /*icu_preserve_whitespace_tokens=*/false,
535                                     /*preserve_floating_numbers=*/false);
536     EXPECT_EQ(tokenizer.Tokenize("앨라배마123웹사이트"),
537               std::vector<Token>({Token("앨라배마", 0, 4), Token("123", 4, 7),
538                                   Token("웹사이트", 7, 11)}));
539   }
540 }
541 #endif
542 
TEST(TokenizerTest,LetterDigitTokenize)543 TEST(TokenizerTest, LetterDigitTokenize) {
544   TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
545                                   /*split_on_script_change=*/false,
546                                   /*icu_preserve_whitespace_tokens=*/false,
547                                   /*preserve_floating_numbers=*/true);
548   std::vector<Token> tokens = tokenizer.Tokenize("7% -3.14 68.9#? 7% $99 .18.");
549   ASSERT_EQ(tokens,
550             std::vector<Token>(
551                 {Token("7", 0, 1), Token("%", 1, 2), Token(" ", 2, 3),
552                  Token("-", 3, 4), Token("3.14", 4, 8), Token(" ", 8, 9),
553                  Token("68.9", 9, 13), Token("#", 13, 14), Token("?", 14, 15),
554                  Token(" ", 15, 16), Token("7", 16, 17), Token("%", 17, 18),
555                  Token(" ", 18, 19), Token("$", 19, 20), Token("99", 20, 22),
556                  Token(" ", 22, 23), Token(".", 23, 24), Token("18", 24, 26),
557                  Token(".", 26, 27)}));
558 }
559 
TEST(TokenizerTest,LetterDigitTokenizeUnicode)560 TEST(TokenizerTest, LetterDigitTokenizeUnicode) {
561   TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
562                                   /*split_on_script_change=*/false,
563                                   /*icu_preserve_whitespace_tokens=*/false,
564                                   /*preserve_floating_numbers=*/true);
565   std::vector<Token> tokens = tokenizer.Tokenize("2 pércént 3パーセント");
566   ASSERT_EQ(tokens, std::vector<Token>({Token("2", 0, 1), Token(" ", 1, 2),
567                                         Token("pércént", 2, 9),
568                                         Token(" ", 9, 10), Token("3", 10, 11),
569                                         Token("パーセント", 11, 16)}));
570 }
571 
TEST(TokenizerTest,LetterDigitTokenizeWithDots)572 TEST(TokenizerTest, LetterDigitTokenizeWithDots) {
573   TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
574                                   /*split_on_script_change=*/false,
575                                   /*icu_preserve_whitespace_tokens=*/false,
576                                   /*preserve_floating_numbers=*/true);
577   std::vector<Token> tokens = tokenizer.Tokenize("3 3﹒2 3.3%");
578   ASSERT_EQ(tokens,
579             std::vector<Token>({Token("3", 0, 1), Token(" ", 1, 2),
580                                 Token("3﹒2", 2, 5), Token(" ", 5, 6),
581                                 Token("3.3", 6, 9), Token("%", 9, 10)}));
582 }
583 
TEST(TokenizerTest,LetterDigitTokenizeDoNotPreserveFloatingNumbers)584 TEST(TokenizerTest, LetterDigitTokenizeDoNotPreserveFloatingNumbers) {
585   TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
586                                   /*split_on_script_change=*/false,
587                                   /*icu_preserve_whitespace_tokens=*/false,
588                                   /*preserve_floating_numbers=*/false);
589   std::vector<Token> tokens = tokenizer.Tokenize("15.12.2019 january's 3.2");
590   ASSERT_EQ(tokens,
591             std::vector<Token>(
592                 {Token("15", 0, 2), Token(".", 2, 3), Token("12", 3, 5),
593                  Token(".", 5, 6), Token("2019", 6, 10), Token(" ", 10, 11),
594                  Token("january", 11, 18), Token("'", 18, 19),
595                  Token("s", 19, 20), Token(" ", 20, 21), Token("3", 21, 22),
596                  Token(".", 22, 23), Token("2", 23, 24)}));
597 }
598 
TEST(TokenizerTest,LetterDigitTokenizeStrangeStringFloatingNumbers)599 TEST(TokenizerTest, LetterDigitTokenizeStrangeStringFloatingNumbers) {
600   TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
601                                   /*split_on_script_change=*/false,
602                                   /*icu_preserve_whitespace_tokens=*/false,
603                                   /*preserve_floating_numbers=*/false);
604   std::vector<Token> tokens = tokenizer.Tokenize("The+2345++the +íí+");
605   ASSERT_EQ(tokens,
606             std::vector<Token>({Token("The", 0, 3), Token("+", 3, 4),
607                                 Token("2345", 4, 8), Token("+", 8, 9),
608                                 Token("+", 9, 10), Token("the", 10, 13),
609                                 Token(" ", 13, 14), Token("+", 14, 15),
610                                 Token("íí", 15, 17), Token("+", 17, 18)}));
611 }
612 
TEST(TokenizerTest,LetterDigitTokenizeWhitespcesInSameToken)613 TEST(TokenizerTest, LetterDigitTokenizeWhitespcesInSameToken) {
614   TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
615                                   /*split_on_script_change=*/false,
616                                   /*icu_preserve_whitespace_tokens=*/false,
617                                   /*preserve_floating_numbers=*/false);
618   std::vector<Token> tokens = tokenizer.Tokenize("2 3  4   5");
619   ASSERT_EQ(tokens, std::vector<Token>({Token("2", 0, 1), Token(" ", 1, 2),
620                                         Token("3", 2, 3), Token("  ", 3, 5),
621                                         Token("4", 5, 6), Token("   ", 6, 9),
622                                         Token("5", 9, 10)}));
623 }
624 
625 }  // namespace
626 }  // namespace libtextclassifier3
627