xref: /aosp_15_r20/external/libtextclassifier/native/utils/tokenizer_test.cc (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #include "utils/tokenizer.h"
18*993b0882SAndroid Build Coastguard Worker 
19*993b0882SAndroid Build Coastguard Worker #include <vector>
20*993b0882SAndroid Build Coastguard Worker 
21*993b0882SAndroid Build Coastguard Worker #include "gmock/gmock.h"
22*993b0882SAndroid Build Coastguard Worker #include "gtest/gtest.h"
23*993b0882SAndroid Build Coastguard Worker 
24*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
25*993b0882SAndroid Build Coastguard Worker namespace {
26*993b0882SAndroid Build Coastguard Worker 
27*993b0882SAndroid Build Coastguard Worker using testing::ElementsAreArray;
28*993b0882SAndroid Build Coastguard Worker 
29*993b0882SAndroid Build Coastguard Worker class TestingTokenizer : public Tokenizer {
30*993b0882SAndroid Build Coastguard Worker  public:
TestingTokenizer(const TokenizationType type,const UniLib * unilib,const std::vector<const TokenizationCodepointRange * > & codepoint_ranges,const std::vector<const CodepointRange * > & internal_tokenizer_codepoint_ranges,const bool split_on_script_change,const bool icu_preserve_whitespace_tokens,const bool preserve_floating_numbers)31*993b0882SAndroid Build Coastguard Worker   TestingTokenizer(
32*993b0882SAndroid Build Coastguard Worker       const TokenizationType type, const UniLib* unilib,
33*993b0882SAndroid Build Coastguard Worker       const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
34*993b0882SAndroid Build Coastguard Worker       const std::vector<const CodepointRange*>&
35*993b0882SAndroid Build Coastguard Worker           internal_tokenizer_codepoint_ranges,
36*993b0882SAndroid Build Coastguard Worker       const bool split_on_script_change,
37*993b0882SAndroid Build Coastguard Worker       const bool icu_preserve_whitespace_tokens,
38*993b0882SAndroid Build Coastguard Worker       const bool preserve_floating_numbers)
39*993b0882SAndroid Build Coastguard Worker       : Tokenizer(type, unilib, codepoint_ranges,
40*993b0882SAndroid Build Coastguard Worker                   internal_tokenizer_codepoint_ranges, split_on_script_change,
41*993b0882SAndroid Build Coastguard Worker                   icu_preserve_whitespace_tokens, preserve_floating_numbers) {}
42*993b0882SAndroid Build Coastguard Worker 
43*993b0882SAndroid Build Coastguard Worker   using Tokenizer::FindTokenizationRange;
44*993b0882SAndroid Build Coastguard Worker };
45*993b0882SAndroid Build Coastguard Worker 
46*993b0882SAndroid Build Coastguard Worker class TestingTokenizerProxy {
47*993b0882SAndroid Build Coastguard Worker  public:
TestingTokenizerProxy(TokenizationType type,const std::vector<TokenizationCodepointRangeT> & codepoint_range_configs,const std::vector<CodepointRangeT> & internal_codepoint_range_configs,const bool split_on_script_change,const bool icu_preserve_whitespace_tokens,const bool preserve_floating_numbers)48*993b0882SAndroid Build Coastguard Worker   TestingTokenizerProxy(
49*993b0882SAndroid Build Coastguard Worker       TokenizationType type,
50*993b0882SAndroid Build Coastguard Worker       const std::vector<TokenizationCodepointRangeT>& codepoint_range_configs,
51*993b0882SAndroid Build Coastguard Worker       const std::vector<CodepointRangeT>& internal_codepoint_range_configs,
52*993b0882SAndroid Build Coastguard Worker       const bool split_on_script_change,
53*993b0882SAndroid Build Coastguard Worker       const bool icu_preserve_whitespace_tokens,
54*993b0882SAndroid Build Coastguard Worker       const bool preserve_floating_numbers)
55*993b0882SAndroid Build Coastguard Worker       : INIT_UNILIB_FOR_TESTING(unilib_) {
56*993b0882SAndroid Build Coastguard Worker     const int num_configs = codepoint_range_configs.size();
57*993b0882SAndroid Build Coastguard Worker     std::vector<const TokenizationCodepointRange*> configs_fb;
58*993b0882SAndroid Build Coastguard Worker     configs_fb.reserve(num_configs);
59*993b0882SAndroid Build Coastguard Worker     const int num_internal_configs = internal_codepoint_range_configs.size();
60*993b0882SAndroid Build Coastguard Worker     std::vector<const CodepointRange*> internal_configs_fb;
61*993b0882SAndroid Build Coastguard Worker     internal_configs_fb.reserve(num_internal_configs);
62*993b0882SAndroid Build Coastguard Worker     buffers_.reserve(num_configs + num_internal_configs);
63*993b0882SAndroid Build Coastguard Worker     for (int i = 0; i < num_configs; i++) {
64*993b0882SAndroid Build Coastguard Worker       flatbuffers::FlatBufferBuilder builder;
65*993b0882SAndroid Build Coastguard Worker       builder.Finish(CreateTokenizationCodepointRange(
66*993b0882SAndroid Build Coastguard Worker           builder, &codepoint_range_configs[i]));
67*993b0882SAndroid Build Coastguard Worker       buffers_.push_back(builder.Release());
68*993b0882SAndroid Build Coastguard Worker       configs_fb.push_back(flatbuffers::GetRoot<TokenizationCodepointRange>(
69*993b0882SAndroid Build Coastguard Worker           buffers_.back().data()));
70*993b0882SAndroid Build Coastguard Worker     }
71*993b0882SAndroid Build Coastguard Worker     for (int i = 0; i < num_internal_configs; i++) {
72*993b0882SAndroid Build Coastguard Worker       flatbuffers::FlatBufferBuilder builder;
73*993b0882SAndroid Build Coastguard Worker       builder.Finish(
74*993b0882SAndroid Build Coastguard Worker           CreateCodepointRange(builder, &internal_codepoint_range_configs[i]));
75*993b0882SAndroid Build Coastguard Worker       buffers_.push_back(builder.Release());
76*993b0882SAndroid Build Coastguard Worker       internal_configs_fb.push_back(
77*993b0882SAndroid Build Coastguard Worker           flatbuffers::GetRoot<CodepointRange>(buffers_.back().data()));
78*993b0882SAndroid Build Coastguard Worker     }
79*993b0882SAndroid Build Coastguard Worker     tokenizer_ = std::unique_ptr<TestingTokenizer>(new TestingTokenizer(
80*993b0882SAndroid Build Coastguard Worker         type, &unilib_, configs_fb, internal_configs_fb, split_on_script_change,
81*993b0882SAndroid Build Coastguard Worker         icu_preserve_whitespace_tokens, preserve_floating_numbers));
82*993b0882SAndroid Build Coastguard Worker   }
83*993b0882SAndroid Build Coastguard Worker 
TestFindTokenizationRole(int c) const84*993b0882SAndroid Build Coastguard Worker   TokenizationCodepointRange_::Role TestFindTokenizationRole(int c) const {
85*993b0882SAndroid Build Coastguard Worker     const TokenizationCodepointRangeT* range =
86*993b0882SAndroid Build Coastguard Worker         tokenizer_->FindTokenizationRange(c);
87*993b0882SAndroid Build Coastguard Worker     if (range != nullptr) {
88*993b0882SAndroid Build Coastguard Worker       return range->role;
89*993b0882SAndroid Build Coastguard Worker     } else {
90*993b0882SAndroid Build Coastguard Worker       return TokenizationCodepointRange_::Role_DEFAULT_ROLE;
91*993b0882SAndroid Build Coastguard Worker     }
92*993b0882SAndroid Build Coastguard Worker   }
93*993b0882SAndroid Build Coastguard Worker 
Tokenize(const std::string & utf8_text) const94*993b0882SAndroid Build Coastguard Worker   std::vector<Token> Tokenize(const std::string& utf8_text) const {
95*993b0882SAndroid Build Coastguard Worker     return tokenizer_->Tokenize(utf8_text);
96*993b0882SAndroid Build Coastguard Worker   }
97*993b0882SAndroid Build Coastguard Worker 
98*993b0882SAndroid Build Coastguard Worker  private:
99*993b0882SAndroid Build Coastguard Worker   UniLib unilib_;
100*993b0882SAndroid Build Coastguard Worker   std::vector<flatbuffers::DetachedBuffer> buffers_;
101*993b0882SAndroid Build Coastguard Worker   std::unique_ptr<TestingTokenizer> tokenizer_;
102*993b0882SAndroid Build Coastguard Worker };
103*993b0882SAndroid Build Coastguard Worker 
TEST(TokenizerTest,FindTokenizationRange)104*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, FindTokenizationRange) {
105*993b0882SAndroid Build Coastguard Worker   std::vector<TokenizationCodepointRangeT> configs;
106*993b0882SAndroid Build Coastguard Worker   TokenizationCodepointRangeT* config;
107*993b0882SAndroid Build Coastguard Worker 
108*993b0882SAndroid Build Coastguard Worker   configs.emplace_back();
109*993b0882SAndroid Build Coastguard Worker   config = &configs.back();
110*993b0882SAndroid Build Coastguard Worker   config->start = 0;
111*993b0882SAndroid Build Coastguard Worker   config->end = 10;
112*993b0882SAndroid Build Coastguard Worker   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
113*993b0882SAndroid Build Coastguard Worker 
114*993b0882SAndroid Build Coastguard Worker   configs.emplace_back();
115*993b0882SAndroid Build Coastguard Worker   config = &configs.back();
116*993b0882SAndroid Build Coastguard Worker   config->start = 32;
117*993b0882SAndroid Build Coastguard Worker   config->end = 33;
118*993b0882SAndroid Build Coastguard Worker   config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
119*993b0882SAndroid Build Coastguard Worker 
120*993b0882SAndroid Build Coastguard Worker   configs.emplace_back();
121*993b0882SAndroid Build Coastguard Worker   config = &configs.back();
122*993b0882SAndroid Build Coastguard Worker   config->start = 1234;
123*993b0882SAndroid Build Coastguard Worker   config->end = 12345;
124*993b0882SAndroid Build Coastguard Worker   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
125*993b0882SAndroid Build Coastguard Worker 
126*993b0882SAndroid Build Coastguard Worker   TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
127*993b0882SAndroid Build Coastguard Worker                                   {}, /*split_on_script_change=*/false,
128*993b0882SAndroid Build Coastguard Worker                                   /*icu_preserve_whitespace_tokens=*/false,
129*993b0882SAndroid Build Coastguard Worker                                   /*preserve_floating_numbers=*/false);
130*993b0882SAndroid Build Coastguard Worker 
131*993b0882SAndroid Build Coastguard Worker   // Test hits to the first group.
132*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokenizer.TestFindTokenizationRole(0),
133*993b0882SAndroid Build Coastguard Worker             TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
134*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokenizer.TestFindTokenizationRole(5),
135*993b0882SAndroid Build Coastguard Worker             TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
136*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokenizer.TestFindTokenizationRole(10),
137*993b0882SAndroid Build Coastguard Worker             TokenizationCodepointRange_::Role_DEFAULT_ROLE);
138*993b0882SAndroid Build Coastguard Worker 
139*993b0882SAndroid Build Coastguard Worker   // Test a hit to the second group.
140*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokenizer.TestFindTokenizationRole(31),
141*993b0882SAndroid Build Coastguard Worker             TokenizationCodepointRange_::Role_DEFAULT_ROLE);
142*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokenizer.TestFindTokenizationRole(32),
143*993b0882SAndroid Build Coastguard Worker             TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR);
144*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokenizer.TestFindTokenizationRole(33),
145*993b0882SAndroid Build Coastguard Worker             TokenizationCodepointRange_::Role_DEFAULT_ROLE);
146*993b0882SAndroid Build Coastguard Worker 
147*993b0882SAndroid Build Coastguard Worker   // Test hits to the third group.
148*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokenizer.TestFindTokenizationRole(1233),
149*993b0882SAndroid Build Coastguard Worker             TokenizationCodepointRange_::Role_DEFAULT_ROLE);
150*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokenizer.TestFindTokenizationRole(1234),
151*993b0882SAndroid Build Coastguard Worker             TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
152*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokenizer.TestFindTokenizationRole(12344),
153*993b0882SAndroid Build Coastguard Worker             TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
154*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokenizer.TestFindTokenizationRole(12345),
155*993b0882SAndroid Build Coastguard Worker             TokenizationCodepointRange_::Role_DEFAULT_ROLE);
156*993b0882SAndroid Build Coastguard Worker 
157*993b0882SAndroid Build Coastguard Worker   // Test a hit outside.
158*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokenizer.TestFindTokenizationRole(99),
159*993b0882SAndroid Build Coastguard Worker             TokenizationCodepointRange_::Role_DEFAULT_ROLE);
160*993b0882SAndroid Build Coastguard Worker }
161*993b0882SAndroid Build Coastguard Worker 
TEST(TokenizerTest,TokenizeOnSpace)162*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, TokenizeOnSpace) {
163*993b0882SAndroid Build Coastguard Worker   std::vector<TokenizationCodepointRangeT> configs;
164*993b0882SAndroid Build Coastguard Worker   TokenizationCodepointRangeT* config;
165*993b0882SAndroid Build Coastguard Worker 
166*993b0882SAndroid Build Coastguard Worker   configs.emplace_back();
167*993b0882SAndroid Build Coastguard Worker   config = &configs.back();
168*993b0882SAndroid Build Coastguard Worker   // Space character.
169*993b0882SAndroid Build Coastguard Worker   config->start = 32;
170*993b0882SAndroid Build Coastguard Worker   config->end = 33;
171*993b0882SAndroid Build Coastguard Worker   config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
172*993b0882SAndroid Build Coastguard Worker 
173*993b0882SAndroid Build Coastguard Worker   TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
174*993b0882SAndroid Build Coastguard Worker                                   {},
175*993b0882SAndroid Build Coastguard Worker                                   /*split_on_script_change=*/false,
176*993b0882SAndroid Build Coastguard Worker                                   /*icu_preserve_whitespace_tokens=*/false,
177*993b0882SAndroid Build Coastguard Worker                                   /*preserve_floating_numbers=*/false);
178*993b0882SAndroid Build Coastguard Worker   std::vector<Token> tokens = tokenizer.Tokenize("Hello world!");
179*993b0882SAndroid Build Coastguard Worker 
180*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(tokens,
181*993b0882SAndroid Build Coastguard Worker               ElementsAreArray({Token("Hello", 0, 5), Token("world!", 6, 12)}));
182*993b0882SAndroid Build Coastguard Worker }
183*993b0882SAndroid Build Coastguard Worker 
TEST(TokenizerTest,TokenizeOnSpaceAndScriptChange)184*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, TokenizeOnSpaceAndScriptChange) {
185*993b0882SAndroid Build Coastguard Worker   std::vector<TokenizationCodepointRangeT> configs;
186*993b0882SAndroid Build Coastguard Worker   TokenizationCodepointRangeT* config;
187*993b0882SAndroid Build Coastguard Worker 
188*993b0882SAndroid Build Coastguard Worker   // Latin.
189*993b0882SAndroid Build Coastguard Worker   configs.emplace_back();
190*993b0882SAndroid Build Coastguard Worker   config = &configs.back();
191*993b0882SAndroid Build Coastguard Worker   config->start = 0;
192*993b0882SAndroid Build Coastguard Worker   config->end = 32;
193*993b0882SAndroid Build Coastguard Worker   config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
194*993b0882SAndroid Build Coastguard Worker   config->script_id = 1;
195*993b0882SAndroid Build Coastguard Worker   configs.emplace_back();
196*993b0882SAndroid Build Coastguard Worker   config = &configs.back();
197*993b0882SAndroid Build Coastguard Worker   config->start = 32;
198*993b0882SAndroid Build Coastguard Worker   config->end = 33;
199*993b0882SAndroid Build Coastguard Worker   config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
200*993b0882SAndroid Build Coastguard Worker   config->script_id = 1;
201*993b0882SAndroid Build Coastguard Worker   configs.emplace_back();
202*993b0882SAndroid Build Coastguard Worker   config = &configs.back();
203*993b0882SAndroid Build Coastguard Worker   config->start = 33;
204*993b0882SAndroid Build Coastguard Worker   config->end = 0x77F + 1;
205*993b0882SAndroid Build Coastguard Worker   config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
206*993b0882SAndroid Build Coastguard Worker   config->script_id = 1;
207*993b0882SAndroid Build Coastguard Worker 
208*993b0882SAndroid Build Coastguard Worker   TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
209*993b0882SAndroid Build Coastguard Worker                                   {},
210*993b0882SAndroid Build Coastguard Worker                                   /*split_on_script_change=*/true,
211*993b0882SAndroid Build Coastguard Worker                                   /*icu_preserve_whitespace_tokens=*/false,
212*993b0882SAndroid Build Coastguard Worker                                   /*preserve_floating_numbers=*/false);
213*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(tokenizer.Tokenize("앨라배마 주 전화(123) 456-789웹사이트"),
214*993b0882SAndroid Build Coastguard Worker               std::vector<Token>({Token("앨라배마", 0, 4), Token("주", 5, 6),
215*993b0882SAndroid Build Coastguard Worker                                   Token("전화", 7, 10), Token("(123)", 10, 15),
216*993b0882SAndroid Build Coastguard Worker                                   Token("456-789", 16, 23),
217*993b0882SAndroid Build Coastguard Worker                                   Token("웹사이트", 23, 28)}));
218*993b0882SAndroid Build Coastguard Worker }  // namespace
219*993b0882SAndroid Build Coastguard Worker 
TEST(TokenizerTest,TokenizeComplex)220*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, TokenizeComplex) {
221*993b0882SAndroid Build Coastguard Worker   std::vector<TokenizationCodepointRangeT> configs;
222*993b0882SAndroid Build Coastguard Worker   TokenizationCodepointRangeT* config;
223*993b0882SAndroid Build Coastguard Worker 
224*993b0882SAndroid Build Coastguard Worker   // Source: http://www.unicode.org/Public/10.0.0/ucd/Blocks-10.0.0d1.txt
225*993b0882SAndroid Build Coastguard Worker   // Latin - cyrilic.
226*993b0882SAndroid Build Coastguard Worker   //   0000..007F; Basic Latin
227*993b0882SAndroid Build Coastguard Worker   //   0080..00FF; Latin-1 Supplement
228*993b0882SAndroid Build Coastguard Worker   //   0100..017F; Latin Extended-A
229*993b0882SAndroid Build Coastguard Worker   //   0180..024F; Latin Extended-B
230*993b0882SAndroid Build Coastguard Worker   //   0250..02AF; IPA Extensions
231*993b0882SAndroid Build Coastguard Worker   //   02B0..02FF; Spacing Modifier Letters
232*993b0882SAndroid Build Coastguard Worker   //   0300..036F; Combining Diacritical Marks
233*993b0882SAndroid Build Coastguard Worker   //   0370..03FF; Greek and Coptic
234*993b0882SAndroid Build Coastguard Worker   //   0400..04FF; Cyrillic
235*993b0882SAndroid Build Coastguard Worker   //   0500..052F; Cyrillic Supplement
236*993b0882SAndroid Build Coastguard Worker   //   0530..058F; Armenian
237*993b0882SAndroid Build Coastguard Worker   //   0590..05FF; Hebrew
238*993b0882SAndroid Build Coastguard Worker   //   0600..06FF; Arabic
239*993b0882SAndroid Build Coastguard Worker   //   0700..074F; Syriac
240*993b0882SAndroid Build Coastguard Worker   //   0750..077F; Arabic Supplement
241*993b0882SAndroid Build Coastguard Worker   configs.emplace_back();
242*993b0882SAndroid Build Coastguard Worker   config = &configs.back();
243*993b0882SAndroid Build Coastguard Worker   config->start = 0;
244*993b0882SAndroid Build Coastguard Worker   config->end = 32;
245*993b0882SAndroid Build Coastguard Worker   config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
246*993b0882SAndroid Build Coastguard Worker   configs.emplace_back();
247*993b0882SAndroid Build Coastguard Worker   config = &configs.back();
248*993b0882SAndroid Build Coastguard Worker   config->start = 32;
249*993b0882SAndroid Build Coastguard Worker   config->end = 33;
250*993b0882SAndroid Build Coastguard Worker   config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
251*993b0882SAndroid Build Coastguard Worker   configs.emplace_back();
252*993b0882SAndroid Build Coastguard Worker   config = &configs.back();
253*993b0882SAndroid Build Coastguard Worker   config->start = 33;
254*993b0882SAndroid Build Coastguard Worker   config->end = 0x77F + 1;
255*993b0882SAndroid Build Coastguard Worker   config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
256*993b0882SAndroid Build Coastguard Worker 
257*993b0882SAndroid Build Coastguard Worker   // CJK
258*993b0882SAndroid Build Coastguard Worker   // 2E80..2EFF; CJK Radicals Supplement
259*993b0882SAndroid Build Coastguard Worker   // 3000..303F; CJK Symbols and Punctuation
260*993b0882SAndroid Build Coastguard Worker   // 3040..309F; Hiragana
261*993b0882SAndroid Build Coastguard Worker   // 30A0..30FF; Katakana
262*993b0882SAndroid Build Coastguard Worker   // 3100..312F; Bopomofo
263*993b0882SAndroid Build Coastguard Worker   // 3130..318F; Hangul Compatibility Jamo
264*993b0882SAndroid Build Coastguard Worker   // 3190..319F; Kanbun
265*993b0882SAndroid Build Coastguard Worker   // 31A0..31BF; Bopomofo Extended
266*993b0882SAndroid Build Coastguard Worker   // 31C0..31EF; CJK Strokes
267*993b0882SAndroid Build Coastguard Worker   // 31F0..31FF; Katakana Phonetic Extensions
268*993b0882SAndroid Build Coastguard Worker   // 3200..32FF; Enclosed CJK Letters and Months
269*993b0882SAndroid Build Coastguard Worker   // 3300..33FF; CJK Compatibility
270*993b0882SAndroid Build Coastguard Worker   // 3400..4DBF; CJK Unified Ideographs Extension A
271*993b0882SAndroid Build Coastguard Worker   // 4DC0..4DFF; Yijing Hexagram Symbols
272*993b0882SAndroid Build Coastguard Worker   // 4E00..9FFF; CJK Unified Ideographs
273*993b0882SAndroid Build Coastguard Worker   // A000..A48F; Yi Syllables
274*993b0882SAndroid Build Coastguard Worker   // A490..A4CF; Yi Radicals
275*993b0882SAndroid Build Coastguard Worker   // A4D0..A4FF; Lisu
276*993b0882SAndroid Build Coastguard Worker   // A500..A63F; Vai
277*993b0882SAndroid Build Coastguard Worker   // F900..FAFF; CJK Compatibility Ideographs
278*993b0882SAndroid Build Coastguard Worker   // FE30..FE4F; CJK Compatibility Forms
279*993b0882SAndroid Build Coastguard Worker   // 20000..2A6DF; CJK Unified Ideographs Extension B
280*993b0882SAndroid Build Coastguard Worker   // 2A700..2B73F; CJK Unified Ideographs Extension C
281*993b0882SAndroid Build Coastguard Worker   // 2B740..2B81F; CJK Unified Ideographs Extension D
282*993b0882SAndroid Build Coastguard Worker   // 2B820..2CEAF; CJK Unified Ideographs Extension E
283*993b0882SAndroid Build Coastguard Worker   // 2CEB0..2EBEF; CJK Unified Ideographs Extension F
284*993b0882SAndroid Build Coastguard Worker   // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
285*993b0882SAndroid Build Coastguard Worker   configs.emplace_back();
286*993b0882SAndroid Build Coastguard Worker   config = &configs.back();
287*993b0882SAndroid Build Coastguard Worker   config->start = 0x2E80;
288*993b0882SAndroid Build Coastguard Worker   config->end = 0x2EFF + 1;
289*993b0882SAndroid Build Coastguard Worker   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
290*993b0882SAndroid Build Coastguard Worker   configs.emplace_back();
291*993b0882SAndroid Build Coastguard Worker   config = &configs.back();
292*993b0882SAndroid Build Coastguard Worker   config->start = 0x3000;
293*993b0882SAndroid Build Coastguard Worker   config->end = 0xA63F + 1;
294*993b0882SAndroid Build Coastguard Worker   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
295*993b0882SAndroid Build Coastguard Worker   configs.emplace_back();
296*993b0882SAndroid Build Coastguard Worker   config = &configs.back();
297*993b0882SAndroid Build Coastguard Worker   config->start = 0xF900;
298*993b0882SAndroid Build Coastguard Worker   config->end = 0xFAFF + 1;
299*993b0882SAndroid Build Coastguard Worker   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
300*993b0882SAndroid Build Coastguard Worker   configs.emplace_back();
301*993b0882SAndroid Build Coastguard Worker   config = &configs.back();
302*993b0882SAndroid Build Coastguard Worker   config->start = 0xFE30;
303*993b0882SAndroid Build Coastguard Worker   config->end = 0xFE4F + 1;
304*993b0882SAndroid Build Coastguard Worker   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
305*993b0882SAndroid Build Coastguard Worker   configs.emplace_back();
306*993b0882SAndroid Build Coastguard Worker   config = &configs.back();
307*993b0882SAndroid Build Coastguard Worker   config->start = 0x20000;
308*993b0882SAndroid Build Coastguard Worker   config->end = 0x2A6DF + 1;
309*993b0882SAndroid Build Coastguard Worker   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
310*993b0882SAndroid Build Coastguard Worker   configs.emplace_back();
311*993b0882SAndroid Build Coastguard Worker   config = &configs.back();
312*993b0882SAndroid Build Coastguard Worker   config->start = 0x2A700;
313*993b0882SAndroid Build Coastguard Worker   config->end = 0x2B73F + 1;
314*993b0882SAndroid Build Coastguard Worker   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
315*993b0882SAndroid Build Coastguard Worker   configs.emplace_back();
316*993b0882SAndroid Build Coastguard Worker   config = &configs.back();
317*993b0882SAndroid Build Coastguard Worker   config->start = 0x2B740;
318*993b0882SAndroid Build Coastguard Worker   config->end = 0x2B81F + 1;
319*993b0882SAndroid Build Coastguard Worker   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
320*993b0882SAndroid Build Coastguard Worker   configs.emplace_back();
321*993b0882SAndroid Build Coastguard Worker   config = &configs.back();
322*993b0882SAndroid Build Coastguard Worker   config->start = 0x2B820;
323*993b0882SAndroid Build Coastguard Worker   config->end = 0x2CEAF + 1;
324*993b0882SAndroid Build Coastguard Worker   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
325*993b0882SAndroid Build Coastguard Worker   configs.emplace_back();
326*993b0882SAndroid Build Coastguard Worker   config = &configs.back();
327*993b0882SAndroid Build Coastguard Worker   config->start = 0x2CEB0;
328*993b0882SAndroid Build Coastguard Worker   config->end = 0x2EBEF + 1;
329*993b0882SAndroid Build Coastguard Worker   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
330*993b0882SAndroid Build Coastguard Worker   configs.emplace_back();
331*993b0882SAndroid Build Coastguard Worker   config = &configs.back();
332*993b0882SAndroid Build Coastguard Worker   config->start = 0x2F800;
333*993b0882SAndroid Build Coastguard Worker   config->end = 0x2FA1F + 1;
334*993b0882SAndroid Build Coastguard Worker   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
335*993b0882SAndroid Build Coastguard Worker 
336*993b0882SAndroid Build Coastguard Worker   // Thai.
337*993b0882SAndroid Build Coastguard Worker   // 0E00..0E7F; Thai
338*993b0882SAndroid Build Coastguard Worker   configs.emplace_back();
339*993b0882SAndroid Build Coastguard Worker   config = &configs.back();
340*993b0882SAndroid Build Coastguard Worker   config->start = 0x0E00;
341*993b0882SAndroid Build Coastguard Worker   config->end = 0x0E7F + 1;
342*993b0882SAndroid Build Coastguard Worker   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
343*993b0882SAndroid Build Coastguard Worker 
344*993b0882SAndroid Build Coastguard Worker   TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
345*993b0882SAndroid Build Coastguard Worker                                   {},
346*993b0882SAndroid Build Coastguard Worker                                   /*split_on_script_change=*/false,
347*993b0882SAndroid Build Coastguard Worker                                   /*icu_preserve_whitespace_tokens=*/false,
348*993b0882SAndroid Build Coastguard Worker                                   /*preserve_floating_numbers=*/false);
349*993b0882SAndroid Build Coastguard Worker   std::vector<Token> tokens;
350*993b0882SAndroid Build Coastguard Worker 
351*993b0882SAndroid Build Coastguard Worker   tokens = tokenizer.Tokenize(
352*993b0882SAndroid Build Coastguard Worker       "問少目木輸走猶術権自京門録球変。細開括省用掲情結傍走愛明氷。");
353*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens.size(), 30);
354*993b0882SAndroid Build Coastguard Worker 
355*993b0882SAndroid Build Coastguard Worker   tokens = tokenizer.Tokenize("問少目 hello 木輸ยามきゃ");
356*993b0882SAndroid Build Coastguard Worker   // clang-format off
357*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(
358*993b0882SAndroid Build Coastguard Worker       tokens,
359*993b0882SAndroid Build Coastguard Worker       ElementsAreArray({Token("問", 0, 1),
360*993b0882SAndroid Build Coastguard Worker                         Token("少", 1, 2),
361*993b0882SAndroid Build Coastguard Worker                         Token("目", 2, 3),
362*993b0882SAndroid Build Coastguard Worker                         Token("hello", 4, 9),
363*993b0882SAndroid Build Coastguard Worker                         Token("木", 10, 11),
364*993b0882SAndroid Build Coastguard Worker                         Token("輸", 11, 12),
365*993b0882SAndroid Build Coastguard Worker                         Token("ย", 12, 13),
366*993b0882SAndroid Build Coastguard Worker                         Token("า", 13, 14),
367*993b0882SAndroid Build Coastguard Worker                         Token("ม", 14, 15),
368*993b0882SAndroid Build Coastguard Worker                         Token("き", 15, 16),
369*993b0882SAndroid Build Coastguard Worker                         Token("ゃ", 16, 17)}));
370*993b0882SAndroid Build Coastguard Worker   // clang-format on
371*993b0882SAndroid Build Coastguard Worker }
372*993b0882SAndroid Build Coastguard Worker 
373*993b0882SAndroid Build Coastguard Worker #if defined(TC3_TEST_ICU) || defined(__APPLE__)
TEST(TokenizerTest,ICUTokenizeWithWhitespaces)374*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, ICUTokenizeWithWhitespaces) {
375*993b0882SAndroid Build Coastguard Worker   TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
376*993b0882SAndroid Build Coastguard Worker                                   /*split_on_script_change=*/false,
377*993b0882SAndroid Build Coastguard Worker                                   /*icu_preserve_whitespace_tokens=*/true,
378*993b0882SAndroid Build Coastguard Worker                                   /*preserve_floating_numbers=*/false);
379*993b0882SAndroid Build Coastguard Worker   std::vector<Token> tokens = tokenizer.Tokenize("พระบาท สมเด็จ พระ ปร มิ");
380*993b0882SAndroid Build Coastguard Worker   // clang-format off
381*993b0882SAndroid Build Coastguard Worker   ASSERT_EQ(tokens,
382*993b0882SAndroid Build Coastguard Worker             std::vector<Token>({Token("พระบาท", 0, 6),
383*993b0882SAndroid Build Coastguard Worker                                 Token(" ", 6, 7),
384*993b0882SAndroid Build Coastguard Worker                                 Token("สมเด็จ", 7, 13),
385*993b0882SAndroid Build Coastguard Worker                                 Token(" ", 13, 14),
386*993b0882SAndroid Build Coastguard Worker                                 Token("พระ", 14, 17),
387*993b0882SAndroid Build Coastguard Worker                                 Token(" ", 17, 18),
388*993b0882SAndroid Build Coastguard Worker                                 Token("ปร", 18, 20),
389*993b0882SAndroid Build Coastguard Worker                                 Token(" ", 20, 21),
390*993b0882SAndroid Build Coastguard Worker                                 Token("มิ", 21, 23)}));
391*993b0882SAndroid Build Coastguard Worker   // clang-format on
392*993b0882SAndroid Build Coastguard Worker }
393*993b0882SAndroid Build Coastguard Worker 
TEST(TokenizerTest,ICUTokenizePunctuation)394*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, ICUTokenizePunctuation) {
395*993b0882SAndroid Build Coastguard Worker   TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
396*993b0882SAndroid Build Coastguard Worker                                   /*split_on_script_change=*/false,
397*993b0882SAndroid Build Coastguard Worker                                   /*icu_preserve_whitespace_tokens=*/true,
398*993b0882SAndroid Build Coastguard Worker                                   /*preserve_floating_numbers=*/false);
399*993b0882SAndroid Build Coastguard Worker   std::vector<Token> tokens =
400*993b0882SAndroid Build Coastguard Worker       tokenizer.Tokenize("The interval is: -(12, 138*)");
401*993b0882SAndroid Build Coastguard Worker   // clang-format off
402*993b0882SAndroid Build Coastguard Worker   ASSERT_EQ(
403*993b0882SAndroid Build Coastguard Worker       tokens,
404*993b0882SAndroid Build Coastguard Worker             std::vector<Token>({Token("The", 0, 3),
405*993b0882SAndroid Build Coastguard Worker                                 Token(" ", 3, 4),
406*993b0882SAndroid Build Coastguard Worker                                 Token("interval", 4, 12),
407*993b0882SAndroid Build Coastguard Worker                                 Token(" ", 12, 13),
408*993b0882SAndroid Build Coastguard Worker                                 Token("is", 13, 15),
409*993b0882SAndroid Build Coastguard Worker                                 Token(":", 15, 16),
410*993b0882SAndroid Build Coastguard Worker                                 Token(" ", 16, 17),
411*993b0882SAndroid Build Coastguard Worker                                 Token("-", 17, 18),
412*993b0882SAndroid Build Coastguard Worker                                 Token("(", 18, 19),
413*993b0882SAndroid Build Coastguard Worker                                 Token("12", 19, 21),
414*993b0882SAndroid Build Coastguard Worker                                 Token(",", 21, 22),
415*993b0882SAndroid Build Coastguard Worker                                 Token(" ", 22, 23),
416*993b0882SAndroid Build Coastguard Worker                                 Token("138", 23, 26),
417*993b0882SAndroid Build Coastguard Worker                                 Token("*", 26, 27),
418*993b0882SAndroid Build Coastguard Worker                                 Token(")", 27, 28)}));
419*993b0882SAndroid Build Coastguard Worker   // clang-format on
420*993b0882SAndroid Build Coastguard Worker }
421*993b0882SAndroid Build Coastguard Worker 
TEST(TokenizerTest,ICUTokenizeWithNumbers)422*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, ICUTokenizeWithNumbers) {
423*993b0882SAndroid Build Coastguard Worker   TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
424*993b0882SAndroid Build Coastguard Worker                                   /*split_on_script_change=*/false,
425*993b0882SAndroid Build Coastguard Worker                                   /*icu_preserve_whitespace_tokens=*/true,
426*993b0882SAndroid Build Coastguard Worker                                   /*preserve_floating_numbers=*/false);
427*993b0882SAndroid Build Coastguard Worker   std::vector<Token> tokens = tokenizer.Tokenize("3.1 3﹒2 3.3");
428*993b0882SAndroid Build Coastguard Worker   // clang-format off
429*993b0882SAndroid Build Coastguard Worker   ASSERT_EQ(tokens,
430*993b0882SAndroid Build Coastguard Worker             std::vector<Token>({Token("3.1", 0, 3),
431*993b0882SAndroid Build Coastguard Worker                                 Token(" ", 3, 4),
432*993b0882SAndroid Build Coastguard Worker                                 Token("3﹒2", 4, 7),
433*993b0882SAndroid Build Coastguard Worker                                 Token(" ", 7, 8),
434*993b0882SAndroid Build Coastguard Worker                                 Token("3.3", 8, 11)}));
435*993b0882SAndroid Build Coastguard Worker   // clang-format on
436*993b0882SAndroid Build Coastguard Worker }
437*993b0882SAndroid Build Coastguard Worker #endif
438*993b0882SAndroid Build Coastguard Worker 
439*993b0882SAndroid Build Coastguard Worker #if defined(TC3_TEST_ICU)
TEST(TokenizerTest,ICUTokenize)440*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, ICUTokenize) {
441*993b0882SAndroid Build Coastguard Worker   TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
442*993b0882SAndroid Build Coastguard Worker                                   /*split_on_script_change=*/false,
443*993b0882SAndroid Build Coastguard Worker                                   /*icu_preserve_whitespace_tokens=*/false,
444*993b0882SAndroid Build Coastguard Worker                                   /*preserve_floating_numbers=*/false);
445*993b0882SAndroid Build Coastguard Worker   std::vector<Token> tokens = tokenizer.Tokenize("พระบาทสมเด็จพระปรมิ");
446*993b0882SAndroid Build Coastguard Worker   // clang-format off
447*993b0882SAndroid Build Coastguard Worker   ASSERT_EQ(tokens,
448*993b0882SAndroid Build Coastguard Worker             std::vector<Token>({Token("พระบาท", 0, 6),
449*993b0882SAndroid Build Coastguard Worker                                 Token("สมเด็จ", 6, 12),
450*993b0882SAndroid Build Coastguard Worker                                 Token("พระ", 12, 15),
451*993b0882SAndroid Build Coastguard Worker                                 Token("ปร", 15, 17),
452*993b0882SAndroid Build Coastguard Worker                                 Token("มิ", 17, 19)}));
453*993b0882SAndroid Build Coastguard Worker   // clang-format on
454*993b0882SAndroid Build Coastguard Worker }
455*993b0882SAndroid Build Coastguard Worker 
TEST(TokenizerTest,MixedTokenize)456*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, MixedTokenize) {
457*993b0882SAndroid Build Coastguard Worker   std::vector<TokenizationCodepointRangeT> configs;
458*993b0882SAndroid Build Coastguard Worker   TokenizationCodepointRangeT* config;
459*993b0882SAndroid Build Coastguard Worker 
460*993b0882SAndroid Build Coastguard Worker   configs.emplace_back();
461*993b0882SAndroid Build Coastguard Worker   config = &configs.back();
462*993b0882SAndroid Build Coastguard Worker   config->start = 32;
463*993b0882SAndroid Build Coastguard Worker   config->end = 33;
464*993b0882SAndroid Build Coastguard Worker   config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
465*993b0882SAndroid Build Coastguard Worker 
466*993b0882SAndroid Build Coastguard Worker   std::vector<CodepointRangeT> internal_configs;
467*993b0882SAndroid Build Coastguard Worker   CodepointRangeT* interal_config;
468*993b0882SAndroid Build Coastguard Worker 
469*993b0882SAndroid Build Coastguard Worker   internal_configs.emplace_back();
470*993b0882SAndroid Build Coastguard Worker   interal_config = &internal_configs.back();
471*993b0882SAndroid Build Coastguard Worker   interal_config->start = 0;
472*993b0882SAndroid Build Coastguard Worker   interal_config->end = 128;
473*993b0882SAndroid Build Coastguard Worker 
474*993b0882SAndroid Build Coastguard Worker   internal_configs.emplace_back();
475*993b0882SAndroid Build Coastguard Worker   interal_config = &internal_configs.back();
476*993b0882SAndroid Build Coastguard Worker   interal_config->start = 128;
477*993b0882SAndroid Build Coastguard Worker   interal_config->end = 256;
478*993b0882SAndroid Build Coastguard Worker 
479*993b0882SAndroid Build Coastguard Worker   internal_configs.emplace_back();
480*993b0882SAndroid Build Coastguard Worker   interal_config = &internal_configs.back();
481*993b0882SAndroid Build Coastguard Worker   interal_config->start = 256;
482*993b0882SAndroid Build Coastguard Worker   interal_config->end = 384;
483*993b0882SAndroid Build Coastguard Worker 
484*993b0882SAndroid Build Coastguard Worker   internal_configs.emplace_back();
485*993b0882SAndroid Build Coastguard Worker   interal_config = &internal_configs.back();
486*993b0882SAndroid Build Coastguard Worker   interal_config->start = 384;
487*993b0882SAndroid Build Coastguard Worker   interal_config->end = 592;
488*993b0882SAndroid Build Coastguard Worker 
489*993b0882SAndroid Build Coastguard Worker   TestingTokenizerProxy tokenizer(TokenizationType_MIXED, configs,
490*993b0882SAndroid Build Coastguard Worker                                   internal_configs,
491*993b0882SAndroid Build Coastguard Worker                                   /*split_on_script_change=*/false,
492*993b0882SAndroid Build Coastguard Worker                                   /*icu_preserve_whitespace_tokens=*/false,
493*993b0882SAndroid Build Coastguard Worker                                   /*preserve_floating_numbers=*/false);
494*993b0882SAndroid Build Coastguard Worker 
495*993b0882SAndroid Build Coastguard Worker   std::vector<Token> tokens = tokenizer.Tokenize(
496*993b0882SAndroid Build Coastguard Worker       "こんにちはJapanese-ląnguagę text 你好世界 http://www.google.com/");
497*993b0882SAndroid Build Coastguard Worker   ASSERT_EQ(
498*993b0882SAndroid Build Coastguard Worker       tokens,
499*993b0882SAndroid Build Coastguard Worker       // clang-format off
500*993b0882SAndroid Build Coastguard Worker       std::vector<Token>({Token("こんにちは", 0, 5),
501*993b0882SAndroid Build Coastguard Worker                           Token("Japanese-ląnguagę", 5, 22),
502*993b0882SAndroid Build Coastguard Worker                           Token("text", 23, 27),
503*993b0882SAndroid Build Coastguard Worker                           Token("你好", 28, 30),
504*993b0882SAndroid Build Coastguard Worker                           Token("世界", 30, 32),
505*993b0882SAndroid Build Coastguard Worker                           Token("http://www.google.com/", 33, 55)}));
506*993b0882SAndroid Build Coastguard Worker   // clang-format on
507*993b0882SAndroid Build Coastguard Worker }
508*993b0882SAndroid Build Coastguard Worker 
TEST(TokenizerTest,InternalTokenizeOnScriptChange)509*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, InternalTokenizeOnScriptChange) {
510*993b0882SAndroid Build Coastguard Worker   std::vector<TokenizationCodepointRangeT> configs;
511*993b0882SAndroid Build Coastguard Worker   TokenizationCodepointRangeT* config;
512*993b0882SAndroid Build Coastguard Worker 
513*993b0882SAndroid Build Coastguard Worker   configs.emplace_back();
514*993b0882SAndroid Build Coastguard Worker   config = &configs.back();
515*993b0882SAndroid Build Coastguard Worker   config->start = 0;
516*993b0882SAndroid Build Coastguard Worker   config->end = 256;
517*993b0882SAndroid Build Coastguard Worker   config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
518*993b0882SAndroid Build Coastguard Worker 
519*993b0882SAndroid Build Coastguard Worker   {
520*993b0882SAndroid Build Coastguard Worker     TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER,
521*993b0882SAndroid Build Coastguard Worker                                     configs, {},
522*993b0882SAndroid Build Coastguard Worker                                     /*split_on_script_change=*/false,
523*993b0882SAndroid Build Coastguard Worker                                     /*icu_preserve_whitespace_tokens=*/false,
524*993b0882SAndroid Build Coastguard Worker                                     /*preserve_floating_numbers=*/false);
525*993b0882SAndroid Build Coastguard Worker 
526*993b0882SAndroid Build Coastguard Worker     EXPECT_EQ(tokenizer.Tokenize("앨라배마123웹사이트"),
527*993b0882SAndroid Build Coastguard Worker               std::vector<Token>({Token("앨라배마123웹사이트", 0, 11)}));
528*993b0882SAndroid Build Coastguard Worker   }
529*993b0882SAndroid Build Coastguard Worker 
530*993b0882SAndroid Build Coastguard Worker   {
531*993b0882SAndroid Build Coastguard Worker     TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER,
532*993b0882SAndroid Build Coastguard Worker                                     configs, {},
533*993b0882SAndroid Build Coastguard Worker                                     /*split_on_script_change=*/true,
534*993b0882SAndroid Build Coastguard Worker                                     /*icu_preserve_whitespace_tokens=*/false,
535*993b0882SAndroid Build Coastguard Worker                                     /*preserve_floating_numbers=*/false);
536*993b0882SAndroid Build Coastguard Worker     EXPECT_EQ(tokenizer.Tokenize("앨라배마123웹사이트"),
537*993b0882SAndroid Build Coastguard Worker               std::vector<Token>({Token("앨라배마", 0, 4), Token("123", 4, 7),
538*993b0882SAndroid Build Coastguard Worker                                   Token("웹사이트", 7, 11)}));
539*993b0882SAndroid Build Coastguard Worker   }
540*993b0882SAndroid Build Coastguard Worker }
541*993b0882SAndroid Build Coastguard Worker #endif
542*993b0882SAndroid Build Coastguard Worker 
TEST(TokenizerTest,LetterDigitTokenize)543*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, LetterDigitTokenize) {
544*993b0882SAndroid Build Coastguard Worker   TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
545*993b0882SAndroid Build Coastguard Worker                                   /*split_on_script_change=*/false,
546*993b0882SAndroid Build Coastguard Worker                                   /*icu_preserve_whitespace_tokens=*/false,
547*993b0882SAndroid Build Coastguard Worker                                   /*preserve_floating_numbers=*/true);
548*993b0882SAndroid Build Coastguard Worker   std::vector<Token> tokens = tokenizer.Tokenize("7% -3.14 68.9#? 7% $99 .18.");
549*993b0882SAndroid Build Coastguard Worker   ASSERT_EQ(tokens,
550*993b0882SAndroid Build Coastguard Worker             std::vector<Token>(
551*993b0882SAndroid Build Coastguard Worker                 {Token("7", 0, 1), Token("%", 1, 2), Token(" ", 2, 3),
552*993b0882SAndroid Build Coastguard Worker                  Token("-", 3, 4), Token("3.14", 4, 8), Token(" ", 8, 9),
553*993b0882SAndroid Build Coastguard Worker                  Token("68.9", 9, 13), Token("#", 13, 14), Token("?", 14, 15),
554*993b0882SAndroid Build Coastguard Worker                  Token(" ", 15, 16), Token("7", 16, 17), Token("%", 17, 18),
555*993b0882SAndroid Build Coastguard Worker                  Token(" ", 18, 19), Token("$", 19, 20), Token("99", 20, 22),
556*993b0882SAndroid Build Coastguard Worker                  Token(" ", 22, 23), Token(".", 23, 24), Token("18", 24, 26),
557*993b0882SAndroid Build Coastguard Worker                  Token(".", 26, 27)}));
558*993b0882SAndroid Build Coastguard Worker }
559*993b0882SAndroid Build Coastguard Worker 
TEST(TokenizerTest,LetterDigitTokenizeUnicode)560*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, LetterDigitTokenizeUnicode) {
561*993b0882SAndroid Build Coastguard Worker   TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
562*993b0882SAndroid Build Coastguard Worker                                   /*split_on_script_change=*/false,
563*993b0882SAndroid Build Coastguard Worker                                   /*icu_preserve_whitespace_tokens=*/false,
564*993b0882SAndroid Build Coastguard Worker                                   /*preserve_floating_numbers=*/true);
565*993b0882SAndroid Build Coastguard Worker   std::vector<Token> tokens = tokenizer.Tokenize("2 pércént 3パーセント");
566*993b0882SAndroid Build Coastguard Worker   ASSERT_EQ(tokens, std::vector<Token>({Token("2", 0, 1), Token(" ", 1, 2),
567*993b0882SAndroid Build Coastguard Worker                                         Token("pércént", 2, 9),
568*993b0882SAndroid Build Coastguard Worker                                         Token(" ", 9, 10), Token("3", 10, 11),
569*993b0882SAndroid Build Coastguard Worker                                         Token("パーセント", 11, 16)}));
570*993b0882SAndroid Build Coastguard Worker }
571*993b0882SAndroid Build Coastguard Worker 
TEST(TokenizerTest,LetterDigitTokenizeWithDots)572*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, LetterDigitTokenizeWithDots) {
573*993b0882SAndroid Build Coastguard Worker   TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
574*993b0882SAndroid Build Coastguard Worker                                   /*split_on_script_change=*/false,
575*993b0882SAndroid Build Coastguard Worker                                   /*icu_preserve_whitespace_tokens=*/false,
576*993b0882SAndroid Build Coastguard Worker                                   /*preserve_floating_numbers=*/true);
577*993b0882SAndroid Build Coastguard Worker   std::vector<Token> tokens = tokenizer.Tokenize("3 3﹒2 3.3%");
578*993b0882SAndroid Build Coastguard Worker   ASSERT_EQ(tokens,
579*993b0882SAndroid Build Coastguard Worker             std::vector<Token>({Token("3", 0, 1), Token(" ", 1, 2),
580*993b0882SAndroid Build Coastguard Worker                                 Token("3﹒2", 2, 5), Token(" ", 5, 6),
581*993b0882SAndroid Build Coastguard Worker                                 Token("3.3", 6, 9), Token("%", 9, 10)}));
582*993b0882SAndroid Build Coastguard Worker }
583*993b0882SAndroid Build Coastguard Worker 
TEST(TokenizerTest,LetterDigitTokenizeDoNotPreserveFloatingNumbers)584*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, LetterDigitTokenizeDoNotPreserveFloatingNumbers) {
585*993b0882SAndroid Build Coastguard Worker   TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
586*993b0882SAndroid Build Coastguard Worker                                   /*split_on_script_change=*/false,
587*993b0882SAndroid Build Coastguard Worker                                   /*icu_preserve_whitespace_tokens=*/false,
588*993b0882SAndroid Build Coastguard Worker                                   /*preserve_floating_numbers=*/false);
589*993b0882SAndroid Build Coastguard Worker   std::vector<Token> tokens = tokenizer.Tokenize("15.12.2019 january's 3.2");
590*993b0882SAndroid Build Coastguard Worker   ASSERT_EQ(tokens,
591*993b0882SAndroid Build Coastguard Worker             std::vector<Token>(
592*993b0882SAndroid Build Coastguard Worker                 {Token("15", 0, 2), Token(".", 2, 3), Token("12", 3, 5),
593*993b0882SAndroid Build Coastguard Worker                  Token(".", 5, 6), Token("2019", 6, 10), Token(" ", 10, 11),
594*993b0882SAndroid Build Coastguard Worker                  Token("january", 11, 18), Token("'", 18, 19),
595*993b0882SAndroid Build Coastguard Worker                  Token("s", 19, 20), Token(" ", 20, 21), Token("3", 21, 22),
596*993b0882SAndroid Build Coastguard Worker                  Token(".", 22, 23), Token("2", 23, 24)}));
597*993b0882SAndroid Build Coastguard Worker }
598*993b0882SAndroid Build Coastguard Worker 
TEST(TokenizerTest,LetterDigitTokenizeStrangeStringFloatingNumbers)599*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, LetterDigitTokenizeStrangeStringFloatingNumbers) {
600*993b0882SAndroid Build Coastguard Worker   TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
601*993b0882SAndroid Build Coastguard Worker                                   /*split_on_script_change=*/false,
602*993b0882SAndroid Build Coastguard Worker                                   /*icu_preserve_whitespace_tokens=*/false,
603*993b0882SAndroid Build Coastguard Worker                                   /*preserve_floating_numbers=*/false);
604*993b0882SAndroid Build Coastguard Worker   std::vector<Token> tokens = tokenizer.Tokenize("The+2345++the +íí+");
605*993b0882SAndroid Build Coastguard Worker   ASSERT_EQ(tokens,
606*993b0882SAndroid Build Coastguard Worker             std::vector<Token>({Token("The", 0, 3), Token("+", 3, 4),
607*993b0882SAndroid Build Coastguard Worker                                 Token("2345", 4, 8), Token("+", 8, 9),
608*993b0882SAndroid Build Coastguard Worker                                 Token("+", 9, 10), Token("the", 10, 13),
609*993b0882SAndroid Build Coastguard Worker                                 Token(" ", 13, 14), Token("+", 14, 15),
610*993b0882SAndroid Build Coastguard Worker                                 Token("íí", 15, 17), Token("+", 17, 18)}));
611*993b0882SAndroid Build Coastguard Worker }
612*993b0882SAndroid Build Coastguard Worker 
TEST(TokenizerTest,LetterDigitTokenizeWhitespcesInSameToken)613*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, LetterDigitTokenizeWhitespcesInSameToken) {
614*993b0882SAndroid Build Coastguard Worker   TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
615*993b0882SAndroid Build Coastguard Worker                                   /*split_on_script_change=*/false,
616*993b0882SAndroid Build Coastguard Worker                                   /*icu_preserve_whitespace_tokens=*/false,
617*993b0882SAndroid Build Coastguard Worker                                   /*preserve_floating_numbers=*/false);
618*993b0882SAndroid Build Coastguard Worker   std::vector<Token> tokens = tokenizer.Tokenize("2 3  4   5");
619*993b0882SAndroid Build Coastguard Worker   ASSERT_EQ(tokens, std::vector<Token>({Token("2", 0, 1), Token(" ", 1, 2),
620*993b0882SAndroid Build Coastguard Worker                                         Token("3", 2, 3), Token("  ", 3, 5),
621*993b0882SAndroid Build Coastguard Worker                                         Token("4", 5, 6), Token("   ", 6, 9),
622*993b0882SAndroid Build Coastguard Worker                                         Token("5", 9, 10)}));
623*993b0882SAndroid Build Coastguard Worker }
624*993b0882SAndroid Build Coastguard Worker 
625*993b0882SAndroid Build Coastguard Worker }  // namespace
626*993b0882SAndroid Build Coastguard Worker }  // namespace libtextclassifier3
627