1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker *
4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker *
8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker *
10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker */
16*993b0882SAndroid Build Coastguard Worker
17*993b0882SAndroid Build Coastguard Worker #include "utils/tokenizer.h"
18*993b0882SAndroid Build Coastguard Worker
19*993b0882SAndroid Build Coastguard Worker #include <vector>
20*993b0882SAndroid Build Coastguard Worker
21*993b0882SAndroid Build Coastguard Worker #include "gmock/gmock.h"
22*993b0882SAndroid Build Coastguard Worker #include "gtest/gtest.h"
23*993b0882SAndroid Build Coastguard Worker
24*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
25*993b0882SAndroid Build Coastguard Worker namespace {
26*993b0882SAndroid Build Coastguard Worker
27*993b0882SAndroid Build Coastguard Worker using testing::ElementsAreArray;
28*993b0882SAndroid Build Coastguard Worker
29*993b0882SAndroid Build Coastguard Worker class TestingTokenizer : public Tokenizer {
30*993b0882SAndroid Build Coastguard Worker public:
TestingTokenizer(const TokenizationType type,const UniLib * unilib,const std::vector<const TokenizationCodepointRange * > & codepoint_ranges,const std::vector<const CodepointRange * > & internal_tokenizer_codepoint_ranges,const bool split_on_script_change,const bool icu_preserve_whitespace_tokens,const bool preserve_floating_numbers)31*993b0882SAndroid Build Coastguard Worker TestingTokenizer(
32*993b0882SAndroid Build Coastguard Worker const TokenizationType type, const UniLib* unilib,
33*993b0882SAndroid Build Coastguard Worker const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
34*993b0882SAndroid Build Coastguard Worker const std::vector<const CodepointRange*>&
35*993b0882SAndroid Build Coastguard Worker internal_tokenizer_codepoint_ranges,
36*993b0882SAndroid Build Coastguard Worker const bool split_on_script_change,
37*993b0882SAndroid Build Coastguard Worker const bool icu_preserve_whitespace_tokens,
38*993b0882SAndroid Build Coastguard Worker const bool preserve_floating_numbers)
39*993b0882SAndroid Build Coastguard Worker : Tokenizer(type, unilib, codepoint_ranges,
40*993b0882SAndroid Build Coastguard Worker internal_tokenizer_codepoint_ranges, split_on_script_change,
41*993b0882SAndroid Build Coastguard Worker icu_preserve_whitespace_tokens, preserve_floating_numbers) {}
42*993b0882SAndroid Build Coastguard Worker
43*993b0882SAndroid Build Coastguard Worker using Tokenizer::FindTokenizationRange;
44*993b0882SAndroid Build Coastguard Worker };
45*993b0882SAndroid Build Coastguard Worker
46*993b0882SAndroid Build Coastguard Worker class TestingTokenizerProxy {
47*993b0882SAndroid Build Coastguard Worker public:
TestingTokenizerProxy(TokenizationType type,const std::vector<TokenizationCodepointRangeT> & codepoint_range_configs,const std::vector<CodepointRangeT> & internal_codepoint_range_configs,const bool split_on_script_change,const bool icu_preserve_whitespace_tokens,const bool preserve_floating_numbers)48*993b0882SAndroid Build Coastguard Worker TestingTokenizerProxy(
49*993b0882SAndroid Build Coastguard Worker TokenizationType type,
50*993b0882SAndroid Build Coastguard Worker const std::vector<TokenizationCodepointRangeT>& codepoint_range_configs,
51*993b0882SAndroid Build Coastguard Worker const std::vector<CodepointRangeT>& internal_codepoint_range_configs,
52*993b0882SAndroid Build Coastguard Worker const bool split_on_script_change,
53*993b0882SAndroid Build Coastguard Worker const bool icu_preserve_whitespace_tokens,
54*993b0882SAndroid Build Coastguard Worker const bool preserve_floating_numbers)
55*993b0882SAndroid Build Coastguard Worker : INIT_UNILIB_FOR_TESTING(unilib_) {
56*993b0882SAndroid Build Coastguard Worker const int num_configs = codepoint_range_configs.size();
57*993b0882SAndroid Build Coastguard Worker std::vector<const TokenizationCodepointRange*> configs_fb;
58*993b0882SAndroid Build Coastguard Worker configs_fb.reserve(num_configs);
59*993b0882SAndroid Build Coastguard Worker const int num_internal_configs = internal_codepoint_range_configs.size();
60*993b0882SAndroid Build Coastguard Worker std::vector<const CodepointRange*> internal_configs_fb;
61*993b0882SAndroid Build Coastguard Worker internal_configs_fb.reserve(num_internal_configs);
62*993b0882SAndroid Build Coastguard Worker buffers_.reserve(num_configs + num_internal_configs);
63*993b0882SAndroid Build Coastguard Worker for (int i = 0; i < num_configs; i++) {
64*993b0882SAndroid Build Coastguard Worker flatbuffers::FlatBufferBuilder builder;
65*993b0882SAndroid Build Coastguard Worker builder.Finish(CreateTokenizationCodepointRange(
66*993b0882SAndroid Build Coastguard Worker builder, &codepoint_range_configs[i]));
67*993b0882SAndroid Build Coastguard Worker buffers_.push_back(builder.Release());
68*993b0882SAndroid Build Coastguard Worker configs_fb.push_back(flatbuffers::GetRoot<TokenizationCodepointRange>(
69*993b0882SAndroid Build Coastguard Worker buffers_.back().data()));
70*993b0882SAndroid Build Coastguard Worker }
71*993b0882SAndroid Build Coastguard Worker for (int i = 0; i < num_internal_configs; i++) {
72*993b0882SAndroid Build Coastguard Worker flatbuffers::FlatBufferBuilder builder;
73*993b0882SAndroid Build Coastguard Worker builder.Finish(
74*993b0882SAndroid Build Coastguard Worker CreateCodepointRange(builder, &internal_codepoint_range_configs[i]));
75*993b0882SAndroid Build Coastguard Worker buffers_.push_back(builder.Release());
76*993b0882SAndroid Build Coastguard Worker internal_configs_fb.push_back(
77*993b0882SAndroid Build Coastguard Worker flatbuffers::GetRoot<CodepointRange>(buffers_.back().data()));
78*993b0882SAndroid Build Coastguard Worker }
79*993b0882SAndroid Build Coastguard Worker tokenizer_ = std::unique_ptr<TestingTokenizer>(new TestingTokenizer(
80*993b0882SAndroid Build Coastguard Worker type, &unilib_, configs_fb, internal_configs_fb, split_on_script_change,
81*993b0882SAndroid Build Coastguard Worker icu_preserve_whitespace_tokens, preserve_floating_numbers));
82*993b0882SAndroid Build Coastguard Worker }
83*993b0882SAndroid Build Coastguard Worker
TestFindTokenizationRole(int c) const84*993b0882SAndroid Build Coastguard Worker TokenizationCodepointRange_::Role TestFindTokenizationRole(int c) const {
85*993b0882SAndroid Build Coastguard Worker const TokenizationCodepointRangeT* range =
86*993b0882SAndroid Build Coastguard Worker tokenizer_->FindTokenizationRange(c);
87*993b0882SAndroid Build Coastguard Worker if (range != nullptr) {
88*993b0882SAndroid Build Coastguard Worker return range->role;
89*993b0882SAndroid Build Coastguard Worker } else {
90*993b0882SAndroid Build Coastguard Worker return TokenizationCodepointRange_::Role_DEFAULT_ROLE;
91*993b0882SAndroid Build Coastguard Worker }
92*993b0882SAndroid Build Coastguard Worker }
93*993b0882SAndroid Build Coastguard Worker
Tokenize(const std::string & utf8_text) const94*993b0882SAndroid Build Coastguard Worker std::vector<Token> Tokenize(const std::string& utf8_text) const {
95*993b0882SAndroid Build Coastguard Worker return tokenizer_->Tokenize(utf8_text);
96*993b0882SAndroid Build Coastguard Worker }
97*993b0882SAndroid Build Coastguard Worker
98*993b0882SAndroid Build Coastguard Worker private:
99*993b0882SAndroid Build Coastguard Worker UniLib unilib_;
100*993b0882SAndroid Build Coastguard Worker std::vector<flatbuffers::DetachedBuffer> buffers_;
101*993b0882SAndroid Build Coastguard Worker std::unique_ptr<TestingTokenizer> tokenizer_;
102*993b0882SAndroid Build Coastguard Worker };
103*993b0882SAndroid Build Coastguard Worker
TEST(TokenizerTest,FindTokenizationRange)104*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, FindTokenizationRange) {
105*993b0882SAndroid Build Coastguard Worker std::vector<TokenizationCodepointRangeT> configs;
106*993b0882SAndroid Build Coastguard Worker TokenizationCodepointRangeT* config;
107*993b0882SAndroid Build Coastguard Worker
108*993b0882SAndroid Build Coastguard Worker configs.emplace_back();
109*993b0882SAndroid Build Coastguard Worker config = &configs.back();
110*993b0882SAndroid Build Coastguard Worker config->start = 0;
111*993b0882SAndroid Build Coastguard Worker config->end = 10;
112*993b0882SAndroid Build Coastguard Worker config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
113*993b0882SAndroid Build Coastguard Worker
114*993b0882SAndroid Build Coastguard Worker configs.emplace_back();
115*993b0882SAndroid Build Coastguard Worker config = &configs.back();
116*993b0882SAndroid Build Coastguard Worker config->start = 32;
117*993b0882SAndroid Build Coastguard Worker config->end = 33;
118*993b0882SAndroid Build Coastguard Worker config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
119*993b0882SAndroid Build Coastguard Worker
120*993b0882SAndroid Build Coastguard Worker configs.emplace_back();
121*993b0882SAndroid Build Coastguard Worker config = &configs.back();
122*993b0882SAndroid Build Coastguard Worker config->start = 1234;
123*993b0882SAndroid Build Coastguard Worker config->end = 12345;
124*993b0882SAndroid Build Coastguard Worker config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
125*993b0882SAndroid Build Coastguard Worker
126*993b0882SAndroid Build Coastguard Worker TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
127*993b0882SAndroid Build Coastguard Worker {}, /*split_on_script_change=*/false,
128*993b0882SAndroid Build Coastguard Worker /*icu_preserve_whitespace_tokens=*/false,
129*993b0882SAndroid Build Coastguard Worker /*preserve_floating_numbers=*/false);
130*993b0882SAndroid Build Coastguard Worker
131*993b0882SAndroid Build Coastguard Worker // Test hits to the first group.
132*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokenizer.TestFindTokenizationRole(0),
133*993b0882SAndroid Build Coastguard Worker TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
134*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokenizer.TestFindTokenizationRole(5),
135*993b0882SAndroid Build Coastguard Worker TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
136*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokenizer.TestFindTokenizationRole(10),
137*993b0882SAndroid Build Coastguard Worker TokenizationCodepointRange_::Role_DEFAULT_ROLE);
138*993b0882SAndroid Build Coastguard Worker
139*993b0882SAndroid Build Coastguard Worker // Test a hit to the second group.
140*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokenizer.TestFindTokenizationRole(31),
141*993b0882SAndroid Build Coastguard Worker TokenizationCodepointRange_::Role_DEFAULT_ROLE);
142*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokenizer.TestFindTokenizationRole(32),
143*993b0882SAndroid Build Coastguard Worker TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR);
144*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokenizer.TestFindTokenizationRole(33),
145*993b0882SAndroid Build Coastguard Worker TokenizationCodepointRange_::Role_DEFAULT_ROLE);
146*993b0882SAndroid Build Coastguard Worker
147*993b0882SAndroid Build Coastguard Worker // Test hits to the third group.
148*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokenizer.TestFindTokenizationRole(1233),
149*993b0882SAndroid Build Coastguard Worker TokenizationCodepointRange_::Role_DEFAULT_ROLE);
150*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokenizer.TestFindTokenizationRole(1234),
151*993b0882SAndroid Build Coastguard Worker TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
152*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokenizer.TestFindTokenizationRole(12344),
153*993b0882SAndroid Build Coastguard Worker TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
154*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokenizer.TestFindTokenizationRole(12345),
155*993b0882SAndroid Build Coastguard Worker TokenizationCodepointRange_::Role_DEFAULT_ROLE);
156*993b0882SAndroid Build Coastguard Worker
157*993b0882SAndroid Build Coastguard Worker // Test a hit outside.
158*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokenizer.TestFindTokenizationRole(99),
159*993b0882SAndroid Build Coastguard Worker TokenizationCodepointRange_::Role_DEFAULT_ROLE);
160*993b0882SAndroid Build Coastguard Worker }
161*993b0882SAndroid Build Coastguard Worker
TEST(TokenizerTest,TokenizeOnSpace)162*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, TokenizeOnSpace) {
163*993b0882SAndroid Build Coastguard Worker std::vector<TokenizationCodepointRangeT> configs;
164*993b0882SAndroid Build Coastguard Worker TokenizationCodepointRangeT* config;
165*993b0882SAndroid Build Coastguard Worker
166*993b0882SAndroid Build Coastguard Worker configs.emplace_back();
167*993b0882SAndroid Build Coastguard Worker config = &configs.back();
168*993b0882SAndroid Build Coastguard Worker // Space character.
169*993b0882SAndroid Build Coastguard Worker config->start = 32;
170*993b0882SAndroid Build Coastguard Worker config->end = 33;
171*993b0882SAndroid Build Coastguard Worker config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
172*993b0882SAndroid Build Coastguard Worker
173*993b0882SAndroid Build Coastguard Worker TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
174*993b0882SAndroid Build Coastguard Worker {},
175*993b0882SAndroid Build Coastguard Worker /*split_on_script_change=*/false,
176*993b0882SAndroid Build Coastguard Worker /*icu_preserve_whitespace_tokens=*/false,
177*993b0882SAndroid Build Coastguard Worker /*preserve_floating_numbers=*/false);
178*993b0882SAndroid Build Coastguard Worker std::vector<Token> tokens = tokenizer.Tokenize("Hello world!");
179*993b0882SAndroid Build Coastguard Worker
180*993b0882SAndroid Build Coastguard Worker EXPECT_THAT(tokens,
181*993b0882SAndroid Build Coastguard Worker ElementsAreArray({Token("Hello", 0, 5), Token("world!", 6, 12)}));
182*993b0882SAndroid Build Coastguard Worker }
183*993b0882SAndroid Build Coastguard Worker
TEST(TokenizerTest,TokenizeOnSpaceAndScriptChange)184*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, TokenizeOnSpaceAndScriptChange) {
185*993b0882SAndroid Build Coastguard Worker std::vector<TokenizationCodepointRangeT> configs;
186*993b0882SAndroid Build Coastguard Worker TokenizationCodepointRangeT* config;
187*993b0882SAndroid Build Coastguard Worker
188*993b0882SAndroid Build Coastguard Worker // Latin.
189*993b0882SAndroid Build Coastguard Worker configs.emplace_back();
190*993b0882SAndroid Build Coastguard Worker config = &configs.back();
191*993b0882SAndroid Build Coastguard Worker config->start = 0;
192*993b0882SAndroid Build Coastguard Worker config->end = 32;
193*993b0882SAndroid Build Coastguard Worker config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
194*993b0882SAndroid Build Coastguard Worker config->script_id = 1;
195*993b0882SAndroid Build Coastguard Worker configs.emplace_back();
196*993b0882SAndroid Build Coastguard Worker config = &configs.back();
197*993b0882SAndroid Build Coastguard Worker config->start = 32;
198*993b0882SAndroid Build Coastguard Worker config->end = 33;
199*993b0882SAndroid Build Coastguard Worker config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
200*993b0882SAndroid Build Coastguard Worker config->script_id = 1;
201*993b0882SAndroid Build Coastguard Worker configs.emplace_back();
202*993b0882SAndroid Build Coastguard Worker config = &configs.back();
203*993b0882SAndroid Build Coastguard Worker config->start = 33;
204*993b0882SAndroid Build Coastguard Worker config->end = 0x77F + 1;
205*993b0882SAndroid Build Coastguard Worker config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
206*993b0882SAndroid Build Coastguard Worker config->script_id = 1;
207*993b0882SAndroid Build Coastguard Worker
208*993b0882SAndroid Build Coastguard Worker TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
209*993b0882SAndroid Build Coastguard Worker {},
210*993b0882SAndroid Build Coastguard Worker /*split_on_script_change=*/true,
211*993b0882SAndroid Build Coastguard Worker /*icu_preserve_whitespace_tokens=*/false,
212*993b0882SAndroid Build Coastguard Worker /*preserve_floating_numbers=*/false);
213*993b0882SAndroid Build Coastguard Worker EXPECT_THAT(tokenizer.Tokenize("앨라배마 주 전화(123) 456-789웹사이트"),
214*993b0882SAndroid Build Coastguard Worker std::vector<Token>({Token("앨라배마", 0, 4), Token("주", 5, 6),
215*993b0882SAndroid Build Coastguard Worker Token("전화", 7, 10), Token("(123)", 10, 15),
216*993b0882SAndroid Build Coastguard Worker Token("456-789", 16, 23),
217*993b0882SAndroid Build Coastguard Worker Token("웹사이트", 23, 28)}));
218*993b0882SAndroid Build Coastguard Worker } // namespace
219*993b0882SAndroid Build Coastguard Worker
TEST(TokenizerTest,TokenizeComplex)220*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, TokenizeComplex) {
221*993b0882SAndroid Build Coastguard Worker std::vector<TokenizationCodepointRangeT> configs;
222*993b0882SAndroid Build Coastguard Worker TokenizationCodepointRangeT* config;
223*993b0882SAndroid Build Coastguard Worker
224*993b0882SAndroid Build Coastguard Worker // Source: http://www.unicode.org/Public/10.0.0/ucd/Blocks-10.0.0d1.txt
225*993b0882SAndroid Build Coastguard Worker // Latin - cyrilic.
226*993b0882SAndroid Build Coastguard Worker // 0000..007F; Basic Latin
227*993b0882SAndroid Build Coastguard Worker // 0080..00FF; Latin-1 Supplement
228*993b0882SAndroid Build Coastguard Worker // 0100..017F; Latin Extended-A
229*993b0882SAndroid Build Coastguard Worker // 0180..024F; Latin Extended-B
230*993b0882SAndroid Build Coastguard Worker // 0250..02AF; IPA Extensions
231*993b0882SAndroid Build Coastguard Worker // 02B0..02FF; Spacing Modifier Letters
232*993b0882SAndroid Build Coastguard Worker // 0300..036F; Combining Diacritical Marks
233*993b0882SAndroid Build Coastguard Worker // 0370..03FF; Greek and Coptic
234*993b0882SAndroid Build Coastguard Worker // 0400..04FF; Cyrillic
235*993b0882SAndroid Build Coastguard Worker // 0500..052F; Cyrillic Supplement
236*993b0882SAndroid Build Coastguard Worker // 0530..058F; Armenian
237*993b0882SAndroid Build Coastguard Worker // 0590..05FF; Hebrew
238*993b0882SAndroid Build Coastguard Worker // 0600..06FF; Arabic
239*993b0882SAndroid Build Coastguard Worker // 0700..074F; Syriac
240*993b0882SAndroid Build Coastguard Worker // 0750..077F; Arabic Supplement
241*993b0882SAndroid Build Coastguard Worker configs.emplace_back();
242*993b0882SAndroid Build Coastguard Worker config = &configs.back();
243*993b0882SAndroid Build Coastguard Worker config->start = 0;
244*993b0882SAndroid Build Coastguard Worker config->end = 32;
245*993b0882SAndroid Build Coastguard Worker config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
246*993b0882SAndroid Build Coastguard Worker configs.emplace_back();
247*993b0882SAndroid Build Coastguard Worker config = &configs.back();
248*993b0882SAndroid Build Coastguard Worker config->start = 32;
249*993b0882SAndroid Build Coastguard Worker config->end = 33;
250*993b0882SAndroid Build Coastguard Worker config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
251*993b0882SAndroid Build Coastguard Worker configs.emplace_back();
252*993b0882SAndroid Build Coastguard Worker config = &configs.back();
253*993b0882SAndroid Build Coastguard Worker config->start = 33;
254*993b0882SAndroid Build Coastguard Worker config->end = 0x77F + 1;
255*993b0882SAndroid Build Coastguard Worker config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
256*993b0882SAndroid Build Coastguard Worker
257*993b0882SAndroid Build Coastguard Worker // CJK
258*993b0882SAndroid Build Coastguard Worker // 2E80..2EFF; CJK Radicals Supplement
259*993b0882SAndroid Build Coastguard Worker // 3000..303F; CJK Symbols and Punctuation
260*993b0882SAndroid Build Coastguard Worker // 3040..309F; Hiragana
261*993b0882SAndroid Build Coastguard Worker // 30A0..30FF; Katakana
262*993b0882SAndroid Build Coastguard Worker // 3100..312F; Bopomofo
263*993b0882SAndroid Build Coastguard Worker // 3130..318F; Hangul Compatibility Jamo
264*993b0882SAndroid Build Coastguard Worker // 3190..319F; Kanbun
265*993b0882SAndroid Build Coastguard Worker // 31A0..31BF; Bopomofo Extended
266*993b0882SAndroid Build Coastguard Worker // 31C0..31EF; CJK Strokes
267*993b0882SAndroid Build Coastguard Worker // 31F0..31FF; Katakana Phonetic Extensions
268*993b0882SAndroid Build Coastguard Worker // 3200..32FF; Enclosed CJK Letters and Months
269*993b0882SAndroid Build Coastguard Worker // 3300..33FF; CJK Compatibility
270*993b0882SAndroid Build Coastguard Worker // 3400..4DBF; CJK Unified Ideographs Extension A
271*993b0882SAndroid Build Coastguard Worker // 4DC0..4DFF; Yijing Hexagram Symbols
272*993b0882SAndroid Build Coastguard Worker // 4E00..9FFF; CJK Unified Ideographs
273*993b0882SAndroid Build Coastguard Worker // A000..A48F; Yi Syllables
274*993b0882SAndroid Build Coastguard Worker // A490..A4CF; Yi Radicals
275*993b0882SAndroid Build Coastguard Worker // A4D0..A4FF; Lisu
276*993b0882SAndroid Build Coastguard Worker // A500..A63F; Vai
277*993b0882SAndroid Build Coastguard Worker // F900..FAFF; CJK Compatibility Ideographs
278*993b0882SAndroid Build Coastguard Worker // FE30..FE4F; CJK Compatibility Forms
279*993b0882SAndroid Build Coastguard Worker // 20000..2A6DF; CJK Unified Ideographs Extension B
280*993b0882SAndroid Build Coastguard Worker // 2A700..2B73F; CJK Unified Ideographs Extension C
281*993b0882SAndroid Build Coastguard Worker // 2B740..2B81F; CJK Unified Ideographs Extension D
282*993b0882SAndroid Build Coastguard Worker // 2B820..2CEAF; CJK Unified Ideographs Extension E
283*993b0882SAndroid Build Coastguard Worker // 2CEB0..2EBEF; CJK Unified Ideographs Extension F
284*993b0882SAndroid Build Coastguard Worker // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
285*993b0882SAndroid Build Coastguard Worker configs.emplace_back();
286*993b0882SAndroid Build Coastguard Worker config = &configs.back();
287*993b0882SAndroid Build Coastguard Worker config->start = 0x2E80;
288*993b0882SAndroid Build Coastguard Worker config->end = 0x2EFF + 1;
289*993b0882SAndroid Build Coastguard Worker config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
290*993b0882SAndroid Build Coastguard Worker configs.emplace_back();
291*993b0882SAndroid Build Coastguard Worker config = &configs.back();
292*993b0882SAndroid Build Coastguard Worker config->start = 0x3000;
293*993b0882SAndroid Build Coastguard Worker config->end = 0xA63F + 1;
294*993b0882SAndroid Build Coastguard Worker config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
295*993b0882SAndroid Build Coastguard Worker configs.emplace_back();
296*993b0882SAndroid Build Coastguard Worker config = &configs.back();
297*993b0882SAndroid Build Coastguard Worker config->start = 0xF900;
298*993b0882SAndroid Build Coastguard Worker config->end = 0xFAFF + 1;
299*993b0882SAndroid Build Coastguard Worker config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
300*993b0882SAndroid Build Coastguard Worker configs.emplace_back();
301*993b0882SAndroid Build Coastguard Worker config = &configs.back();
302*993b0882SAndroid Build Coastguard Worker config->start = 0xFE30;
303*993b0882SAndroid Build Coastguard Worker config->end = 0xFE4F + 1;
304*993b0882SAndroid Build Coastguard Worker config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
305*993b0882SAndroid Build Coastguard Worker configs.emplace_back();
306*993b0882SAndroid Build Coastguard Worker config = &configs.back();
307*993b0882SAndroid Build Coastguard Worker config->start = 0x20000;
308*993b0882SAndroid Build Coastguard Worker config->end = 0x2A6DF + 1;
309*993b0882SAndroid Build Coastguard Worker config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
310*993b0882SAndroid Build Coastguard Worker configs.emplace_back();
311*993b0882SAndroid Build Coastguard Worker config = &configs.back();
312*993b0882SAndroid Build Coastguard Worker config->start = 0x2A700;
313*993b0882SAndroid Build Coastguard Worker config->end = 0x2B73F + 1;
314*993b0882SAndroid Build Coastguard Worker config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
315*993b0882SAndroid Build Coastguard Worker configs.emplace_back();
316*993b0882SAndroid Build Coastguard Worker config = &configs.back();
317*993b0882SAndroid Build Coastguard Worker config->start = 0x2B740;
318*993b0882SAndroid Build Coastguard Worker config->end = 0x2B81F + 1;
319*993b0882SAndroid Build Coastguard Worker config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
320*993b0882SAndroid Build Coastguard Worker configs.emplace_back();
321*993b0882SAndroid Build Coastguard Worker config = &configs.back();
322*993b0882SAndroid Build Coastguard Worker config->start = 0x2B820;
323*993b0882SAndroid Build Coastguard Worker config->end = 0x2CEAF + 1;
324*993b0882SAndroid Build Coastguard Worker config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
325*993b0882SAndroid Build Coastguard Worker configs.emplace_back();
326*993b0882SAndroid Build Coastguard Worker config = &configs.back();
327*993b0882SAndroid Build Coastguard Worker config->start = 0x2CEB0;
328*993b0882SAndroid Build Coastguard Worker config->end = 0x2EBEF + 1;
329*993b0882SAndroid Build Coastguard Worker config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
330*993b0882SAndroid Build Coastguard Worker configs.emplace_back();
331*993b0882SAndroid Build Coastguard Worker config = &configs.back();
332*993b0882SAndroid Build Coastguard Worker config->start = 0x2F800;
333*993b0882SAndroid Build Coastguard Worker config->end = 0x2FA1F + 1;
334*993b0882SAndroid Build Coastguard Worker config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
335*993b0882SAndroid Build Coastguard Worker
336*993b0882SAndroid Build Coastguard Worker // Thai.
337*993b0882SAndroid Build Coastguard Worker // 0E00..0E7F; Thai
338*993b0882SAndroid Build Coastguard Worker configs.emplace_back();
339*993b0882SAndroid Build Coastguard Worker config = &configs.back();
340*993b0882SAndroid Build Coastguard Worker config->start = 0x0E00;
341*993b0882SAndroid Build Coastguard Worker config->end = 0x0E7F + 1;
342*993b0882SAndroid Build Coastguard Worker config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
343*993b0882SAndroid Build Coastguard Worker
344*993b0882SAndroid Build Coastguard Worker TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
345*993b0882SAndroid Build Coastguard Worker {},
346*993b0882SAndroid Build Coastguard Worker /*split_on_script_change=*/false,
347*993b0882SAndroid Build Coastguard Worker /*icu_preserve_whitespace_tokens=*/false,
348*993b0882SAndroid Build Coastguard Worker /*preserve_floating_numbers=*/false);
349*993b0882SAndroid Build Coastguard Worker std::vector<Token> tokens;
350*993b0882SAndroid Build Coastguard Worker
351*993b0882SAndroid Build Coastguard Worker tokens = tokenizer.Tokenize(
352*993b0882SAndroid Build Coastguard Worker "問少目木輸走猶術権自京門録球変。細開括省用掲情結傍走愛明氷。");
353*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens.size(), 30);
354*993b0882SAndroid Build Coastguard Worker
355*993b0882SAndroid Build Coastguard Worker tokens = tokenizer.Tokenize("問少目 hello 木輸ยามきゃ");
356*993b0882SAndroid Build Coastguard Worker // clang-format off
357*993b0882SAndroid Build Coastguard Worker EXPECT_THAT(
358*993b0882SAndroid Build Coastguard Worker tokens,
359*993b0882SAndroid Build Coastguard Worker ElementsAreArray({Token("問", 0, 1),
360*993b0882SAndroid Build Coastguard Worker Token("少", 1, 2),
361*993b0882SAndroid Build Coastguard Worker Token("目", 2, 3),
362*993b0882SAndroid Build Coastguard Worker Token("hello", 4, 9),
363*993b0882SAndroid Build Coastguard Worker Token("木", 10, 11),
364*993b0882SAndroid Build Coastguard Worker Token("輸", 11, 12),
365*993b0882SAndroid Build Coastguard Worker Token("ย", 12, 13),
366*993b0882SAndroid Build Coastguard Worker Token("า", 13, 14),
367*993b0882SAndroid Build Coastguard Worker Token("ม", 14, 15),
368*993b0882SAndroid Build Coastguard Worker Token("き", 15, 16),
369*993b0882SAndroid Build Coastguard Worker Token("ゃ", 16, 17)}));
370*993b0882SAndroid Build Coastguard Worker // clang-format on
371*993b0882SAndroid Build Coastguard Worker }
372*993b0882SAndroid Build Coastguard Worker
373*993b0882SAndroid Build Coastguard Worker #if defined(TC3_TEST_ICU) || defined(__APPLE__)
TEST(TokenizerTest,ICUTokenizeWithWhitespaces)374*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, ICUTokenizeWithWhitespaces) {
375*993b0882SAndroid Build Coastguard Worker TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
376*993b0882SAndroid Build Coastguard Worker /*split_on_script_change=*/false,
377*993b0882SAndroid Build Coastguard Worker /*icu_preserve_whitespace_tokens=*/true,
378*993b0882SAndroid Build Coastguard Worker /*preserve_floating_numbers=*/false);
379*993b0882SAndroid Build Coastguard Worker std::vector<Token> tokens = tokenizer.Tokenize("พระบาท สมเด็จ พระ ปร มิ");
380*993b0882SAndroid Build Coastguard Worker // clang-format off
381*993b0882SAndroid Build Coastguard Worker ASSERT_EQ(tokens,
382*993b0882SAndroid Build Coastguard Worker std::vector<Token>({Token("พระบาท", 0, 6),
383*993b0882SAndroid Build Coastguard Worker Token(" ", 6, 7),
384*993b0882SAndroid Build Coastguard Worker Token("สมเด็จ", 7, 13),
385*993b0882SAndroid Build Coastguard Worker Token(" ", 13, 14),
386*993b0882SAndroid Build Coastguard Worker Token("พระ", 14, 17),
387*993b0882SAndroid Build Coastguard Worker Token(" ", 17, 18),
388*993b0882SAndroid Build Coastguard Worker Token("ปร", 18, 20),
389*993b0882SAndroid Build Coastguard Worker Token(" ", 20, 21),
390*993b0882SAndroid Build Coastguard Worker Token("มิ", 21, 23)}));
391*993b0882SAndroid Build Coastguard Worker // clang-format on
392*993b0882SAndroid Build Coastguard Worker }
393*993b0882SAndroid Build Coastguard Worker
TEST(TokenizerTest,ICUTokenizePunctuation)394*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, ICUTokenizePunctuation) {
395*993b0882SAndroid Build Coastguard Worker TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
396*993b0882SAndroid Build Coastguard Worker /*split_on_script_change=*/false,
397*993b0882SAndroid Build Coastguard Worker /*icu_preserve_whitespace_tokens=*/true,
398*993b0882SAndroid Build Coastguard Worker /*preserve_floating_numbers=*/false);
399*993b0882SAndroid Build Coastguard Worker std::vector<Token> tokens =
400*993b0882SAndroid Build Coastguard Worker tokenizer.Tokenize("The interval is: -(12, 138*)");
401*993b0882SAndroid Build Coastguard Worker // clang-format off
402*993b0882SAndroid Build Coastguard Worker ASSERT_EQ(
403*993b0882SAndroid Build Coastguard Worker tokens,
404*993b0882SAndroid Build Coastguard Worker std::vector<Token>({Token("The", 0, 3),
405*993b0882SAndroid Build Coastguard Worker Token(" ", 3, 4),
406*993b0882SAndroid Build Coastguard Worker Token("interval", 4, 12),
407*993b0882SAndroid Build Coastguard Worker Token(" ", 12, 13),
408*993b0882SAndroid Build Coastguard Worker Token("is", 13, 15),
409*993b0882SAndroid Build Coastguard Worker Token(":", 15, 16),
410*993b0882SAndroid Build Coastguard Worker Token(" ", 16, 17),
411*993b0882SAndroid Build Coastguard Worker Token("-", 17, 18),
412*993b0882SAndroid Build Coastguard Worker Token("(", 18, 19),
413*993b0882SAndroid Build Coastguard Worker Token("12", 19, 21),
414*993b0882SAndroid Build Coastguard Worker Token(",", 21, 22),
415*993b0882SAndroid Build Coastguard Worker Token(" ", 22, 23),
416*993b0882SAndroid Build Coastguard Worker Token("138", 23, 26),
417*993b0882SAndroid Build Coastguard Worker Token("*", 26, 27),
418*993b0882SAndroid Build Coastguard Worker Token(")", 27, 28)}));
419*993b0882SAndroid Build Coastguard Worker // clang-format on
420*993b0882SAndroid Build Coastguard Worker }
421*993b0882SAndroid Build Coastguard Worker
TEST(TokenizerTest,ICUTokenizeWithNumbers)422*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, ICUTokenizeWithNumbers) {
423*993b0882SAndroid Build Coastguard Worker TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
424*993b0882SAndroid Build Coastguard Worker /*split_on_script_change=*/false,
425*993b0882SAndroid Build Coastguard Worker /*icu_preserve_whitespace_tokens=*/true,
426*993b0882SAndroid Build Coastguard Worker /*preserve_floating_numbers=*/false);
427*993b0882SAndroid Build Coastguard Worker std::vector<Token> tokens = tokenizer.Tokenize("3.1 3﹒2 3.3");
428*993b0882SAndroid Build Coastguard Worker // clang-format off
429*993b0882SAndroid Build Coastguard Worker ASSERT_EQ(tokens,
430*993b0882SAndroid Build Coastguard Worker std::vector<Token>({Token("3.1", 0, 3),
431*993b0882SAndroid Build Coastguard Worker Token(" ", 3, 4),
432*993b0882SAndroid Build Coastguard Worker Token("3﹒2", 4, 7),
433*993b0882SAndroid Build Coastguard Worker Token(" ", 7, 8),
434*993b0882SAndroid Build Coastguard Worker Token("3.3", 8, 11)}));
435*993b0882SAndroid Build Coastguard Worker // clang-format on
436*993b0882SAndroid Build Coastguard Worker }
437*993b0882SAndroid Build Coastguard Worker #endif
438*993b0882SAndroid Build Coastguard Worker
439*993b0882SAndroid Build Coastguard Worker #if defined(TC3_TEST_ICU)
TEST(TokenizerTest,ICUTokenize)440*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, ICUTokenize) {
441*993b0882SAndroid Build Coastguard Worker TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
442*993b0882SAndroid Build Coastguard Worker /*split_on_script_change=*/false,
443*993b0882SAndroid Build Coastguard Worker /*icu_preserve_whitespace_tokens=*/false,
444*993b0882SAndroid Build Coastguard Worker /*preserve_floating_numbers=*/false);
445*993b0882SAndroid Build Coastguard Worker std::vector<Token> tokens = tokenizer.Tokenize("พระบาทสมเด็จพระปรมิ");
446*993b0882SAndroid Build Coastguard Worker // clang-format off
447*993b0882SAndroid Build Coastguard Worker ASSERT_EQ(tokens,
448*993b0882SAndroid Build Coastguard Worker std::vector<Token>({Token("พระบาท", 0, 6),
449*993b0882SAndroid Build Coastguard Worker Token("สมเด็จ", 6, 12),
450*993b0882SAndroid Build Coastguard Worker Token("พระ", 12, 15),
451*993b0882SAndroid Build Coastguard Worker Token("ปร", 15, 17),
452*993b0882SAndroid Build Coastguard Worker Token("มิ", 17, 19)}));
453*993b0882SAndroid Build Coastguard Worker // clang-format on
454*993b0882SAndroid Build Coastguard Worker }
455*993b0882SAndroid Build Coastguard Worker
TEST(TokenizerTest,MixedTokenize)456*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, MixedTokenize) {
457*993b0882SAndroid Build Coastguard Worker std::vector<TokenizationCodepointRangeT> configs;
458*993b0882SAndroid Build Coastguard Worker TokenizationCodepointRangeT* config;
459*993b0882SAndroid Build Coastguard Worker
460*993b0882SAndroid Build Coastguard Worker configs.emplace_back();
461*993b0882SAndroid Build Coastguard Worker config = &configs.back();
462*993b0882SAndroid Build Coastguard Worker config->start = 32;
463*993b0882SAndroid Build Coastguard Worker config->end = 33;
464*993b0882SAndroid Build Coastguard Worker config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
465*993b0882SAndroid Build Coastguard Worker
466*993b0882SAndroid Build Coastguard Worker std::vector<CodepointRangeT> internal_configs;
467*993b0882SAndroid Build Coastguard Worker CodepointRangeT* interal_config;
468*993b0882SAndroid Build Coastguard Worker
469*993b0882SAndroid Build Coastguard Worker internal_configs.emplace_back();
470*993b0882SAndroid Build Coastguard Worker interal_config = &internal_configs.back();
471*993b0882SAndroid Build Coastguard Worker interal_config->start = 0;
472*993b0882SAndroid Build Coastguard Worker interal_config->end = 128;
473*993b0882SAndroid Build Coastguard Worker
474*993b0882SAndroid Build Coastguard Worker internal_configs.emplace_back();
475*993b0882SAndroid Build Coastguard Worker interal_config = &internal_configs.back();
476*993b0882SAndroid Build Coastguard Worker interal_config->start = 128;
477*993b0882SAndroid Build Coastguard Worker interal_config->end = 256;
478*993b0882SAndroid Build Coastguard Worker
479*993b0882SAndroid Build Coastguard Worker internal_configs.emplace_back();
480*993b0882SAndroid Build Coastguard Worker interal_config = &internal_configs.back();
481*993b0882SAndroid Build Coastguard Worker interal_config->start = 256;
482*993b0882SAndroid Build Coastguard Worker interal_config->end = 384;
483*993b0882SAndroid Build Coastguard Worker
484*993b0882SAndroid Build Coastguard Worker internal_configs.emplace_back();
485*993b0882SAndroid Build Coastguard Worker interal_config = &internal_configs.back();
486*993b0882SAndroid Build Coastguard Worker interal_config->start = 384;
487*993b0882SAndroid Build Coastguard Worker interal_config->end = 592;
488*993b0882SAndroid Build Coastguard Worker
489*993b0882SAndroid Build Coastguard Worker TestingTokenizerProxy tokenizer(TokenizationType_MIXED, configs,
490*993b0882SAndroid Build Coastguard Worker internal_configs,
491*993b0882SAndroid Build Coastguard Worker /*split_on_script_change=*/false,
492*993b0882SAndroid Build Coastguard Worker /*icu_preserve_whitespace_tokens=*/false,
493*993b0882SAndroid Build Coastguard Worker /*preserve_floating_numbers=*/false);
494*993b0882SAndroid Build Coastguard Worker
495*993b0882SAndroid Build Coastguard Worker std::vector<Token> tokens = tokenizer.Tokenize(
496*993b0882SAndroid Build Coastguard Worker "こんにちはJapanese-ląnguagę text 你好世界 http://www.google.com/");
497*993b0882SAndroid Build Coastguard Worker ASSERT_EQ(
498*993b0882SAndroid Build Coastguard Worker tokens,
499*993b0882SAndroid Build Coastguard Worker // clang-format off
500*993b0882SAndroid Build Coastguard Worker std::vector<Token>({Token("こんにちは", 0, 5),
501*993b0882SAndroid Build Coastguard Worker Token("Japanese-ląnguagę", 5, 22),
502*993b0882SAndroid Build Coastguard Worker Token("text", 23, 27),
503*993b0882SAndroid Build Coastguard Worker Token("你好", 28, 30),
504*993b0882SAndroid Build Coastguard Worker Token("世界", 30, 32),
505*993b0882SAndroid Build Coastguard Worker Token("http://www.google.com/", 33, 55)}));
506*993b0882SAndroid Build Coastguard Worker // clang-format on
507*993b0882SAndroid Build Coastguard Worker }
508*993b0882SAndroid Build Coastguard Worker
TEST(TokenizerTest,InternalTokenizeOnScriptChange)509*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, InternalTokenizeOnScriptChange) {
510*993b0882SAndroid Build Coastguard Worker std::vector<TokenizationCodepointRangeT> configs;
511*993b0882SAndroid Build Coastguard Worker TokenizationCodepointRangeT* config;
512*993b0882SAndroid Build Coastguard Worker
513*993b0882SAndroid Build Coastguard Worker configs.emplace_back();
514*993b0882SAndroid Build Coastguard Worker config = &configs.back();
515*993b0882SAndroid Build Coastguard Worker config->start = 0;
516*993b0882SAndroid Build Coastguard Worker config->end = 256;
517*993b0882SAndroid Build Coastguard Worker config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
518*993b0882SAndroid Build Coastguard Worker
519*993b0882SAndroid Build Coastguard Worker {
520*993b0882SAndroid Build Coastguard Worker TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER,
521*993b0882SAndroid Build Coastguard Worker configs, {},
522*993b0882SAndroid Build Coastguard Worker /*split_on_script_change=*/false,
523*993b0882SAndroid Build Coastguard Worker /*icu_preserve_whitespace_tokens=*/false,
524*993b0882SAndroid Build Coastguard Worker /*preserve_floating_numbers=*/false);
525*993b0882SAndroid Build Coastguard Worker
526*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokenizer.Tokenize("앨라배마123웹사이트"),
527*993b0882SAndroid Build Coastguard Worker std::vector<Token>({Token("앨라배마123웹사이트", 0, 11)}));
528*993b0882SAndroid Build Coastguard Worker }
529*993b0882SAndroid Build Coastguard Worker
530*993b0882SAndroid Build Coastguard Worker {
531*993b0882SAndroid Build Coastguard Worker TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER,
532*993b0882SAndroid Build Coastguard Worker configs, {},
533*993b0882SAndroid Build Coastguard Worker /*split_on_script_change=*/true,
534*993b0882SAndroid Build Coastguard Worker /*icu_preserve_whitespace_tokens=*/false,
535*993b0882SAndroid Build Coastguard Worker /*preserve_floating_numbers=*/false);
536*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokenizer.Tokenize("앨라배마123웹사이트"),
537*993b0882SAndroid Build Coastguard Worker std::vector<Token>({Token("앨라배마", 0, 4), Token("123", 4, 7),
538*993b0882SAndroid Build Coastguard Worker Token("웹사이트", 7, 11)}));
539*993b0882SAndroid Build Coastguard Worker }
540*993b0882SAndroid Build Coastguard Worker }
541*993b0882SAndroid Build Coastguard Worker #endif
542*993b0882SAndroid Build Coastguard Worker
TEST(TokenizerTest,LetterDigitTokenize)543*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, LetterDigitTokenize) {
544*993b0882SAndroid Build Coastguard Worker TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
545*993b0882SAndroid Build Coastguard Worker /*split_on_script_change=*/false,
546*993b0882SAndroid Build Coastguard Worker /*icu_preserve_whitespace_tokens=*/false,
547*993b0882SAndroid Build Coastguard Worker /*preserve_floating_numbers=*/true);
548*993b0882SAndroid Build Coastguard Worker std::vector<Token> tokens = tokenizer.Tokenize("7% -3.14 68.9#? 7% $99 .18.");
549*993b0882SAndroid Build Coastguard Worker ASSERT_EQ(tokens,
550*993b0882SAndroid Build Coastguard Worker std::vector<Token>(
551*993b0882SAndroid Build Coastguard Worker {Token("7", 0, 1), Token("%", 1, 2), Token(" ", 2, 3),
552*993b0882SAndroid Build Coastguard Worker Token("-", 3, 4), Token("3.14", 4, 8), Token(" ", 8, 9),
553*993b0882SAndroid Build Coastguard Worker Token("68.9", 9, 13), Token("#", 13, 14), Token("?", 14, 15),
554*993b0882SAndroid Build Coastguard Worker Token(" ", 15, 16), Token("7", 16, 17), Token("%", 17, 18),
555*993b0882SAndroid Build Coastguard Worker Token(" ", 18, 19), Token("$", 19, 20), Token("99", 20, 22),
556*993b0882SAndroid Build Coastguard Worker Token(" ", 22, 23), Token(".", 23, 24), Token("18", 24, 26),
557*993b0882SAndroid Build Coastguard Worker Token(".", 26, 27)}));
558*993b0882SAndroid Build Coastguard Worker }
559*993b0882SAndroid Build Coastguard Worker
TEST(TokenizerTest,LetterDigitTokenizeUnicode)560*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, LetterDigitTokenizeUnicode) {
561*993b0882SAndroid Build Coastguard Worker TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
562*993b0882SAndroid Build Coastguard Worker /*split_on_script_change=*/false,
563*993b0882SAndroid Build Coastguard Worker /*icu_preserve_whitespace_tokens=*/false,
564*993b0882SAndroid Build Coastguard Worker /*preserve_floating_numbers=*/true);
565*993b0882SAndroid Build Coastguard Worker std::vector<Token> tokens = tokenizer.Tokenize("2 pércént 3パーセント");
566*993b0882SAndroid Build Coastguard Worker ASSERT_EQ(tokens, std::vector<Token>({Token("2", 0, 1), Token(" ", 1, 2),
567*993b0882SAndroid Build Coastguard Worker Token("pércént", 2, 9),
568*993b0882SAndroid Build Coastguard Worker Token(" ", 9, 10), Token("3", 10, 11),
569*993b0882SAndroid Build Coastguard Worker Token("パーセント", 11, 16)}));
570*993b0882SAndroid Build Coastguard Worker }
571*993b0882SAndroid Build Coastguard Worker
TEST(TokenizerTest,LetterDigitTokenizeWithDots)572*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, LetterDigitTokenizeWithDots) {
573*993b0882SAndroid Build Coastguard Worker TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
574*993b0882SAndroid Build Coastguard Worker /*split_on_script_change=*/false,
575*993b0882SAndroid Build Coastguard Worker /*icu_preserve_whitespace_tokens=*/false,
576*993b0882SAndroid Build Coastguard Worker /*preserve_floating_numbers=*/true);
577*993b0882SAndroid Build Coastguard Worker std::vector<Token> tokens = tokenizer.Tokenize("3 3﹒2 3.3%");
578*993b0882SAndroid Build Coastguard Worker ASSERT_EQ(tokens,
579*993b0882SAndroid Build Coastguard Worker std::vector<Token>({Token("3", 0, 1), Token(" ", 1, 2),
580*993b0882SAndroid Build Coastguard Worker Token("3﹒2", 2, 5), Token(" ", 5, 6),
581*993b0882SAndroid Build Coastguard Worker Token("3.3", 6, 9), Token("%", 9, 10)}));
582*993b0882SAndroid Build Coastguard Worker }
583*993b0882SAndroid Build Coastguard Worker
TEST(TokenizerTest,LetterDigitTokenizeDoNotPreserveFloatingNumbers)584*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, LetterDigitTokenizeDoNotPreserveFloatingNumbers) {
585*993b0882SAndroid Build Coastguard Worker TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
586*993b0882SAndroid Build Coastguard Worker /*split_on_script_change=*/false,
587*993b0882SAndroid Build Coastguard Worker /*icu_preserve_whitespace_tokens=*/false,
588*993b0882SAndroid Build Coastguard Worker /*preserve_floating_numbers=*/false);
589*993b0882SAndroid Build Coastguard Worker std::vector<Token> tokens = tokenizer.Tokenize("15.12.2019 january's 3.2");
590*993b0882SAndroid Build Coastguard Worker ASSERT_EQ(tokens,
591*993b0882SAndroid Build Coastguard Worker std::vector<Token>(
592*993b0882SAndroid Build Coastguard Worker {Token("15", 0, 2), Token(".", 2, 3), Token("12", 3, 5),
593*993b0882SAndroid Build Coastguard Worker Token(".", 5, 6), Token("2019", 6, 10), Token(" ", 10, 11),
594*993b0882SAndroid Build Coastguard Worker Token("january", 11, 18), Token("'", 18, 19),
595*993b0882SAndroid Build Coastguard Worker Token("s", 19, 20), Token(" ", 20, 21), Token("3", 21, 22),
596*993b0882SAndroid Build Coastguard Worker Token(".", 22, 23), Token("2", 23, 24)}));
597*993b0882SAndroid Build Coastguard Worker }
598*993b0882SAndroid Build Coastguard Worker
TEST(TokenizerTest,LetterDigitTokenizeStrangeStringFloatingNumbers)599*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, LetterDigitTokenizeStrangeStringFloatingNumbers) {
600*993b0882SAndroid Build Coastguard Worker TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
601*993b0882SAndroid Build Coastguard Worker /*split_on_script_change=*/false,
602*993b0882SAndroid Build Coastguard Worker /*icu_preserve_whitespace_tokens=*/false,
603*993b0882SAndroid Build Coastguard Worker /*preserve_floating_numbers=*/false);
604*993b0882SAndroid Build Coastguard Worker std::vector<Token> tokens = tokenizer.Tokenize("The+2345++the +íí+");
605*993b0882SAndroid Build Coastguard Worker ASSERT_EQ(tokens,
606*993b0882SAndroid Build Coastguard Worker std::vector<Token>({Token("The", 0, 3), Token("+", 3, 4),
607*993b0882SAndroid Build Coastguard Worker Token("2345", 4, 8), Token("+", 8, 9),
608*993b0882SAndroid Build Coastguard Worker Token("+", 9, 10), Token("the", 10, 13),
609*993b0882SAndroid Build Coastguard Worker Token(" ", 13, 14), Token("+", 14, 15),
610*993b0882SAndroid Build Coastguard Worker Token("íí", 15, 17), Token("+", 17, 18)}));
611*993b0882SAndroid Build Coastguard Worker }
612*993b0882SAndroid Build Coastguard Worker
TEST(TokenizerTest,LetterDigitTokenizeWhitespcesInSameToken)613*993b0882SAndroid Build Coastguard Worker TEST(TokenizerTest, LetterDigitTokenizeWhitespcesInSameToken) {
614*993b0882SAndroid Build Coastguard Worker TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {},
615*993b0882SAndroid Build Coastguard Worker /*split_on_script_change=*/false,
616*993b0882SAndroid Build Coastguard Worker /*icu_preserve_whitespace_tokens=*/false,
617*993b0882SAndroid Build Coastguard Worker /*preserve_floating_numbers=*/false);
618*993b0882SAndroid Build Coastguard Worker std::vector<Token> tokens = tokenizer.Tokenize("2 3 4 5");
619*993b0882SAndroid Build Coastguard Worker ASSERT_EQ(tokens, std::vector<Token>({Token("2", 0, 1), Token(" ", 1, 2),
620*993b0882SAndroid Build Coastguard Worker Token("3", 2, 3), Token(" ", 3, 5),
621*993b0882SAndroid Build Coastguard Worker Token("4", 5, 6), Token(" ", 6, 9),
622*993b0882SAndroid Build Coastguard Worker Token("5", 9, 10)}));
623*993b0882SAndroid Build Coastguard Worker }
624*993b0882SAndroid Build Coastguard Worker
625*993b0882SAndroid Build Coastguard Worker } // namespace
626*993b0882SAndroid Build Coastguard Worker } // namespace libtextclassifier3
627