1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker *
4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker *
8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker *
10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker */
16*993b0882SAndroid Build Coastguard Worker
17*993b0882SAndroid Build Coastguard Worker #include "utils/tokenizer.h"
18*993b0882SAndroid Build Coastguard Worker
19*993b0882SAndroid Build Coastguard Worker #include <algorithm>
20*993b0882SAndroid Build Coastguard Worker
21*993b0882SAndroid Build Coastguard Worker #include "utils/base/logging.h"
22*993b0882SAndroid Build Coastguard Worker #include "utils/base/macros.h"
23*993b0882SAndroid Build Coastguard Worker #include "utils/strings/utf8.h"
24*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unicodetext.h"
25*993b0882SAndroid Build Coastguard Worker #include "absl/strings/string_view.h"
26*993b0882SAndroid Build Coastguard Worker
27*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
28*993b0882SAndroid Build Coastguard Worker
Tokenizer(const TokenizationType type,const UniLib * unilib,const std::vector<const TokenizationCodepointRange * > & codepoint_ranges,const std::vector<const CodepointRange * > & internal_tokenizer_codepoint_ranges,const bool split_on_script_change,const bool icu_preserve_whitespace_tokens,const bool preserve_floating_numbers)29*993b0882SAndroid Build Coastguard Worker Tokenizer::Tokenizer(
30*993b0882SAndroid Build Coastguard Worker const TokenizationType type, const UniLib* unilib,
31*993b0882SAndroid Build Coastguard Worker const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
32*993b0882SAndroid Build Coastguard Worker const std::vector<const CodepointRange*>&
33*993b0882SAndroid Build Coastguard Worker internal_tokenizer_codepoint_ranges,
34*993b0882SAndroid Build Coastguard Worker const bool split_on_script_change,
35*993b0882SAndroid Build Coastguard Worker const bool icu_preserve_whitespace_tokens,
36*993b0882SAndroid Build Coastguard Worker const bool preserve_floating_numbers)
37*993b0882SAndroid Build Coastguard Worker : type_(type),
38*993b0882SAndroid Build Coastguard Worker unilib_(unilib),
39*993b0882SAndroid Build Coastguard Worker split_on_script_change_(split_on_script_change),
40*993b0882SAndroid Build Coastguard Worker icu_preserve_whitespace_tokens_(icu_preserve_whitespace_tokens),
41*993b0882SAndroid Build Coastguard Worker preserve_floating_numbers_(preserve_floating_numbers) {
42*993b0882SAndroid Build Coastguard Worker for (const TokenizationCodepointRange* range : codepoint_ranges) {
43*993b0882SAndroid Build Coastguard Worker codepoint_ranges_.emplace_back(range->UnPack());
44*993b0882SAndroid Build Coastguard Worker }
45*993b0882SAndroid Build Coastguard Worker
46*993b0882SAndroid Build Coastguard Worker std::stable_sort(
47*993b0882SAndroid Build Coastguard Worker codepoint_ranges_.begin(), codepoint_ranges_.end(),
48*993b0882SAndroid Build Coastguard Worker [](const std::unique_ptr<const TokenizationCodepointRangeT>& a,
49*993b0882SAndroid Build Coastguard Worker const std::unique_ptr<const TokenizationCodepointRangeT>& b) {
50*993b0882SAndroid Build Coastguard Worker return a->start < b->start;
51*993b0882SAndroid Build Coastguard Worker });
52*993b0882SAndroid Build Coastguard Worker
53*993b0882SAndroid Build Coastguard Worker SortCodepointRanges(internal_tokenizer_codepoint_ranges,
54*993b0882SAndroid Build Coastguard Worker &internal_tokenizer_codepoint_ranges_);
55*993b0882SAndroid Build Coastguard Worker if (type_ == TokenizationType_MIXED && split_on_script_change) {
56*993b0882SAndroid Build Coastguard Worker TC3_LOG(ERROR) << "The option `split_on_script_change` is unavailable for "
57*993b0882SAndroid Build Coastguard Worker "the selected tokenizer type (mixed).";
58*993b0882SAndroid Build Coastguard Worker }
59*993b0882SAndroid Build Coastguard Worker }
60*993b0882SAndroid Build Coastguard Worker
FindTokenizationRange(int codepoint) const61*993b0882SAndroid Build Coastguard Worker const TokenizationCodepointRangeT* Tokenizer::FindTokenizationRange(
62*993b0882SAndroid Build Coastguard Worker int codepoint) const {
63*993b0882SAndroid Build Coastguard Worker auto it = std::lower_bound(
64*993b0882SAndroid Build Coastguard Worker codepoint_ranges_.begin(), codepoint_ranges_.end(), codepoint,
65*993b0882SAndroid Build Coastguard Worker [](const std::unique_ptr<const TokenizationCodepointRangeT>& range,
66*993b0882SAndroid Build Coastguard Worker int codepoint) {
67*993b0882SAndroid Build Coastguard Worker // This function compares range with the codepoint for the purpose of
68*993b0882SAndroid Build Coastguard Worker // finding the first greater or equal range. Because of the use of
69*993b0882SAndroid Build Coastguard Worker // std::lower_bound it needs to return true when range < codepoint;
70*993b0882SAndroid Build Coastguard Worker // the first time it will return false the lower bound is found and
71*993b0882SAndroid Build Coastguard Worker // returned.
72*993b0882SAndroid Build Coastguard Worker //
73*993b0882SAndroid Build Coastguard Worker // It might seem weird that the condition is range.end <= codepoint
74*993b0882SAndroid Build Coastguard Worker // here but when codepoint == range.end it means it's actually just
75*993b0882SAndroid Build Coastguard Worker // outside of the range, thus the range is less than the codepoint.
76*993b0882SAndroid Build Coastguard Worker return range->end <= codepoint;
77*993b0882SAndroid Build Coastguard Worker });
78*993b0882SAndroid Build Coastguard Worker if (it != codepoint_ranges_.end() && (*it)->start <= codepoint &&
79*993b0882SAndroid Build Coastguard Worker (*it)->end > codepoint) {
80*993b0882SAndroid Build Coastguard Worker return it->get();
81*993b0882SAndroid Build Coastguard Worker } else {
82*993b0882SAndroid Build Coastguard Worker return nullptr;
83*993b0882SAndroid Build Coastguard Worker }
84*993b0882SAndroid Build Coastguard Worker }
85*993b0882SAndroid Build Coastguard Worker
GetScriptAndRole(char32 codepoint,TokenizationCodepointRange_::Role * role,int * script) const86*993b0882SAndroid Build Coastguard Worker void Tokenizer::GetScriptAndRole(char32 codepoint,
87*993b0882SAndroid Build Coastguard Worker TokenizationCodepointRange_::Role* role,
88*993b0882SAndroid Build Coastguard Worker int* script) const {
89*993b0882SAndroid Build Coastguard Worker const TokenizationCodepointRangeT* range = FindTokenizationRange(codepoint);
90*993b0882SAndroid Build Coastguard Worker if (range) {
91*993b0882SAndroid Build Coastguard Worker *role = range->role;
92*993b0882SAndroid Build Coastguard Worker *script = range->script_id;
93*993b0882SAndroid Build Coastguard Worker } else {
94*993b0882SAndroid Build Coastguard Worker *role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
95*993b0882SAndroid Build Coastguard Worker *script = kUnknownScript;
96*993b0882SAndroid Build Coastguard Worker }
97*993b0882SAndroid Build Coastguard Worker }
98*993b0882SAndroid Build Coastguard Worker
Tokenize(absl::string_view text) const99*993b0882SAndroid Build Coastguard Worker std::vector<Token> Tokenizer::Tokenize(absl::string_view text) const {
100*993b0882SAndroid Build Coastguard Worker UnicodeText text_unicode = UTF8ToUnicodeText(text, /*do_copy=*/false);
101*993b0882SAndroid Build Coastguard Worker return Tokenize(text_unicode);
102*993b0882SAndroid Build Coastguard Worker }
103*993b0882SAndroid Build Coastguard Worker
Tokenize(const UnicodeText & text_unicode) const104*993b0882SAndroid Build Coastguard Worker std::vector<Token> Tokenizer::Tokenize(const UnicodeText& text_unicode) const {
105*993b0882SAndroid Build Coastguard Worker switch (type_) {
106*993b0882SAndroid Build Coastguard Worker case TokenizationType_INTERNAL_TOKENIZER:
107*993b0882SAndroid Build Coastguard Worker return InternalTokenize(text_unicode);
108*993b0882SAndroid Build Coastguard Worker case TokenizationType_ICU:
109*993b0882SAndroid Build Coastguard Worker TC3_FALLTHROUGH_INTENDED;
110*993b0882SAndroid Build Coastguard Worker case TokenizationType_MIXED: {
111*993b0882SAndroid Build Coastguard Worker std::vector<Token> result;
112*993b0882SAndroid Build Coastguard Worker if (!ICUTokenize(text_unicode, &result)) {
113*993b0882SAndroid Build Coastguard Worker return {};
114*993b0882SAndroid Build Coastguard Worker }
115*993b0882SAndroid Build Coastguard Worker if (type_ == TokenizationType_MIXED) {
116*993b0882SAndroid Build Coastguard Worker InternalRetokenize(text_unicode, &result);
117*993b0882SAndroid Build Coastguard Worker }
118*993b0882SAndroid Build Coastguard Worker return result;
119*993b0882SAndroid Build Coastguard Worker }
120*993b0882SAndroid Build Coastguard Worker case TokenizationType_LETTER_DIGIT: {
121*993b0882SAndroid Build Coastguard Worker std::vector<Token> result;
122*993b0882SAndroid Build Coastguard Worker if (!NumberTokenize(text_unicode, &result)) {
123*993b0882SAndroid Build Coastguard Worker return {};
124*993b0882SAndroid Build Coastguard Worker }
125*993b0882SAndroid Build Coastguard Worker return result;
126*993b0882SAndroid Build Coastguard Worker }
127*993b0882SAndroid Build Coastguard Worker default:
128*993b0882SAndroid Build Coastguard Worker TC3_LOG(ERROR) << "Unknown tokenization type specified. Using internal.";
129*993b0882SAndroid Build Coastguard Worker return InternalTokenize(text_unicode);
130*993b0882SAndroid Build Coastguard Worker }
131*993b0882SAndroid Build Coastguard Worker }
132*993b0882SAndroid Build Coastguard Worker
AppendCodepointToToken(UnicodeText::const_iterator it,Token * token)133*993b0882SAndroid Build Coastguard Worker void AppendCodepointToToken(UnicodeText::const_iterator it, Token* token) {
134*993b0882SAndroid Build Coastguard Worker token->value += std::string(
135*993b0882SAndroid Build Coastguard Worker it.utf8_data(), it.utf8_data() + GetNumBytesForUTF8Char(it.utf8_data()));
136*993b0882SAndroid Build Coastguard Worker }
137*993b0882SAndroid Build Coastguard Worker
InternalTokenize(const UnicodeText & text_unicode) const138*993b0882SAndroid Build Coastguard Worker std::vector<Token> Tokenizer::InternalTokenize(
139*993b0882SAndroid Build Coastguard Worker const UnicodeText& text_unicode) const {
140*993b0882SAndroid Build Coastguard Worker std::vector<Token> result;
141*993b0882SAndroid Build Coastguard Worker Token new_token("", 0, 0);
142*993b0882SAndroid Build Coastguard Worker int codepoint_index = 0;
143*993b0882SAndroid Build Coastguard Worker
144*993b0882SAndroid Build Coastguard Worker int last_script = kInvalidScript;
145*993b0882SAndroid Build Coastguard Worker for (auto it = text_unicode.begin(); it != text_unicode.end();
146*993b0882SAndroid Build Coastguard Worker ++it, ++codepoint_index) {
147*993b0882SAndroid Build Coastguard Worker TokenizationCodepointRange_::Role role;
148*993b0882SAndroid Build Coastguard Worker int script;
149*993b0882SAndroid Build Coastguard Worker GetScriptAndRole(*it, &role, &script);
150*993b0882SAndroid Build Coastguard Worker
151*993b0882SAndroid Build Coastguard Worker if (role & TokenizationCodepointRange_::Role_SPLIT_BEFORE ||
152*993b0882SAndroid Build Coastguard Worker (split_on_script_change_ && last_script != kInvalidScript &&
153*993b0882SAndroid Build Coastguard Worker last_script != script)) {
154*993b0882SAndroid Build Coastguard Worker if (!new_token.value.empty()) {
155*993b0882SAndroid Build Coastguard Worker result.push_back(new_token);
156*993b0882SAndroid Build Coastguard Worker }
157*993b0882SAndroid Build Coastguard Worker new_token = Token("", codepoint_index, codepoint_index);
158*993b0882SAndroid Build Coastguard Worker }
159*993b0882SAndroid Build Coastguard Worker if (!(role & TokenizationCodepointRange_::Role_DISCARD_CODEPOINT)) {
160*993b0882SAndroid Build Coastguard Worker new_token.end += 1;
161*993b0882SAndroid Build Coastguard Worker AppendCodepointToToken(it, &new_token);
162*993b0882SAndroid Build Coastguard Worker }
163*993b0882SAndroid Build Coastguard Worker if (role & TokenizationCodepointRange_::Role_SPLIT_AFTER) {
164*993b0882SAndroid Build Coastguard Worker if (!new_token.value.empty()) {
165*993b0882SAndroid Build Coastguard Worker result.push_back(new_token);
166*993b0882SAndroid Build Coastguard Worker }
167*993b0882SAndroid Build Coastguard Worker new_token = Token("", codepoint_index + 1, codepoint_index + 1);
168*993b0882SAndroid Build Coastguard Worker }
169*993b0882SAndroid Build Coastguard Worker
170*993b0882SAndroid Build Coastguard Worker last_script = script;
171*993b0882SAndroid Build Coastguard Worker }
172*993b0882SAndroid Build Coastguard Worker if (!new_token.value.empty()) {
173*993b0882SAndroid Build Coastguard Worker result.push_back(new_token);
174*993b0882SAndroid Build Coastguard Worker }
175*993b0882SAndroid Build Coastguard Worker
176*993b0882SAndroid Build Coastguard Worker return result;
177*993b0882SAndroid Build Coastguard Worker }
178*993b0882SAndroid Build Coastguard Worker
TokenizeSubstring(const UnicodeText & unicode_text,CodepointSpan span,std::vector<Token> * result) const179*993b0882SAndroid Build Coastguard Worker void Tokenizer::TokenizeSubstring(const UnicodeText& unicode_text,
180*993b0882SAndroid Build Coastguard Worker CodepointSpan span,
181*993b0882SAndroid Build Coastguard Worker std::vector<Token>* result) const {
182*993b0882SAndroid Build Coastguard Worker if (span.first < 0) {
183*993b0882SAndroid Build Coastguard Worker // There is no span to tokenize.
184*993b0882SAndroid Build Coastguard Worker return;
185*993b0882SAndroid Build Coastguard Worker }
186*993b0882SAndroid Build Coastguard Worker
187*993b0882SAndroid Build Coastguard Worker // Extract the substring.
188*993b0882SAndroid Build Coastguard Worker UnicodeText text = UnicodeText::Substring(unicode_text, span.first,
189*993b0882SAndroid Build Coastguard Worker span.second, /*do_copy=*/false);
190*993b0882SAndroid Build Coastguard Worker
191*993b0882SAndroid Build Coastguard Worker // Run the tokenizer and update the token bounds to reflect the offset of the
192*993b0882SAndroid Build Coastguard Worker // substring.
193*993b0882SAndroid Build Coastguard Worker std::vector<Token> tokens = InternalTokenize(text);
194*993b0882SAndroid Build Coastguard Worker
195*993b0882SAndroid Build Coastguard Worker // Avoids progressive capacity increases in the for loop.
196*993b0882SAndroid Build Coastguard Worker result->reserve(result->size() + tokens.size());
197*993b0882SAndroid Build Coastguard Worker for (Token& token : tokens) {
198*993b0882SAndroid Build Coastguard Worker token.start += span.first;
199*993b0882SAndroid Build Coastguard Worker token.end += span.first;
200*993b0882SAndroid Build Coastguard Worker result->emplace_back(std::move(token));
201*993b0882SAndroid Build Coastguard Worker }
202*993b0882SAndroid Build Coastguard Worker }
203*993b0882SAndroid Build Coastguard Worker
InternalRetokenize(const UnicodeText & unicode_text,std::vector<Token> * tokens) const204*993b0882SAndroid Build Coastguard Worker void Tokenizer::InternalRetokenize(const UnicodeText& unicode_text,
205*993b0882SAndroid Build Coastguard Worker std::vector<Token>* tokens) const {
206*993b0882SAndroid Build Coastguard Worker std::vector<Token> result;
207*993b0882SAndroid Build Coastguard Worker CodepointSpan span(-1, -1);
208*993b0882SAndroid Build Coastguard Worker for (Token& token : *tokens) {
209*993b0882SAndroid Build Coastguard Worker const UnicodeText unicode_token_value =
210*993b0882SAndroid Build Coastguard Worker UTF8ToUnicodeText(token.value, /*do_copy=*/false);
211*993b0882SAndroid Build Coastguard Worker bool should_retokenize = true;
212*993b0882SAndroid Build Coastguard Worker for (const int codepoint : unicode_token_value) {
213*993b0882SAndroid Build Coastguard Worker if (!IsCodepointInRanges(codepoint,
214*993b0882SAndroid Build Coastguard Worker internal_tokenizer_codepoint_ranges_)) {
215*993b0882SAndroid Build Coastguard Worker should_retokenize = false;
216*993b0882SAndroid Build Coastguard Worker break;
217*993b0882SAndroid Build Coastguard Worker }
218*993b0882SAndroid Build Coastguard Worker }
219*993b0882SAndroid Build Coastguard Worker
220*993b0882SAndroid Build Coastguard Worker if (should_retokenize) {
221*993b0882SAndroid Build Coastguard Worker if (span.first < 0) {
222*993b0882SAndroid Build Coastguard Worker span.first = token.start;
223*993b0882SAndroid Build Coastguard Worker }
224*993b0882SAndroid Build Coastguard Worker span.second = token.end;
225*993b0882SAndroid Build Coastguard Worker } else {
226*993b0882SAndroid Build Coastguard Worker TokenizeSubstring(unicode_text, span, &result);
227*993b0882SAndroid Build Coastguard Worker span.first = -1;
228*993b0882SAndroid Build Coastguard Worker result.emplace_back(std::move(token));
229*993b0882SAndroid Build Coastguard Worker }
230*993b0882SAndroid Build Coastguard Worker }
231*993b0882SAndroid Build Coastguard Worker TokenizeSubstring(unicode_text, span, &result);
232*993b0882SAndroid Build Coastguard Worker
233*993b0882SAndroid Build Coastguard Worker *tokens = std::move(result);
234*993b0882SAndroid Build Coastguard Worker }
235*993b0882SAndroid Build Coastguard Worker
ICUTokenize(const UnicodeText & context_unicode,std::vector<Token> * result) const236*993b0882SAndroid Build Coastguard Worker bool Tokenizer::ICUTokenize(const UnicodeText& context_unicode,
237*993b0882SAndroid Build Coastguard Worker std::vector<Token>* result) const {
238*993b0882SAndroid Build Coastguard Worker std::unique_ptr<UniLib::BreakIterator> break_iterator =
239*993b0882SAndroid Build Coastguard Worker unilib_->CreateBreakIterator(context_unicode);
240*993b0882SAndroid Build Coastguard Worker if (!break_iterator) {
241*993b0882SAndroid Build Coastguard Worker return false;
242*993b0882SAndroid Build Coastguard Worker }
243*993b0882SAndroid Build Coastguard Worker const int context_unicode_size = context_unicode.size_codepoints();
244*993b0882SAndroid Build Coastguard Worker int last_unicode_index = 0;
245*993b0882SAndroid Build Coastguard Worker int unicode_index = 0;
246*993b0882SAndroid Build Coastguard Worker auto token_begin_it = context_unicode.begin();
247*993b0882SAndroid Build Coastguard Worker while ((unicode_index = break_iterator->Next()) !=
248*993b0882SAndroid Build Coastguard Worker UniLib::BreakIterator::kDone) {
249*993b0882SAndroid Build Coastguard Worker const int token_length = unicode_index - last_unicode_index;
250*993b0882SAndroid Build Coastguard Worker if (token_length + last_unicode_index > context_unicode_size) {
251*993b0882SAndroid Build Coastguard Worker return false;
252*993b0882SAndroid Build Coastguard Worker }
253*993b0882SAndroid Build Coastguard Worker
254*993b0882SAndroid Build Coastguard Worker auto token_end_it = token_begin_it;
255*993b0882SAndroid Build Coastguard Worker std::advance(token_end_it, token_length);
256*993b0882SAndroid Build Coastguard Worker TC3_CHECK(token_end_it <= context_unicode.end());
257*993b0882SAndroid Build Coastguard Worker
258*993b0882SAndroid Build Coastguard Worker // Determine if the whole token is whitespace.
259*993b0882SAndroid Build Coastguard Worker bool is_whitespace = true;
260*993b0882SAndroid Build Coastguard Worker for (auto char_it = token_begin_it; char_it < token_end_it; ++char_it) {
261*993b0882SAndroid Build Coastguard Worker if (!unilib_->IsWhitespace(*char_it)) {
262*993b0882SAndroid Build Coastguard Worker is_whitespace = false;
263*993b0882SAndroid Build Coastguard Worker break;
264*993b0882SAndroid Build Coastguard Worker }
265*993b0882SAndroid Build Coastguard Worker }
266*993b0882SAndroid Build Coastguard Worker
267*993b0882SAndroid Build Coastguard Worker const std::string token =
268*993b0882SAndroid Build Coastguard Worker context_unicode.UTF8Substring(token_begin_it, token_end_it);
269*993b0882SAndroid Build Coastguard Worker
270*993b0882SAndroid Build Coastguard Worker if (!is_whitespace || icu_preserve_whitespace_tokens_) {
271*993b0882SAndroid Build Coastguard Worker result->push_back(Token(token, last_unicode_index, unicode_index,
272*993b0882SAndroid Build Coastguard Worker /*is_padding=*/false, is_whitespace));
273*993b0882SAndroid Build Coastguard Worker }
274*993b0882SAndroid Build Coastguard Worker
275*993b0882SAndroid Build Coastguard Worker last_unicode_index = unicode_index;
276*993b0882SAndroid Build Coastguard Worker token_begin_it = token_end_it;
277*993b0882SAndroid Build Coastguard Worker }
278*993b0882SAndroid Build Coastguard Worker
279*993b0882SAndroid Build Coastguard Worker return true;
280*993b0882SAndroid Build Coastguard Worker }
281*993b0882SAndroid Build Coastguard Worker
NumberTokenize(const UnicodeText & text_unicode,std::vector<Token> * result) const282*993b0882SAndroid Build Coastguard Worker bool Tokenizer::NumberTokenize(const UnicodeText& text_unicode,
283*993b0882SAndroid Build Coastguard Worker std::vector<Token>* result) const {
284*993b0882SAndroid Build Coastguard Worker Token new_token("", 0, 0);
285*993b0882SAndroid Build Coastguard Worker NumberTokenType current_token_type = NOT_SET;
286*993b0882SAndroid Build Coastguard Worker int codepoint_index = 0;
287*993b0882SAndroid Build Coastguard Worker
288*993b0882SAndroid Build Coastguard Worker auto PushToken = [&new_token, result]() {
289*993b0882SAndroid Build Coastguard Worker if (!new_token.value.empty()) {
290*993b0882SAndroid Build Coastguard Worker result->push_back(new_token);
291*993b0882SAndroid Build Coastguard Worker }
292*993b0882SAndroid Build Coastguard Worker };
293*993b0882SAndroid Build Coastguard Worker
294*993b0882SAndroid Build Coastguard Worker auto MaybeResetTokenAndAddChar =
295*993b0882SAndroid Build Coastguard Worker [&new_token, PushToken, ¤t_token_type](
296*993b0882SAndroid Build Coastguard Worker int codepoint_index, NumberTokenType token_type,
297*993b0882SAndroid Build Coastguard Worker UnicodeText::const_iterator it, bool is_whitespace = false) {
298*993b0882SAndroid Build Coastguard Worker if (current_token_type != token_type) {
299*993b0882SAndroid Build Coastguard Worker PushToken();
300*993b0882SAndroid Build Coastguard Worker new_token = Token("", codepoint_index, codepoint_index,
301*993b0882SAndroid Build Coastguard Worker /*is_padding=*/false, is_whitespace);
302*993b0882SAndroid Build Coastguard Worker }
303*993b0882SAndroid Build Coastguard Worker new_token.end += 1;
304*993b0882SAndroid Build Coastguard Worker AppendCodepointToToken(it, &new_token);
305*993b0882SAndroid Build Coastguard Worker current_token_type = token_type;
306*993b0882SAndroid Build Coastguard Worker };
307*993b0882SAndroid Build Coastguard Worker
308*993b0882SAndroid Build Coastguard Worker auto FinishTokenAndAddSeparator =
309*993b0882SAndroid Build Coastguard Worker [&new_token, result, ¤t_token_type, PushToken](
310*993b0882SAndroid Build Coastguard Worker int codepoint_index, UnicodeText::const_iterator it) {
311*993b0882SAndroid Build Coastguard Worker PushToken();
312*993b0882SAndroid Build Coastguard Worker
313*993b0882SAndroid Build Coastguard Worker result->emplace_back("", codepoint_index, codepoint_index + 1);
314*993b0882SAndroid Build Coastguard Worker AppendCodepointToToken(it, &result->back());
315*993b0882SAndroid Build Coastguard Worker
316*993b0882SAndroid Build Coastguard Worker new_token = Token("", codepoint_index + 1, codepoint_index + 1);
317*993b0882SAndroid Build Coastguard Worker current_token_type = NOT_SET;
318*993b0882SAndroid Build Coastguard Worker };
319*993b0882SAndroid Build Coastguard Worker
320*993b0882SAndroid Build Coastguard Worker for (auto it = text_unicode.begin(); it != text_unicode.end();
321*993b0882SAndroid Build Coastguard Worker ++it, ++codepoint_index) {
322*993b0882SAndroid Build Coastguard Worker if (unilib_->IsDigit(*it)) {
323*993b0882SAndroid Build Coastguard Worker MaybeResetTokenAndAddChar(codepoint_index, NUMERICAL, it);
324*993b0882SAndroid Build Coastguard Worker } else if (unilib_->IsLetter(*it)) {
325*993b0882SAndroid Build Coastguard Worker MaybeResetTokenAndAddChar(codepoint_index, TERM, it);
326*993b0882SAndroid Build Coastguard Worker } else if (unilib_->IsWhitespace(*it)) {
327*993b0882SAndroid Build Coastguard Worker MaybeResetTokenAndAddChar(codepoint_index, WHITESPACE, it,
328*993b0882SAndroid Build Coastguard Worker /*is_whitespace=*/true);
329*993b0882SAndroid Build Coastguard Worker } else if (unilib_->IsDot(*it) && preserve_floating_numbers_) {
330*993b0882SAndroid Build Coastguard Worker auto it_next = std::next(it);
331*993b0882SAndroid Build Coastguard Worker if (current_token_type == NUMERICAL && it_next != text_unicode.end() &&
332*993b0882SAndroid Build Coastguard Worker unilib_->IsDigit(*it_next)) {
333*993b0882SAndroid Build Coastguard Worker new_token.end += 1;
334*993b0882SAndroid Build Coastguard Worker AppendCodepointToToken(it, &new_token);
335*993b0882SAndroid Build Coastguard Worker } else {
336*993b0882SAndroid Build Coastguard Worker // If the current token is not a number or dot at the end or followed
337*993b0882SAndroid Build Coastguard Worker // by a non digit => separate token
338*993b0882SAndroid Build Coastguard Worker FinishTokenAndAddSeparator(codepoint_index, it);
339*993b0882SAndroid Build Coastguard Worker }
340*993b0882SAndroid Build Coastguard Worker } else {
341*993b0882SAndroid Build Coastguard Worker FinishTokenAndAddSeparator(codepoint_index, it);
342*993b0882SAndroid Build Coastguard Worker }
343*993b0882SAndroid Build Coastguard Worker }
344*993b0882SAndroid Build Coastguard Worker PushToken();
345*993b0882SAndroid Build Coastguard Worker
346*993b0882SAndroid Build Coastguard Worker return true;
347*993b0882SAndroid Build Coastguard Worker }
348*993b0882SAndroid Build Coastguard Worker
349*993b0882SAndroid Build Coastguard Worker } // namespace libtextclassifier3
350