xref: /aosp_15_r20/external/libtextclassifier/native/utils/tokenizer.cc (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #include "utils/tokenizer.h"
18*993b0882SAndroid Build Coastguard Worker 
19*993b0882SAndroid Build Coastguard Worker #include <algorithm>
20*993b0882SAndroid Build Coastguard Worker 
21*993b0882SAndroid Build Coastguard Worker #include "utils/base/logging.h"
22*993b0882SAndroid Build Coastguard Worker #include "utils/base/macros.h"
23*993b0882SAndroid Build Coastguard Worker #include "utils/strings/utf8.h"
24*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unicodetext.h"
25*993b0882SAndroid Build Coastguard Worker #include "absl/strings/string_view.h"
26*993b0882SAndroid Build Coastguard Worker 
27*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
28*993b0882SAndroid Build Coastguard Worker 
Tokenizer(const TokenizationType type,const UniLib * unilib,const std::vector<const TokenizationCodepointRange * > & codepoint_ranges,const std::vector<const CodepointRange * > & internal_tokenizer_codepoint_ranges,const bool split_on_script_change,const bool icu_preserve_whitespace_tokens,const bool preserve_floating_numbers)29*993b0882SAndroid Build Coastguard Worker Tokenizer::Tokenizer(
30*993b0882SAndroid Build Coastguard Worker     const TokenizationType type, const UniLib* unilib,
31*993b0882SAndroid Build Coastguard Worker     const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
32*993b0882SAndroid Build Coastguard Worker     const std::vector<const CodepointRange*>&
33*993b0882SAndroid Build Coastguard Worker         internal_tokenizer_codepoint_ranges,
34*993b0882SAndroid Build Coastguard Worker     const bool split_on_script_change,
35*993b0882SAndroid Build Coastguard Worker     const bool icu_preserve_whitespace_tokens,
36*993b0882SAndroid Build Coastguard Worker     const bool preserve_floating_numbers)
37*993b0882SAndroid Build Coastguard Worker     : type_(type),
38*993b0882SAndroid Build Coastguard Worker       unilib_(unilib),
39*993b0882SAndroid Build Coastguard Worker       split_on_script_change_(split_on_script_change),
40*993b0882SAndroid Build Coastguard Worker       icu_preserve_whitespace_tokens_(icu_preserve_whitespace_tokens),
41*993b0882SAndroid Build Coastguard Worker       preserve_floating_numbers_(preserve_floating_numbers) {
42*993b0882SAndroid Build Coastguard Worker   for (const TokenizationCodepointRange* range : codepoint_ranges) {
43*993b0882SAndroid Build Coastguard Worker     codepoint_ranges_.emplace_back(range->UnPack());
44*993b0882SAndroid Build Coastguard Worker   }
45*993b0882SAndroid Build Coastguard Worker 
46*993b0882SAndroid Build Coastguard Worker   std::stable_sort(
47*993b0882SAndroid Build Coastguard Worker       codepoint_ranges_.begin(), codepoint_ranges_.end(),
48*993b0882SAndroid Build Coastguard Worker       [](const std::unique_ptr<const TokenizationCodepointRangeT>& a,
49*993b0882SAndroid Build Coastguard Worker          const std::unique_ptr<const TokenizationCodepointRangeT>& b) {
50*993b0882SAndroid Build Coastguard Worker         return a->start < b->start;
51*993b0882SAndroid Build Coastguard Worker       });
52*993b0882SAndroid Build Coastguard Worker 
53*993b0882SAndroid Build Coastguard Worker   SortCodepointRanges(internal_tokenizer_codepoint_ranges,
54*993b0882SAndroid Build Coastguard Worker                       &internal_tokenizer_codepoint_ranges_);
55*993b0882SAndroid Build Coastguard Worker   if (type_ == TokenizationType_MIXED && split_on_script_change) {
56*993b0882SAndroid Build Coastguard Worker     TC3_LOG(ERROR) << "The option `split_on_script_change` is unavailable for "
57*993b0882SAndroid Build Coastguard Worker                       "the selected tokenizer type (mixed).";
58*993b0882SAndroid Build Coastguard Worker   }
59*993b0882SAndroid Build Coastguard Worker }
60*993b0882SAndroid Build Coastguard Worker 
FindTokenizationRange(int codepoint) const61*993b0882SAndroid Build Coastguard Worker const TokenizationCodepointRangeT* Tokenizer::FindTokenizationRange(
62*993b0882SAndroid Build Coastguard Worker     int codepoint) const {
63*993b0882SAndroid Build Coastguard Worker   auto it = std::lower_bound(
64*993b0882SAndroid Build Coastguard Worker       codepoint_ranges_.begin(), codepoint_ranges_.end(), codepoint,
65*993b0882SAndroid Build Coastguard Worker       [](const std::unique_ptr<const TokenizationCodepointRangeT>& range,
66*993b0882SAndroid Build Coastguard Worker          int codepoint) {
67*993b0882SAndroid Build Coastguard Worker         // This function compares range with the codepoint for the purpose of
68*993b0882SAndroid Build Coastguard Worker         // finding the first greater or equal range. Because of the use of
69*993b0882SAndroid Build Coastguard Worker         // std::lower_bound it needs to return true when range < codepoint;
70*993b0882SAndroid Build Coastguard Worker         // the first time it will return false the lower bound is found and
71*993b0882SAndroid Build Coastguard Worker         // returned.
72*993b0882SAndroid Build Coastguard Worker         //
73*993b0882SAndroid Build Coastguard Worker         // It might seem weird that the condition is range.end <= codepoint
74*993b0882SAndroid Build Coastguard Worker         // here but when codepoint == range.end it means it's actually just
75*993b0882SAndroid Build Coastguard Worker         // outside of the range, thus the range is less than the codepoint.
76*993b0882SAndroid Build Coastguard Worker         return range->end <= codepoint;
77*993b0882SAndroid Build Coastguard Worker       });
78*993b0882SAndroid Build Coastguard Worker   if (it != codepoint_ranges_.end() && (*it)->start <= codepoint &&
79*993b0882SAndroid Build Coastguard Worker       (*it)->end > codepoint) {
80*993b0882SAndroid Build Coastguard Worker     return it->get();
81*993b0882SAndroid Build Coastguard Worker   } else {
82*993b0882SAndroid Build Coastguard Worker     return nullptr;
83*993b0882SAndroid Build Coastguard Worker   }
84*993b0882SAndroid Build Coastguard Worker }
85*993b0882SAndroid Build Coastguard Worker 
GetScriptAndRole(char32 codepoint,TokenizationCodepointRange_::Role * role,int * script) const86*993b0882SAndroid Build Coastguard Worker void Tokenizer::GetScriptAndRole(char32 codepoint,
87*993b0882SAndroid Build Coastguard Worker                                  TokenizationCodepointRange_::Role* role,
88*993b0882SAndroid Build Coastguard Worker                                  int* script) const {
89*993b0882SAndroid Build Coastguard Worker   const TokenizationCodepointRangeT* range = FindTokenizationRange(codepoint);
90*993b0882SAndroid Build Coastguard Worker   if (range) {
91*993b0882SAndroid Build Coastguard Worker     *role = range->role;
92*993b0882SAndroid Build Coastguard Worker     *script = range->script_id;
93*993b0882SAndroid Build Coastguard Worker   } else {
94*993b0882SAndroid Build Coastguard Worker     *role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
95*993b0882SAndroid Build Coastguard Worker     *script = kUnknownScript;
96*993b0882SAndroid Build Coastguard Worker   }
97*993b0882SAndroid Build Coastguard Worker }
98*993b0882SAndroid Build Coastguard Worker 
Tokenize(absl::string_view text) const99*993b0882SAndroid Build Coastguard Worker std::vector<Token> Tokenizer::Tokenize(absl::string_view text) const {
100*993b0882SAndroid Build Coastguard Worker   UnicodeText text_unicode = UTF8ToUnicodeText(text, /*do_copy=*/false);
101*993b0882SAndroid Build Coastguard Worker   return Tokenize(text_unicode);
102*993b0882SAndroid Build Coastguard Worker }
103*993b0882SAndroid Build Coastguard Worker 
Tokenize(const UnicodeText & text_unicode) const104*993b0882SAndroid Build Coastguard Worker std::vector<Token> Tokenizer::Tokenize(const UnicodeText& text_unicode) const {
105*993b0882SAndroid Build Coastguard Worker   switch (type_) {
106*993b0882SAndroid Build Coastguard Worker     case TokenizationType_INTERNAL_TOKENIZER:
107*993b0882SAndroid Build Coastguard Worker       return InternalTokenize(text_unicode);
108*993b0882SAndroid Build Coastguard Worker     case TokenizationType_ICU:
109*993b0882SAndroid Build Coastguard Worker       TC3_FALLTHROUGH_INTENDED;
110*993b0882SAndroid Build Coastguard Worker     case TokenizationType_MIXED: {
111*993b0882SAndroid Build Coastguard Worker       std::vector<Token> result;
112*993b0882SAndroid Build Coastguard Worker       if (!ICUTokenize(text_unicode, &result)) {
113*993b0882SAndroid Build Coastguard Worker         return {};
114*993b0882SAndroid Build Coastguard Worker       }
115*993b0882SAndroid Build Coastguard Worker       if (type_ == TokenizationType_MIXED) {
116*993b0882SAndroid Build Coastguard Worker         InternalRetokenize(text_unicode, &result);
117*993b0882SAndroid Build Coastguard Worker       }
118*993b0882SAndroid Build Coastguard Worker       return result;
119*993b0882SAndroid Build Coastguard Worker     }
120*993b0882SAndroid Build Coastguard Worker     case TokenizationType_LETTER_DIGIT: {
121*993b0882SAndroid Build Coastguard Worker       std::vector<Token> result;
122*993b0882SAndroid Build Coastguard Worker       if (!NumberTokenize(text_unicode, &result)) {
123*993b0882SAndroid Build Coastguard Worker         return {};
124*993b0882SAndroid Build Coastguard Worker       }
125*993b0882SAndroid Build Coastguard Worker       return result;
126*993b0882SAndroid Build Coastguard Worker     }
127*993b0882SAndroid Build Coastguard Worker     default:
128*993b0882SAndroid Build Coastguard Worker       TC3_LOG(ERROR) << "Unknown tokenization type specified. Using internal.";
129*993b0882SAndroid Build Coastguard Worker       return InternalTokenize(text_unicode);
130*993b0882SAndroid Build Coastguard Worker   }
131*993b0882SAndroid Build Coastguard Worker }
132*993b0882SAndroid Build Coastguard Worker 
AppendCodepointToToken(UnicodeText::const_iterator it,Token * token)133*993b0882SAndroid Build Coastguard Worker void AppendCodepointToToken(UnicodeText::const_iterator it, Token* token) {
134*993b0882SAndroid Build Coastguard Worker   token->value += std::string(
135*993b0882SAndroid Build Coastguard Worker       it.utf8_data(), it.utf8_data() + GetNumBytesForUTF8Char(it.utf8_data()));
136*993b0882SAndroid Build Coastguard Worker }
137*993b0882SAndroid Build Coastguard Worker 
InternalTokenize(const UnicodeText & text_unicode) const138*993b0882SAndroid Build Coastguard Worker std::vector<Token> Tokenizer::InternalTokenize(
139*993b0882SAndroid Build Coastguard Worker     const UnicodeText& text_unicode) const {
140*993b0882SAndroid Build Coastguard Worker   std::vector<Token> result;
141*993b0882SAndroid Build Coastguard Worker   Token new_token("", 0, 0);
142*993b0882SAndroid Build Coastguard Worker   int codepoint_index = 0;
143*993b0882SAndroid Build Coastguard Worker 
144*993b0882SAndroid Build Coastguard Worker   int last_script = kInvalidScript;
145*993b0882SAndroid Build Coastguard Worker   for (auto it = text_unicode.begin(); it != text_unicode.end();
146*993b0882SAndroid Build Coastguard Worker        ++it, ++codepoint_index) {
147*993b0882SAndroid Build Coastguard Worker     TokenizationCodepointRange_::Role role;
148*993b0882SAndroid Build Coastguard Worker     int script;
149*993b0882SAndroid Build Coastguard Worker     GetScriptAndRole(*it, &role, &script);
150*993b0882SAndroid Build Coastguard Worker 
151*993b0882SAndroid Build Coastguard Worker     if (role & TokenizationCodepointRange_::Role_SPLIT_BEFORE ||
152*993b0882SAndroid Build Coastguard Worker         (split_on_script_change_ && last_script != kInvalidScript &&
153*993b0882SAndroid Build Coastguard Worker          last_script != script)) {
154*993b0882SAndroid Build Coastguard Worker       if (!new_token.value.empty()) {
155*993b0882SAndroid Build Coastguard Worker         result.push_back(new_token);
156*993b0882SAndroid Build Coastguard Worker       }
157*993b0882SAndroid Build Coastguard Worker       new_token = Token("", codepoint_index, codepoint_index);
158*993b0882SAndroid Build Coastguard Worker     }
159*993b0882SAndroid Build Coastguard Worker     if (!(role & TokenizationCodepointRange_::Role_DISCARD_CODEPOINT)) {
160*993b0882SAndroid Build Coastguard Worker       new_token.end += 1;
161*993b0882SAndroid Build Coastguard Worker       AppendCodepointToToken(it, &new_token);
162*993b0882SAndroid Build Coastguard Worker     }
163*993b0882SAndroid Build Coastguard Worker     if (role & TokenizationCodepointRange_::Role_SPLIT_AFTER) {
164*993b0882SAndroid Build Coastguard Worker       if (!new_token.value.empty()) {
165*993b0882SAndroid Build Coastguard Worker         result.push_back(new_token);
166*993b0882SAndroid Build Coastguard Worker       }
167*993b0882SAndroid Build Coastguard Worker       new_token = Token("", codepoint_index + 1, codepoint_index + 1);
168*993b0882SAndroid Build Coastguard Worker     }
169*993b0882SAndroid Build Coastguard Worker 
170*993b0882SAndroid Build Coastguard Worker     last_script = script;
171*993b0882SAndroid Build Coastguard Worker   }
172*993b0882SAndroid Build Coastguard Worker   if (!new_token.value.empty()) {
173*993b0882SAndroid Build Coastguard Worker     result.push_back(new_token);
174*993b0882SAndroid Build Coastguard Worker   }
175*993b0882SAndroid Build Coastguard Worker 
176*993b0882SAndroid Build Coastguard Worker   return result;
177*993b0882SAndroid Build Coastguard Worker }
178*993b0882SAndroid Build Coastguard Worker 
TokenizeSubstring(const UnicodeText & unicode_text,CodepointSpan span,std::vector<Token> * result) const179*993b0882SAndroid Build Coastguard Worker void Tokenizer::TokenizeSubstring(const UnicodeText& unicode_text,
180*993b0882SAndroid Build Coastguard Worker                                   CodepointSpan span,
181*993b0882SAndroid Build Coastguard Worker                                   std::vector<Token>* result) const {
182*993b0882SAndroid Build Coastguard Worker   if (span.first < 0) {
183*993b0882SAndroid Build Coastguard Worker     // There is no span to tokenize.
184*993b0882SAndroid Build Coastguard Worker     return;
185*993b0882SAndroid Build Coastguard Worker   }
186*993b0882SAndroid Build Coastguard Worker 
187*993b0882SAndroid Build Coastguard Worker   // Extract the substring.
188*993b0882SAndroid Build Coastguard Worker   UnicodeText text = UnicodeText::Substring(unicode_text, span.first,
189*993b0882SAndroid Build Coastguard Worker                                             span.second, /*do_copy=*/false);
190*993b0882SAndroid Build Coastguard Worker 
191*993b0882SAndroid Build Coastguard Worker   // Run the tokenizer and update the token bounds to reflect the offset of the
192*993b0882SAndroid Build Coastguard Worker   // substring.
193*993b0882SAndroid Build Coastguard Worker   std::vector<Token> tokens = InternalTokenize(text);
194*993b0882SAndroid Build Coastguard Worker 
195*993b0882SAndroid Build Coastguard Worker   // Avoids progressive capacity increases in the for loop.
196*993b0882SAndroid Build Coastguard Worker   result->reserve(result->size() + tokens.size());
197*993b0882SAndroid Build Coastguard Worker   for (Token& token : tokens) {
198*993b0882SAndroid Build Coastguard Worker     token.start += span.first;
199*993b0882SAndroid Build Coastguard Worker     token.end += span.first;
200*993b0882SAndroid Build Coastguard Worker     result->emplace_back(std::move(token));
201*993b0882SAndroid Build Coastguard Worker   }
202*993b0882SAndroid Build Coastguard Worker }
203*993b0882SAndroid Build Coastguard Worker 
InternalRetokenize(const UnicodeText & unicode_text,std::vector<Token> * tokens) const204*993b0882SAndroid Build Coastguard Worker void Tokenizer::InternalRetokenize(const UnicodeText& unicode_text,
205*993b0882SAndroid Build Coastguard Worker                                    std::vector<Token>* tokens) const {
206*993b0882SAndroid Build Coastguard Worker   std::vector<Token> result;
207*993b0882SAndroid Build Coastguard Worker   CodepointSpan span(-1, -1);
208*993b0882SAndroid Build Coastguard Worker   for (Token& token : *tokens) {
209*993b0882SAndroid Build Coastguard Worker     const UnicodeText unicode_token_value =
210*993b0882SAndroid Build Coastguard Worker         UTF8ToUnicodeText(token.value, /*do_copy=*/false);
211*993b0882SAndroid Build Coastguard Worker     bool should_retokenize = true;
212*993b0882SAndroid Build Coastguard Worker     for (const int codepoint : unicode_token_value) {
213*993b0882SAndroid Build Coastguard Worker       if (!IsCodepointInRanges(codepoint,
214*993b0882SAndroid Build Coastguard Worker                                internal_tokenizer_codepoint_ranges_)) {
215*993b0882SAndroid Build Coastguard Worker         should_retokenize = false;
216*993b0882SAndroid Build Coastguard Worker         break;
217*993b0882SAndroid Build Coastguard Worker       }
218*993b0882SAndroid Build Coastguard Worker     }
219*993b0882SAndroid Build Coastguard Worker 
220*993b0882SAndroid Build Coastguard Worker     if (should_retokenize) {
221*993b0882SAndroid Build Coastguard Worker       if (span.first < 0) {
222*993b0882SAndroid Build Coastguard Worker         span.first = token.start;
223*993b0882SAndroid Build Coastguard Worker       }
224*993b0882SAndroid Build Coastguard Worker       span.second = token.end;
225*993b0882SAndroid Build Coastguard Worker     } else {
226*993b0882SAndroid Build Coastguard Worker       TokenizeSubstring(unicode_text, span, &result);
227*993b0882SAndroid Build Coastguard Worker       span.first = -1;
228*993b0882SAndroid Build Coastguard Worker       result.emplace_back(std::move(token));
229*993b0882SAndroid Build Coastguard Worker     }
230*993b0882SAndroid Build Coastguard Worker   }
231*993b0882SAndroid Build Coastguard Worker   TokenizeSubstring(unicode_text, span, &result);
232*993b0882SAndroid Build Coastguard Worker 
233*993b0882SAndroid Build Coastguard Worker   *tokens = std::move(result);
234*993b0882SAndroid Build Coastguard Worker }
235*993b0882SAndroid Build Coastguard Worker 
ICUTokenize(const UnicodeText & context_unicode,std::vector<Token> * result) const236*993b0882SAndroid Build Coastguard Worker bool Tokenizer::ICUTokenize(const UnicodeText& context_unicode,
237*993b0882SAndroid Build Coastguard Worker                             std::vector<Token>* result) const {
238*993b0882SAndroid Build Coastguard Worker   std::unique_ptr<UniLib::BreakIterator> break_iterator =
239*993b0882SAndroid Build Coastguard Worker       unilib_->CreateBreakIterator(context_unicode);
240*993b0882SAndroid Build Coastguard Worker   if (!break_iterator) {
241*993b0882SAndroid Build Coastguard Worker     return false;
242*993b0882SAndroid Build Coastguard Worker   }
243*993b0882SAndroid Build Coastguard Worker   const int context_unicode_size = context_unicode.size_codepoints();
244*993b0882SAndroid Build Coastguard Worker   int last_unicode_index = 0;
245*993b0882SAndroid Build Coastguard Worker   int unicode_index = 0;
246*993b0882SAndroid Build Coastguard Worker   auto token_begin_it = context_unicode.begin();
247*993b0882SAndroid Build Coastguard Worker   while ((unicode_index = break_iterator->Next()) !=
248*993b0882SAndroid Build Coastguard Worker          UniLib::BreakIterator::kDone) {
249*993b0882SAndroid Build Coastguard Worker     const int token_length = unicode_index - last_unicode_index;
250*993b0882SAndroid Build Coastguard Worker     if (token_length + last_unicode_index > context_unicode_size) {
251*993b0882SAndroid Build Coastguard Worker       return false;
252*993b0882SAndroid Build Coastguard Worker     }
253*993b0882SAndroid Build Coastguard Worker 
254*993b0882SAndroid Build Coastguard Worker     auto token_end_it = token_begin_it;
255*993b0882SAndroid Build Coastguard Worker     std::advance(token_end_it, token_length);
256*993b0882SAndroid Build Coastguard Worker     TC3_CHECK(token_end_it <= context_unicode.end());
257*993b0882SAndroid Build Coastguard Worker 
258*993b0882SAndroid Build Coastguard Worker     // Determine if the whole token is whitespace.
259*993b0882SAndroid Build Coastguard Worker     bool is_whitespace = true;
260*993b0882SAndroid Build Coastguard Worker     for (auto char_it = token_begin_it; char_it < token_end_it; ++char_it) {
261*993b0882SAndroid Build Coastguard Worker       if (!unilib_->IsWhitespace(*char_it)) {
262*993b0882SAndroid Build Coastguard Worker         is_whitespace = false;
263*993b0882SAndroid Build Coastguard Worker         break;
264*993b0882SAndroid Build Coastguard Worker       }
265*993b0882SAndroid Build Coastguard Worker     }
266*993b0882SAndroid Build Coastguard Worker 
267*993b0882SAndroid Build Coastguard Worker     const std::string token =
268*993b0882SAndroid Build Coastguard Worker         context_unicode.UTF8Substring(token_begin_it, token_end_it);
269*993b0882SAndroid Build Coastguard Worker 
270*993b0882SAndroid Build Coastguard Worker     if (!is_whitespace || icu_preserve_whitespace_tokens_) {
271*993b0882SAndroid Build Coastguard Worker       result->push_back(Token(token, last_unicode_index, unicode_index,
272*993b0882SAndroid Build Coastguard Worker                               /*is_padding=*/false, is_whitespace));
273*993b0882SAndroid Build Coastguard Worker     }
274*993b0882SAndroid Build Coastguard Worker 
275*993b0882SAndroid Build Coastguard Worker     last_unicode_index = unicode_index;
276*993b0882SAndroid Build Coastguard Worker     token_begin_it = token_end_it;
277*993b0882SAndroid Build Coastguard Worker   }
278*993b0882SAndroid Build Coastguard Worker 
279*993b0882SAndroid Build Coastguard Worker   return true;
280*993b0882SAndroid Build Coastguard Worker }
281*993b0882SAndroid Build Coastguard Worker 
NumberTokenize(const UnicodeText & text_unicode,std::vector<Token> * result) const282*993b0882SAndroid Build Coastguard Worker bool Tokenizer::NumberTokenize(const UnicodeText& text_unicode,
283*993b0882SAndroid Build Coastguard Worker                                std::vector<Token>* result) const {
284*993b0882SAndroid Build Coastguard Worker   Token new_token("", 0, 0);
285*993b0882SAndroid Build Coastguard Worker   NumberTokenType current_token_type = NOT_SET;
286*993b0882SAndroid Build Coastguard Worker   int codepoint_index = 0;
287*993b0882SAndroid Build Coastguard Worker 
288*993b0882SAndroid Build Coastguard Worker   auto PushToken = [&new_token, result]() {
289*993b0882SAndroid Build Coastguard Worker     if (!new_token.value.empty()) {
290*993b0882SAndroid Build Coastguard Worker       result->push_back(new_token);
291*993b0882SAndroid Build Coastguard Worker     }
292*993b0882SAndroid Build Coastguard Worker   };
293*993b0882SAndroid Build Coastguard Worker 
294*993b0882SAndroid Build Coastguard Worker   auto MaybeResetTokenAndAddChar =
295*993b0882SAndroid Build Coastguard Worker       [&new_token, PushToken, &current_token_type](
296*993b0882SAndroid Build Coastguard Worker           int codepoint_index, NumberTokenType token_type,
297*993b0882SAndroid Build Coastguard Worker           UnicodeText::const_iterator it, bool is_whitespace = false) {
298*993b0882SAndroid Build Coastguard Worker         if (current_token_type != token_type) {
299*993b0882SAndroid Build Coastguard Worker           PushToken();
300*993b0882SAndroid Build Coastguard Worker           new_token = Token("", codepoint_index, codepoint_index,
301*993b0882SAndroid Build Coastguard Worker                             /*is_padding=*/false, is_whitespace);
302*993b0882SAndroid Build Coastguard Worker         }
303*993b0882SAndroid Build Coastguard Worker         new_token.end += 1;
304*993b0882SAndroid Build Coastguard Worker         AppendCodepointToToken(it, &new_token);
305*993b0882SAndroid Build Coastguard Worker         current_token_type = token_type;
306*993b0882SAndroid Build Coastguard Worker       };
307*993b0882SAndroid Build Coastguard Worker 
308*993b0882SAndroid Build Coastguard Worker   auto FinishTokenAndAddSeparator =
309*993b0882SAndroid Build Coastguard Worker       [&new_token, result, &current_token_type, PushToken](
310*993b0882SAndroid Build Coastguard Worker           int codepoint_index, UnicodeText::const_iterator it) {
311*993b0882SAndroid Build Coastguard Worker         PushToken();
312*993b0882SAndroid Build Coastguard Worker 
313*993b0882SAndroid Build Coastguard Worker         result->emplace_back("", codepoint_index, codepoint_index + 1);
314*993b0882SAndroid Build Coastguard Worker         AppendCodepointToToken(it, &result->back());
315*993b0882SAndroid Build Coastguard Worker 
316*993b0882SAndroid Build Coastguard Worker         new_token = Token("", codepoint_index + 1, codepoint_index + 1);
317*993b0882SAndroid Build Coastguard Worker         current_token_type = NOT_SET;
318*993b0882SAndroid Build Coastguard Worker       };
319*993b0882SAndroid Build Coastguard Worker 
320*993b0882SAndroid Build Coastguard Worker   for (auto it = text_unicode.begin(); it != text_unicode.end();
321*993b0882SAndroid Build Coastguard Worker        ++it, ++codepoint_index) {
322*993b0882SAndroid Build Coastguard Worker     if (unilib_->IsDigit(*it)) {
323*993b0882SAndroid Build Coastguard Worker       MaybeResetTokenAndAddChar(codepoint_index, NUMERICAL, it);
324*993b0882SAndroid Build Coastguard Worker     } else if (unilib_->IsLetter(*it)) {
325*993b0882SAndroid Build Coastguard Worker       MaybeResetTokenAndAddChar(codepoint_index, TERM, it);
326*993b0882SAndroid Build Coastguard Worker     } else if (unilib_->IsWhitespace(*it)) {
327*993b0882SAndroid Build Coastguard Worker       MaybeResetTokenAndAddChar(codepoint_index, WHITESPACE, it,
328*993b0882SAndroid Build Coastguard Worker                                 /*is_whitespace=*/true);
329*993b0882SAndroid Build Coastguard Worker     } else if (unilib_->IsDot(*it) && preserve_floating_numbers_) {
330*993b0882SAndroid Build Coastguard Worker       auto it_next = std::next(it);
331*993b0882SAndroid Build Coastguard Worker       if (current_token_type == NUMERICAL && it_next != text_unicode.end() &&
332*993b0882SAndroid Build Coastguard Worker           unilib_->IsDigit(*it_next)) {
333*993b0882SAndroid Build Coastguard Worker         new_token.end += 1;
334*993b0882SAndroid Build Coastguard Worker         AppendCodepointToToken(it, &new_token);
335*993b0882SAndroid Build Coastguard Worker       } else {
336*993b0882SAndroid Build Coastguard Worker         // If the current token is not a number or dot at the end or followed
337*993b0882SAndroid Build Coastguard Worker         // by a non digit => separate token
338*993b0882SAndroid Build Coastguard Worker         FinishTokenAndAddSeparator(codepoint_index, it);
339*993b0882SAndroid Build Coastguard Worker       }
340*993b0882SAndroid Build Coastguard Worker     } else {
341*993b0882SAndroid Build Coastguard Worker       FinishTokenAndAddSeparator(codepoint_index, it);
342*993b0882SAndroid Build Coastguard Worker     }
343*993b0882SAndroid Build Coastguard Worker   }
344*993b0882SAndroid Build Coastguard Worker   PushToken();
345*993b0882SAndroid Build Coastguard Worker 
346*993b0882SAndroid Build Coastguard Worker   return true;
347*993b0882SAndroid Build Coastguard Worker }
348*993b0882SAndroid Build Coastguard Worker 
349*993b0882SAndroid Build Coastguard Worker }  // namespace libtextclassifier3
350