xref: /aosp_15_r20/external/libtextclassifier/native/utils/token-feature-extractor.cc (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #include "utils/token-feature-extractor.h"
18*993b0882SAndroid Build Coastguard Worker 
19*993b0882SAndroid Build Coastguard Worker #include <cctype>
20*993b0882SAndroid Build Coastguard Worker #include <string>
21*993b0882SAndroid Build Coastguard Worker 
22*993b0882SAndroid Build Coastguard Worker #include "utils/base/logging.h"
23*993b0882SAndroid Build Coastguard Worker #include "utils/hash/farmhash.h"
24*993b0882SAndroid Build Coastguard Worker #include "utils/strings/stringpiece.h"
25*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unicodetext.h"
26*993b0882SAndroid Build Coastguard Worker 
27*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
28*993b0882SAndroid Build Coastguard Worker 
29*993b0882SAndroid Build Coastguard Worker namespace {
30*993b0882SAndroid Build Coastguard Worker 
RemapTokenAscii(const std::string & token,const TokenFeatureExtractorOptions & options)31*993b0882SAndroid Build Coastguard Worker std::string RemapTokenAscii(const std::string& token,
32*993b0882SAndroid Build Coastguard Worker                             const TokenFeatureExtractorOptions& options) {
33*993b0882SAndroid Build Coastguard Worker   if (!options.remap_digits && !options.lowercase_tokens) {
34*993b0882SAndroid Build Coastguard Worker     return token;
35*993b0882SAndroid Build Coastguard Worker   }
36*993b0882SAndroid Build Coastguard Worker 
37*993b0882SAndroid Build Coastguard Worker   std::string copy = token;
38*993b0882SAndroid Build Coastguard Worker   for (int i = 0; i < token.size(); ++i) {
39*993b0882SAndroid Build Coastguard Worker     if (options.remap_digits && isdigit(copy[i])) {
40*993b0882SAndroid Build Coastguard Worker       copy[i] = '0';
41*993b0882SAndroid Build Coastguard Worker     }
42*993b0882SAndroid Build Coastguard Worker     if (options.lowercase_tokens) {
43*993b0882SAndroid Build Coastguard Worker       copy[i] = tolower(copy[i]);
44*993b0882SAndroid Build Coastguard Worker     }
45*993b0882SAndroid Build Coastguard Worker   }
46*993b0882SAndroid Build Coastguard Worker   return copy;
47*993b0882SAndroid Build Coastguard Worker }
48*993b0882SAndroid Build Coastguard Worker 
RemapTokenUnicode(const std::string & token,const TokenFeatureExtractorOptions & options,const UniLib & unilib,UnicodeText * remapped)49*993b0882SAndroid Build Coastguard Worker void RemapTokenUnicode(const std::string& token,
50*993b0882SAndroid Build Coastguard Worker                        const TokenFeatureExtractorOptions& options,
51*993b0882SAndroid Build Coastguard Worker                        const UniLib& unilib, UnicodeText* remapped) {
52*993b0882SAndroid Build Coastguard Worker   if (!options.remap_digits && !options.lowercase_tokens) {
53*993b0882SAndroid Build Coastguard Worker     // Leave remapped untouched.
54*993b0882SAndroid Build Coastguard Worker     return;
55*993b0882SAndroid Build Coastguard Worker   }
56*993b0882SAndroid Build Coastguard Worker 
57*993b0882SAndroid Build Coastguard Worker   UnicodeText word = UTF8ToUnicodeText(token, /*do_copy=*/false);
58*993b0882SAndroid Build Coastguard Worker   remapped->clear();
59*993b0882SAndroid Build Coastguard Worker   for (auto it = word.begin(); it != word.end(); ++it) {
60*993b0882SAndroid Build Coastguard Worker     if (options.remap_digits && unilib.IsDigit(*it)) {
61*993b0882SAndroid Build Coastguard Worker       remapped->push_back('0');
62*993b0882SAndroid Build Coastguard Worker     } else if (options.lowercase_tokens) {
63*993b0882SAndroid Build Coastguard Worker       remapped->push_back(unilib.ToLower(*it));
64*993b0882SAndroid Build Coastguard Worker     } else {
65*993b0882SAndroid Build Coastguard Worker       remapped->push_back(*it);
66*993b0882SAndroid Build Coastguard Worker     }
67*993b0882SAndroid Build Coastguard Worker   }
68*993b0882SAndroid Build Coastguard Worker }
69*993b0882SAndroid Build Coastguard Worker 
70*993b0882SAndroid Build Coastguard Worker }  // namespace
71*993b0882SAndroid Build Coastguard Worker 
TokenFeatureExtractor(const TokenFeatureExtractorOptions & options,const UniLib * unilib)72*993b0882SAndroid Build Coastguard Worker TokenFeatureExtractor::TokenFeatureExtractor(
73*993b0882SAndroid Build Coastguard Worker     const TokenFeatureExtractorOptions& options, const UniLib* unilib)
74*993b0882SAndroid Build Coastguard Worker     : options_(options), unilib_(*unilib) {
75*993b0882SAndroid Build Coastguard Worker   for (const std::string& pattern : options.regexp_features) {
76*993b0882SAndroid Build Coastguard Worker     regex_patterns_.push_back(std::unique_ptr<UniLib::RegexPattern>(
77*993b0882SAndroid Build Coastguard Worker         unilib_.CreateRegexPattern(UTF8ToUnicodeText(
78*993b0882SAndroid Build Coastguard Worker             pattern.c_str(), pattern.size(), /*do_copy=*/false))));
79*993b0882SAndroid Build Coastguard Worker   }
80*993b0882SAndroid Build Coastguard Worker }
81*993b0882SAndroid Build Coastguard Worker 
Extract(const Token & token,bool is_in_span,std::vector<int> * sparse_features,std::vector<float> * dense_features) const82*993b0882SAndroid Build Coastguard Worker bool TokenFeatureExtractor::Extract(const Token& token, bool is_in_span,
83*993b0882SAndroid Build Coastguard Worker                                     std::vector<int>* sparse_features,
84*993b0882SAndroid Build Coastguard Worker                                     std::vector<float>* dense_features) const {
85*993b0882SAndroid Build Coastguard Worker   if (!dense_features) {
86*993b0882SAndroid Build Coastguard Worker     return false;
87*993b0882SAndroid Build Coastguard Worker   }
88*993b0882SAndroid Build Coastguard Worker   if (sparse_features) {
89*993b0882SAndroid Build Coastguard Worker     *sparse_features = ExtractCharactergramFeatures(token);
90*993b0882SAndroid Build Coastguard Worker   }
91*993b0882SAndroid Build Coastguard Worker   *dense_features = ExtractDenseFeatures(token, is_in_span);
92*993b0882SAndroid Build Coastguard Worker   return true;
93*993b0882SAndroid Build Coastguard Worker }
94*993b0882SAndroid Build Coastguard Worker 
ExtractCharactergramFeatures(const Token & token) const95*993b0882SAndroid Build Coastguard Worker std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeatures(
96*993b0882SAndroid Build Coastguard Worker     const Token& token) const {
97*993b0882SAndroid Build Coastguard Worker   if (options_.unicode_aware_features) {
98*993b0882SAndroid Build Coastguard Worker     return ExtractCharactergramFeaturesUnicode(token);
99*993b0882SAndroid Build Coastguard Worker   } else {
100*993b0882SAndroid Build Coastguard Worker     return ExtractCharactergramFeaturesAscii(token);
101*993b0882SAndroid Build Coastguard Worker   }
102*993b0882SAndroid Build Coastguard Worker }
103*993b0882SAndroid Build Coastguard Worker 
ExtractDenseFeatures(const Token & token,bool is_in_span) const104*993b0882SAndroid Build Coastguard Worker std::vector<float> TokenFeatureExtractor::ExtractDenseFeatures(
105*993b0882SAndroid Build Coastguard Worker     const Token& token, bool is_in_span) const {
106*993b0882SAndroid Build Coastguard Worker   std::vector<float> dense_features;
107*993b0882SAndroid Build Coastguard Worker 
108*993b0882SAndroid Build Coastguard Worker   if (options_.extract_case_feature) {
109*993b0882SAndroid Build Coastguard Worker     if (options_.unicode_aware_features) {
110*993b0882SAndroid Build Coastguard Worker       UnicodeText token_unicode =
111*993b0882SAndroid Build Coastguard Worker           UTF8ToUnicodeText(token.value, /*do_copy=*/false);
112*993b0882SAndroid Build Coastguard Worker       if (!token.value.empty() && unilib_.IsUpper(*token_unicode.begin())) {
113*993b0882SAndroid Build Coastguard Worker         dense_features.push_back(1.0);
114*993b0882SAndroid Build Coastguard Worker       } else {
115*993b0882SAndroid Build Coastguard Worker         dense_features.push_back(-1.0);
116*993b0882SAndroid Build Coastguard Worker       }
117*993b0882SAndroid Build Coastguard Worker     } else {
118*993b0882SAndroid Build Coastguard Worker       if (!token.value.empty() && isupper(*token.value.begin())) {
119*993b0882SAndroid Build Coastguard Worker         dense_features.push_back(1.0);
120*993b0882SAndroid Build Coastguard Worker       } else {
121*993b0882SAndroid Build Coastguard Worker         dense_features.push_back(-1.0);
122*993b0882SAndroid Build Coastguard Worker       }
123*993b0882SAndroid Build Coastguard Worker     }
124*993b0882SAndroid Build Coastguard Worker   }
125*993b0882SAndroid Build Coastguard Worker 
126*993b0882SAndroid Build Coastguard Worker   if (options_.extract_selection_mask_feature) {
127*993b0882SAndroid Build Coastguard Worker     if (is_in_span) {
128*993b0882SAndroid Build Coastguard Worker       dense_features.push_back(1.0);
129*993b0882SAndroid Build Coastguard Worker     } else {
130*993b0882SAndroid Build Coastguard Worker       if (options_.unicode_aware_features) {
131*993b0882SAndroid Build Coastguard Worker         dense_features.push_back(-1.0);
132*993b0882SAndroid Build Coastguard Worker       } else {
133*993b0882SAndroid Build Coastguard Worker         dense_features.push_back(0.0);
134*993b0882SAndroid Build Coastguard Worker       }
135*993b0882SAndroid Build Coastguard Worker     }
136*993b0882SAndroid Build Coastguard Worker   }
137*993b0882SAndroid Build Coastguard Worker 
138*993b0882SAndroid Build Coastguard Worker   // Add regexp features.
139*993b0882SAndroid Build Coastguard Worker   if (!regex_patterns_.empty()) {
140*993b0882SAndroid Build Coastguard Worker     UnicodeText token_unicode =
141*993b0882SAndroid Build Coastguard Worker         UTF8ToUnicodeText(token.value, /*do_copy=*/false);
142*993b0882SAndroid Build Coastguard Worker     for (int i = 0; i < regex_patterns_.size(); ++i) {
143*993b0882SAndroid Build Coastguard Worker       if (!regex_patterns_[i].get()) {
144*993b0882SAndroid Build Coastguard Worker         dense_features.push_back(-1.0);
145*993b0882SAndroid Build Coastguard Worker         continue;
146*993b0882SAndroid Build Coastguard Worker       }
147*993b0882SAndroid Build Coastguard Worker       auto matcher = regex_patterns_[i]->Matcher(token_unicode);
148*993b0882SAndroid Build Coastguard Worker       int status;
149*993b0882SAndroid Build Coastguard Worker       if (matcher->Matches(&status)) {
150*993b0882SAndroid Build Coastguard Worker         dense_features.push_back(1.0);
151*993b0882SAndroid Build Coastguard Worker       } else {
152*993b0882SAndroid Build Coastguard Worker         dense_features.push_back(-1.0);
153*993b0882SAndroid Build Coastguard Worker       }
154*993b0882SAndroid Build Coastguard Worker     }
155*993b0882SAndroid Build Coastguard Worker   }
156*993b0882SAndroid Build Coastguard Worker 
157*993b0882SAndroid Build Coastguard Worker   return dense_features;
158*993b0882SAndroid Build Coastguard Worker }
159*993b0882SAndroid Build Coastguard Worker 
HashToken(StringPiece token) const160*993b0882SAndroid Build Coastguard Worker int TokenFeatureExtractor::HashToken(StringPiece token) const {
161*993b0882SAndroid Build Coastguard Worker   if (options_.allowed_chargrams.empty()) {
162*993b0882SAndroid Build Coastguard Worker     return tc3farmhash::Fingerprint64(token) % options_.num_buckets;
163*993b0882SAndroid Build Coastguard Worker   } else {
164*993b0882SAndroid Build Coastguard Worker     // Padding and out-of-vocabulary tokens have extra buckets reserved because
165*993b0882SAndroid Build Coastguard Worker     // they are special and important tokens, and we don't want them to share
166*993b0882SAndroid Build Coastguard Worker     // embedding with other charactergrams.
167*993b0882SAndroid Build Coastguard Worker     // TODO(zilka): Experimentally verify.
168*993b0882SAndroid Build Coastguard Worker     const int kNumExtraBuckets = 2;
169*993b0882SAndroid Build Coastguard Worker     const std::string token_string = token.ToString();
170*993b0882SAndroid Build Coastguard Worker     if (token_string == "<PAD>") {
171*993b0882SAndroid Build Coastguard Worker       return 1;
172*993b0882SAndroid Build Coastguard Worker     } else if (options_.allowed_chargrams.find(token_string) ==
173*993b0882SAndroid Build Coastguard Worker                options_.allowed_chargrams.end()) {
174*993b0882SAndroid Build Coastguard Worker       return 0;  // Out-of-vocabulary.
175*993b0882SAndroid Build Coastguard Worker     } else {
176*993b0882SAndroid Build Coastguard Worker       return (tc3farmhash::Fingerprint64(token) %
177*993b0882SAndroid Build Coastguard Worker               (options_.num_buckets - kNumExtraBuckets)) +
178*993b0882SAndroid Build Coastguard Worker              kNumExtraBuckets;
179*993b0882SAndroid Build Coastguard Worker     }
180*993b0882SAndroid Build Coastguard Worker   }
181*993b0882SAndroid Build Coastguard Worker }
182*993b0882SAndroid Build Coastguard Worker 
ExtractCharactergramFeaturesAscii(const Token & token) const183*993b0882SAndroid Build Coastguard Worker std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeaturesAscii(
184*993b0882SAndroid Build Coastguard Worker     const Token& token) const {
185*993b0882SAndroid Build Coastguard Worker   std::vector<int> result;
186*993b0882SAndroid Build Coastguard Worker   if (token.is_padding || token.value.empty()) {
187*993b0882SAndroid Build Coastguard Worker     result.push_back(HashToken("<PAD>"));
188*993b0882SAndroid Build Coastguard Worker   } else {
189*993b0882SAndroid Build Coastguard Worker     const std::string word = RemapTokenAscii(token.value, options_);
190*993b0882SAndroid Build Coastguard Worker 
191*993b0882SAndroid Build Coastguard Worker     // Trim words that are over max_word_length characters.
192*993b0882SAndroid Build Coastguard Worker     const int max_word_length = options_.max_word_length;
193*993b0882SAndroid Build Coastguard Worker     std::string feature_word;
194*993b0882SAndroid Build Coastguard Worker     if (word.size() > max_word_length) {
195*993b0882SAndroid Build Coastguard Worker       feature_word =
196*993b0882SAndroid Build Coastguard Worker           "^" + word.substr(0, max_word_length / 2) + "\1" +
197*993b0882SAndroid Build Coastguard Worker           word.substr(word.size() - max_word_length / 2, max_word_length / 2) +
198*993b0882SAndroid Build Coastguard Worker           "$";
199*993b0882SAndroid Build Coastguard Worker     } else {
200*993b0882SAndroid Build Coastguard Worker       // Add a prefix and suffix to the word.
201*993b0882SAndroid Build Coastguard Worker       feature_word = "^" + word + "$";
202*993b0882SAndroid Build Coastguard Worker     }
203*993b0882SAndroid Build Coastguard Worker 
204*993b0882SAndroid Build Coastguard Worker     // Upper-bound the number of charactergram extracted to avoid resizing.
205*993b0882SAndroid Build Coastguard Worker     result.reserve(options_.chargram_orders.size() * feature_word.size());
206*993b0882SAndroid Build Coastguard Worker 
207*993b0882SAndroid Build Coastguard Worker     if (options_.chargram_orders.empty()) {
208*993b0882SAndroid Build Coastguard Worker       result.push_back(HashToken(feature_word));
209*993b0882SAndroid Build Coastguard Worker     } else {
210*993b0882SAndroid Build Coastguard Worker       // Generate the character-grams.
211*993b0882SAndroid Build Coastguard Worker       for (int chargram_order : options_.chargram_orders) {
212*993b0882SAndroid Build Coastguard Worker         if (chargram_order == 1) {
213*993b0882SAndroid Build Coastguard Worker           for (int i = 1; i < feature_word.size() - 1; ++i) {
214*993b0882SAndroid Build Coastguard Worker             result.push_back(
215*993b0882SAndroid Build Coastguard Worker                 HashToken(StringPiece(feature_word, /*offset=*/i, /*len=*/1)));
216*993b0882SAndroid Build Coastguard Worker           }
217*993b0882SAndroid Build Coastguard Worker         } else {
218*993b0882SAndroid Build Coastguard Worker           for (int i = 0;
219*993b0882SAndroid Build Coastguard Worker                i < static_cast<int>(feature_word.size()) - chargram_order + 1;
220*993b0882SAndroid Build Coastguard Worker                ++i) {
221*993b0882SAndroid Build Coastguard Worker             result.push_back(HashToken(StringPiece(feature_word, /*offset=*/i,
222*993b0882SAndroid Build Coastguard Worker                                                    /*len=*/chargram_order)));
223*993b0882SAndroid Build Coastguard Worker           }
224*993b0882SAndroid Build Coastguard Worker         }
225*993b0882SAndroid Build Coastguard Worker       }
226*993b0882SAndroid Build Coastguard Worker     }
227*993b0882SAndroid Build Coastguard Worker   }
228*993b0882SAndroid Build Coastguard Worker   return result;
229*993b0882SAndroid Build Coastguard Worker }
230*993b0882SAndroid Build Coastguard Worker 
ExtractCharactergramFeaturesUnicode(const Token & token) const231*993b0882SAndroid Build Coastguard Worker std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeaturesUnicode(
232*993b0882SAndroid Build Coastguard Worker     const Token& token) const {
233*993b0882SAndroid Build Coastguard Worker   std::vector<int> result;
234*993b0882SAndroid Build Coastguard Worker   if (token.is_padding || token.value.empty()) {
235*993b0882SAndroid Build Coastguard Worker     result.push_back(HashToken("<PAD>"));
236*993b0882SAndroid Build Coastguard Worker   } else {
237*993b0882SAndroid Build Coastguard Worker     UnicodeText word = UTF8ToUnicodeText(token.value, /*do_copy=*/false);
238*993b0882SAndroid Build Coastguard Worker     RemapTokenUnicode(token.value, options_, unilib_, &word);
239*993b0882SAndroid Build Coastguard Worker 
240*993b0882SAndroid Build Coastguard Worker     // Trim the word if needed by finding a left-cut point and right-cut point.
241*993b0882SAndroid Build Coastguard Worker     auto left_cut = word.begin();
242*993b0882SAndroid Build Coastguard Worker     auto right_cut = word.end();
243*993b0882SAndroid Build Coastguard Worker     for (int i = 0; i < options_.max_word_length / 2; i++) {
244*993b0882SAndroid Build Coastguard Worker       if (left_cut < right_cut) {
245*993b0882SAndroid Build Coastguard Worker         ++left_cut;
246*993b0882SAndroid Build Coastguard Worker       }
247*993b0882SAndroid Build Coastguard Worker       if (left_cut < right_cut) {
248*993b0882SAndroid Build Coastguard Worker         --right_cut;
249*993b0882SAndroid Build Coastguard Worker       }
250*993b0882SAndroid Build Coastguard Worker     }
251*993b0882SAndroid Build Coastguard Worker 
252*993b0882SAndroid Build Coastguard Worker     std::string feature_word;
253*993b0882SAndroid Build Coastguard Worker     if (left_cut == right_cut) {
254*993b0882SAndroid Build Coastguard Worker       feature_word = "^" + word.UTF8Substring(word.begin(), word.end()) + "$";
255*993b0882SAndroid Build Coastguard Worker     } else {
256*993b0882SAndroid Build Coastguard Worker       // clang-format off
257*993b0882SAndroid Build Coastguard Worker       feature_word = "^" +
258*993b0882SAndroid Build Coastguard Worker                      word.UTF8Substring(word.begin(), left_cut) +
259*993b0882SAndroid Build Coastguard Worker                      "\1" +
260*993b0882SAndroid Build Coastguard Worker                      word.UTF8Substring(right_cut, word.end()) +
261*993b0882SAndroid Build Coastguard Worker                      "$";
262*993b0882SAndroid Build Coastguard Worker       // clang-format on
263*993b0882SAndroid Build Coastguard Worker     }
264*993b0882SAndroid Build Coastguard Worker 
265*993b0882SAndroid Build Coastguard Worker     const UnicodeText feature_word_unicode =
266*993b0882SAndroid Build Coastguard Worker         UTF8ToUnicodeText(feature_word, /*do_copy=*/false);
267*993b0882SAndroid Build Coastguard Worker 
268*993b0882SAndroid Build Coastguard Worker     // Upper-bound the number of charactergram extracted to avoid resizing.
269*993b0882SAndroid Build Coastguard Worker     result.reserve(options_.chargram_orders.size() * feature_word.size());
270*993b0882SAndroid Build Coastguard Worker 
271*993b0882SAndroid Build Coastguard Worker     if (options_.chargram_orders.empty()) {
272*993b0882SAndroid Build Coastguard Worker       result.push_back(HashToken(feature_word));
273*993b0882SAndroid Build Coastguard Worker     } else {
274*993b0882SAndroid Build Coastguard Worker       // Generate the character-grams.
275*993b0882SAndroid Build Coastguard Worker       for (int chargram_order : options_.chargram_orders) {
276*993b0882SAndroid Build Coastguard Worker         UnicodeText::const_iterator it_start = feature_word_unicode.begin();
277*993b0882SAndroid Build Coastguard Worker         UnicodeText::const_iterator it_end = feature_word_unicode.end();
278*993b0882SAndroid Build Coastguard Worker         if (chargram_order == 1) {
279*993b0882SAndroid Build Coastguard Worker           ++it_start;
280*993b0882SAndroid Build Coastguard Worker           --it_end;
281*993b0882SAndroid Build Coastguard Worker         }
282*993b0882SAndroid Build Coastguard Worker 
283*993b0882SAndroid Build Coastguard Worker         UnicodeText::const_iterator it_chargram_start = it_start;
284*993b0882SAndroid Build Coastguard Worker         UnicodeText::const_iterator it_chargram_end = it_start;
285*993b0882SAndroid Build Coastguard Worker         bool chargram_is_complete = true;
286*993b0882SAndroid Build Coastguard Worker         for (int i = 0; i < chargram_order; ++i) {
287*993b0882SAndroid Build Coastguard Worker           if (it_chargram_end == it_end) {
288*993b0882SAndroid Build Coastguard Worker             chargram_is_complete = false;
289*993b0882SAndroid Build Coastguard Worker             break;
290*993b0882SAndroid Build Coastguard Worker           }
291*993b0882SAndroid Build Coastguard Worker           ++it_chargram_end;
292*993b0882SAndroid Build Coastguard Worker         }
293*993b0882SAndroid Build Coastguard Worker         if (!chargram_is_complete) {
294*993b0882SAndroid Build Coastguard Worker           continue;
295*993b0882SAndroid Build Coastguard Worker         }
296*993b0882SAndroid Build Coastguard Worker 
297*993b0882SAndroid Build Coastguard Worker         for (; it_chargram_end <= it_end;
298*993b0882SAndroid Build Coastguard Worker              ++it_chargram_start, ++it_chargram_end) {
299*993b0882SAndroid Build Coastguard Worker           const int length_bytes =
300*993b0882SAndroid Build Coastguard Worker               it_chargram_end.utf8_data() - it_chargram_start.utf8_data();
301*993b0882SAndroid Build Coastguard Worker           result.push_back(HashToken(
302*993b0882SAndroid Build Coastguard Worker               StringPiece(it_chargram_start.utf8_data(), length_bytes)));
303*993b0882SAndroid Build Coastguard Worker         }
304*993b0882SAndroid Build Coastguard Worker       }
305*993b0882SAndroid Build Coastguard Worker     }
306*993b0882SAndroid Build Coastguard Worker   }
307*993b0882SAndroid Build Coastguard Worker   return result;
308*993b0882SAndroid Build Coastguard Worker }
309*993b0882SAndroid Build Coastguard Worker 
310*993b0882SAndroid Build Coastguard Worker }  // namespace libtextclassifier3
311