xref: /aosp_15_r20/external/libtextclassifier/native/utils/tokenizer-utils.cc (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #include "utils/tokenizer-utils.h"
18*993b0882SAndroid Build Coastguard Worker 
19*993b0882SAndroid Build Coastguard Worker #include <iterator>
20*993b0882SAndroid Build Coastguard Worker 
21*993b0882SAndroid Build Coastguard Worker #include "utils/codepoint-range.h"
22*993b0882SAndroid Build Coastguard Worker #include "utils/strings/utf8.h"
23*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unicodetext.h"
24*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unilib-common.h"
25*993b0882SAndroid Build Coastguard Worker #include "absl/container/flat_hash_set.h"
26*993b0882SAndroid Build Coastguard Worker 
27*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
28*993b0882SAndroid Build Coastguard Worker 
29*993b0882SAndroid Build Coastguard Worker using libtextclassifier3::Token;
30*993b0882SAndroid Build Coastguard Worker 
TokenizeOnSpace(const std::string & text)31*993b0882SAndroid Build Coastguard Worker std::vector<Token> TokenizeOnSpace(const std::string& text) {
32*993b0882SAndroid Build Coastguard Worker   return TokenizeOnDelimiters(text, {' '});
33*993b0882SAndroid Build Coastguard Worker }
34*993b0882SAndroid Build Coastguard Worker 
TokenizeOnDelimiters(const std::string & text,const absl::flat_hash_set<char32> & delimiters,bool create_tokens_for_non_space_delimiters)35*993b0882SAndroid Build Coastguard Worker std::vector<Token> TokenizeOnDelimiters(
36*993b0882SAndroid Build Coastguard Worker     const std::string& text, const absl::flat_hash_set<char32>& delimiters,
37*993b0882SAndroid Build Coastguard Worker     bool create_tokens_for_non_space_delimiters) {
38*993b0882SAndroid Build Coastguard Worker   return TokenizeWithFilter(text, [&](char32 codepoint) {
39*993b0882SAndroid Build Coastguard Worker     bool to_split = delimiters.find(codepoint) != delimiters.end();
40*993b0882SAndroid Build Coastguard Worker     bool to_keep =
41*993b0882SAndroid Build Coastguard Worker         (create_tokens_for_non_space_delimiters) ? codepoint != ' ' : false;
42*993b0882SAndroid Build Coastguard Worker     return FilterResult{to_split, to_keep};
43*993b0882SAndroid Build Coastguard Worker   });
44*993b0882SAndroid Build Coastguard Worker }
45*993b0882SAndroid Build Coastguard Worker 
TokenizeOnWhiteSpacePunctuationAndChineseLetter(const absl::string_view text)46*993b0882SAndroid Build Coastguard Worker std::vector<Token> TokenizeOnWhiteSpacePunctuationAndChineseLetter(
47*993b0882SAndroid Build Coastguard Worker     const absl::string_view text) {
48*993b0882SAndroid Build Coastguard Worker   return TokenizeWithFilter(text, [](char32 codepoint) {
49*993b0882SAndroid Build Coastguard Worker     bool is_whitespace = IsWhitespace(codepoint);
50*993b0882SAndroid Build Coastguard Worker     bool to_split =
51*993b0882SAndroid Build Coastguard Worker         is_whitespace || IsPunctuation(codepoint) || IsChineseLetter(codepoint);
52*993b0882SAndroid Build Coastguard Worker     bool to_keep = !is_whitespace;
53*993b0882SAndroid Build Coastguard Worker     return FilterResult{to_split, to_keep};
54*993b0882SAndroid Build Coastguard Worker   });
55*993b0882SAndroid Build Coastguard Worker }
56*993b0882SAndroid Build Coastguard Worker }  // namespace  libtextclassifier3
57