1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker *
4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker *
8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker *
10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker */
16*993b0882SAndroid Build Coastguard Worker
17*993b0882SAndroid Build Coastguard Worker #include "utils/tokenizer-utils.h"
18*993b0882SAndroid Build Coastguard Worker
19*993b0882SAndroid Build Coastguard Worker #include <iterator>
20*993b0882SAndroid Build Coastguard Worker
21*993b0882SAndroid Build Coastguard Worker #include "utils/codepoint-range.h"
22*993b0882SAndroid Build Coastguard Worker #include "utils/strings/utf8.h"
23*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unicodetext.h"
24*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unilib-common.h"
25*993b0882SAndroid Build Coastguard Worker #include "absl/container/flat_hash_set.h"
26*993b0882SAndroid Build Coastguard Worker
27*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
28*993b0882SAndroid Build Coastguard Worker
29*993b0882SAndroid Build Coastguard Worker using libtextclassifier3::Token;
30*993b0882SAndroid Build Coastguard Worker
TokenizeOnSpace(const std::string & text)31*993b0882SAndroid Build Coastguard Worker std::vector<Token> TokenizeOnSpace(const std::string& text) {
32*993b0882SAndroid Build Coastguard Worker return TokenizeOnDelimiters(text, {' '});
33*993b0882SAndroid Build Coastguard Worker }
34*993b0882SAndroid Build Coastguard Worker
TokenizeOnDelimiters(const std::string & text,const absl::flat_hash_set<char32> & delimiters,bool create_tokens_for_non_space_delimiters)35*993b0882SAndroid Build Coastguard Worker std::vector<Token> TokenizeOnDelimiters(
36*993b0882SAndroid Build Coastguard Worker const std::string& text, const absl::flat_hash_set<char32>& delimiters,
37*993b0882SAndroid Build Coastguard Worker bool create_tokens_for_non_space_delimiters) {
38*993b0882SAndroid Build Coastguard Worker return TokenizeWithFilter(text, [&](char32 codepoint) {
39*993b0882SAndroid Build Coastguard Worker bool to_split = delimiters.find(codepoint) != delimiters.end();
40*993b0882SAndroid Build Coastguard Worker bool to_keep =
41*993b0882SAndroid Build Coastguard Worker (create_tokens_for_non_space_delimiters) ? codepoint != ' ' : false;
42*993b0882SAndroid Build Coastguard Worker return FilterResult{to_split, to_keep};
43*993b0882SAndroid Build Coastguard Worker });
44*993b0882SAndroid Build Coastguard Worker }
45*993b0882SAndroid Build Coastguard Worker
TokenizeOnWhiteSpacePunctuationAndChineseLetter(const absl::string_view text)46*993b0882SAndroid Build Coastguard Worker std::vector<Token> TokenizeOnWhiteSpacePunctuationAndChineseLetter(
47*993b0882SAndroid Build Coastguard Worker const absl::string_view text) {
48*993b0882SAndroid Build Coastguard Worker return TokenizeWithFilter(text, [](char32 codepoint) {
49*993b0882SAndroid Build Coastguard Worker bool is_whitespace = IsWhitespace(codepoint);
50*993b0882SAndroid Build Coastguard Worker bool to_split =
51*993b0882SAndroid Build Coastguard Worker is_whitespace || IsPunctuation(codepoint) || IsChineseLetter(codepoint);
52*993b0882SAndroid Build Coastguard Worker bool to_keep = !is_whitespace;
53*993b0882SAndroid Build Coastguard Worker return FilterResult{to_split, to_keep};
54*993b0882SAndroid Build Coastguard Worker });
55*993b0882SAndroid Build Coastguard Worker }
56*993b0882SAndroid Build Coastguard Worker } // namespace libtextclassifier3
57