1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker *
4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker *
8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker *
10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker */
16*993b0882SAndroid Build Coastguard Worker
17*993b0882SAndroid Build Coastguard Worker // Utilities for tests.
18*993b0882SAndroid Build Coastguard Worker
19*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_
20*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_
21*993b0882SAndroid Build Coastguard Worker
22*993b0882SAndroid Build Coastguard Worker #include <string>
23*993b0882SAndroid Build Coastguard Worker
24*993b0882SAndroid Build Coastguard Worker #include "annotator/types.h"
25*993b0882SAndroid Build Coastguard Worker #include "utils/codepoint-range.h"
26*993b0882SAndroid Build Coastguard Worker #include "utils/strings/utf8.h"
27*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unicodetext.h"
28*993b0882SAndroid Build Coastguard Worker #include "absl/container/flat_hash_set.h"
29*993b0882SAndroid Build Coastguard Worker #include "absl/strings/string_view.h"
30*993b0882SAndroid Build Coastguard Worker
31*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
32*993b0882SAndroid Build Coastguard Worker
33*993b0882SAndroid Build Coastguard Worker struct FilterResult {
34*993b0882SAndroid Build Coastguard Worker // Whether split on this codepoint.
35*993b0882SAndroid Build Coastguard Worker bool to_split;
36*993b0882SAndroid Build Coastguard Worker // If the codepoint is used to split the text, whether to output it as a
37*993b0882SAndroid Build Coastguard Worker // token.
38*993b0882SAndroid Build Coastguard Worker bool to_keep;
39*993b0882SAndroid Build Coastguard Worker };
40*993b0882SAndroid Build Coastguard Worker
41*993b0882SAndroid Build Coastguard Worker // Returns a list of Tokens for a given input string, by tokenizing on space.
42*993b0882SAndroid Build Coastguard Worker std::vector<Token> TokenizeOnSpace(const std::string& text);
43*993b0882SAndroid Build Coastguard Worker
44*993b0882SAndroid Build Coastguard Worker // Returns a list of Tokens for a given input string, by tokenizing on the
45*993b0882SAndroid Build Coastguard Worker // given set of delimiter codepoints.
46*993b0882SAndroid Build Coastguard Worker // If create_tokens_for_non_space_delimiters is true, create tokens for
47*993b0882SAndroid Build Coastguard Worker // delimiters which are not white spaces. For example "This, is" -> {"This",
48*993b0882SAndroid Build Coastguard Worker // ",", "is"}.
49*993b0882SAndroid Build Coastguard Worker std::vector<Token> TokenizeOnDelimiters(
50*993b0882SAndroid Build Coastguard Worker const std::string& text, const absl::flat_hash_set<char32>& delimiters,
51*993b0882SAndroid Build Coastguard Worker bool create_tokens_for_non_space_delimiters = false);
52*993b0882SAndroid Build Coastguard Worker
53*993b0882SAndroid Build Coastguard Worker // This replicates how the original bert_tokenizer from the tflite-support
54*993b0882SAndroid Build Coastguard Worker // library pretokenize text by using regex_split with these default regexes.
55*993b0882SAndroid Build Coastguard Worker // It splits the text on spaces, punctuations and chinese characters and
56*993b0882SAndroid Build Coastguard Worker // output all the tokens except spaces.
57*993b0882SAndroid Build Coastguard Worker // So far, the only difference between this and the original implementation
58*993b0882SAndroid Build Coastguard Worker // we are aware of is that the original regexes has 8 ranges of chinese
59*993b0882SAndroid Build Coastguard Worker // unicodes. We have all these 8 ranges plus two extra ranges.
60*993b0882SAndroid Build Coastguard Worker std::vector<Token> TokenizeOnWhiteSpacePunctuationAndChineseLetter(
61*993b0882SAndroid Build Coastguard Worker const absl::string_view text);
62*993b0882SAndroid Build Coastguard Worker
63*993b0882SAndroid Build Coastguard Worker // Returns a list of Tokens for a given input string, by tokenizing on the
64*993b0882SAndroid Build Coastguard Worker // given filter function. Caller can control which codepoint to split and
65*993b0882SAndroid Build Coastguard Worker // whether a delimiter should be output as a token.
66*993b0882SAndroid Build Coastguard Worker template <typename FilterFn>
TokenizeWithFilter(const absl::string_view input,FilterFn filter)67*993b0882SAndroid Build Coastguard Worker std::vector<Token> TokenizeWithFilter(const absl::string_view input,
68*993b0882SAndroid Build Coastguard Worker FilterFn filter) {
69*993b0882SAndroid Build Coastguard Worker const UnicodeText input_unicode = UTF8ToUnicodeText(input, /*do_copy=*/false);
70*993b0882SAndroid Build Coastguard Worker std::vector<Token> tokens;
71*993b0882SAndroid Build Coastguard Worker UnicodeText::const_iterator start_it = input_unicode.begin();
72*993b0882SAndroid Build Coastguard Worker int token_start_codepoint = 0;
73*993b0882SAndroid Build Coastguard Worker int codepoint_idx = 0;
74*993b0882SAndroid Build Coastguard Worker
75*993b0882SAndroid Build Coastguard Worker for (auto it = input_unicode.begin(); it != input_unicode.end(); ++it) {
76*993b0882SAndroid Build Coastguard Worker const char32 code_point = *it;
77*993b0882SAndroid Build Coastguard Worker FilterResult filter_result = filter(code_point);
78*993b0882SAndroid Build Coastguard Worker if (filter_result.to_split) {
79*993b0882SAndroid Build Coastguard Worker const std::string token_text = UnicodeText::UTF8Substring(start_it, it);
80*993b0882SAndroid Build Coastguard Worker if (!token_text.empty()) {
81*993b0882SAndroid Build Coastguard Worker tokens.push_back(
82*993b0882SAndroid Build Coastguard Worker Token{token_text, token_start_codepoint, codepoint_idx});
83*993b0882SAndroid Build Coastguard Worker }
84*993b0882SAndroid Build Coastguard Worker if (filter_result.to_keep) {
85*993b0882SAndroid Build Coastguard Worker const std::string delimiter =
86*993b0882SAndroid Build Coastguard Worker UnicodeText::UTF8Substring(it, std::next(it));
87*993b0882SAndroid Build Coastguard Worker tokens.push_back(Token{delimiter, codepoint_idx, codepoint_idx + 1});
88*993b0882SAndroid Build Coastguard Worker }
89*993b0882SAndroid Build Coastguard Worker start_it = std::next(it);
90*993b0882SAndroid Build Coastguard Worker token_start_codepoint = codepoint_idx + 1;
91*993b0882SAndroid Build Coastguard Worker }
92*993b0882SAndroid Build Coastguard Worker codepoint_idx++;
93*993b0882SAndroid Build Coastguard Worker }
94*993b0882SAndroid Build Coastguard Worker // Flush the last token if any.
95*993b0882SAndroid Build Coastguard Worker if (start_it != input_unicode.end()) {
96*993b0882SAndroid Build Coastguard Worker const std::string token_text =
97*993b0882SAndroid Build Coastguard Worker UnicodeText::UTF8Substring(start_it, input_unicode.end());
98*993b0882SAndroid Build Coastguard Worker tokens.push_back(Token{token_text, token_start_codepoint, codepoint_idx});
99*993b0882SAndroid Build Coastguard Worker }
100*993b0882SAndroid Build Coastguard Worker return tokens;
101*993b0882SAndroid Build Coastguard Worker }
102*993b0882SAndroid Build Coastguard Worker
103*993b0882SAndroid Build Coastguard Worker } // namespace libtextclassifier3
104*993b0882SAndroid Build Coastguard Worker
105*993b0882SAndroid Build Coastguard Worker #endif // LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_
106