1*993b0882SAndroid Build Coastguard Worker /* 2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project 3*993b0882SAndroid Build Coastguard Worker * 4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License"); 5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License. 6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at 7*993b0882SAndroid Build Coastguard Worker * 8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0 9*993b0882SAndroid Build Coastguard Worker * 10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software 11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS, 12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and 14*993b0882SAndroid Build Coastguard Worker * limitations under the License. 15*993b0882SAndroid Build Coastguard Worker */ 16*993b0882SAndroid Build Coastguard Worker 17*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_ 18*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_ 19*993b0882SAndroid Build Coastguard Worker 20*993b0882SAndroid Build Coastguard Worker #include <string> 21*993b0882SAndroid Build Coastguard Worker #include <vector> 22*993b0882SAndroid Build Coastguard Worker 23*993b0882SAndroid Build Coastguard Worker #include "annotator/types.h" 24*993b0882SAndroid Build Coastguard Worker #include "utils/base/integral_types.h" 25*993b0882SAndroid Build Coastguard Worker #include "utils/codepoint-range.h" 26*993b0882SAndroid Build Coastguard Worker #include "utils/tokenizer_generated.h" 27*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unicodetext.h" 28*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unilib.h" 29*993b0882SAndroid Build Coastguard Worker #include "absl/strings/string_view.h" 30*993b0882SAndroid Build Coastguard Worker 31*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 { 32*993b0882SAndroid Build Coastguard Worker 33*993b0882SAndroid Build Coastguard Worker const int kInvalidScript = -1; 34*993b0882SAndroid Build Coastguard Worker const int kUnknownScript = -2; 35*993b0882SAndroid Build Coastguard Worker 36*993b0882SAndroid Build Coastguard Worker // Tokenizer splits the input string into a sequence of tokens, according to 37*993b0882SAndroid Build Coastguard Worker // the configuration. 38*993b0882SAndroid Build Coastguard Worker class Tokenizer { 39*993b0882SAndroid Build Coastguard Worker public: 40*993b0882SAndroid Build Coastguard Worker // `codepoint_ranges`: Codepoint ranges that determine how different 41*993b0882SAndroid Build Coastguard Worker // codepoints are tokenized. The ranges must not overlap. 42*993b0882SAndroid Build Coastguard Worker // `internal_tokenizer_codepoint_ranges`: Codepoint ranges that define which 43*993b0882SAndroid Build Coastguard Worker // tokens should be re-tokenized with the internal tokenizer in the mixed 44*993b0882SAndroid Build Coastguard Worker // tokenization mode. 45*993b0882SAndroid Build Coastguard Worker // `split_on_script_change`: Whether to consider a change of codepoint script 46*993b0882SAndroid Build Coastguard Worker // in a sequence of characters as a token boundary. If True, will treat 47*993b0882SAndroid Build Coastguard Worker // script change as a token boundary. 48*993b0882SAndroid Build Coastguard Worker // `icu_preserve_whitespace_tokens`: If true, will include empty tokens in the 49*993b0882SAndroid Build Coastguard Worker // output (in the ICU tokenization mode). 50*993b0882SAndroid Build Coastguard Worker // `preserve_floating_numbers`: If true (default), will keep dots between 51*993b0882SAndroid Build Coastguard Worker // digits together, not making separate tokens (in the LETTER_DIGIT 52*993b0882SAndroid Build Coastguard Worker // tokenization mode). 53*993b0882SAndroid Build Coastguard Worker Tokenizer( 54*993b0882SAndroid Build Coastguard Worker const TokenizationType type, const UniLib* unilib, 55*993b0882SAndroid Build Coastguard Worker const std::vector<const TokenizationCodepointRange*>& codepoint_ranges, 56*993b0882SAndroid Build Coastguard Worker const std::vector<const CodepointRange*>& 57*993b0882SAndroid Build Coastguard Worker internal_tokenizer_codepoint_ranges, 58*993b0882SAndroid Build Coastguard Worker const bool split_on_script_change, 59*993b0882SAndroid Build Coastguard Worker const bool icu_preserve_whitespace_tokens, 60*993b0882SAndroid Build Coastguard Worker const bool preserve_floating_numbers); 61*993b0882SAndroid Build Coastguard Worker Tokenizer(const TokenizationType type,const UniLib * unilib,const std::vector<const TokenizationCodepointRange * > & codepoint_ranges,const std::vector<const CodepointRange * > & internal_tokenizer_codepoint_ranges,const bool split_on_script_change,const bool icu_preserve_whitespace_tokens)62*993b0882SAndroid Build Coastguard Worker Tokenizer( 63*993b0882SAndroid Build Coastguard Worker const TokenizationType type, const UniLib* unilib, 64*993b0882SAndroid Build Coastguard Worker const std::vector<const TokenizationCodepointRange*>& codepoint_ranges, 65*993b0882SAndroid Build Coastguard Worker const std::vector<const CodepointRange*>& 66*993b0882SAndroid Build Coastguard Worker internal_tokenizer_codepoint_ranges, 67*993b0882SAndroid Build Coastguard Worker const bool split_on_script_change, 68*993b0882SAndroid Build Coastguard Worker const bool icu_preserve_whitespace_tokens) 69*993b0882SAndroid Build Coastguard Worker : Tokenizer(type, unilib, codepoint_ranges, 70*993b0882SAndroid Build Coastguard Worker internal_tokenizer_codepoint_ranges, split_on_script_change, 71*993b0882SAndroid Build Coastguard Worker icu_preserve_whitespace_tokens, 72*993b0882SAndroid Build Coastguard Worker /*preserve_floating_numbers=*/true) {} 73*993b0882SAndroid Build Coastguard Worker Tokenizer(const std::vector<const TokenizationCodepointRange * > & codepoint_ranges,const bool split_on_script_change)74*993b0882SAndroid Build Coastguard Worker Tokenizer( 75*993b0882SAndroid Build Coastguard Worker const std::vector<const TokenizationCodepointRange*>& codepoint_ranges, 76*993b0882SAndroid Build Coastguard Worker const bool split_on_script_change) 77*993b0882SAndroid Build Coastguard Worker : Tokenizer(TokenizationType_INTERNAL_TOKENIZER, /*unilib=*/nullptr, 78*993b0882SAndroid Build Coastguard Worker codepoint_ranges, /*internal_tokenizer_codepoint_ranges=*/{}, 79*993b0882SAndroid Build Coastguard Worker split_on_script_change, 80*993b0882SAndroid Build Coastguard Worker /*icu_preserve_whitespace_tokens=*/false, 81*993b0882SAndroid Build Coastguard Worker /*preserve_floating_numbers=*/true) {} 82*993b0882SAndroid Build Coastguard Worker 83*993b0882SAndroid Build Coastguard Worker // Describes the type of tokens used in the NumberTokenizer. 84*993b0882SAndroid Build Coastguard Worker enum NumberTokenType { 85*993b0882SAndroid Build Coastguard Worker INVALID_TOKEN_TYPE, 86*993b0882SAndroid Build Coastguard Worker NUMERICAL, 87*993b0882SAndroid Build Coastguard Worker TERM, 88*993b0882SAndroid Build Coastguard Worker WHITESPACE, 89*993b0882SAndroid Build Coastguard Worker SEPARATOR, 90*993b0882SAndroid Build Coastguard Worker NOT_SET 91*993b0882SAndroid Build Coastguard Worker }; 92*993b0882SAndroid Build Coastguard Worker 93*993b0882SAndroid Build Coastguard Worker // Tokenizes the input string using the selected tokenization method. 94*993b0882SAndroid Build Coastguard Worker std::vector<Token> Tokenize(absl::string_view text) const; 95*993b0882SAndroid Build Coastguard Worker 96*993b0882SAndroid Build Coastguard Worker // Same as above but takes UnicodeText. 97*993b0882SAndroid Build Coastguard Worker std::vector<Token> Tokenize(const UnicodeText& text_unicode) const; 98*993b0882SAndroid Build Coastguard Worker 99*993b0882SAndroid Build Coastguard Worker protected: 100*993b0882SAndroid Build Coastguard Worker // Finds the tokenization codepoint range config for given codepoint. 101*993b0882SAndroid Build Coastguard Worker // Internally uses binary search so should be O(log(# of codepoint_ranges)). 102*993b0882SAndroid Build Coastguard Worker const TokenizationCodepointRangeT* FindTokenizationRange(int codepoint) const; 103*993b0882SAndroid Build Coastguard Worker 104*993b0882SAndroid Build Coastguard Worker // Finds the role and script for given codepoint. If not found, DEFAULT_ROLE 105*993b0882SAndroid Build Coastguard Worker // and kUnknownScript are assigned. 106*993b0882SAndroid Build Coastguard Worker void GetScriptAndRole(char32 codepoint, 107*993b0882SAndroid Build Coastguard Worker TokenizationCodepointRange_::Role* role, 108*993b0882SAndroid Build Coastguard Worker int* script) const; 109*993b0882SAndroid Build Coastguard Worker 110*993b0882SAndroid Build Coastguard Worker // Tokenizes a substring of the unicode string, appending the resulting tokens 111*993b0882SAndroid Build Coastguard Worker // to the output vector. The resulting tokens have bounds relative to the full 112*993b0882SAndroid Build Coastguard Worker // string. Does nothing if the start of the span is negative. 113*993b0882SAndroid Build Coastguard Worker void TokenizeSubstring(const UnicodeText& unicode_text, CodepointSpan span, 114*993b0882SAndroid Build Coastguard Worker std::vector<Token>* result) const; 115*993b0882SAndroid Build Coastguard Worker 116*993b0882SAndroid Build Coastguard Worker std::vector<Token> InternalTokenize(const UnicodeText& text_unicode) const; 117*993b0882SAndroid Build Coastguard Worker 118*993b0882SAndroid Build Coastguard Worker // Takes the result of ICU tokenization and retokenizes stretches of tokens 119*993b0882SAndroid Build Coastguard Worker // made of a specific subset of characters using the internal tokenizer. 120*993b0882SAndroid Build Coastguard Worker void InternalRetokenize(const UnicodeText& unicode_text, 121*993b0882SAndroid Build Coastguard Worker std::vector<Token>* tokens) const; 122*993b0882SAndroid Build Coastguard Worker 123*993b0882SAndroid Build Coastguard Worker // Tokenizes the input text using ICU tokenizer. 124*993b0882SAndroid Build Coastguard Worker bool ICUTokenize(const UnicodeText& context_unicode, 125*993b0882SAndroid Build Coastguard Worker std::vector<Token>* result) const; 126*993b0882SAndroid Build Coastguard Worker 127*993b0882SAndroid Build Coastguard Worker // Tokenizes the input in number, word and separator tokens. 128*993b0882SAndroid Build Coastguard Worker bool NumberTokenize(const UnicodeText& text_unicode, 129*993b0882SAndroid Build Coastguard Worker std::vector<Token>* result) const; 130*993b0882SAndroid Build Coastguard Worker 131*993b0882SAndroid Build Coastguard Worker private: 132*993b0882SAndroid Build Coastguard Worker const TokenizationType type_; 133*993b0882SAndroid Build Coastguard Worker 134*993b0882SAndroid Build Coastguard Worker const UniLib* unilib_; 135*993b0882SAndroid Build Coastguard Worker 136*993b0882SAndroid Build Coastguard Worker // Codepoint ranges that determine how different codepoints are tokenized. 137*993b0882SAndroid Build Coastguard Worker // The ranges must not overlap. 138*993b0882SAndroid Build Coastguard Worker std::vector<std::unique_ptr<const TokenizationCodepointRangeT>> 139*993b0882SAndroid Build Coastguard Worker codepoint_ranges_; 140*993b0882SAndroid Build Coastguard Worker 141*993b0882SAndroid Build Coastguard Worker // Codepoint ranges that define which tokens (consisting of which codepoints) 142*993b0882SAndroid Build Coastguard Worker // should be re-tokenized with the internal tokenizer in the mixed 143*993b0882SAndroid Build Coastguard Worker // tokenization mode. 144*993b0882SAndroid Build Coastguard Worker // NOTE: Must be sorted. 145*993b0882SAndroid Build Coastguard Worker std::vector<CodepointRangeStruct> internal_tokenizer_codepoint_ranges_; 146*993b0882SAndroid Build Coastguard Worker 147*993b0882SAndroid Build Coastguard Worker // If true, tokens will be additionally split when the codepoint's script_id 148*993b0882SAndroid Build Coastguard Worker // changes. 149*993b0882SAndroid Build Coastguard Worker const bool split_on_script_change_; 150*993b0882SAndroid Build Coastguard Worker 151*993b0882SAndroid Build Coastguard Worker const bool icu_preserve_whitespace_tokens_; 152*993b0882SAndroid Build Coastguard Worker const bool preserve_floating_numbers_; 153*993b0882SAndroid Build Coastguard Worker }; 154*993b0882SAndroid Build Coastguard Worker 155*993b0882SAndroid Build Coastguard Worker } // namespace libtextclassifier3 156*993b0882SAndroid Build Coastguard Worker 157*993b0882SAndroid Build Coastguard Worker #endif // LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_ 158