1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_ 18 #define LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_ 19 20 #include <string> 21 #include <vector> 22 23 #include "annotator/types.h" 24 #include "utils/base/integral_types.h" 25 #include "utils/codepoint-range.h" 26 #include "utils/tokenizer_generated.h" 27 #include "utils/utf8/unicodetext.h" 28 #include "utils/utf8/unilib.h" 29 #include "absl/strings/string_view.h" 30 31 namespace libtextclassifier3 { 32 33 const int kInvalidScript = -1; 34 const int kUnknownScript = -2; 35 36 // Tokenizer splits the input string into a sequence of tokens, according to 37 // the configuration. 38 class Tokenizer { 39 public: 40 // `codepoint_ranges`: Codepoint ranges that determine how different 41 // codepoints are tokenized. The ranges must not overlap. 42 // `internal_tokenizer_codepoint_ranges`: Codepoint ranges that define which 43 // tokens should be re-tokenized with the internal tokenizer in the mixed 44 // tokenization mode. 45 // `split_on_script_change`: Whether to consider a change of codepoint script 46 // in a sequence of characters as a token boundary. If True, will treat 47 // script change as a token boundary. 48 // `icu_preserve_whitespace_tokens`: If true, will include empty tokens in the 49 // output (in the ICU tokenization mode). 50 // `preserve_floating_numbers`: If true (default), will keep dots between 51 // digits together, not making separate tokens (in the LETTER_DIGIT 52 // tokenization mode). 53 Tokenizer( 54 const TokenizationType type, const UniLib* unilib, 55 const std::vector<const TokenizationCodepointRange*>& codepoint_ranges, 56 const std::vector<const CodepointRange*>& 57 internal_tokenizer_codepoint_ranges, 58 const bool split_on_script_change, 59 const bool icu_preserve_whitespace_tokens, 60 const bool preserve_floating_numbers); 61 Tokenizer(const TokenizationType type,const UniLib * unilib,const std::vector<const TokenizationCodepointRange * > & codepoint_ranges,const std::vector<const CodepointRange * > & internal_tokenizer_codepoint_ranges,const bool split_on_script_change,const bool icu_preserve_whitespace_tokens)62 Tokenizer( 63 const TokenizationType type, const UniLib* unilib, 64 const std::vector<const TokenizationCodepointRange*>& codepoint_ranges, 65 const std::vector<const CodepointRange*>& 66 internal_tokenizer_codepoint_ranges, 67 const bool split_on_script_change, 68 const bool icu_preserve_whitespace_tokens) 69 : Tokenizer(type, unilib, codepoint_ranges, 70 internal_tokenizer_codepoint_ranges, split_on_script_change, 71 icu_preserve_whitespace_tokens, 72 /*preserve_floating_numbers=*/true) {} 73 Tokenizer(const std::vector<const TokenizationCodepointRange * > & codepoint_ranges,const bool split_on_script_change)74 Tokenizer( 75 const std::vector<const TokenizationCodepointRange*>& codepoint_ranges, 76 const bool split_on_script_change) 77 : Tokenizer(TokenizationType_INTERNAL_TOKENIZER, /*unilib=*/nullptr, 78 codepoint_ranges, /*internal_tokenizer_codepoint_ranges=*/{}, 79 split_on_script_change, 80 /*icu_preserve_whitespace_tokens=*/false, 81 /*preserve_floating_numbers=*/true) {} 82 83 // Describes the type of tokens used in the NumberTokenizer. 84 enum NumberTokenType { 85 INVALID_TOKEN_TYPE, 86 NUMERICAL, 87 TERM, 88 WHITESPACE, 89 SEPARATOR, 90 NOT_SET 91 }; 92 93 // Tokenizes the input string using the selected tokenization method. 94 std::vector<Token> Tokenize(absl::string_view text) const; 95 96 // Same as above but takes UnicodeText. 97 std::vector<Token> Tokenize(const UnicodeText& text_unicode) const; 98 99 protected: 100 // Finds the tokenization codepoint range config for given codepoint. 101 // Internally uses binary search so should be O(log(# of codepoint_ranges)). 102 const TokenizationCodepointRangeT* FindTokenizationRange(int codepoint) const; 103 104 // Finds the role and script for given codepoint. If not found, DEFAULT_ROLE 105 // and kUnknownScript are assigned. 106 void GetScriptAndRole(char32 codepoint, 107 TokenizationCodepointRange_::Role* role, 108 int* script) const; 109 110 // Tokenizes a substring of the unicode string, appending the resulting tokens 111 // to the output vector. The resulting tokens have bounds relative to the full 112 // string. Does nothing if the start of the span is negative. 113 void TokenizeSubstring(const UnicodeText& unicode_text, CodepointSpan span, 114 std::vector<Token>* result) const; 115 116 std::vector<Token> InternalTokenize(const UnicodeText& text_unicode) const; 117 118 // Takes the result of ICU tokenization and retokenizes stretches of tokens 119 // made of a specific subset of characters using the internal tokenizer. 120 void InternalRetokenize(const UnicodeText& unicode_text, 121 std::vector<Token>* tokens) const; 122 123 // Tokenizes the input text using ICU tokenizer. 124 bool ICUTokenize(const UnicodeText& context_unicode, 125 std::vector<Token>* result) const; 126 127 // Tokenizes the input in number, word and separator tokens. 128 bool NumberTokenize(const UnicodeText& text_unicode, 129 std::vector<Token>* result) const; 130 131 private: 132 const TokenizationType type_; 133 134 const UniLib* unilib_; 135 136 // Codepoint ranges that determine how different codepoints are tokenized. 137 // The ranges must not overlap. 138 std::vector<std::unique_ptr<const TokenizationCodepointRangeT>> 139 codepoint_ranges_; 140 141 // Codepoint ranges that define which tokens (consisting of which codepoints) 142 // should be re-tokenized with the internal tokenizer in the mixed 143 // tokenization mode. 144 // NOTE: Must be sorted. 145 std::vector<CodepointRangeStruct> internal_tokenizer_codepoint_ranges_; 146 147 // If true, tokens will be additionally split when the codepoint's script_id 148 // changes. 149 const bool split_on_script_change_; 150 151 const bool icu_preserve_whitespace_tokens_; 152 const bool preserve_floating_numbers_; 153 }; 154 155 } // namespace libtextclassifier3 156 157 #endif // LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_ 158