xref: /aosp_15_r20/external/libtextclassifier/native/utils/tokenizer.h (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_
18 #define LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_
19 
20 #include <string>
21 #include <vector>
22 
23 #include "annotator/types.h"
24 #include "utils/base/integral_types.h"
25 #include "utils/codepoint-range.h"
26 #include "utils/tokenizer_generated.h"
27 #include "utils/utf8/unicodetext.h"
28 #include "utils/utf8/unilib.h"
29 #include "absl/strings/string_view.h"
30 
31 namespace libtextclassifier3 {
32 
33 const int kInvalidScript = -1;
34 const int kUnknownScript = -2;
35 
36 // Tokenizer splits the input string into a sequence of tokens, according to
37 // the configuration.
38 class Tokenizer {
39  public:
40   // `codepoint_ranges`: Codepoint ranges that determine how different
41   //      codepoints are tokenized. The ranges must not overlap.
42   // `internal_tokenizer_codepoint_ranges`: Codepoint ranges that define which
43   //      tokens should be re-tokenized with the internal tokenizer in the mixed
44   //      tokenization mode.
45   // `split_on_script_change`: Whether to consider a change of codepoint script
46   //      in a sequence of characters as a token boundary. If True, will treat
47   //      script change as a token boundary.
48   // `icu_preserve_whitespace_tokens`: If true, will include empty tokens in the
49   // output (in the ICU tokenization mode).
50   // `preserve_floating_numbers`: If true (default), will keep dots between
51   // digits together, not making separate tokens (in the LETTER_DIGIT
52   // tokenization mode).
53   Tokenizer(
54       const TokenizationType type, const UniLib* unilib,
55       const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
56       const std::vector<const CodepointRange*>&
57           internal_tokenizer_codepoint_ranges,
58       const bool split_on_script_change,
59       const bool icu_preserve_whitespace_tokens,
60       const bool preserve_floating_numbers);
61 
Tokenizer(const TokenizationType type,const UniLib * unilib,const std::vector<const TokenizationCodepointRange * > & codepoint_ranges,const std::vector<const CodepointRange * > & internal_tokenizer_codepoint_ranges,const bool split_on_script_change,const bool icu_preserve_whitespace_tokens)62   Tokenizer(
63       const TokenizationType type, const UniLib* unilib,
64       const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
65       const std::vector<const CodepointRange*>&
66           internal_tokenizer_codepoint_ranges,
67       const bool split_on_script_change,
68       const bool icu_preserve_whitespace_tokens)
69       : Tokenizer(type, unilib, codepoint_ranges,
70                   internal_tokenizer_codepoint_ranges, split_on_script_change,
71                   icu_preserve_whitespace_tokens,
72                   /*preserve_floating_numbers=*/true) {}
73 
Tokenizer(const std::vector<const TokenizationCodepointRange * > & codepoint_ranges,const bool split_on_script_change)74   Tokenizer(
75       const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
76       const bool split_on_script_change)
77       : Tokenizer(TokenizationType_INTERNAL_TOKENIZER, /*unilib=*/nullptr,
78                   codepoint_ranges, /*internal_tokenizer_codepoint_ranges=*/{},
79                   split_on_script_change,
80                   /*icu_preserve_whitespace_tokens=*/false,
81                   /*preserve_floating_numbers=*/true) {}
82 
83   // Describes the type of tokens used in the NumberTokenizer.
84   enum NumberTokenType {
85     INVALID_TOKEN_TYPE,
86     NUMERICAL,
87     TERM,
88     WHITESPACE,
89     SEPARATOR,
90     NOT_SET
91   };
92 
93   // Tokenizes the input string using the selected tokenization method.
94   std::vector<Token> Tokenize(absl::string_view text) const;
95 
96   // Same as above but takes UnicodeText.
97   std::vector<Token> Tokenize(const UnicodeText& text_unicode) const;
98 
99  protected:
100   // Finds the tokenization codepoint range config for given codepoint.
101   // Internally uses binary search so should be O(log(# of codepoint_ranges)).
102   const TokenizationCodepointRangeT* FindTokenizationRange(int codepoint) const;
103 
104   // Finds the role and script for given codepoint. If not found, DEFAULT_ROLE
105   // and kUnknownScript are assigned.
106   void GetScriptAndRole(char32 codepoint,
107                         TokenizationCodepointRange_::Role* role,
108                         int* script) const;
109 
110   // Tokenizes a substring of the unicode string, appending the resulting tokens
111   // to the output vector. The resulting tokens have bounds relative to the full
112   // string. Does nothing if the start of the span is negative.
113   void TokenizeSubstring(const UnicodeText& unicode_text, CodepointSpan span,
114                          std::vector<Token>* result) const;
115 
116   std::vector<Token> InternalTokenize(const UnicodeText& text_unicode) const;
117 
118   // Takes the result of ICU tokenization and retokenizes stretches of tokens
119   // made of a specific subset of characters using the internal tokenizer.
120   void InternalRetokenize(const UnicodeText& unicode_text,
121                           std::vector<Token>* tokens) const;
122 
123   // Tokenizes the input text using ICU tokenizer.
124   bool ICUTokenize(const UnicodeText& context_unicode,
125                    std::vector<Token>* result) const;
126 
127   // Tokenizes the input in number, word and separator tokens.
128   bool NumberTokenize(const UnicodeText& text_unicode,
129                       std::vector<Token>* result) const;
130 
131  private:
132   const TokenizationType type_;
133 
134   const UniLib* unilib_;
135 
136   // Codepoint ranges that determine how different codepoints are tokenized.
137   // The ranges must not overlap.
138   std::vector<std::unique_ptr<const TokenizationCodepointRangeT>>
139       codepoint_ranges_;
140 
141   // Codepoint ranges that define which tokens (consisting of which codepoints)
142   // should be re-tokenized with the internal tokenizer in the mixed
143   // tokenization mode.
144   // NOTE: Must be sorted.
145   std::vector<CodepointRangeStruct> internal_tokenizer_codepoint_ranges_;
146 
147   // If true, tokens will be additionally split when the codepoint's script_id
148   // changes.
149   const bool split_on_script_change_;
150 
151   const bool icu_preserve_whitespace_tokens_;
152   const bool preserve_floating_numbers_;
153 };
154 
155 }  // namespace libtextclassifier3
156 
157 #endif  // LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_
158