xref: /aosp_15_r20/external/libtextclassifier/native/utils/tokenizer.h (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_
18*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_
19*993b0882SAndroid Build Coastguard Worker 
20*993b0882SAndroid Build Coastguard Worker #include <string>
21*993b0882SAndroid Build Coastguard Worker #include <vector>
22*993b0882SAndroid Build Coastguard Worker 
23*993b0882SAndroid Build Coastguard Worker #include "annotator/types.h"
24*993b0882SAndroid Build Coastguard Worker #include "utils/base/integral_types.h"
25*993b0882SAndroid Build Coastguard Worker #include "utils/codepoint-range.h"
26*993b0882SAndroid Build Coastguard Worker #include "utils/tokenizer_generated.h"
27*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unicodetext.h"
28*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unilib.h"
29*993b0882SAndroid Build Coastguard Worker #include "absl/strings/string_view.h"
30*993b0882SAndroid Build Coastguard Worker 
31*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
32*993b0882SAndroid Build Coastguard Worker 
33*993b0882SAndroid Build Coastguard Worker const int kInvalidScript = -1;
34*993b0882SAndroid Build Coastguard Worker const int kUnknownScript = -2;
35*993b0882SAndroid Build Coastguard Worker 
36*993b0882SAndroid Build Coastguard Worker // Tokenizer splits the input string into a sequence of tokens, according to
37*993b0882SAndroid Build Coastguard Worker // the configuration.
38*993b0882SAndroid Build Coastguard Worker class Tokenizer {
39*993b0882SAndroid Build Coastguard Worker  public:
40*993b0882SAndroid Build Coastguard Worker   // `codepoint_ranges`: Codepoint ranges that determine how different
41*993b0882SAndroid Build Coastguard Worker   //      codepoints are tokenized. The ranges must not overlap.
42*993b0882SAndroid Build Coastguard Worker   // `internal_tokenizer_codepoint_ranges`: Codepoint ranges that define which
43*993b0882SAndroid Build Coastguard Worker   //      tokens should be re-tokenized with the internal tokenizer in the mixed
44*993b0882SAndroid Build Coastguard Worker   //      tokenization mode.
45*993b0882SAndroid Build Coastguard Worker   // `split_on_script_change`: Whether to consider a change of codepoint script
46*993b0882SAndroid Build Coastguard Worker   //      in a sequence of characters as a token boundary. If True, will treat
47*993b0882SAndroid Build Coastguard Worker   //      script change as a token boundary.
48*993b0882SAndroid Build Coastguard Worker   // `icu_preserve_whitespace_tokens`: If true, will include empty tokens in the
49*993b0882SAndroid Build Coastguard Worker   // output (in the ICU tokenization mode).
50*993b0882SAndroid Build Coastguard Worker   // `preserve_floating_numbers`: If true (default), will keep dots between
51*993b0882SAndroid Build Coastguard Worker   // digits together, not making separate tokens (in the LETTER_DIGIT
52*993b0882SAndroid Build Coastguard Worker   // tokenization mode).
53*993b0882SAndroid Build Coastguard Worker   Tokenizer(
54*993b0882SAndroid Build Coastguard Worker       const TokenizationType type, const UniLib* unilib,
55*993b0882SAndroid Build Coastguard Worker       const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
56*993b0882SAndroid Build Coastguard Worker       const std::vector<const CodepointRange*>&
57*993b0882SAndroid Build Coastguard Worker           internal_tokenizer_codepoint_ranges,
58*993b0882SAndroid Build Coastguard Worker       const bool split_on_script_change,
59*993b0882SAndroid Build Coastguard Worker       const bool icu_preserve_whitespace_tokens,
60*993b0882SAndroid Build Coastguard Worker       const bool preserve_floating_numbers);
61*993b0882SAndroid Build Coastguard Worker 
Tokenizer(const TokenizationType type,const UniLib * unilib,const std::vector<const TokenizationCodepointRange * > & codepoint_ranges,const std::vector<const CodepointRange * > & internal_tokenizer_codepoint_ranges,const bool split_on_script_change,const bool icu_preserve_whitespace_tokens)62*993b0882SAndroid Build Coastguard Worker   Tokenizer(
63*993b0882SAndroid Build Coastguard Worker       const TokenizationType type, const UniLib* unilib,
64*993b0882SAndroid Build Coastguard Worker       const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
65*993b0882SAndroid Build Coastguard Worker       const std::vector<const CodepointRange*>&
66*993b0882SAndroid Build Coastguard Worker           internal_tokenizer_codepoint_ranges,
67*993b0882SAndroid Build Coastguard Worker       const bool split_on_script_change,
68*993b0882SAndroid Build Coastguard Worker       const bool icu_preserve_whitespace_tokens)
69*993b0882SAndroid Build Coastguard Worker       : Tokenizer(type, unilib, codepoint_ranges,
70*993b0882SAndroid Build Coastguard Worker                   internal_tokenizer_codepoint_ranges, split_on_script_change,
71*993b0882SAndroid Build Coastguard Worker                   icu_preserve_whitespace_tokens,
72*993b0882SAndroid Build Coastguard Worker                   /*preserve_floating_numbers=*/true) {}
73*993b0882SAndroid Build Coastguard Worker 
Tokenizer(const std::vector<const TokenizationCodepointRange * > & codepoint_ranges,const bool split_on_script_change)74*993b0882SAndroid Build Coastguard Worker   Tokenizer(
75*993b0882SAndroid Build Coastguard Worker       const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
76*993b0882SAndroid Build Coastguard Worker       const bool split_on_script_change)
77*993b0882SAndroid Build Coastguard Worker       : Tokenizer(TokenizationType_INTERNAL_TOKENIZER, /*unilib=*/nullptr,
78*993b0882SAndroid Build Coastguard Worker                   codepoint_ranges, /*internal_tokenizer_codepoint_ranges=*/{},
79*993b0882SAndroid Build Coastguard Worker                   split_on_script_change,
80*993b0882SAndroid Build Coastguard Worker                   /*icu_preserve_whitespace_tokens=*/false,
81*993b0882SAndroid Build Coastguard Worker                   /*preserve_floating_numbers=*/true) {}
82*993b0882SAndroid Build Coastguard Worker 
83*993b0882SAndroid Build Coastguard Worker   // Describes the type of tokens used in the NumberTokenizer.
84*993b0882SAndroid Build Coastguard Worker   enum NumberTokenType {
85*993b0882SAndroid Build Coastguard Worker     INVALID_TOKEN_TYPE,
86*993b0882SAndroid Build Coastguard Worker     NUMERICAL,
87*993b0882SAndroid Build Coastguard Worker     TERM,
88*993b0882SAndroid Build Coastguard Worker     WHITESPACE,
89*993b0882SAndroid Build Coastguard Worker     SEPARATOR,
90*993b0882SAndroid Build Coastguard Worker     NOT_SET
91*993b0882SAndroid Build Coastguard Worker   };
92*993b0882SAndroid Build Coastguard Worker 
93*993b0882SAndroid Build Coastguard Worker   // Tokenizes the input string using the selected tokenization method.
94*993b0882SAndroid Build Coastguard Worker   std::vector<Token> Tokenize(absl::string_view text) const;
95*993b0882SAndroid Build Coastguard Worker 
96*993b0882SAndroid Build Coastguard Worker   // Same as above but takes UnicodeText.
97*993b0882SAndroid Build Coastguard Worker   std::vector<Token> Tokenize(const UnicodeText& text_unicode) const;
98*993b0882SAndroid Build Coastguard Worker 
99*993b0882SAndroid Build Coastguard Worker  protected:
100*993b0882SAndroid Build Coastguard Worker   // Finds the tokenization codepoint range config for given codepoint.
101*993b0882SAndroid Build Coastguard Worker   // Internally uses binary search so should be O(log(# of codepoint_ranges)).
102*993b0882SAndroid Build Coastguard Worker   const TokenizationCodepointRangeT* FindTokenizationRange(int codepoint) const;
103*993b0882SAndroid Build Coastguard Worker 
104*993b0882SAndroid Build Coastguard Worker   // Finds the role and script for given codepoint. If not found, DEFAULT_ROLE
105*993b0882SAndroid Build Coastguard Worker   // and kUnknownScript are assigned.
106*993b0882SAndroid Build Coastguard Worker   void GetScriptAndRole(char32 codepoint,
107*993b0882SAndroid Build Coastguard Worker                         TokenizationCodepointRange_::Role* role,
108*993b0882SAndroid Build Coastguard Worker                         int* script) const;
109*993b0882SAndroid Build Coastguard Worker 
110*993b0882SAndroid Build Coastguard Worker   // Tokenizes a substring of the unicode string, appending the resulting tokens
111*993b0882SAndroid Build Coastguard Worker   // to the output vector. The resulting tokens have bounds relative to the full
112*993b0882SAndroid Build Coastguard Worker   // string. Does nothing if the start of the span is negative.
113*993b0882SAndroid Build Coastguard Worker   void TokenizeSubstring(const UnicodeText& unicode_text, CodepointSpan span,
114*993b0882SAndroid Build Coastguard Worker                          std::vector<Token>* result) const;
115*993b0882SAndroid Build Coastguard Worker 
116*993b0882SAndroid Build Coastguard Worker   std::vector<Token> InternalTokenize(const UnicodeText& text_unicode) const;
117*993b0882SAndroid Build Coastguard Worker 
118*993b0882SAndroid Build Coastguard Worker   // Takes the result of ICU tokenization and retokenizes stretches of tokens
119*993b0882SAndroid Build Coastguard Worker   // made of a specific subset of characters using the internal tokenizer.
120*993b0882SAndroid Build Coastguard Worker   void InternalRetokenize(const UnicodeText& unicode_text,
121*993b0882SAndroid Build Coastguard Worker                           std::vector<Token>* tokens) const;
122*993b0882SAndroid Build Coastguard Worker 
123*993b0882SAndroid Build Coastguard Worker   // Tokenizes the input text using ICU tokenizer.
124*993b0882SAndroid Build Coastguard Worker   bool ICUTokenize(const UnicodeText& context_unicode,
125*993b0882SAndroid Build Coastguard Worker                    std::vector<Token>* result) const;
126*993b0882SAndroid Build Coastguard Worker 
127*993b0882SAndroid Build Coastguard Worker   // Tokenizes the input in number, word and separator tokens.
128*993b0882SAndroid Build Coastguard Worker   bool NumberTokenize(const UnicodeText& text_unicode,
129*993b0882SAndroid Build Coastguard Worker                       std::vector<Token>* result) const;
130*993b0882SAndroid Build Coastguard Worker 
131*993b0882SAndroid Build Coastguard Worker  private:
132*993b0882SAndroid Build Coastguard Worker   const TokenizationType type_;
133*993b0882SAndroid Build Coastguard Worker 
134*993b0882SAndroid Build Coastguard Worker   const UniLib* unilib_;
135*993b0882SAndroid Build Coastguard Worker 
136*993b0882SAndroid Build Coastguard Worker   // Codepoint ranges that determine how different codepoints are tokenized.
137*993b0882SAndroid Build Coastguard Worker   // The ranges must not overlap.
138*993b0882SAndroid Build Coastguard Worker   std::vector<std::unique_ptr<const TokenizationCodepointRangeT>>
139*993b0882SAndroid Build Coastguard Worker       codepoint_ranges_;
140*993b0882SAndroid Build Coastguard Worker 
141*993b0882SAndroid Build Coastguard Worker   // Codepoint ranges that define which tokens (consisting of which codepoints)
142*993b0882SAndroid Build Coastguard Worker   // should be re-tokenized with the internal tokenizer in the mixed
143*993b0882SAndroid Build Coastguard Worker   // tokenization mode.
144*993b0882SAndroid Build Coastguard Worker   // NOTE: Must be sorted.
145*993b0882SAndroid Build Coastguard Worker   std::vector<CodepointRangeStruct> internal_tokenizer_codepoint_ranges_;
146*993b0882SAndroid Build Coastguard Worker 
147*993b0882SAndroid Build Coastguard Worker   // If true, tokens will be additionally split when the codepoint's script_id
148*993b0882SAndroid Build Coastguard Worker   // changes.
149*993b0882SAndroid Build Coastguard Worker   const bool split_on_script_change_;
150*993b0882SAndroid Build Coastguard Worker 
151*993b0882SAndroid Build Coastguard Worker   const bool icu_preserve_whitespace_tokens_;
152*993b0882SAndroid Build Coastguard Worker   const bool preserve_floating_numbers_;
153*993b0882SAndroid Build Coastguard Worker };
154*993b0882SAndroid Build Coastguard Worker 
155*993b0882SAndroid Build Coastguard Worker }  // namespace libtextclassifier3
156*993b0882SAndroid Build Coastguard Worker 
157*993b0882SAndroid Build Coastguard Worker #endif  // LIBTEXTCLASSIFIER_UTILS_TOKENIZER_H_
158