xref: /aosp_15_r20/external/libtextclassifier/native/lang_id/custom-tokenizer.h (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_
18*993b0882SAndroid Build Coastguard Worker #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_
19*993b0882SAndroid Build Coastguard Worker 
20*993b0882SAndroid Build Coastguard Worker #include <string>
21*993b0882SAndroid Build Coastguard Worker 
22*993b0882SAndroid Build Coastguard Worker #include "lang_id/common/fel/task-context.h"
23*993b0882SAndroid Build Coastguard Worker #include "lang_id/common/lite_strings/stringpiece.h"
24*993b0882SAndroid Build Coastguard Worker #include "lang_id/light-sentence.h"
25*993b0882SAndroid Build Coastguard Worker 
26*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
27*993b0882SAndroid Build Coastguard Worker namespace mobile {
28*993b0882SAndroid Build Coastguard Worker namespace lang_id {
29*993b0882SAndroid Build Coastguard Worker 
30*993b0882SAndroid Build Coastguard Worker // Custom tokenizer for the LangId model.
31*993b0882SAndroid Build Coastguard Worker class TokenizerForLangId {
32*993b0882SAndroid Build Coastguard Worker  public:
33*993b0882SAndroid Build Coastguard Worker   void Setup(TaskContext *context);
34*993b0882SAndroid Build Coastguard Worker 
35*993b0882SAndroid Build Coastguard Worker   // Tokenizes |text|, placing the tokens into |sentence|.  Customized for
36*993b0882SAndroid Build Coastguard Worker   // LangId.  Currently (Sep 15, 2016) we tokenize on space, newline, tab, and
37*993b0882SAndroid Build Coastguard Worker   // any other 1-byte UTF8 character which is not a letter, ignore all empty
38*993b0882SAndroid Build Coastguard Worker   // tokens, and (for each of the remaining tokens) prepend "^" (special token
39*993b0882SAndroid Build Coastguard Worker   // begin marker) and append "$" (special token end marker).
40*993b0882SAndroid Build Coastguard Worker   //
41*993b0882SAndroid Build Coastguard Worker   // Tokens are stored into the "repeated Token token;" field of *sentence.
42*993b0882SAndroid Build Coastguard Worker   void Tokenize(StringPiece text, LightSentence *sentence) const;
43*993b0882SAndroid Build Coastguard Worker 
44*993b0882SAndroid Build Coastguard Worker  private:
45*993b0882SAndroid Build Coastguard Worker   // If true, during tokenization, we use the lowercase version of each Unicode
46*993b0882SAndroid Build Coastguard Worker   // character from the text to tokenize.  E.g., if this is true, the text "Foo
47*993b0882SAndroid Build Coastguard Worker   // bar" is tokenized as ["foo", "bar"]; otherwise, we get ["Foo", "bar"].
48*993b0882SAndroid Build Coastguard Worker   bool lowercase_input_ = false;
49*993b0882SAndroid Build Coastguard Worker };
50*993b0882SAndroid Build Coastguard Worker 
51*993b0882SAndroid Build Coastguard Worker }  // namespace lang_id
52*993b0882SAndroid Build Coastguard Worker }  // namespace mobile
53*993b0882SAndroid Build Coastguard Worker }  // namespace nlp_saft
54*993b0882SAndroid Build Coastguard Worker 
55*993b0882SAndroid Build Coastguard Worker #endif  // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_
56