1*993b0882SAndroid Build Coastguard Worker /* 2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project 3*993b0882SAndroid Build Coastguard Worker * 4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License"); 5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License. 6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at 7*993b0882SAndroid Build Coastguard Worker * 8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0 9*993b0882SAndroid Build Coastguard Worker * 10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software 11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS, 12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and 14*993b0882SAndroid Build Coastguard Worker * limitations under the License. 15*993b0882SAndroid Build Coastguard Worker */ 16*993b0882SAndroid Build Coastguard Worker 17*993b0882SAndroid Build Coastguard Worker #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_ 18*993b0882SAndroid Build Coastguard Worker #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_ 19*993b0882SAndroid Build Coastguard Worker 20*993b0882SAndroid Build Coastguard Worker #include <string> 21*993b0882SAndroid Build Coastguard Worker 22*993b0882SAndroid Build Coastguard Worker #include "lang_id/common/fel/task-context.h" 23*993b0882SAndroid Build Coastguard Worker #include "lang_id/common/lite_strings/stringpiece.h" 24*993b0882SAndroid Build Coastguard Worker #include "lang_id/light-sentence.h" 25*993b0882SAndroid Build Coastguard Worker 26*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 { 27*993b0882SAndroid Build Coastguard Worker namespace mobile { 28*993b0882SAndroid Build Coastguard Worker namespace lang_id { 29*993b0882SAndroid Build Coastguard Worker 30*993b0882SAndroid Build Coastguard Worker // Custom tokenizer for the LangId model. 31*993b0882SAndroid Build Coastguard Worker class TokenizerForLangId { 32*993b0882SAndroid Build Coastguard Worker public: 33*993b0882SAndroid Build Coastguard Worker void Setup(TaskContext *context); 34*993b0882SAndroid Build Coastguard Worker 35*993b0882SAndroid Build Coastguard Worker // Tokenizes |text|, placing the tokens into |sentence|. Customized for 36*993b0882SAndroid Build Coastguard Worker // LangId. Currently (Sep 15, 2016) we tokenize on space, newline, tab, and 37*993b0882SAndroid Build Coastguard Worker // any other 1-byte UTF8 character which is not a letter, ignore all empty 38*993b0882SAndroid Build Coastguard Worker // tokens, and (for each of the remaining tokens) prepend "^" (special token 39*993b0882SAndroid Build Coastguard Worker // begin marker) and append "$" (special token end marker). 40*993b0882SAndroid Build Coastguard Worker // 41*993b0882SAndroid Build Coastguard Worker // Tokens are stored into the "repeated Token token;" field of *sentence. 42*993b0882SAndroid Build Coastguard Worker void Tokenize(StringPiece text, LightSentence *sentence) const; 43*993b0882SAndroid Build Coastguard Worker 44*993b0882SAndroid Build Coastguard Worker private: 45*993b0882SAndroid Build Coastguard Worker // If true, during tokenization, we use the lowercase version of each Unicode 46*993b0882SAndroid Build Coastguard Worker // character from the text to tokenize. E.g., if this is true, the text "Foo 47*993b0882SAndroid Build Coastguard Worker // bar" is tokenized as ["foo", "bar"]; otherwise, we get ["Foo", "bar"]. 48*993b0882SAndroid Build Coastguard Worker bool lowercase_input_ = false; 49*993b0882SAndroid Build Coastguard Worker }; 50*993b0882SAndroid Build Coastguard Worker 51*993b0882SAndroid Build Coastguard Worker } // namespace lang_id 52*993b0882SAndroid Build Coastguard Worker } // namespace mobile 53*993b0882SAndroid Build Coastguard Worker } // namespace nlp_saft 54*993b0882SAndroid Build Coastguard Worker 55*993b0882SAndroid Build Coastguard Worker #endif // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_ 56