1*993b0882SAndroid Build Coastguard Worker /* 2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project 3*993b0882SAndroid Build Coastguard Worker * 4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License"); 5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License. 6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at 7*993b0882SAndroid Build Coastguard Worker * 8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0 9*993b0882SAndroid Build Coastguard Worker * 10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software 11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS, 12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and 14*993b0882SAndroid Build Coastguard Worker * limitations under the License. 15*993b0882SAndroid Build Coastguard Worker */ 16*993b0882SAndroid Build Coastguard Worker 17*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_TRANSLATE_TRANSLATE_H_ 18*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_ANNOTATOR_TRANSLATE_TRANSLATE_H_ 19*993b0882SAndroid Build Coastguard Worker 20*993b0882SAndroid Build Coastguard Worker #include "annotator/model_generated.h" 21*993b0882SAndroid Build Coastguard Worker #include "annotator/types.h" 22*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unicodetext.h" 23*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unilib.h" 24*993b0882SAndroid Build Coastguard Worker #include "lang_id/lang-id.h" 25*993b0882SAndroid Build Coastguard Worker 26*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 { 27*993b0882SAndroid Build Coastguard Worker 28*993b0882SAndroid Build Coastguard Worker // Returns classification with "translate" when the input text is in a language 29*993b0882SAndroid Build Coastguard Worker // not understood by the user. 30*993b0882SAndroid Build Coastguard Worker class TranslateAnnotator { 31*993b0882SAndroid Build Coastguard Worker public: TranslateAnnotator(const TranslateAnnotatorOptions * options,const libtextclassifier3::mobile::lang_id::LangId * langid_model,const UniLib * unilib)32*993b0882SAndroid Build Coastguard Worker TranslateAnnotator(const TranslateAnnotatorOptions* options, 33*993b0882SAndroid Build Coastguard Worker const libtextclassifier3::mobile::lang_id::LangId* langid_model, 34*993b0882SAndroid Build Coastguard Worker const UniLib* unilib) 35*993b0882SAndroid Build Coastguard Worker : options_(options), langid_model_(langid_model), unilib_(unilib) {} 36*993b0882SAndroid Build Coastguard Worker 37*993b0882SAndroid Build Coastguard Worker // Returns true if a classification_result was filled with "translate" 38*993b0882SAndroid Build Coastguard Worker // classification. 39*993b0882SAndroid Build Coastguard Worker bool ClassifyText(const UnicodeText& context, CodepointSpan selection_indices, 40*993b0882SAndroid Build Coastguard Worker const std::string& user_familiar_language_tags, 41*993b0882SAndroid Build Coastguard Worker ClassificationResult* classification_result) const; 42*993b0882SAndroid Build Coastguard Worker 43*993b0882SAndroid Build Coastguard Worker protected: 44*993b0882SAndroid Build Coastguard Worker struct LanguageConfidence { 45*993b0882SAndroid Build Coastguard Worker std::string language; 46*993b0882SAndroid Build Coastguard Worker float confidence = -1.0; 47*993b0882SAndroid Build Coastguard Worker }; 48*993b0882SAndroid Build Coastguard Worker 49*993b0882SAndroid Build Coastguard Worker // Detects language of the selection in given context using the "Backoff 50*993b0882SAndroid Build Coastguard Worker // algorithm", sorted by the score descendingly. It is based on several 51*993b0882SAndroid Build Coastguard Worker // heuristics, see the code. This is the same algorithm that TextClassifier 52*993b0882SAndroid Build Coastguard Worker // uses in Android Q. 53*993b0882SAndroid Build Coastguard Worker std::vector<LanguageConfidence> BackoffDetectLanguages( 54*993b0882SAndroid Build Coastguard Worker const UnicodeText& context, CodepointSpan selection_indices) const; 55*993b0882SAndroid Build Coastguard Worker 56*993b0882SAndroid Build Coastguard Worker // Returns the iterator of the next whitespace/punctuation character in given 57*993b0882SAndroid Build Coastguard Worker // text, starting from given position and going forward (iff direction == 1), 58*993b0882SAndroid Build Coastguard Worker // and backward (iff direction == -1). 59*993b0882SAndroid Build Coastguard Worker UnicodeText::const_iterator FindIndexOfNextWhitespaceOrPunctuation( 60*993b0882SAndroid Build Coastguard Worker const UnicodeText& text, int start_index, int direction) const; 61*993b0882SAndroid Build Coastguard Worker 62*993b0882SAndroid Build Coastguard Worker // Returns substring from given text, centered around the specified indices, 63*993b0882SAndroid Build Coastguard Worker // of certain minimum length. The substring is token aligned, so it is 64*993b0882SAndroid Build Coastguard Worker // guaranteed that the words won't be broken down. 65*993b0882SAndroid Build Coastguard Worker UnicodeText TokenAlignedSubstringAroundSpan(const UnicodeText& text, 66*993b0882SAndroid Build Coastguard Worker CodepointSpan indices, 67*993b0882SAndroid Build Coastguard Worker int minimum_length) const; 68*993b0882SAndroid Build Coastguard Worker 69*993b0882SAndroid Build Coastguard Worker private: 70*993b0882SAndroid Build Coastguard Worker std::string CreateSerializedEntityData( 71*993b0882SAndroid Build Coastguard Worker const std::vector<TranslateAnnotator::LanguageConfidence>& confidences) 72*993b0882SAndroid Build Coastguard Worker const; 73*993b0882SAndroid Build Coastguard Worker 74*993b0882SAndroid Build Coastguard Worker const TranslateAnnotatorOptions* options_; 75*993b0882SAndroid Build Coastguard Worker const libtextclassifier3::mobile::lang_id::LangId* langid_model_; 76*993b0882SAndroid Build Coastguard Worker const UniLib* unilib_; 77*993b0882SAndroid Build Coastguard Worker }; 78*993b0882SAndroid Build Coastguard Worker 79*993b0882SAndroid Build Coastguard Worker } // namespace libtextclassifier3 80*993b0882SAndroid Build Coastguard Worker 81*993b0882SAndroid Build Coastguard Worker #endif // LIBTEXTCLASSIFIER_ANNOTATOR_TRANSLATE_TRANSLATE_H_ 82