1*993b0882SAndroid Build Coastguard Worker /* 2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project 3*993b0882SAndroid Build Coastguard Worker * 4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License"); 5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License. 6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at 7*993b0882SAndroid Build Coastguard Worker * 8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0 9*993b0882SAndroid Build Coastguard Worker * 10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software 11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS, 12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and 14*993b0882SAndroid Build Coastguard Worker * limitations under the License. 15*993b0882SAndroid Build Coastguard Worker */ 16*993b0882SAndroid Build Coastguard Worker 17*993b0882SAndroid Build Coastguard Worker #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_ 18*993b0882SAndroid Build Coastguard Worker #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_ 19*993b0882SAndroid Build Coastguard Worker 20*993b0882SAndroid Build Coastguard Worker 21*993b0882SAndroid Build Coastguard Worker #include <stddef.h> 22*993b0882SAndroid Build Coastguard Worker 23*993b0882SAndroid Build Coastguard Worker #include <memory> 24*993b0882SAndroid Build Coastguard Worker #include <string> 25*993b0882SAndroid Build Coastguard Worker #include <utility> 26*993b0882SAndroid Build Coastguard Worker #include <vector> 27*993b0882SAndroid Build Coastguard Worker 28*993b0882SAndroid Build Coastguard Worker #include "lang_id/common/lite_base/macros.h" 29*993b0882SAndroid Build Coastguard Worker #include "lang_id/model-provider.h" 30*993b0882SAndroid Build Coastguard Worker 31*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 { 32*993b0882SAndroid Build Coastguard Worker namespace mobile { 33*993b0882SAndroid Build Coastguard Worker namespace lang_id { 34*993b0882SAndroid Build Coastguard Worker 35*993b0882SAndroid Build Coastguard Worker // Forward-declaration of the class that performs all underlying work. 36*993b0882SAndroid Build Coastguard Worker class LangIdImpl; 37*993b0882SAndroid Build Coastguard Worker 38*993b0882SAndroid Build Coastguard Worker struct LangIdResult { 39*993b0882SAndroid Build Coastguard Worker // An n-best list of possible language codes for a given input sorted in 40*993b0882SAndroid Build Coastguard Worker // descending order according to each code's respective probability. 41*993b0882SAndroid Build Coastguard Worker // 42*993b0882SAndroid Build Coastguard Worker // This list is guaranteed to be non-empty after calling 43*993b0882SAndroid Build Coastguard Worker // LangId::FindLanguages. The most likely language code is always the first 44*993b0882SAndroid Build Coastguard Worker // item in this array. 45*993b0882SAndroid Build Coastguard Worker // 46*993b0882SAndroid Build Coastguard Worker // If the model cannot make a prediction, this array contains a single result: 47*993b0882SAndroid Build Coastguard Worker // a language code LangId::kUnknownLanguageCode with probability 1. 48*993b0882SAndroid Build Coastguard Worker std::vector<std::pair<std::string, float>> predictions; 49*993b0882SAndroid Build Coastguard Worker }; 50*993b0882SAndroid Build Coastguard Worker 51*993b0882SAndroid Build Coastguard Worker // Class for detecting the language of a document. 52*993b0882SAndroid Build Coastguard Worker // 53*993b0882SAndroid Build Coastguard Worker // Note: this class does not handle the details of loading the actual model. 54*993b0882SAndroid Build Coastguard Worker // Those details have been "outsourced" to the ModelProvider class. 55*993b0882SAndroid Build Coastguard Worker // 56*993b0882SAndroid Build Coastguard Worker // This class is thread safe. 57*993b0882SAndroid Build Coastguard Worker class LangId { 58*993b0882SAndroid Build Coastguard Worker public: 59*993b0882SAndroid Build Coastguard Worker // Standard BCP-47 language code for Unknown/Undetermined language. 60*993b0882SAndroid Build Coastguard Worker static const char kUnknownLanguageCode[]; 61*993b0882SAndroid Build Coastguard Worker 62*993b0882SAndroid Build Coastguard Worker // Constructs a LangId object, based on |model_provider|. 63*993b0882SAndroid Build Coastguard Worker // 64*993b0882SAndroid Build Coastguard Worker // Note: we don't crash if we detect a problem at construction time (e.g., the 65*993b0882SAndroid Build Coastguard Worker // model provider can't read an underlying file). Instead, we mark the 66*993b0882SAndroid Build Coastguard Worker // newly-constructed object as invalid; clients can invoke FindLanguage() on 67*993b0882SAndroid Build Coastguard Worker // an invalid object: nothing crashes, but accuracy will be bad. 68*993b0882SAndroid Build Coastguard Worker explicit LangId(std::unique_ptr<ModelProvider> model_provider); 69*993b0882SAndroid Build Coastguard Worker 70*993b0882SAndroid Build Coastguard Worker virtual ~LangId(); 71*993b0882SAndroid Build Coastguard Worker 72*993b0882SAndroid Build Coastguard Worker // Computes the n-best list of language codes and probabilities corresponding 73*993b0882SAndroid Build Coastguard Worker // to the most likely languages the given input text is written in. That list 74*993b0882SAndroid Build Coastguard Worker // includes the most likely |max_results| languages and is sorted in 75*993b0882SAndroid Build Coastguard Worker // descending order by language probability. 76*993b0882SAndroid Build Coastguard Worker // 77*993b0882SAndroid Build Coastguard Worker // The input text consists of the |num_bytes| bytes that starts at |data|. 78*993b0882SAndroid Build Coastguard Worker // 79*993b0882SAndroid Build Coastguard Worker // If max_results <= 0, we report probabilities for all languages known by 80*993b0882SAndroid Build Coastguard Worker // this LangId object (as always, in decreasing order of their probabilities). 81*993b0882SAndroid Build Coastguard Worker // 82*993b0882SAndroid Build Coastguard Worker // Note: If this LangId object is not valid (see is_valid()) or if this LangId 83*993b0882SAndroid Build Coastguard Worker // object can't make a prediction, this method sets the LangIdResult to 84*993b0882SAndroid Build Coastguard Worker // contain a single entry with kUnknownLanguageCode with probability 1. 85*993b0882SAndroid Build Coastguard Worker // 86*993b0882SAndroid Build Coastguard Worker void FindLanguages(const char *data, size_t num_bytes, LangIdResult *result, 87*993b0882SAndroid Build Coastguard Worker int max_results = 0) const; 88*993b0882SAndroid Build Coastguard Worker 89*993b0882SAndroid Build Coastguard Worker // Convenience version of FindLanguages(const char *, size_t, LangIdResult *). 90*993b0882SAndroid Build Coastguard Worker void FindLanguages(const std::string &text, LangIdResult *result, 91*993b0882SAndroid Build Coastguard Worker int max_results = 0) const { 92*993b0882SAndroid Build Coastguard Worker FindLanguages(text.data(), text.size(), result, max_results); 93*993b0882SAndroid Build Coastguard Worker } 94*993b0882SAndroid Build Coastguard Worker 95*993b0882SAndroid Build Coastguard Worker // Returns language code for the most likely language for a piece of text. 96*993b0882SAndroid Build Coastguard Worker // 97*993b0882SAndroid Build Coastguard Worker // The input text consists of the |num_bytes| bytes that start at |data|. 98*993b0882SAndroid Build Coastguard Worker // 99*993b0882SAndroid Build Coastguard Worker // Note: this method reports the most likely (1-best) language only if its 100*993b0882SAndroid Build Coastguard Worker // probability is high enough; otherwise, it returns 101*993b0882SAndroid Build Coastguard Worker // LangId::kUnknownLanguageCode. The specific probability threshold is tuned 102*993b0882SAndroid Build Coastguard Worker // to the needs of an early client. If you need a different threshold, you 103*993b0882SAndroid Build Coastguard Worker // can use FindLanguages (plural) to get the full LangIdResult, and apply your 104*993b0882SAndroid Build Coastguard Worker // own threshold. 105*993b0882SAndroid Build Coastguard Worker // 106*993b0882SAndroid Build Coastguard Worker // Note: if this LangId object is not valid (see is_valid()) or if this LangId 107*993b0882SAndroid Build Coastguard Worker // object can't make a prediction, then this method returns 108*993b0882SAndroid Build Coastguard Worker // LangId::kUnknownLanguageCode. 109*993b0882SAndroid Build Coastguard Worker // 110*993b0882SAndroid Build Coastguard Worker std::string FindLanguage(const char *data, size_t num_bytes) const; 111*993b0882SAndroid Build Coastguard Worker 112*993b0882SAndroid Build Coastguard Worker // Convenience version of FindLanguage(const char *, size_t). FindLanguage(const std::string & text)113*993b0882SAndroid Build Coastguard Worker std::string FindLanguage(const std::string &text) const { 114*993b0882SAndroid Build Coastguard Worker return FindLanguage(text.data(), text.size()); 115*993b0882SAndroid Build Coastguard Worker } 116*993b0882SAndroid Build Coastguard Worker 117*993b0882SAndroid Build Coastguard Worker // Returns true if this object has been correctly initialized and is ready to 118*993b0882SAndroid Build Coastguard Worker // perform predictions. For more info, see doc for LangId 119*993b0882SAndroid Build Coastguard Worker // constructor above. 120*993b0882SAndroid Build Coastguard Worker bool is_valid() const; 121*993b0882SAndroid Build Coastguard Worker 122*993b0882SAndroid Build Coastguard Worker // Returns the version of the model used by this LangId object. On success, 123*993b0882SAndroid Build Coastguard Worker // the returned version number is a strictly positive integer. Returns 0 if 124*993b0882SAndroid Build Coastguard Worker // the model version can not be determined (e.g., for old models that do not 125*993b0882SAndroid Build Coastguard Worker // specify a version number). 126*993b0882SAndroid Build Coastguard Worker int GetModelVersion() const; 127*993b0882SAndroid Build Coastguard Worker 128*993b0882SAndroid Build Coastguard Worker // Returns a typed property stored in the model file. 129*993b0882SAndroid Build Coastguard Worker float GetFloatProperty(const std::string &property, 130*993b0882SAndroid Build Coastguard Worker float default_value) const; 131*993b0882SAndroid Build Coastguard Worker 132*993b0882SAndroid Build Coastguard Worker private: 133*993b0882SAndroid Build Coastguard Worker // Pimpl ("pointer to implementation") pattern, to hide all internals from our 134*993b0882SAndroid Build Coastguard Worker // clients. 135*993b0882SAndroid Build Coastguard Worker std::unique_ptr<LangIdImpl> pimpl_; 136*993b0882SAndroid Build Coastguard Worker 137*993b0882SAndroid Build Coastguard Worker SAFTM_DISALLOW_COPY_AND_ASSIGN(LangId); 138*993b0882SAndroid Build Coastguard Worker }; 139*993b0882SAndroid Build Coastguard Worker 140*993b0882SAndroid Build Coastguard Worker } // namespace lang_id 141*993b0882SAndroid Build Coastguard Worker } // namespace mobile 142*993b0882SAndroid Build Coastguard Worker } // namespace nlp_saft 143*993b0882SAndroid Build Coastguard Worker 144*993b0882SAndroid Build Coastguard Worker #endif // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_ 145