xref: /aosp_15_r20/external/libtextclassifier/native/lang_id/lang-id.h (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_
18*993b0882SAndroid Build Coastguard Worker #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_
19*993b0882SAndroid Build Coastguard Worker 
20*993b0882SAndroid Build Coastguard Worker 
21*993b0882SAndroid Build Coastguard Worker #include <stddef.h>
22*993b0882SAndroid Build Coastguard Worker 
23*993b0882SAndroid Build Coastguard Worker #include <memory>
24*993b0882SAndroid Build Coastguard Worker #include <string>
25*993b0882SAndroid Build Coastguard Worker #include <utility>
26*993b0882SAndroid Build Coastguard Worker #include <vector>
27*993b0882SAndroid Build Coastguard Worker 
28*993b0882SAndroid Build Coastguard Worker #include "lang_id/common/lite_base/macros.h"
29*993b0882SAndroid Build Coastguard Worker #include "lang_id/model-provider.h"
30*993b0882SAndroid Build Coastguard Worker 
31*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
32*993b0882SAndroid Build Coastguard Worker namespace mobile {
33*993b0882SAndroid Build Coastguard Worker namespace lang_id {
34*993b0882SAndroid Build Coastguard Worker 
35*993b0882SAndroid Build Coastguard Worker // Forward-declaration of the class that performs all underlying work.
36*993b0882SAndroid Build Coastguard Worker class LangIdImpl;
37*993b0882SAndroid Build Coastguard Worker 
38*993b0882SAndroid Build Coastguard Worker struct LangIdResult {
39*993b0882SAndroid Build Coastguard Worker   // An n-best list of possible language codes for a given input sorted in
40*993b0882SAndroid Build Coastguard Worker   // descending order according to each code's respective probability.
41*993b0882SAndroid Build Coastguard Worker   //
42*993b0882SAndroid Build Coastguard Worker   // This list is guaranteed to be non-empty after calling
43*993b0882SAndroid Build Coastguard Worker   // LangId::FindLanguages.  The most likely language code is always the first
44*993b0882SAndroid Build Coastguard Worker   // item in this array.
45*993b0882SAndroid Build Coastguard Worker   //
46*993b0882SAndroid Build Coastguard Worker   // If the model cannot make a prediction, this array contains a single result:
47*993b0882SAndroid Build Coastguard Worker   // a language code LangId::kUnknownLanguageCode with probability 1.
48*993b0882SAndroid Build Coastguard Worker   std::vector<std::pair<std::string, float>> predictions;
49*993b0882SAndroid Build Coastguard Worker };
50*993b0882SAndroid Build Coastguard Worker 
51*993b0882SAndroid Build Coastguard Worker // Class for detecting the language of a document.
52*993b0882SAndroid Build Coastguard Worker //
53*993b0882SAndroid Build Coastguard Worker // Note: this class does not handle the details of loading the actual model.
54*993b0882SAndroid Build Coastguard Worker // Those details have been "outsourced" to the ModelProvider class.
55*993b0882SAndroid Build Coastguard Worker //
56*993b0882SAndroid Build Coastguard Worker // This class is thread safe.
57*993b0882SAndroid Build Coastguard Worker class LangId {
58*993b0882SAndroid Build Coastguard Worker  public:
59*993b0882SAndroid Build Coastguard Worker   // Standard BCP-47 language code for Unknown/Undetermined language.
60*993b0882SAndroid Build Coastguard Worker   static const char kUnknownLanguageCode[];
61*993b0882SAndroid Build Coastguard Worker 
62*993b0882SAndroid Build Coastguard Worker   // Constructs a LangId object, based on |model_provider|.
63*993b0882SAndroid Build Coastguard Worker   //
64*993b0882SAndroid Build Coastguard Worker   // Note: we don't crash if we detect a problem at construction time (e.g., the
65*993b0882SAndroid Build Coastguard Worker   // model provider can't read an underlying file).  Instead, we mark the
66*993b0882SAndroid Build Coastguard Worker   // newly-constructed object as invalid; clients can invoke FindLanguage() on
67*993b0882SAndroid Build Coastguard Worker   // an invalid object: nothing crashes, but accuracy will be bad.
68*993b0882SAndroid Build Coastguard Worker   explicit LangId(std::unique_ptr<ModelProvider> model_provider);
69*993b0882SAndroid Build Coastguard Worker 
70*993b0882SAndroid Build Coastguard Worker   virtual ~LangId();
71*993b0882SAndroid Build Coastguard Worker 
72*993b0882SAndroid Build Coastguard Worker   // Computes the n-best list of language codes and probabilities corresponding
73*993b0882SAndroid Build Coastguard Worker   // to the most likely languages the given input text is written in.  That list
74*993b0882SAndroid Build Coastguard Worker   // includes the most likely |max_results| languages and is sorted in
75*993b0882SAndroid Build Coastguard Worker   // descending order by language probability.
76*993b0882SAndroid Build Coastguard Worker   //
77*993b0882SAndroid Build Coastguard Worker   // The input text consists of the |num_bytes| bytes that starts at |data|.
78*993b0882SAndroid Build Coastguard Worker   //
79*993b0882SAndroid Build Coastguard Worker   // If max_results <= 0, we report probabilities for all languages known by
80*993b0882SAndroid Build Coastguard Worker   // this LangId object (as always, in decreasing order of their probabilities).
81*993b0882SAndroid Build Coastguard Worker   //
82*993b0882SAndroid Build Coastguard Worker   // Note: If this LangId object is not valid (see is_valid()) or if this LangId
83*993b0882SAndroid Build Coastguard Worker   // object can't make a prediction, this method sets the LangIdResult to
84*993b0882SAndroid Build Coastguard Worker   // contain a single entry with kUnknownLanguageCode with probability 1.
85*993b0882SAndroid Build Coastguard Worker   //
86*993b0882SAndroid Build Coastguard Worker   void FindLanguages(const char *data, size_t num_bytes, LangIdResult *result,
87*993b0882SAndroid Build Coastguard Worker                      int max_results = 0) const;
88*993b0882SAndroid Build Coastguard Worker 
89*993b0882SAndroid Build Coastguard Worker   // Convenience version of FindLanguages(const char *, size_t, LangIdResult *).
90*993b0882SAndroid Build Coastguard Worker   void FindLanguages(const std::string &text, LangIdResult *result,
91*993b0882SAndroid Build Coastguard Worker                      int max_results = 0) const {
92*993b0882SAndroid Build Coastguard Worker     FindLanguages(text.data(), text.size(), result, max_results);
93*993b0882SAndroid Build Coastguard Worker   }
94*993b0882SAndroid Build Coastguard Worker 
95*993b0882SAndroid Build Coastguard Worker   // Returns language code for the most likely language for a piece of text.
96*993b0882SAndroid Build Coastguard Worker   //
97*993b0882SAndroid Build Coastguard Worker   // The input text consists of the |num_bytes| bytes that start at |data|.
98*993b0882SAndroid Build Coastguard Worker   //
99*993b0882SAndroid Build Coastguard Worker   // Note: this method reports the most likely (1-best) language only if its
100*993b0882SAndroid Build Coastguard Worker   // probability is high enough; otherwise, it returns
101*993b0882SAndroid Build Coastguard Worker   // LangId::kUnknownLanguageCode.  The specific probability threshold is tuned
102*993b0882SAndroid Build Coastguard Worker   // to the needs of an early client.  If you need a different threshold, you
103*993b0882SAndroid Build Coastguard Worker   // can use FindLanguages (plural) to get the full LangIdResult, and apply your
104*993b0882SAndroid Build Coastguard Worker   // own threshold.
105*993b0882SAndroid Build Coastguard Worker   //
106*993b0882SAndroid Build Coastguard Worker   // Note: if this LangId object is not valid (see is_valid()) or if this LangId
107*993b0882SAndroid Build Coastguard Worker   // object can't make a prediction, then this method returns
108*993b0882SAndroid Build Coastguard Worker   // LangId::kUnknownLanguageCode.
109*993b0882SAndroid Build Coastguard Worker   //
110*993b0882SAndroid Build Coastguard Worker   std::string FindLanguage(const char *data, size_t num_bytes) const;
111*993b0882SAndroid Build Coastguard Worker 
112*993b0882SAndroid Build Coastguard Worker   // Convenience version of FindLanguage(const char *, size_t).
FindLanguage(const std::string & text)113*993b0882SAndroid Build Coastguard Worker   std::string FindLanguage(const std::string &text) const {
114*993b0882SAndroid Build Coastguard Worker     return FindLanguage(text.data(), text.size());
115*993b0882SAndroid Build Coastguard Worker   }
116*993b0882SAndroid Build Coastguard Worker 
117*993b0882SAndroid Build Coastguard Worker   // Returns true if this object has been correctly initialized and is ready to
118*993b0882SAndroid Build Coastguard Worker   // perform predictions.  For more info, see doc for LangId
119*993b0882SAndroid Build Coastguard Worker   // constructor above.
120*993b0882SAndroid Build Coastguard Worker   bool is_valid() const;
121*993b0882SAndroid Build Coastguard Worker 
122*993b0882SAndroid Build Coastguard Worker   // Returns the version of the model used by this LangId object.  On success,
123*993b0882SAndroid Build Coastguard Worker   // the returned version number is a strictly positive integer.  Returns 0 if
124*993b0882SAndroid Build Coastguard Worker   // the model version can not be determined (e.g., for old models that do not
125*993b0882SAndroid Build Coastguard Worker   // specify a version number).
126*993b0882SAndroid Build Coastguard Worker   int GetModelVersion() const;
127*993b0882SAndroid Build Coastguard Worker 
128*993b0882SAndroid Build Coastguard Worker   // Returns a typed property stored in the model file.
129*993b0882SAndroid Build Coastguard Worker   float GetFloatProperty(const std::string &property,
130*993b0882SAndroid Build Coastguard Worker                          float default_value) const;
131*993b0882SAndroid Build Coastguard Worker 
132*993b0882SAndroid Build Coastguard Worker  private:
133*993b0882SAndroid Build Coastguard Worker   // Pimpl ("pointer to implementation") pattern, to hide all internals from our
134*993b0882SAndroid Build Coastguard Worker   // clients.
135*993b0882SAndroid Build Coastguard Worker   std::unique_ptr<LangIdImpl> pimpl_;
136*993b0882SAndroid Build Coastguard Worker 
137*993b0882SAndroid Build Coastguard Worker   SAFTM_DISALLOW_COPY_AND_ASSIGN(LangId);
138*993b0882SAndroid Build Coastguard Worker };
139*993b0882SAndroid Build Coastguard Worker 
140*993b0882SAndroid Build Coastguard Worker }  // namespace lang_id
141*993b0882SAndroid Build Coastguard Worker }  // namespace mobile
142*993b0882SAndroid Build Coastguard Worker }  // namespace nlp_saft
143*993b0882SAndroid Build Coastguard Worker 
144*993b0882SAndroid Build Coastguard Worker #endif  // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_
145