xref: /aosp_15_r20/external/libtextclassifier/native/lang_id/lang-id-wrapper.cc (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #include "lang_id/lang-id-wrapper.h"
18*993b0882SAndroid Build Coastguard Worker 
19*993b0882SAndroid Build Coastguard Worker #include <fcntl.h>
20*993b0882SAndroid Build Coastguard Worker 
21*993b0882SAndroid Build Coastguard Worker #include "lang_id/fb_model/lang-id-from-fb.h"
22*993b0882SAndroid Build Coastguard Worker #include "lang_id/lang-id.h"
23*993b0882SAndroid Build Coastguard Worker 
24*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
25*993b0882SAndroid Build Coastguard Worker 
26*993b0882SAndroid Build Coastguard Worker namespace langid {
27*993b0882SAndroid Build Coastguard Worker 
LoadFromPath(const std::string & langid_model_path)28*993b0882SAndroid Build Coastguard Worker std::unique_ptr<libtextclassifier3::mobile::lang_id::LangId> LoadFromPath(
29*993b0882SAndroid Build Coastguard Worker     const std::string& langid_model_path) {
30*993b0882SAndroid Build Coastguard Worker   std::unique_ptr<libtextclassifier3::mobile::lang_id::LangId> langid_model =
31*993b0882SAndroid Build Coastguard Worker       libtextclassifier3::mobile::lang_id::GetLangIdFromFlatbufferFile(langid_model_path);
32*993b0882SAndroid Build Coastguard Worker   return langid_model;
33*993b0882SAndroid Build Coastguard Worker }
34*993b0882SAndroid Build Coastguard Worker 
LoadFromDescriptor(const int langid_fd)35*993b0882SAndroid Build Coastguard Worker std::unique_ptr<libtextclassifier3::mobile::lang_id::LangId> LoadFromDescriptor(
36*993b0882SAndroid Build Coastguard Worker     const int langid_fd) {
37*993b0882SAndroid Build Coastguard Worker   std::unique_ptr<libtextclassifier3::mobile::lang_id::LangId> langid_model =
38*993b0882SAndroid Build Coastguard Worker       libtextclassifier3::mobile::lang_id::GetLangIdFromFlatbufferFileDescriptor(
39*993b0882SAndroid Build Coastguard Worker           langid_fd);
40*993b0882SAndroid Build Coastguard Worker   return langid_model;
41*993b0882SAndroid Build Coastguard Worker }
42*993b0882SAndroid Build Coastguard Worker 
LoadFromUnownedBuffer(const char * buffer,int size)43*993b0882SAndroid Build Coastguard Worker std::unique_ptr<libtextclassifier3::mobile::lang_id::LangId> LoadFromUnownedBuffer(
44*993b0882SAndroid Build Coastguard Worker     const char* buffer, int size) {
45*993b0882SAndroid Build Coastguard Worker   std::unique_ptr<libtextclassifier3::mobile::lang_id::LangId> langid_model =
46*993b0882SAndroid Build Coastguard Worker       libtextclassifier3::mobile::lang_id::GetLangIdFromFlatbufferBytes(buffer, size);
47*993b0882SAndroid Build Coastguard Worker   return langid_model;
48*993b0882SAndroid Build Coastguard Worker }
49*993b0882SAndroid Build Coastguard Worker 
GetPredictions(const libtextclassifier3::mobile::lang_id::LangId * model,const std::string & text)50*993b0882SAndroid Build Coastguard Worker std::vector<std::pair<std::string, float>> GetPredictions(
51*993b0882SAndroid Build Coastguard Worker     const libtextclassifier3::mobile::lang_id::LangId* model, const std::string& text) {
52*993b0882SAndroid Build Coastguard Worker   return GetPredictions(model, text.data(), text.size());
53*993b0882SAndroid Build Coastguard Worker }
54*993b0882SAndroid Build Coastguard Worker 
GetPredictions(const libtextclassifier3::mobile::lang_id::LangId * model,const char * text,int text_size)55*993b0882SAndroid Build Coastguard Worker std::vector<std::pair<std::string, float>> GetPredictions(
56*993b0882SAndroid Build Coastguard Worker     const libtextclassifier3::mobile::lang_id::LangId* model, const char* text,
57*993b0882SAndroid Build Coastguard Worker     int text_size) {
58*993b0882SAndroid Build Coastguard Worker   std::vector<std::pair<std::string, float>> prediction_results;
59*993b0882SAndroid Build Coastguard Worker   if (model == nullptr) {
60*993b0882SAndroid Build Coastguard Worker     return prediction_results;
61*993b0882SAndroid Build Coastguard Worker   }
62*993b0882SAndroid Build Coastguard Worker 
63*993b0882SAndroid Build Coastguard Worker   const float noise_threshold =
64*993b0882SAndroid Build Coastguard Worker       model->GetFloatProperty("text_classifier_langid_noise_threshold", -1.0f);
65*993b0882SAndroid Build Coastguard Worker 
66*993b0882SAndroid Build Coastguard Worker   // Speed up the things by specifying the max results we want. For example, if
67*993b0882SAndroid Build Coastguard Worker   // the noise threshold is 0.1, we don't need more than 10 results.
68*993b0882SAndroid Build Coastguard Worker   const int max_results =
69*993b0882SAndroid Build Coastguard Worker       noise_threshold < 0.01
70*993b0882SAndroid Build Coastguard Worker           ? -1  // -1 means FindLanguages returns all predictions
71*993b0882SAndroid Build Coastguard Worker           : static_cast<int>(1 / noise_threshold) + 1;
72*993b0882SAndroid Build Coastguard Worker 
73*993b0882SAndroid Build Coastguard Worker   libtextclassifier3::mobile::lang_id::LangIdResult langid_result;
74*993b0882SAndroid Build Coastguard Worker   model->FindLanguages(text, text_size, &langid_result, max_results);
75*993b0882SAndroid Build Coastguard Worker   for (int i = 0; i < langid_result.predictions.size(); i++) {
76*993b0882SAndroid Build Coastguard Worker     const auto& prediction = langid_result.predictions[i];
77*993b0882SAndroid Build Coastguard Worker     if (prediction.second >= noise_threshold && prediction.first != "und") {
78*993b0882SAndroid Build Coastguard Worker       prediction_results.push_back({prediction.first, prediction.second});
79*993b0882SAndroid Build Coastguard Worker     }
80*993b0882SAndroid Build Coastguard Worker   }
81*993b0882SAndroid Build Coastguard Worker   return prediction_results;
82*993b0882SAndroid Build Coastguard Worker }
83*993b0882SAndroid Build Coastguard Worker 
GetLanguageTags(const libtextclassifier3::mobile::lang_id::LangId * model,const std::string & text)84*993b0882SAndroid Build Coastguard Worker std::string GetLanguageTags(const libtextclassifier3::mobile::lang_id::LangId* model,
85*993b0882SAndroid Build Coastguard Worker                             const std::string& text) {
86*993b0882SAndroid Build Coastguard Worker   const std::vector<std::pair<std::string, float>>& predictions =
87*993b0882SAndroid Build Coastguard Worker       GetPredictions(model, text);
88*993b0882SAndroid Build Coastguard Worker   const float threshold =
89*993b0882SAndroid Build Coastguard Worker       model->GetFloatProperty("text_classifier_langid_threshold", -1.0f);
90*993b0882SAndroid Build Coastguard Worker   std::string detected_language_tags = "";
91*993b0882SAndroid Build Coastguard Worker   bool first_accepted_language = true;
92*993b0882SAndroid Build Coastguard Worker   for (int i = 0; i < predictions.size(); i++) {
93*993b0882SAndroid Build Coastguard Worker     const auto& prediction = predictions[i];
94*993b0882SAndroid Build Coastguard Worker     if (threshold >= 0.f && prediction.second < threshold) {
95*993b0882SAndroid Build Coastguard Worker       continue;
96*993b0882SAndroid Build Coastguard Worker     }
97*993b0882SAndroid Build Coastguard Worker     if (first_accepted_language) {
98*993b0882SAndroid Build Coastguard Worker       first_accepted_language = false;
99*993b0882SAndroid Build Coastguard Worker     } else {
100*993b0882SAndroid Build Coastguard Worker       detected_language_tags += ",";
101*993b0882SAndroid Build Coastguard Worker     }
102*993b0882SAndroid Build Coastguard Worker     detected_language_tags += prediction.first;
103*993b0882SAndroid Build Coastguard Worker   }
104*993b0882SAndroid Build Coastguard Worker   return detected_language_tags;
105*993b0882SAndroid Build Coastguard Worker }
106*993b0882SAndroid Build Coastguard Worker 
107*993b0882SAndroid Build Coastguard Worker }  // namespace langid
108*993b0882SAndroid Build Coastguard Worker 
109*993b0882SAndroid Build Coastguard Worker }  // namespace libtextclassifier3
110