1*993b0882SAndroid Build Coastguard Worker /* 2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project 3*993b0882SAndroid Build Coastguard Worker * 4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License"); 5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License. 6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at 7*993b0882SAndroid Build Coastguard Worker * 8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0 9*993b0882SAndroid Build Coastguard Worker * 10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software 11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS, 12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and 14*993b0882SAndroid Build Coastguard Worker * limitations under the License. 15*993b0882SAndroid Build Coastguard Worker */ 16*993b0882SAndroid Build Coastguard Worker 17*993b0882SAndroid Build Coastguard Worker #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_FEATURES_CHAR_NGRAM_FEATURE_H_ 18*993b0882SAndroid Build Coastguard Worker #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_FEATURES_CHAR_NGRAM_FEATURE_H_ 19*993b0882SAndroid Build Coastguard Worker 20*993b0882SAndroid Build Coastguard Worker #include <mutex> // NOLINT: see comments for state_mutex_ 21*993b0882SAndroid Build Coastguard Worker #include <string> 22*993b0882SAndroid Build Coastguard Worker #include <vector> 23*993b0882SAndroid Build Coastguard Worker 24*993b0882SAndroid Build Coastguard Worker #include "lang_id/common/fel/feature-extractor.h" 25*993b0882SAndroid Build Coastguard Worker #include "lang_id/common/fel/task-context.h" 26*993b0882SAndroid Build Coastguard Worker #include "lang_id/common/fel/workspace.h" 27*993b0882SAndroid Build Coastguard Worker #include "lang_id/features/light-sentence-features.h" 28*993b0882SAndroid Build Coastguard Worker #include "lang_id/light-sentence.h" 29*993b0882SAndroid Build Coastguard Worker 30*993b0882SAndroid Build Coastguard Worker // TODO(abakalov): Add a test. 31*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 { 32*993b0882SAndroid Build Coastguard Worker namespace mobile { 33*993b0882SAndroid Build Coastguard Worker namespace lang_id { 34*993b0882SAndroid Build Coastguard Worker 35*993b0882SAndroid Build Coastguard Worker // Class for computing continuous char ngram features. 36*993b0882SAndroid Build Coastguard Worker // 37*993b0882SAndroid Build Coastguard Worker // Feature function descriptor parameters: 38*993b0882SAndroid Build Coastguard Worker // include_terminators(bool, false): 39*993b0882SAndroid Build Coastguard Worker // If 'true', then splits the text based on spaces to get tokens, adds "^" 40*993b0882SAndroid Build Coastguard Worker // to the beginning of each token, and adds "$" to the end of each token. 41*993b0882SAndroid Build Coastguard Worker // NOTE: currently, we support only include_terminators=true. 42*993b0882SAndroid Build Coastguard Worker // include_spaces(bool, false): 43*993b0882SAndroid Build Coastguard Worker // If 'true', then includes char ngrams containing spaces. 44*993b0882SAndroid Build Coastguard Worker // NOTE: currently, we support only include_spaces=false. 45*993b0882SAndroid Build Coastguard Worker // use_equal_weight(bool, false): 46*993b0882SAndroid Build Coastguard Worker // If 'true', then weighs each unique ngram by 1.0 / (number of unique 47*993b0882SAndroid Build Coastguard Worker // ngrams in the input). Otherwise, weighs each unique ngram by (ngram 48*993b0882SAndroid Build Coastguard Worker // count) / (total number of ngrams). 49*993b0882SAndroid Build Coastguard Worker // NOTE: currently, we support only use_equal_weight=false. 50*993b0882SAndroid Build Coastguard Worker // id_dim(int, 10000): 51*993b0882SAndroid Build Coastguard Worker // The integer id of each char ngram is computed as follows: 52*993b0882SAndroid Build Coastguard Worker // Hash32WithDefault(char ngram) % id_dim. 53*993b0882SAndroid Build Coastguard Worker // size(int, 3): 54*993b0882SAndroid Build Coastguard Worker // Only ngrams of this size will be extracted. 55*993b0882SAndroid Build Coastguard Worker // 56*993b0882SAndroid Build Coastguard Worker // NOTE: this class is not thread-safe. TODO(salcianu): make it thread-safe. 57*993b0882SAndroid Build Coastguard Worker class ContinuousBagOfNgramsFunction : public LightSentenceFeature { 58*993b0882SAndroid Build Coastguard Worker public: 59*993b0882SAndroid Build Coastguard Worker bool Setup(TaskContext *context) override; 60*993b0882SAndroid Build Coastguard Worker bool Init(TaskContext *context) override; 61*993b0882SAndroid Build Coastguard Worker 62*993b0882SAndroid Build Coastguard Worker // Appends the features computed from the sentence to the feature vector. 63*993b0882SAndroid Build Coastguard Worker void Evaluate(const WorkspaceSet &workspaces, const LightSentence &sentence, 64*993b0882SAndroid Build Coastguard Worker FeatureVector *result) const override; 65*993b0882SAndroid Build Coastguard Worker 66*993b0882SAndroid Build Coastguard Worker SAFTM_DEFINE_REGISTRATION_METHOD("continuous-bag-of-ngrams", 67*993b0882SAndroid Build Coastguard Worker ContinuousBagOfNgramsFunction); 68*993b0882SAndroid Build Coastguard Worker 69*993b0882SAndroid Build Coastguard Worker private: 70*993b0882SAndroid Build Coastguard Worker // Auxiliary for Evaluate(). Fills counts_ and non_zero_count_indices_ (see 71*993b0882SAndroid Build Coastguard Worker // below), and returns the total ngram count. 72*993b0882SAndroid Build Coastguard Worker int ComputeNgramCounts(const LightSentence &sentence) const; 73*993b0882SAndroid Build Coastguard Worker 74*993b0882SAndroid Build Coastguard Worker // Guards counts_ and non_zero_count_indices_. NOTE: we use std::* constructs 75*993b0882SAndroid Build Coastguard Worker // (instead of absl::Mutex & co) to simplify porting to Android and to avoid 76*993b0882SAndroid Build Coastguard Worker // pulling in absl (which increases our code size). 77*993b0882SAndroid Build Coastguard Worker mutable std::mutex state_mutex_; 78*993b0882SAndroid Build Coastguard Worker 79*993b0882SAndroid Build Coastguard Worker // counts_[i] is the count of all ngrams with id i. Work data for Evaluate(). 80*993b0882SAndroid Build Coastguard Worker // NOTE: we declare this vector as a field, such that its underlying capacity 81*993b0882SAndroid Build Coastguard Worker // stays allocated in between calls to Evaluate(). 82*993b0882SAndroid Build Coastguard Worker mutable std::vector<int> counts_; 83*993b0882SAndroid Build Coastguard Worker 84*993b0882SAndroid Build Coastguard Worker // Indices of non-zero elements of counts_. See comments for counts_. 85*993b0882SAndroid Build Coastguard Worker mutable std::vector<int> non_zero_count_indices_; 86*993b0882SAndroid Build Coastguard Worker 87*993b0882SAndroid Build Coastguard Worker // The integer id of each char ngram is computed as follows: 88*993b0882SAndroid Build Coastguard Worker // Hash32WithDefaultSeed(char_ngram) % ngram_id_dimension_. 89*993b0882SAndroid Build Coastguard Worker int ngram_id_dimension_; 90*993b0882SAndroid Build Coastguard Worker 91*993b0882SAndroid Build Coastguard Worker // Only ngrams of size ngram_size_ will be extracted. 92*993b0882SAndroid Build Coastguard Worker int ngram_size_; 93*993b0882SAndroid Build Coastguard Worker }; 94*993b0882SAndroid Build Coastguard Worker 95*993b0882SAndroid Build Coastguard Worker } // namespace lang_id 96*993b0882SAndroid Build Coastguard Worker } // namespace mobile 97*993b0882SAndroid Build Coastguard Worker } // namespace nlp_saft 98*993b0882SAndroid Build Coastguard Worker 99*993b0882SAndroid Build Coastguard Worker #endif // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_FEATURES_CHAR_NGRAM_FEATURE_H_ 100