xref: /aosp_15_r20/external/libtextclassifier/native/lang_id/features/char-ngram-feature.h (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_FEATURES_CHAR_NGRAM_FEATURE_H_
18*993b0882SAndroid Build Coastguard Worker #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_FEATURES_CHAR_NGRAM_FEATURE_H_
19*993b0882SAndroid Build Coastguard Worker 
20*993b0882SAndroid Build Coastguard Worker #include <mutex>  // NOLINT: see comments for state_mutex_
21*993b0882SAndroid Build Coastguard Worker #include <string>
22*993b0882SAndroid Build Coastguard Worker #include <vector>
23*993b0882SAndroid Build Coastguard Worker 
24*993b0882SAndroid Build Coastguard Worker #include "lang_id/common/fel/feature-extractor.h"
25*993b0882SAndroid Build Coastguard Worker #include "lang_id/common/fel/task-context.h"
26*993b0882SAndroid Build Coastguard Worker #include "lang_id/common/fel/workspace.h"
27*993b0882SAndroid Build Coastguard Worker #include "lang_id/features/light-sentence-features.h"
28*993b0882SAndroid Build Coastguard Worker #include "lang_id/light-sentence.h"
29*993b0882SAndroid Build Coastguard Worker 
30*993b0882SAndroid Build Coastguard Worker // TODO(abakalov): Add a test.
31*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
32*993b0882SAndroid Build Coastguard Worker namespace mobile {
33*993b0882SAndroid Build Coastguard Worker namespace lang_id {
34*993b0882SAndroid Build Coastguard Worker 
35*993b0882SAndroid Build Coastguard Worker // Class for computing continuous char ngram features.
36*993b0882SAndroid Build Coastguard Worker //
37*993b0882SAndroid Build Coastguard Worker // Feature function descriptor parameters:
38*993b0882SAndroid Build Coastguard Worker //   include_terminators(bool, false):
39*993b0882SAndroid Build Coastguard Worker //     If 'true', then splits the text based on spaces to get tokens, adds "^"
40*993b0882SAndroid Build Coastguard Worker //     to the beginning of each token, and adds "$" to the end of each token.
41*993b0882SAndroid Build Coastguard Worker //     NOTE: currently, we support only include_terminators=true.
42*993b0882SAndroid Build Coastguard Worker //   include_spaces(bool, false):
43*993b0882SAndroid Build Coastguard Worker //     If 'true', then includes char ngrams containing spaces.
44*993b0882SAndroid Build Coastguard Worker //     NOTE: currently, we support only include_spaces=false.
45*993b0882SAndroid Build Coastguard Worker //   use_equal_weight(bool, false):
46*993b0882SAndroid Build Coastguard Worker //     If 'true', then weighs each unique ngram by 1.0 / (number of unique
47*993b0882SAndroid Build Coastguard Worker //     ngrams in the input). Otherwise, weighs each unique ngram by (ngram
48*993b0882SAndroid Build Coastguard Worker //     count) / (total number of ngrams).
49*993b0882SAndroid Build Coastguard Worker //     NOTE: currently, we support only use_equal_weight=false.
50*993b0882SAndroid Build Coastguard Worker //   id_dim(int, 10000):
51*993b0882SAndroid Build Coastguard Worker //     The integer id of each char ngram is computed as follows:
52*993b0882SAndroid Build Coastguard Worker //     Hash32WithDefault(char ngram) % id_dim.
53*993b0882SAndroid Build Coastguard Worker //   size(int, 3):
54*993b0882SAndroid Build Coastguard Worker //     Only ngrams of this size will be extracted.
55*993b0882SAndroid Build Coastguard Worker //
56*993b0882SAndroid Build Coastguard Worker // NOTE: this class is not thread-safe.  TODO(salcianu): make it thread-safe.
57*993b0882SAndroid Build Coastguard Worker class ContinuousBagOfNgramsFunction : public LightSentenceFeature {
58*993b0882SAndroid Build Coastguard Worker  public:
59*993b0882SAndroid Build Coastguard Worker   bool Setup(TaskContext *context) override;
60*993b0882SAndroid Build Coastguard Worker   bool Init(TaskContext *context) override;
61*993b0882SAndroid Build Coastguard Worker 
62*993b0882SAndroid Build Coastguard Worker   // Appends the features computed from the sentence to the feature vector.
63*993b0882SAndroid Build Coastguard Worker   void Evaluate(const WorkspaceSet &workspaces, const LightSentence &sentence,
64*993b0882SAndroid Build Coastguard Worker                 FeatureVector *result) const override;
65*993b0882SAndroid Build Coastguard Worker 
66*993b0882SAndroid Build Coastguard Worker   SAFTM_DEFINE_REGISTRATION_METHOD("continuous-bag-of-ngrams",
67*993b0882SAndroid Build Coastguard Worker                                    ContinuousBagOfNgramsFunction);
68*993b0882SAndroid Build Coastguard Worker 
69*993b0882SAndroid Build Coastguard Worker  private:
70*993b0882SAndroid Build Coastguard Worker   // Auxiliary for Evaluate().  Fills counts_ and non_zero_count_indices_ (see
71*993b0882SAndroid Build Coastguard Worker   // below), and returns the total ngram count.
72*993b0882SAndroid Build Coastguard Worker   int ComputeNgramCounts(const LightSentence &sentence) const;
73*993b0882SAndroid Build Coastguard Worker 
74*993b0882SAndroid Build Coastguard Worker   // Guards counts_ and non_zero_count_indices_.  NOTE: we use std::* constructs
75*993b0882SAndroid Build Coastguard Worker   // (instead of absl::Mutex & co) to simplify porting to Android and to avoid
76*993b0882SAndroid Build Coastguard Worker   // pulling in absl (which increases our code size).
77*993b0882SAndroid Build Coastguard Worker   mutable std::mutex state_mutex_;
78*993b0882SAndroid Build Coastguard Worker 
79*993b0882SAndroid Build Coastguard Worker   // counts_[i] is the count of all ngrams with id i.  Work data for Evaluate().
80*993b0882SAndroid Build Coastguard Worker   // NOTE: we declare this vector as a field, such that its underlying capacity
81*993b0882SAndroid Build Coastguard Worker   // stays allocated in between calls to Evaluate().
82*993b0882SAndroid Build Coastguard Worker   mutable std::vector<int> counts_;
83*993b0882SAndroid Build Coastguard Worker 
84*993b0882SAndroid Build Coastguard Worker   // Indices of non-zero elements of counts_.  See comments for counts_.
85*993b0882SAndroid Build Coastguard Worker   mutable std::vector<int> non_zero_count_indices_;
86*993b0882SAndroid Build Coastguard Worker 
87*993b0882SAndroid Build Coastguard Worker   // The integer id of each char ngram is computed as follows:
88*993b0882SAndroid Build Coastguard Worker   // Hash32WithDefaultSeed(char_ngram) % ngram_id_dimension_.
89*993b0882SAndroid Build Coastguard Worker   int ngram_id_dimension_;
90*993b0882SAndroid Build Coastguard Worker 
91*993b0882SAndroid Build Coastguard Worker   // Only ngrams of size ngram_size_ will be extracted.
92*993b0882SAndroid Build Coastguard Worker   int ngram_size_;
93*993b0882SAndroid Build Coastguard Worker };
94*993b0882SAndroid Build Coastguard Worker 
95*993b0882SAndroid Build Coastguard Worker }  // namespace lang_id
96*993b0882SAndroid Build Coastguard Worker }  // namespace mobile
97*993b0882SAndroid Build Coastguard Worker }  // namespace nlp_saft
98*993b0882SAndroid Build Coastguard Worker 
99*993b0882SAndroid Build Coastguard Worker #endif  // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_FEATURES_CHAR_NGRAM_FEATURE_H_
100