xref: /aosp_15_r20/external/libtextclassifier/native/utils/tflite/string_projection_base.cc (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utils/tflite/string_projection_base.h"
18 
19 #include <cmath>
20 #include <cstdint>
21 #include <cstring>
22 #include <memory>
23 #include <string>
24 #include <unordered_map>
25 #include <vector>
26 
27 #include "utils/hash/cityhash.h"
28 #include "utils/hash/farmhash.h"
29 #include "flatbuffers/flexbuffers.h"
30 #include "tensorflow/lite/context.h"
31 #include "tensorflow_models/seq_flow_lite/tflite_ops/quantization_util.h"
32 
33 namespace tflite {
34 namespace ops {
35 namespace custom {
36 namespace libtextclassifier3 {
37 namespace string_projection {
38 
39 namespace {
40 const int32_t kMaxInputChars = 300;
41 
42 const int kOutputLabel = 0;
43 const char kFastHash[] = "[DEV] FastHash";
44 const char kAXB[] = "[DEV] AXB";
45 
46 const int kSeedSize = sizeof(float);
47 const int kInputItemBytes = sizeof(int32_t);
48 const int kKeyBytes = sizeof(float) + sizeof(int32_t);
49 
50 }  // namespace
51 
StringProjectionOpBase(const flexbuffers::Map & custom_options)52 StringProjectionOpBase::StringProjectionOpBase(
53     const flexbuffers::Map& custom_options)
54     : hash_function_(custom_options["hash_function"].AsTypedVector()),
55       num_hash_(custom_options["num_hash"].AsInt32()),
56       num_bits_(custom_options["num_bits"].AsInt32()),
57       binary_projection_(custom_options["binary_projection"].AsBool()),
58       hash_method_(custom_options["hash_method"].ToString()),
59       axb_scale_(custom_options["axb_scale"].AsFloat()) {
60   skip_gram_params_ = {
61       .ngram_size = custom_options["ngram_size"].AsInt32(),
62       .max_skip_size = custom_options["max_skip_size"].AsInt32(),
63       .include_all_ngrams = custom_options["include_all_ngrams"].AsBool(),
64       .preprocess = custom_options["preprocess"].AsBool(),
65       .char_level = custom_options["char_level"].AsBool(),
66       .remove_punctuation = custom_options["remove_punctuation"].AsBool(),
67       .max_input_chars = kMaxInputChars,
68   };
69 }
70 
GetFeatureWeights(const std::unordered_map<std::string,int> & feature_counts,std::vector<std::vector<int64_t>> * batch_ids,std::vector<std::vector<float>> * batch_weights)71 void StringProjectionOpBase::GetFeatureWeights(
72     const std::unordered_map<std::string, int>& feature_counts,
73     std::vector<std::vector<int64_t>>* batch_ids,
74     std::vector<std::vector<float>>* batch_weights) {
75   std::vector<int64_t> ids;
76   std::vector<float> weights;
77   for (const auto& iter : feature_counts) {
78     if (hash_method_ == kFastHash || hash_method_ == kAXB) {
79       int32_t feature_id =
80           tc3farmhash::CityHash64(iter.first.c_str(), iter.first.size());
81       ids.push_back(feature_id);
82       weights.push_back(iter.second);
83     } else {
84       int64_t feature_id =
85           tc3farmhash::Fingerprint64(iter.first.c_str(), iter.first.size());
86       ids.push_back(feature_id);
87       weights.push_back(iter.second);
88     }
89   }
90 
91   batch_ids->push_back(ids);
92   batch_weights->push_back(weights);
93 }
94 
DenseLshProjection(const int batch_size,const std::vector<std::vector<int64_t>> & batch_ids,const std::vector<std::vector<float>> & batch_weights,TfLiteTensor * output)95 void StringProjectionOpBase::DenseLshProjection(
96     const int batch_size, const std::vector<std::vector<int64_t>>& batch_ids,
97     const std::vector<std::vector<float>>& batch_weights,
98     TfLiteTensor* output) {
99   auto key = std::unique_ptr<char[]>(
100       new char[kKeyBytes]);  // NOLINT: modernize-make-unique
101 
102   if (output->type == kTfLiteFloat32) {
103     for (int batch = 0; batch < batch_size; ++batch) {
104       const std::vector<int64_t>& input = batch_ids[batch];
105       const std::vector<float>& weight = batch_weights[batch];
106 
107       for (int i = 0; i < num_hash_; i++) {
108         for (int j = 0; j < num_bits_; j++) {
109           int hash_bit = i * num_bits_ + j;
110           float seed = hash_function_[hash_bit].AsFloat();
111           float bit = running_sign_bit(input, weight, seed, key.get());
112           output->data.f[batch * num_hash_ * num_bits_ + hash_bit] = bit;
113         }
114       }
115     }
116   } else if (output->type == kTfLiteUInt8) {
117     const float inverse_scale = 1.0 / output->params.scale;
118     for (int batch = 0; batch < batch_size; ++batch) {
119       const std::vector<int64_t>& input = batch_ids[batch];
120       const std::vector<float>& weight = batch_weights[batch];
121 
122       for (int i = 0; i < num_hash_; i++) {
123         for (int j = 0; j < num_bits_; j++) {
124           int hash_bit = i * num_bits_ + j;
125           float seed = hash_function_[hash_bit].AsFloat();
126           float bit = running_sign_bit(input, weight, seed, key.get());
127           output->data.uint8[batch * num_hash_ * num_bits_ + hash_bit] =
128               seq_flow_lite::PodQuantize(bit, output->params.zero_point,
129                                          inverse_scale);
130         }
131       }
132     }
133   }
134 }
135 
136 namespace {
137 
hash32(int32_t value,uint32_t seed)138 int32_t hash32(int32_t value, uint32_t seed) {
139   uint32_t hash = value;
140   hash = (hash ^ 61) ^ (hash >> 16);
141   hash = hash + (hash << 3);
142   hash = hash ^ (hash >> 4);
143   hash = hash * seed;
144   hash = hash ^ (hash >> 15);
145   return static_cast<int32_t>(hash);
146 }
147 
axb(int32_t value,float seed,float scale)148 double axb(int32_t value, float seed, float scale) {
149   // Convert seed to a larger scale of range, multiplier is 1e5 to avoid
150   // precision difference on different hardware.
151   int64_t hash_signature =
152       static_cast<int64_t>(scale) * static_cast<int64_t>(seed * 1e5) * value;
153   hash_signature %= 0x100000000;
154   hash_signature = fabs(hash_signature);
155   if (hash_signature >= 0x80000000) {
156     hash_signature -= 0x100000000;
157   }
158   return hash_signature;
159 }
160 
161 }  // namespace
162 
163 // Compute sign bit of dot product of hash(seed, input) and weight.
running_sign_bit(const std::vector<int64_t> & input,const std::vector<float> & weight,float seed,char * key)164 float StringProjectionOpBase::running_sign_bit(
165     const std::vector<int64_t>& input, const std::vector<float>& weight,
166     float seed, char* key) {
167   double score = 0.0;
168   memcpy(key, &seed, kSeedSize);
169   int cnt = 0;
170   for (int i = 0; i < input.size(); ++i) {
171     if (weight[i] == 0.0) continue;
172     cnt++;
173     const int32_t curr_input = input[i];
174     memcpy(key + kSeedSize, &curr_input, kInputItemBytes);
175 
176     // Create running hash id and value for current dimension.
177     if (hash_method_ == kFastHash) {
178       int32_t hash_signature =
179           hash32(input[i], *reinterpret_cast<uint32_t*>(&seed));
180       score += static_cast<double>(weight[i]) * hash_signature;
181     } else if (hash_method_ == kAXB) {
182       score += weight[i] * axb(input[i], seed, axb_scale_);
183     } else {
184       int64_t hash_signature = tc3farmhash::Fingerprint64(key, kKeyBytes);
185       double running_value = static_cast<double>(hash_signature);
186       score += weight[i] * running_value;
187     }
188   }
189 
190   const double inverse_normalizer = 0.00000000046566129;
191   if (!binary_projection_) {
192     if (hash_method_ == kAXB) {
193       return tanh(score / cnt * inverse_normalizer);
194     } else {
195       return tanh(score * inverse_normalizer);
196     }
197   }
198 
199   return (score > 0) ? 1 : 0;
200 }
201 
Free(TfLiteContext * context,void * buffer)202 void Free(TfLiteContext* context, void* buffer) {
203   delete reinterpret_cast<StringProjectionOpBase*>(buffer);
204 }
205 
Resize(TfLiteContext * context,TfLiteNode * node)206 TfLiteStatus Resize(TfLiteContext* context, TfLiteNode* node) {
207   auto* op = reinterpret_cast<StringProjectionOpBase*>(node->user_data);
208 
209   // The shape of the output should be the shape of the input + a new inner
210   // dimension equal to the number of features.
211   TfLiteIntArray* input_shape = op->GetInputShape(context, node);
212   TfLiteIntArray* output_shape = TfLiteIntArrayCreate(input_shape->size + 1);
213   for (int i = 0; i < input_shape->size; ++i) {
214     output_shape->data[i] = input_shape->data[i];
215   }
216   output_shape->data[input_shape->size] = op->num_hash() * op->num_bits();
217   context->ResizeTensor(context,
218                         &context->tensors[node->outputs->data[kOutputLabel]],
219                         output_shape);
220   return kTfLiteOk;
221 }
222 
Eval(TfLiteContext * context,TfLiteNode * node)223 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
224   auto* op = reinterpret_cast<StringProjectionOpBase*>(node->user_data);
225 
226   TfLiteTensor* label = &context->tensors[node->outputs->data[kOutputLabel]];
227 
228   TfLiteIntArray* input_shape = op->GetInputShape(context, node);
229   int input_size = 1;
230   for (int i = 0; i < input_shape->size; ++i) {
231     input_size *= input_shape->data[i];
232   }
233 
234   TF_LITE_ENSURE_STATUS(op->InitializeInput(context, node));
235 
236   std::vector<std::vector<int64_t>> batch_ids;
237   std::vector<std::vector<float>> batch_weights;
238   for (int i = 0; i < input_size; ++i) {
239     std::unordered_map<std::string, int> feature_counts =
240         op->ExtractSkipGrams(i);
241     op->GetFeatureWeights(feature_counts, &batch_ids, &batch_weights);
242   }
243 
244   op->DenseLshProjection(input_size, batch_ids, batch_weights, label);
245 
246   op->FinalizeInput();
247 
248   return kTfLiteOk;
249 }
250 
251 }  // namespace string_projection
252 }  // namespace libtextclassifier3
253 }  // namespace custom
254 }  // namespace ops
255 }  // namespace tflite
256