xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/tools/optimize/quantize_model.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/lite/tools/optimize/quantize_model.h"
16 
17 #include <algorithm>
18 #include <cstdint>
19 #include <limits>
20 #include <memory>
21 #include <string>
22 #include <unordered_map>
23 #include <unordered_set>
24 #include <utility>
25 #include <vector>
26 
27 #include "flatbuffers/flexbuffers.h"
28 #include "absl/strings/str_cat.h"
29 #include "tensorflow/lite/context.h"
30 #include "tensorflow/lite/core/api/error_reporter.h"
31 #include "tensorflow/lite/kernels/internal/cppmath.h"
32 #include "tensorflow/lite/model.h"
33 #include "tensorflow/lite/schema/schema_generated.h"
34 #include "tensorflow/lite/schema/schema_utils.h"
35 #include "tensorflow/lite/tools/optimize/model_utils.h"
36 #include "tensorflow/lite/tools/optimize/operator_property.h"
37 #include "tensorflow/lite/tools/optimize/quantization_utils.h"
38 
39 namespace tflite {
40 namespace optimize {
41 
42 namespace {
43 
44 // Bias tensors must be duplicated if it is used as a non-bias input in another
45 // op (quantized to 8 bit), in order to quantize to 32 bit.
DuplicateBiasesWithMultipleUses(ModelT * model,ErrorReporter * error_reporter)46 TfLiteStatus DuplicateBiasesWithMultipleUses(ModelT* model,
47                                              ErrorReporter* error_reporter) {
48   std::set<int> input_uses;
49   // Get all input uses for constant tensors.
50   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
51        subgraph_idx++) {
52     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
53     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
54       operator_property::OperatorProperty property =
55           operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
56       auto* op = subgraph->operators[op_idx].get();
57       for (const auto& idx_pair : property.inputs) {
58         const int idx = idx_pair.first;
59         if (op->inputs[idx] < 0 || idx >= op->inputs.size()) {
60           continue;
61         }
62         const TensorT* input_tensor = subgraph->tensors[op->inputs[idx]].get();
63         if (!input_tensor || (input_tensor->buffer < 0) ||
64             (input_tensor->buffer >= model->buffers.size())) {
65           continue;
66         }
67         const BufferT* buffer = model->buffers[input_tensor->buffer].get();
68         if (buffer && !buffer->data.empty()) {
69           input_uses.insert({op->inputs[idx]});
70         }
71       }
72     }
73   }
74 
75   std::map<int, int> bias_uses;
76   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
77        subgraph_idx++) {
78     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
79     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
80       operator_property::OperatorProperty property =
81           operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
82       OperatorT* op = subgraph->operators[op_idx].get();
83       for (const int bias_idx : property.biases) {
84         if (bias_idx >= op->inputs.size() || op->inputs[bias_idx] < 0) {
85           continue;
86         }
87         const TensorT* bias_tensor =
88             subgraph->tensors[op->inputs[bias_idx]].get();
89         if (!bias_tensor || (bias_tensor->buffer < 0) ||
90             (bias_tensor->buffer >= model->buffers.size())) {
91           continue;
92         }
93         const BufferT* bias_buffer = model->buffers[bias_tensor->buffer].get();
94         if (!bias_buffer || bias_buffer->data.empty()) {
95           continue;
96         }
97         if (input_uses.find(op->inputs[bias_idx]) != input_uses.end()) {
98           // If used as input, duplicate the tensor and insert into bias uses.
99           int bias_use_count = 1;
100           auto inserted =
101               bias_uses.insert({op->inputs[bias_idx], bias_use_count});
102           if (!inserted.second) {
103             bias_use_count = ++inserted.first->second;
104           }
105           std::unique_ptr<TensorT> new_tensor(new TensorT);
106           new_tensor->name =
107               absl::StrCat(bias_tensor->name, "_duplicate_", bias_use_count);
108           new_tensor->shape = bias_tensor->shape;
109           new_tensor->type = bias_tensor->type;
110           if (bias_tensor->quantization) {
111             new_tensor->quantization =
112                 std::make_unique<QuantizationParametersT>();
113             new_tensor->quantization->scale.assign(
114                 bias_tensor->quantization->scale.begin(),
115                 bias_tensor->quantization->scale.end());
116             new_tensor->quantization->zero_point.assign(
117                 bias_tensor->quantization->zero_point.begin(),
118                 bias_tensor->quantization->zero_point.end());
119           }
120           std::unique_ptr<BufferT> new_buffer(new BufferT);
121           new_buffer->data.assign(bias_buffer->data.begin(),
122                                   bias_buffer->data.end());
123           model->buffers.push_back(std::move(new_buffer));
124           new_tensor->buffer = model->buffers.size() - 1;
125           subgraph->tensors.push_back(std::move(new_tensor));
126           op->inputs[bias_idx] = subgraph->tensors.size() - 1;
127         }
128       }
129     }
130   }
131   return kTfLiteOk;
132 }
133 
IsFloatTensor(const SubGraphT * subgraph,int32_t tensor_idx)134 bool IsFloatTensor(const SubGraphT* subgraph, int32_t tensor_idx) {
135   TensorT* tensor = subgraph->tensors[tensor_idx].get();
136   if (tensor->type != TensorType_FLOAT32) {
137     // Skip non-real-valued tensor.
138     return false;
139   }
140   return true;
141 }
142 
143 // Gets the operator property from the operator_property list and additionally
144 // modifies the quantizable parameter based on the user's specified
145 // operator_names.
GetOperatorProperty(const std::unordered_set<string> & operator_names,const ModelT * model,int subgraph_index,int op_idx,const string & operator_name,const TensorType & activations_type,bool disable_per_channel=false)146 operator_property::OperatorProperty GetOperatorProperty(
147     const std::unordered_set<string>& operator_names, const ModelT* model,
148     int subgraph_index, int op_idx, const string& operator_name,
149     const TensorType& activations_type, bool disable_per_channel = false) {
150   operator_property::OperatorProperty property =
151       operator_property::GetOperatorProperty(model, subgraph_index, op_idx);
152   const SubGraphT* subgraph = model->subgraphs[subgraph_index].get();
153   const OperatorT* op = subgraph->operators[op_idx].get();
154   const BuiltinOperator op_code =
155       GetBuiltinCode(model->operator_codes[op->opcode_index].get());
156   if (activations_type == TensorType_INT16 && !property.quantizable_int16) {
157     property.quantizable = false;
158   }
159   // The algorithm adds Dequantize and Quantize, so we don't require them to be
160   // in the operator_names.
161   if (op_code != BuiltinOperator_DEQUANTIZE &&
162       op_code != BuiltinOperator_QUANTIZE) {
163     property.quantizable =
164         property.quantizable &&
165         (operator_names.find(operator_name) != operator_names.end());
166   }
167   if (disable_per_channel) {
168     for (auto& input : property.inputs) {
169       if (input.second.per_axis) {
170         input.second.per_axis = false;
171       }
172     }
173   }
174   return property;
175 }
176 
IsRealValueOp(const std::unordered_set<string> & real_value_op_set,const string & operator_name)177 bool IsRealValueOp(const std::unordered_set<string>& real_value_op_set,
178                    const string& operator_name) {
179   return real_value_op_set.find(operator_name) != real_value_op_set.end();
180 }
181 
182 // Utility function to determine if tensor is constant and only has one use.
IsConstantWithOneUse(const ModelT * model,const SubGraphT * subgraph,const int tensor_id)183 bool IsConstantWithOneUse(const ModelT* model, const SubGraphT* subgraph,
184                           const int tensor_id) {
185   if (!subgraph || (tensor_id >= subgraph->tensors.size())) {
186     return false;
187   }
188   const auto& tensor = subgraph->tensors[tensor_id];
189   if (!tensor || !model || (tensor->buffer < 0) ||
190       (tensor->buffer >= model->buffers.size()) ||
191       (!model->buffers[tensor->buffer]) ||
192       (model->buffers[tensor->buffer]->data.empty())) {
193     return false;
194   }
195   int uses = 0;
196   for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
197     const auto& op = subgraph->operators[op_idx];
198     if (!op) {
199       continue;
200     }
201     const std::vector<int32_t>& inputs = op->inputs;
202     if ((std::find(inputs.begin(), inputs.end(), tensor_id) != inputs.end()) &&
203         (++uses > 1)) {
204       return false;
205     }
206   }
207   return true;
208 }
209 
210 // Creates a set that contains all quantizable ops that happen to take a
211 // non-float type in the source graph.
PopulateRealValueOpSet(ModelT * model,const std::unordered_set<string> & operator_names,const TensorType & activations_type)212 std::unordered_set<string> PopulateRealValueOpSet(
213     ModelT* model, const std::unordered_set<string>& operator_names,
214     const TensorType& activations_type) {
215   std::unordered_set<string> real_value_op_set;
216   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
217        subgraph_idx++) {
218     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
219     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
220       OperatorT* op = subgraph->operators[op_idx].get();
221       const BuiltinOperator op_code =
222           GetBuiltinCode(model->operator_codes[op->opcode_index].get());
223       if (op->outputs.empty() && op_code != BuiltinOperator_ASSIGN_VARIABLE) {
224         continue;
225       }
226       const string operator_name = op_code != BuiltinOperator_ASSIGN_VARIABLE
227                                        ? subgraph->tensors[op->outputs[0]]->name
228                                        : subgraph->tensors[op->inputs[0]]->name;
229       operator_property::OperatorProperty property =
230           GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
231                               operator_name, activations_type);
232 
233       if (!property.quantizable) {
234         real_value_op_set.insert(operator_name);
235         continue;
236       }
237 
238       for (const std::pair<int, operator_property::TensorProperty>& input :
239            property.inputs) {
240         const int32_t input_idx = input.first;
241         const int32_t tensor_idx = op->inputs[input_idx];
242         if (IsFloatTensor(subgraph, tensor_idx)) {
243           real_value_op_set.insert(operator_name);
244           break;
245         }
246       }
247       for (const std::pair<int, operator_property::TensorProperty>& output :
248            property.outputs) {
249         const int32_t output_idx = output.first;
250         const int32_t tensor_idx = op->outputs[output_idx];
251         if (IsFloatTensor(subgraph, tensor_idx)) {
252           real_value_op_set.insert(operator_name);
253           break;
254         }
255       }
256 
257       if (property.arbitrary_inputs) {
258         const int32_t tensor_idx = op->inputs[0];
259         if (IsFloatTensor(subgraph, tensor_idx)) {
260           real_value_op_set.insert(operator_name);
261         }
262       }
263 
264       if (property.arbitrary_outputs) {
265         const int32_t tensor_idx = op->outputs[0];
266         if (IsFloatTensor(subgraph, tensor_idx)) {
267           real_value_op_set.insert(operator_name);
268         }
269       }
270     }
271   }
272   return real_value_op_set;
273 }
274 
QuantizeBias(ModelT * model,const TensorT * input_tensor,const TensorT * weight_tensor,TensorT * bias_tensor,bool is_per_channel,int channel_dim_index,const TensorType & bias_type,ErrorReporter * error_reporter)275 TfLiteStatus QuantizeBias(ModelT* model, const TensorT* input_tensor,
276                           const TensorT* weight_tensor, TensorT* bias_tensor,
277                           bool is_per_channel, int channel_dim_index,
278                           const TensorType& bias_type,
279                           ErrorReporter* error_reporter) {
280   if (bias_tensor->shape.size() != 1) {
281     TF_LITE_REPORT_ERROR(error_reporter, "Expected bias tensor shape to be 1.");
282     return kTfLiteError;
283   }
284 
285   if (input_tensor->type == tflite::TensorType_INT8 &&
286       bias_type != tflite::TensorType_INT32) {
287     TF_LITE_REPORT_ERROR(
288         error_reporter,
289         "Expected bias type to be TensorType_INT32 for Int8Quant.");
290     return kTfLiteError;
291   }
292 
293   if (input_tensor->type == tflite::TensorType_INT16 &&
294       bias_type != tflite::TensorType_INT32 &&
295       bias_type != tflite::TensorType_INT64) {
296     TF_LITE_REPORT_ERROR(error_reporter,
297                          "Expected bias type to be TensorType_INT32 or "
298                          "TensorType_INT64 for Int16Quant.");
299     return kTfLiteError;
300   }
301 
302   int32_t channel_dim_size = bias_tensor->shape[0];
303   TF_LITE_ENSURE(error_reporter, weight_tensor->quantization);
304   std::vector<float> weight_scales = weight_tensor->quantization->scale;
305 
306   if (is_per_channel) {
307     if (bias_tensor->shape[0] != weight_tensor->shape[channel_dim_index]) {
308       TF_LITE_REPORT_ERROR(
309           error_reporter,
310           "Channel mismatch between bias and weight tensors %d vs %d",
311           bias_tensor->shape[0], weight_tensor->shape[channel_dim_index]);
312       return kTfLiteError;
313     }
314     if (!input_tensor->quantization ||
315         input_tensor->quantization->scale.size() != 1) {
316       TF_LITE_REPORT_ERROR(error_reporter,
317                            "Input tensor missing quantization information");
318       return kTfLiteError;
319     }
320 
321     if (weight_scales.size() != channel_dim_size) {
322       TF_LITE_REPORT_ERROR(error_reporter,
323                            "Mismatch weight scale dimension: %d",
324                            weight_scales.size());
325       return kTfLiteError;
326     }
327     if (bias_type == tflite::TensorType_INT64) {
328       return utils::SymmetricPerChannelBiasQuantize<std::int64_t>(
329           model, bias_tensor, input_tensor->quantization->scale[0],
330           weight_scales.data(), channel_dim_size, error_reporter);
331     } else {
332       return utils::SymmetricPerChannelBiasQuantize<std::int32_t>(
333           model, bias_tensor, input_tensor->quantization->scale[0],
334           weight_scales.data(), channel_dim_size, error_reporter);
335     }
336   } else {
337     if (weight_scales.size() != 1) {
338       TF_LITE_REPORT_ERROR(
339           error_reporter,
340           "Expected per-layer weight scale dimension size 1, got %d",
341           weight_scales.size());
342       return kTfLiteError;
343     }
344     if (bias_type == tflite::TensorType_INT64) {
345       return utils::SymmetricPerLayerBiasQuantize<std::int64_t>(
346           model, bias_tensor,
347           input_tensor->quantization->scale[0] * weight_scales[0],
348           error_reporter);
349     } else {
350       return utils::SymmetricPerLayerBiasQuantize<std::int32_t>(
351           model, bias_tensor,
352           input_tensor->quantization->scale[0] * weight_scales[0],
353           error_reporter);
354     }
355   }
356   return kTfLiteError;
357 }
358 
359 // True if the tensor type has to be modified.
TensorTypeChangeRequired(const TensorT * tensor,const TensorType & type)360 bool TensorTypeChangeRequired(const TensorT* tensor, const TensorType& type) {
361   // The quantized model is type INT8/INT16, so if the user provided type is
362   // INT8/INT16, we do not have to do any custom logic. Additionally, if the
363   // current tensor isn't INT8/INT16 quantized, the custom type doesn't apply.
364   bool int8check = type != TensorType_INT8 && tensor->type == TensorType_INT8 &&
365                    !tensor->quantization->scale.empty();
366   bool int16check = type != TensorType_INT16 &&
367                     tensor->type == TensorType_INT16 &&
368                     !tensor->quantization->scale.empty();
369   return (int8check || int16check);
370 }
371 
372 // Check if input is consumed by quantize, which means we don't need to
373 // requantize if the output scale is the same as the input tensor's.
InputQuantizeRequired(const ModelT * model,const SubGraphT * subgraph,int32_t input_idx)374 bool InputQuantizeRequired(const ModelT* model, const SubGraphT* subgraph,
375                            int32_t input_idx) {
376   std::vector<OperatorT*> quantize_ops;
377   for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
378     OperatorT* op = subgraph->operators[op_idx].get();
379     if (std::find(op->inputs.begin(), op->inputs.end(), input_idx) !=
380         op->inputs.end()) {
381       const BuiltinOperator op_code =
382           GetBuiltinCode(model->operator_codes[op->opcode_index].get());
383       if (op_code != BuiltinOperator_QUANTIZE) {
384         return true;
385       }
386       quantize_ops.push_back(op);
387     }
388   }
389   if (quantize_ops.size() == 1) {
390     const auto* tensor = subgraph->tensors[input_idx].get();
391     const auto* op = quantize_ops[0];
392     const int32_t output_idx = op->outputs[0];
393     const auto output_type = subgraph->tensors[output_idx]->type;
394     const float output_scale =
395         subgraph->tensors[output_idx]->quantization->scale[0];
396     const int64_t output_zero_point =
397         subgraph->tensors[output_idx]->quantization->zero_point[0];
398     if (output_type == tensor->type &&
399         output_scale == tensor->quantization->scale[0] &&
400         output_zero_point == tensor->quantization->zero_point[0]) {
401       return false;
402     }
403   }
404   return true;
405 }
406 
407 // Sets the input type, adding a Leading Op node at the start of the model if
408 // necessary.
409 // Returns the new input tensor index.
SetInputType(ModelT * model,SubGraphT * subgraph,const int32_t tensor_idx,const TensorType & input_type,const TensorType & activations_type)410 int32_t SetInputType(ModelT* model, SubGraphT* subgraph,
411                      const int32_t tensor_idx, const TensorType& input_type,
412                      const TensorType& activations_type) {
413   TensorT* tensor = subgraph->tensors[tensor_idx].get();
414   if (!TensorTypeChangeRequired(tensor, input_type)) {
415     return -1;
416   }
417   if (input_type == TensorType_FLOAT32 || input_type == TensorType_UINT8) {
418     std::string type_string =
419         activations_type == TensorType_INT16 ? "int16" : "int8";
420     // Create a new tensor to be the input of the leading Op.
421     std::unique_ptr<TensorT> leading_op_input;
422     if (input_type == TensorType_FLOAT32) {
423       // Add tensor for quantize operator. Scales and zero points are not
424       // needed.
425       const string leading_op_name = tensor->name;
426       const string new_name_original_input = tensor->name + "_" + type_string;
427       tensor->name = new_name_original_input;
428       utils::MakeTensor(leading_op_name, tensor->shape, tensor->shape_signature,
429                         input_type, &leading_op_input);
430     } else {
431       // Get scale and zero point from the first tensor.
432       const float scale = subgraph->tensors[tensor_idx]->quantization->scale[0];
433       const int64_t zero_point =
434           subgraph->tensors[tensor_idx]->quantization->zero_point[0];
435 
436       //  Add tensor for requantize operator. Scale is the existing scale and
437       //  zero point is shifted by +128.
438       TFLITE_DCHECK_GE(zero_point, -128);
439       TFLITE_DCHECK_LE(zero_point, 127);
440       const string leading_op_name = tensor->name;
441       const string new_name_original_input = tensor->name + "_" + type_string;
442       tensor->name = new_name_original_input;
443       utils::MakeTensorWithQuantParam(
444           leading_op_name, tensor->shape, tensor->shape_signature, input_type,
445           scale, zero_point + 128, &leading_op_input);
446     }
447 
448     // Check if quantize op already exists.
449     if (!InputQuantizeRequired(model, subgraph, tensor_idx)) {
450       subgraph->tensors[tensor_idx] = std::move(leading_op_input);
451       return tensor_idx;
452     }
453 
454     const int32_t leading_op_input_idx = subgraph->tensors.size();
455     subgraph->tensors.push_back(std::move(leading_op_input));
456 
457     // Create the leading op, which is Quantize Op that quantize or requantize
458     // the input.
459     std::unique_ptr<OperatorT> leading_op;
460     utils::MakeQuantizeOperator(model, &leading_op, leading_op_input_idx,
461                                 tensor_idx);
462 
463     // Insert the new op at the start of the model.
464     subgraph->operators.insert(subgraph->operators.begin(),
465                                std::move(leading_op));
466     return leading_op_input_idx;
467   }
468   return -1;
469 }
470 
471 // Sets the output type, adding a Tailing Op node at the end of the model if
472 // necessary.
473 // Returns the new output tensor index.
SetOutputType(ModelT * model,SubGraphT * subgraph,const int32_t tensor_idx,const TensorType & output_type,const TensorType & activations_type)474 int32_t SetOutputType(ModelT* model, SubGraphT* subgraph,
475                       const int32_t tensor_idx, const TensorType& output_type,
476                       const TensorType& activations_type) {
477   TensorT* tensor = subgraph->tensors[tensor_idx].get();
478   if (!TensorTypeChangeRequired(tensor, output_type)) {
479     return -1;
480   }
481   if (output_type == TensorType_FLOAT32 || output_type == TensorType_UINT8) {
482     std::string type_string =
483         activations_type == TensorType_INT16 ? "int16" : "int8";
484     // Create a new tensor to be the output of the tailing op.
485     std::unique_ptr<TensorT> tailing_op_output;
486     if (output_type == TensorType_FLOAT32) {
487       const string tailing_op_name = tensor->name;
488       const string new_name_original_output = tensor->name + "_" + type_string;
489       tensor->name = new_name_original_output;
490       utils::MakeTensor(tailing_op_name, tensor->shape, tensor->shape_signature,
491                         output_type, &tailing_op_output);
492     } else {
493       // Get scale and zero point from the last tensor.
494       const float scale = subgraph->tensors[tensor_idx]->quantization->scale[0];
495       const int64_t zero_point =
496           subgraph->tensors[tensor_idx]->quantization->zero_point[0];
497 
498       //  Add tensor for requantize operator. Scale is the existing scale and
499       //  zero point is shifted by +128.
500       TFLITE_DCHECK_GE(zero_point, -128);
501       TFLITE_DCHECK_LE(zero_point, 127);
502       const string tailing_op_name = tensor->name;
503       const string new_name_original_output = tensor->name + "_" + type_string;
504       tensor->name = new_name_original_output;
505       utils::MakeTensorWithQuantParam(
506           tailing_op_name, tensor->shape, tensor->shape_signature, output_type,
507           scale, zero_point + 128, &tailing_op_output);
508     }
509     const int32_t tailing_op_output_idx = subgraph->tensors.size();
510     subgraph->tensors.push_back(std::move(tailing_op_output));
511 
512     // Create the tailing operation.
513     std::unique_ptr<OperatorT> tailing_op;
514     if (output_type == TensorType_FLOAT32) {
515       // Tailing Op is Dequantize Op.
516       utils::MakeDequantizeOperator(model, &tailing_op, tensor_idx,
517                                     tailing_op_output_idx);
518     } else {
519       // Tailing Op is Quantize Op that does requantization.
520       utils::MakeQuantizeOperator(model, &tailing_op, tensor_idx,
521                                   tailing_op_output_idx);
522     }
523     // Add the operator at the end of the model.
524     subgraph->operators.push_back(std::move(tailing_op));
525     return tailing_op_output_idx;
526   }
527   return -1;
528 }
529 
530 // Sets the input and output types to the provided types. Leading and
531 // tailing operations will be added if needed.
532 // For Float input and output, leading op is Quantize and tailing op is
533 // Dequantize.
534 // For Uint8 input and output, leading op is Quantize (uint8 to
535 // int8, can be thought as "requant") and tailing op is also Quantize (int8 to
536 // uint8, can be thought as "requant").
SetInputAndOutputTypes(ModelT * model,const TensorType & input_type,const TensorType & output_type,const TensorType & activations_type,ErrorReporter * error_reporter)537 TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
538                                     const TensorType& output_type,
539                                     const TensorType& activations_type,
540                                     ErrorReporter* error_reporter) {
541   for (int subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
542        subgraph_idx++) {
543     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
544     SignatureDefT* signature_def = nullptr;
545     for (const auto& sig_def : model->signature_defs) {
546       if (sig_def->subgraph_index == subgraph_idx) {
547         signature_def = sig_def.get();
548         break;
549       }
550     }
551     for (int i = 0; i < subgraph->inputs.size(); ++i) {
552       TensorT* tensor = subgraph->tensors[subgraph->inputs[i]].get();
553       // TODO(suharshs): Add support for this case if it ever comes up.
554       if (tensor->type == TensorType_FLOAT32 && input_type != tensor->type) {
555         TF_LITE_REPORT_ERROR(
556             error_reporter,
557             "Unsupported input type %s for input tensor %d of type %s.",
558             EnumNameTensorType(input_type), subgraph->inputs[i],
559             EnumNameTensorType(tensor->type));
560         return kTfLiteError;
561       }
562       const int32_t input_idx = SetInputType(
563           model, subgraph, subgraph->inputs[i], input_type, activations_type);
564       if (input_idx < 0) {
565         continue;
566       }
567       if (signature_def != nullptr) {
568         for (const auto& input : signature_def->inputs) {
569           if (input->tensor_index == subgraph->inputs[i]) {
570             input->tensor_index = input_idx;
571             break;
572           }
573         }
574       }
575       subgraph->inputs[i] = input_idx;
576     }
577     for (int i = 0; i < subgraph->outputs.size(); ++i) {
578       TensorT* tensor = subgraph->tensors[subgraph->outputs[i]].get();
579       // TODO(suharshs): Add support for this case if it ever comes up.
580       if (tensor->type == TensorType_FLOAT32 && output_type != tensor->type) {
581         TF_LITE_REPORT_ERROR(
582             error_reporter,
583             "Unsupported output type %s for output tensor '%s' of type %s.",
584             EnumNameTensorType(output_type), tensor->name.c_str(),
585             EnumNameTensorType(tensor->type));
586         return kTfLiteError;
587       }
588       const int32_t output_idx = SetOutputType(
589           model, subgraph, subgraph->outputs[i], output_type, activations_type);
590       if (output_idx < 0) {
591         continue;
592       }
593       if (signature_def != nullptr) {
594         for (const auto& output : signature_def->outputs) {
595           if (output->tensor_index == subgraph->outputs[i]) {
596             output->tensor_index = output_idx;
597             break;
598           }
599         }
600       }
601       subgraph->outputs[i] = output_idx;
602     }
603   }
604   return kTfLiteOk;
605 }
606 
607 // Requantize a constant quantized tensor.
608 template <typename TensorDataType>
RequantizeConstant(const std::vector<uint8_t> & buffer_data,const TensorT * tensor,const std::unique_ptr<QuantizationParametersT> & new_quantization,std::vector<uint8_t> & new_buffer_data)609 TfLiteStatus RequantizeConstant(
610     const std::vector<uint8_t>& buffer_data, const TensorT* tensor,
611     const std::unique_ptr<QuantizationParametersT>& new_quantization,
612     std::vector<uint8_t>& new_buffer_data) {
613   if (new_buffer_data.size() != buffer_data.size()) {
614     new_buffer_data.resize(buffer_data.size());
615   }
616   const auto& quantization = tensor->quantization;
617   const std::vector<float>& scales = quantization->scale;
618   if (scales.empty()) {
619     // No existing quantization, assumes that new quantization parameters
620     // are correct.
621     new_buffer_data.assign(buffer_data.begin(), buffer_data.end());
622     return kTfLiteOk;
623   }
624   const std::vector<int64_t>& zero_points = quantization->zero_point;
625   const int num_elements = buffer_data.size() / sizeof(TensorDataType);
626   std::vector<float> float_values(num_elements);
627   const TensorDataType* buffer_values =
628       reinterpret_cast<const TensorDataType*>(buffer_data.data());
629   // This logic is for per-channel quantization, but works for per-tensor.
630   const int kPerChannelMaxDim = 4;
631   const std::vector<int32_t>& tensor_shape = tensor->shape;
632   RuntimeShape unextended_tensor_dims(tensor_shape.size(), tensor_shape.data());
633   RuntimeShape tensor_dims =
634       RuntimeShape::ExtendedShape(kPerChannelMaxDim, unextended_tensor_dims);
635   const int channel_dim_index = quantization->quantized_dimension +
636                                 kPerChannelMaxDim -
637                                 unextended_tensor_dims.DimensionsCount();
638   int indices[kPerChannelMaxDim];
639   for (indices[0] = 0; indices[0] < tensor_dims.Dims(0); indices[0]++) {
640     for (indices[1] = 0; indices[1] < tensor_dims.Dims(1); indices[1]++) {
641       for (indices[2] = 0; indices[2] < tensor_dims.Dims(2); indices[2]++) {
642         for (indices[3] = 0; indices[3] < tensor_dims.Dims(3); indices[3]++) {
643           const float scale = scales.size() > 1
644                                   ? scales[indices[channel_dim_index]]
645                                   : scales[0];
646           const int64_t zp = zero_points.size() > 1
647                                  ? zero_points[indices[channel_dim_index]]
648                                  : zero_points[0];
649           const int index = Offset(tensor_dims, indices);
650           float_values[index] = scale * (buffer_values[index] - zp);
651         }
652       }
653     }
654   }
655 
656   // Only have to deal with per-tensor for new parameters.
657   if (tensor->type == TensorType_INT16) {
658     std::vector<int16_t> requant_int16 = utils::SymmetricQuantizeFloatsToInt16(
659         float_values.data(), float_values.size(), new_quantization->scale[0]);
660     uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(requant_int16.data());
661     new_buffer_data.assign(uint8_buffer, uint8_buffer + buffer_data.size());
662     return kTfLiteOk;
663   } else if (tensor->type == TensorType_INT8) {
664     const int32_t q_min = std::numeric_limits<int8_t>::min();
665     const int32_t q_max = std::numeric_limits<int8_t>::max();
666     const float scaling_factor = new_quantization->scale[0];
667     const int32_t zp = new_quantization->zero_point[0];
668     const auto& rescale = [&scaling_factor, &zp, &q_min,
669                            &q_max](float f) -> uint8_t {
670       const float scaling_factor_inv =
671           (scaling_factor == 0) ? 0 : 1.0 / scaling_factor;
672       int32_t q_i32 = TfLiteRound(f * scaling_factor_inv) + zp;
673       int8_t q = std::min(std::max(q_i32, q_min), q_max);
674       return *(reinterpret_cast<uint8_t*>(&q));
675     };
676     std::transform(float_values.begin(), float_values.end(),
677                    new_buffer_data.begin(), rescale);
678     return kTfLiteOk;
679   }
680   return kTfLiteError;
681 }
682 
683 // Apply constraints to ops if they have any.
684 // We have made the restriction that for int8 quantized concat, minimum, and
685 // maximum, the inputs and outputs must have the same scale and zero point.
686 // The other ones with constraints are handled in QuantizeWeightsAndInput.
ApplyConstraints(ModelT * model,const std::unordered_set<string> & operator_names,const std::unordered_set<string> & real_value_op_set,TensorType activations_type,ErrorReporter * error_reporter)687 TfLiteStatus ApplyConstraints(
688     ModelT* model, const std::unordered_set<string>& operator_names,
689     const std::unordered_set<string>& real_value_op_set,
690     TensorType activations_type, ErrorReporter* error_reporter) {
691   for (int subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
692        subgraph_idx++) {
693     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
694     // Iterate backward to avoid messing with index.
695     for (int op_idx = subgraph->operators.size() - 1; op_idx >= 0; op_idx--) {
696       OperatorT* op = subgraph->operators[op_idx].get();
697       if (op->outputs.empty()) {
698         continue;
699       }
700       const string operator_name = subgraph->tensors[op->outputs[0]]->name;
701       operator_property::OperatorProperty property =
702           GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
703                               operator_name, activations_type);
704       if (!property.quantizable ||
705           !IsRealValueOp(real_value_op_set, operator_name)) {
706         continue;
707       }
708       TensorT* output_tensor = subgraph->tensors[op->outputs[0]].get();
709       if (!property.arbitrary_inputs ||
710           !property.restrict_same_input_output_scale(output_tensor->type)) {
711         continue;
712       }
713       // If ApplyConstraints and requant is needed, use the min of min and max
714       // of max, which means using the scale and zero point of output.
715       if (!utils::QuantizationParametersExist(output_tensor)) {
716         TF_LITE_REPORT_ERROR(
717             error_reporter,
718             "Unable to get scale or zero point from the tensor at %d.",
719             op->outputs[0]);
720         return kTfLiteError;
721       }
722       const float output_scale = output_tensor->quantization->scale[0];
723       const float output_zp = output_tensor->quantization->zero_point[0];
724       for (size_t input_idx = 0; input_idx < op->inputs.size(); ++input_idx) {
725         TensorT* input_tensor = subgraph->tensors[op->inputs[input_idx]].get();
726         if (!utils::QuantizationParametersExist(input_tensor)) {
727           TF_LITE_REPORT_ERROR(
728               error_reporter,
729               "Unable to get scale or zero point from tensor at %d.",
730               op->inputs[input_idx]);
731           return kTfLiteError;
732         }
733         if (input_tensor->quantization->scale[0] == output_scale &&
734             input_tensor->quantization->zero_point[0] == output_zp) {
735           // This input does not need to be requantized.
736           continue;
737         }
738 
739         if (IsConstantWithOneUse(model, subgraph, op->inputs[input_idx])) {
740           auto quantization = std::make_unique<QuantizationParametersT>();
741           quantization->scale.push_back(output_scale);
742           quantization->zero_point.push_back(output_zp);
743           const std::vector<uint8_t>& buffer_data =
744               model->buffers[input_tensor->buffer]->data;
745           std::vector<uint8_t> new_buffer_data;
746           TfLiteStatus requant_status = kTfLiteError;
747           if (input_tensor->type == TensorType_INT8) {
748             requant_status = RequantizeConstant<int8_t>(
749                 buffer_data, input_tensor, quantization, new_buffer_data);
750           } else if (input_tensor->type == TensorType_INT16) {
751             requant_status = RequantizeConstant<int16_t>(
752                 buffer_data, input_tensor, quantization, new_buffer_data);
753           }
754           if (requant_status == kTfLiteOk) {
755             model->buffers[input_tensor->buffer]->data = new_buffer_data;
756             input_tensor->quantization = std::move(quantization);
757             continue;
758           } else {
759             quantization.release();
760           }
761         }
762 
763         std::unique_ptr<TensorT> additional_tensor;
764         const string requant_tensor_name = input_tensor->name + "_requantized";
765         utils::MakeTensorWithQuantParam(
766             requant_tensor_name, input_tensor->shape,
767             input_tensor->shape_signature, activations_type, output_scale,
768             output_zp, &additional_tensor);
769         const int32_t additional_tensor_idx = subgraph->tensors.size();
770         subgraph->tensors.push_back(std::move(additional_tensor));
771 
772         // Add requant op before this input.
773         // There are better ways to handle this, which is to try to push the
774         // rescale upwards recursively and hope all upstream ops can absort
775         // this rescale.and only add requant when there is no other way.
776         std::unique_ptr<OperatorT> requant_op;
777         utils::MakeQuantizeOperator(model, &requant_op, op->inputs[input_idx],
778                                     additional_tensor_idx);
779         op->inputs[input_idx] = additional_tensor_idx;
780 
781         subgraph->operators.insert(subgraph->operators.begin() + op_idx,
782                                    std::move(requant_op));
783       }
784     }
785   }
786   return kTfLiteOk;
787 }
788 
789 // In case of int16 activations, there are two implementations of kernels for
790 // ADD/SUB operators. We set the builtin option pot_scale_int16
791 // during quantization so that from now only the general case implementation is
792 // used.
SetOperatorPropertyADDSUBOperator(ModelT * model,const TensorType & activations_type)793 void SetOperatorPropertyADDSUBOperator(ModelT* model,
794                                        const TensorType& activations_type) {
795   if (activations_type != TensorType_INT16) {
796     // This is needed only in case of int16 activations.
797     return;
798   }
799 
800   for (int subgraph_idx = 0, end = model->subgraphs.size(); subgraph_idx < end;
801        subgraph_idx++) {
802     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
803     // Iterate backward to avoid messing with index.
804     for (int op_idx = subgraph->operators.size() - 1; op_idx >= 0; op_idx--) {
805       OperatorT* op = subgraph->operators[op_idx].get();
806       OperatorCodeT* op_code = model->operator_codes[op->opcode_index].get();
807       if (op_code && op_code->builtin_code == BuiltinOperator_ADD) {
808         {
809           auto* options = op->builtin_options.AsAddOptions();
810           if (options) {
811             options->pot_scale_int16 = false;
812           }
813         }
814       }
815       if (op_code && op_code->builtin_code == BuiltinOperator_SUB) {
816         {
817           auto* options = op->builtin_options.AsSubOptions();
818           if (options) {
819             options->pot_scale_int16 = false;
820           }
821         }
822       }
823     }
824   }
825 }
826 
GetInputs(const OperatorT * op,operator_property::OperatorProperty property)827 std::vector<std::pair<int, operator_property::TensorProperty>> GetInputs(
828     const OperatorT* op, operator_property::OperatorProperty property) {
829   std::vector<std::pair<int, operator_property::TensorProperty>> inputs;
830   if (property.arbitrary_inputs || !property.quantizable) {
831     for (int i = 0; i < op->inputs.size(); ++i) {
832       inputs.push_back({i, {}});
833     }
834   } else {
835     inputs = property.inputs;
836   }
837   return inputs;
838 }
839 
GetOutputs(const OperatorT * op,operator_property::OperatorProperty property)840 std::vector<std::pair<int, operator_property::TensorProperty>> GetOutputs(
841     const OperatorT* op, operator_property::OperatorProperty property) {
842   std::vector<std::pair<int, operator_property::TensorProperty>> outputs;
843   if (property.arbitrary_outputs) {
844     for (int i = 0; i < op->outputs.size(); ++i) {
845       outputs.push_back({i, {}});
846     }
847   } else {
848     outputs = property.outputs;
849   }
850   return outputs;
851 }
852 
ShouldRestrictSameInputOutputScale(operator_property::OperatorProperty property,TensorType tensor_type)853 bool ShouldRestrictSameInputOutputScale(
854     operator_property::OperatorProperty property, TensorType tensor_type) {
855   // Ops with multiple inputs (i.e. concat, max and min) gets restricted in
856   // ApplyConstraints.
857   return (!property.arbitrary_inputs &&
858           property.restrict_same_input_output_scale(tensor_type));
859 }
860 
IsSubgraphInput(SubGraphT * subgraph,int32_t index)861 bool IsSubgraphInput(SubGraphT* subgraph, int32_t index) {
862   for (const int32_t input_idx : subgraph->inputs) {
863     if (index == input_idx) {
864       return true;
865     }
866   }
867   return false;
868 }
869 
870 // Quantize the op input. Will increment op_idx if ops are added.
QuantizeOpInput(ModelT * model,int32_t subgraph_idx,size_t * op_idx,operator_property::OperatorProperty property,const std::pair<int32_t,operator_property::TensorProperty> & input,const TensorType & activations_type,ErrorReporter * error_reporter)871 TfLiteStatus QuantizeOpInput(
872     ModelT* model, int32_t subgraph_idx, size_t* op_idx,
873     operator_property::OperatorProperty property,
874     const std::pair<int32_t, operator_property::TensorProperty>& input,
875     const TensorType& activations_type, ErrorReporter* error_reporter) {
876   int32_t input_idx = input.first;
877   operator_property::TensorProperty tensor_property = input.second;
878   SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
879   OperatorT* op = subgraph->operators[*op_idx].get();
880   const BuiltinOperator op_code =
881       GetBuiltinCode(model->operator_codes[op->opcode_index].get());
882   if (input_idx >= op->inputs.size()) {
883     TF_LITE_REPORT_ERROR(
884         error_reporter,
885         "Required input index %d is larger than the input length of op "
886         "%s at index %d in subgraph %d",
887         input_idx, op->inputs.size(), EnumNameBuiltinOperator(op_code), *op_idx,
888         subgraph_idx);
889     return kTfLiteError;
890   }
891   const int32_t tensor_idx = op->inputs[input_idx];
892   if (tensor_idx == -1) {
893     // Skip optional tensor.
894     return kTfLiteOk;
895   }
896   TensorT* tensor = subgraph->tensors[tensor_idx].get();
897   // Assumes if tensor is quantized, then it is a weight and quantized to 8 bit.
898   const bool is_input_quantized = utils::QuantizationParametersExist(tensor);
899   if (property.quantizable && !is_input_quantized) {
900     // The operation is quantizable, but the input isn't yet quantized.
901     if (utils::HasBuffer(model, subgraph, tensor_idx)) {
902       // TODO(suharshs): Look at consumers, throw error if one consumer is
903       // per-channel and one per-layer.
904       bool quantize_const_input = false;
905       if (activations_type == TensorType_INT16 &&
906           (property.restrict_same_input_output_scale(tensor->type) ||
907            property.quantize_input_as_activations)) {
908         quantize_const_input = true;
909       }
910       if (tensor_property.number_of_bits == 8 && !quantize_const_input) {
911         if (tensor_property.use_derived_scale) {
912           // Currently 8bit tensors in input do not accept derived scale.
913           return kTfLiteError;
914         }
915         if (utils::QuantizeWeight(model, tensor, tensor_property.per_axis,
916                                   tensor_property.per_axis_index,
917                                   error_reporter) != kTfLiteOk) {
918           TF_LITE_REPORT_ERROR(
919               error_reporter,
920               "Unable to quantize buffer or min/max value for input %d "
921               "in op %s in subgraph %d, node: %d",
922               input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx,
923               *op_idx);
924           return kTfLiteError;
925         }
926       } else if (tensor_property.number_of_bits == 16 || quantize_const_input) {
927         if (tensor_property.use_derived_scale) {
928           // Currently 16bit tensors in input do not accept derived scale.
929           return kTfLiteError;
930         }
931         TensorT* tensor = subgraph->tensors[tensor_idx].get();
932         int total_size = 1;
933         for (int i = 0; i < tensor->shape.size(); ++i) {
934           total_size *= tensor->shape[i];
935         }
936         BufferT* buffer = model->buffers[tensor->buffer].get();
937         float* float_data = reinterpret_cast<float*>(buffer->data.data());
938         auto minmax = std::minmax_element(float_data, float_data + total_size);
939         const float min = *minmax.first;
940         const float max = *minmax.second;
941         const float range = std::max(std::abs(min), std::abs(max));
942         // The narrow range quantized value for int16.
943         const float quantize_range = 32767.0;
944         const float scale = range / quantize_range;
945         return utils::SymmetricQuantizeFloatsToInt16(model, tensor, scale,
946                                                      error_reporter);
947       } else if (tensor_property.number_of_bits == 32) {
948         if (!tensor_property.use_derived_scale) {
949           // Currently 32 bit tensors in input only accept derived scale.
950           return kTfLiteError;
951         }
952         TensorT* tensor = subgraph->tensors[tensor_idx].get();
953         const float scale = utils::GetEffectiveScale(
954             model, subgraph, *op_idx,
955             tensor_property.derived_scale.input_tensors,
956             tensor_property.derived_scale.intermediate_tensors,
957             tensor_property.derived_scale.factors);
958         return utils::SymmetricPerLayerBiasQuantize<std::int32_t>(
959             model, tensor, scale, error_reporter);
960 
961       } else if (tensor_property.number_of_bits == 10) {
962         // When the number of bits is 10 (instead of 16), quantize the tensor to
963         // [-512, 512], instead of [-32767, 32767].
964         TensorT* tensor = subgraph->tensors[tensor_idx].get();
965         int total_size = 1;
966         for (int i = 0; i < tensor->shape.size(); ++i) {
967           total_size *= tensor->shape[i];
968         }
969         BufferT* buffer = model->buffers[tensor->buffer].get();
970         float* buffer_data = reinterpret_cast<float*>(buffer->data.data());
971         auto minmax =
972             std::minmax_element(buffer_data, buffer_data + total_size);
973         const float range =
974             std::max(std::abs(*minmax.first), std::abs(*minmax.second));
975         const float quantized_range = 512.0;
976         const float scale = range / quantized_range;
977         return utils::SymmetricQuantizeFloatsToInt16(model, tensor, scale,
978                                                      error_reporter);
979       } else {
980         // Currently supports only 8, 16, 32, 10 bits.
981         TF_LITE_REPORT_ERROR(
982             error_reporter,
983             "Unable to quantize buffer or min/max value for input %d "
984             "in op %s in subgraph %d, node: %d",
985             input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, *op_idx);
986         return kTfLiteError;
987       }
988     } else if (utils::HasMinMax(tensor)) {
989       if (IsSubgraphInput(subgraph, tensor_idx) ||
990           tensor_property.state_tensor) {
991         if (tensor_property.number_of_bits == 8) {
992           if (tensor_property.use_derived_scale) {
993             // Currently 8bit tensors in input do not accept derived scale.
994             return kTfLiteError;
995           }
996           TF_LITE_ENSURE_STATUS(utils::QuantizeActivation(
997               tensor, activations_type, error_reporter));
998         } else if (tensor_property.number_of_bits == 16) {
999           TensorT* tensor = subgraph->tensors[tensor_idx].get();
1000           float quantized_range = 32767.0;
1001           float range = std::max(std::abs(tensor->quantization->min[0]),
1002                                  std::abs(tensor->quantization->max[0]));
1003           if (tensor_property.extend_to_power_of_two) {
1004             const int power_of_two_scale = utils::GetPowerOfTwoScale(
1005                 tensor->quantization->min[0], tensor->quantization->max[0]);
1006             range = std::pow(2, power_of_two_scale);  // NOLINT
1007             quantized_range = 32768.0;
1008           }
1009           const float scale = range / quantized_range;
1010           utils::QuantizeActivationToInt16(tensor, scale);
1011         }
1012       } else {
1013         // If the tensor is not a model input, we need to add a Quantize
1014         // operation since the preceding op may require a float output.
1015         std::string type_string =
1016             activations_type == TensorType_INT16 ? "int16" : "int8";
1017         std::unique_ptr<TensorT> op_output;
1018         utils::MakeTensor(tensor->name + "_" + type_string, tensor->shape,
1019                           tensor->shape_signature, activations_type,
1020                           &op_output);
1021         op_output->quantization = std::make_unique<QuantizationParametersT>();
1022         op_output->quantization->min.push_back(tensor->quantization->min[0]);
1023         op_output->quantization->max.push_back(tensor->quantization->max[0]);
1024         TF_LITE_ENSURE_STATUS(utils::QuantizeActivation(
1025             op_output.get(), activations_type, error_reporter));
1026         const int32_t quant_op_output_idx = subgraph->tensors.size();
1027         subgraph->tensors.push_back(std::move(op_output));
1028         std::unique_ptr<OperatorT> quant_op;
1029         utils::MakeQuantizeOperator(model, &quant_op, tensor_idx,
1030                                     quant_op_output_idx);
1031         subgraph->operators.insert(subgraph->operators.begin() + *op_idx,
1032                                    std::move(quant_op));
1033         op->inputs[input_idx] = quant_op_output_idx;
1034         *op_idx += 1;
1035       }
1036     } else {
1037       TF_LITE_REPORT_ERROR(error_reporter,
1038                            "Unable to find buffer or min/max value for input "
1039                            "%d in %s in subgraph %d, node: %d",
1040                            input_idx, EnumNameBuiltinOperator(op_code),
1041                            subgraph_idx, *op_idx);
1042       return kTfLiteError;
1043     }
1044   } else if (!property.quantizable && is_input_quantized) {
1045     // If the tensor is quantized, we have to add a Dequantize op after
1046     // since this op is not quantizable.
1047     std::unique_ptr<TensorT> op_output;
1048     utils::MakeTensor(tensor->name + "_float", tensor->shape,
1049                       tensor->shape_signature, TensorType_FLOAT32, &op_output);
1050     const int32_t dequant_op_output_idx = subgraph->tensors.size();
1051     subgraph->tensors.push_back(std::move(op_output));
1052     std::unique_ptr<OperatorT> dequant_op;
1053     utils::MakeDequantizeOperator(model, &dequant_op, tensor_idx,
1054                                   dequant_op_output_idx);
1055     subgraph->operators.insert(subgraph->operators.begin() + *op_idx,
1056                                std::move(dequant_op));
1057     op->inputs[input_idx] = dequant_op_output_idx;
1058     *op_idx += 1;
1059   }
1060   return kTfLiteOk;
1061 }
1062 
1063 // Quantize the op output.
QuantizeOpOutput(ModelT * model,int32_t subgraph_idx,int32_t op_idx,operator_property::OperatorProperty property,const std::pair<int32_t,operator_property::TensorProperty> & output,TensorType activations_type,ErrorReporter * error_reporter)1064 TfLiteStatus QuantizeOpOutput(
1065     ModelT* model, int32_t subgraph_idx, int32_t op_idx,
1066     operator_property::OperatorProperty property,
1067     const std::pair<int32_t, operator_property::TensorProperty>& output,
1068     TensorType activations_type, ErrorReporter* error_reporter) {
1069   int32_t output_idx = output.first;
1070   operator_property::TensorProperty tensor_property = output.second;
1071   // If the operator is not quantizable, we don't need to do anything for the
1072   // output.
1073   if (!property.quantizable) {
1074     return kTfLiteOk;
1075   }
1076   SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1077   OperatorT* op = subgraph->operators[op_idx].get();
1078   const BuiltinOperator op_code =
1079       GetBuiltinCode(model->operator_codes[op->opcode_index].get());
1080   if (output_idx >= op->outputs.size()) {
1081     TF_LITE_REPORT_ERROR(
1082         error_reporter,
1083         "Required output index %d is larger than the output length of "
1084         "op %s at index %d in subgraph %d",
1085         output_idx, op->outputs.size(), EnumNameBuiltinOperator(op_code),
1086         op_idx, subgraph_idx);
1087     return kTfLiteError;
1088   }
1089 
1090   TensorT* output_tensor = subgraph->tensors[op->outputs[output_idx]].get();
1091   if (utils::QuantizationParametersExist(output_tensor)) {
1092     // Skip output if it has been quantized.
1093     return kTfLiteOk;
1094   }
1095   if (ShouldRestrictSameInputOutputScale(property, output_tensor->type)) {
1096     // Copy quantization parameter. For average pool, max pool, etc
1097     // min/max can be different but we want them to be the same.
1098     // Get scale and zero point of input.
1099     if (property.inputs[0].first >= op->inputs.size()) {
1100       TF_LITE_REPORT_ERROR(
1101           error_reporter,
1102           "Required input index %d is larger than the input length of "
1103           "op %s at index %d in subgraph %d",
1104           property.inputs[0].first, op->inputs.size(),
1105           EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx);
1106       return kTfLiteError;
1107     }
1108     const int input_tensor_idx = op->inputs[property.inputs[0].first];
1109     TensorT* input_tensor = subgraph->tensors[input_tensor_idx].get();
1110     if (input_tensor->quantization->scale.size() != 1 ||
1111         input_tensor->quantization->zero_point.size() != 1) {
1112       TF_LITE_REPORT_ERROR(error_reporter,
1113                            "Invalid quantization params for op %s at index %d "
1114                            "in subgraph %d",
1115                            EnumNameBuiltinOperator(op_code), op_idx,
1116                            subgraph_idx);
1117       return kTfLiteError;
1118     }
1119 
1120     const float input_scale = input_tensor->quantization->scale[0];
1121     const int32_t input_zero_point = input_tensor->quantization->zero_point[0];
1122 
1123     // Apply to output.
1124     output_tensor->quantization = std::make_unique<QuantizationParametersT>();
1125     output_tensor->quantization->scale.push_back(input_scale);
1126     output_tensor->quantization->zero_point.push_back(input_zero_point);
1127     if (!input_tensor->quantization->min.empty()) {
1128       const float min = input_tensor->quantization->min[0];
1129       output_tensor->quantization->min = {min};
1130     }
1131     if (!input_tensor->quantization->max.empty()) {
1132       const float max = input_tensor->quantization->max[0];
1133       output_tensor->quantization->max = {max};
1134     }
1135     output_tensor->type = activations_type;
1136   } else if (tensor_property.restriction) {
1137     const auto scale_and_zp = activations_type == TensorType_INT16
1138                                   ? tensor_property.restricted_value_int16
1139                                   : tensor_property.restricted_value_int8;
1140 
1141     // Apply to output.
1142     output_tensor->quantization = std::make_unique<QuantizationParametersT>();
1143     output_tensor->quantization->scale.push_back(scale_and_zp.first);
1144     output_tensor->quantization->zero_point.push_back(scale_and_zp.second);
1145     output_tensor->type = activations_type;
1146   } else {
1147     // Process regular output that doesn't have any restrictions.
1148     if (utils::HasMinMax(output_tensor)) {
1149       utils::QuantizeActivation(output_tensor, activations_type,
1150                                 error_reporter);
1151     } else {
1152       TF_LITE_REPORT_ERROR(
1153           error_reporter,
1154           "Unable to find min/max value for output %d in %s in "
1155           "subgraph %d, node: %d",
1156           output_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, op_idx);
1157       return kTfLiteError;
1158     }
1159   }
1160   return kTfLiteOk;
1161 }
1162 
QuantizeIntermediateTensors(ModelT * model,TensorType activations_type,ErrorReporter * error_reporter)1163 TfLiteStatus QuantizeIntermediateTensors(ModelT* model,
1164                                          TensorType activations_type,
1165                                          ErrorReporter* error_reporter) {
1166   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1167        subgraph_idx++) {
1168     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1169     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1170       operator_property::OperatorProperty property =
1171           operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
1172       if (!property.intermediates.empty()) {
1173         OperatorT* op = subgraph->operators[op_idx].get();
1174         const BuiltinOperator op_code =
1175             GetBuiltinCode(model->operator_codes[op->opcode_index].get());
1176         for (const std::pair<int, operator_property::TensorProperty>& input :
1177              property.intermediates) {
1178           const int index_local = input.first;
1179           const int index_global = op->intermediates[index_local];
1180           if (index_global == -1) {
1181             // Skip optional tensor.
1182             continue;
1183           }
1184           if (input.second.number_of_bits == 8 &&
1185               input.second.symmetric == false) {
1186             TensorT* tensor = subgraph->tensors[index_global].get();
1187             if (tensor->quantization == nullptr) {
1188               continue;
1189             }
1190             if (utils::HasMinMax(tensor)) {
1191               utils::QuantizeActivation(tensor, activations_type,
1192                                         error_reporter);
1193             } else {
1194               TF_LITE_REPORT_ERROR(error_reporter,
1195                                    "Unable to find min/max value for "
1196                                    "intermediate tensor %d in %s in "
1197                                    "subgraph %d, node: %d",
1198                                    index_local,
1199                                    EnumNameBuiltinOperator(op_code),
1200                                    subgraph_idx, op_idx);
1201               return kTfLiteError;
1202             }
1203           } else if (input.second.number_of_bits == 16 &&
1204                      input.second.symmetric == true) {
1205             TensorT* tensor = subgraph->tensors[index_global].get();
1206             if (tensor->quantization == nullptr) {
1207               continue;
1208             }
1209             const float min = tensor->quantization->min[0];
1210             const float max = tensor->quantization->max[0];
1211             const float range = std::max(std::abs(min), std::abs(max));
1212             if (range < 1e-8) {
1213               return kTfLiteError;
1214             }
1215 
1216             // Get scale and zero point.
1217             const float quantized_range = 32767.0;
1218             const float scale = range / quantized_range;
1219             utils::QuantizeActivationToInt16(tensor, scale);
1220           } else {
1221             return kTfLiteError;
1222           }
1223         }
1224       }
1225     }
1226   }
1227   return kTfLiteOk;
1228 }
1229 
1230 // Quantize tensors that have shared range. For example, in LSTM, the output
1231 // tensor and input state tensor should share the same range because they are
1232 // using the same scale and zero point.
1233 // We have to model this explicitly because the output is modeled as an extra
1234 // tensor in LSTM. In calibrator, state tensors are logged both before and after
1235 // the inference so the range is fully captured. But output, although it is
1236 // identical to activation, is not a state tensor the input value (range) of the
1237 // very first inference is not captured.
QuantizeSharedRange(ModelT * model,ErrorReporter * error_reporter)1238 TfLiteStatus QuantizeSharedRange(ModelT* model, ErrorReporter* error_reporter) {
1239   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1240        subgraph_idx++) {
1241     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1242     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1243       operator_property::OperatorProperty property =
1244           operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
1245       if (!property.intermediates.empty()) {
1246         OperatorT* op = subgraph->operators[op_idx].get();
1247         for (const std::vector<int>& input : property.restrict_scale) {
1248           if (input.empty()) {
1249             continue;
1250           }
1251           // Currently only support two values. The first one for input and
1252           // the second one for output.
1253           if (input.size() != 2) {
1254             return kTfLiteError;
1255           }
1256           const int index_1 = input[0];
1257           const int index_2 = input[1];
1258           TensorT* tensor_1 = subgraph->tensors[op->inputs[index_1]].get();
1259           TensorT* tensor_2 = subgraph->tensors[op->outputs[index_2]].get();
1260           const float min_of_min = std::min(tensor_1->quantization->min[0],
1261                                             tensor_2->quantization->min[0]);
1262           const float max_of_max = std::max(tensor_1->quantization->max[0],
1263                                             tensor_2->quantization->max[0]);
1264           if (min_of_min == 0.0 && max_of_max == 0.0) {
1265             return kTfLiteError;
1266           }
1267 
1268           // Asmmetric quantization to 8 bit.
1269           auto quantization_params =
1270               std::make_unique<QuantizationParametersT>();
1271           utils::GetAsymmetricQuantizationParams(
1272               min_of_min, max_of_max, -128, 127, quantization_params.get());
1273 
1274           // Populate both tensors with the same parameters.
1275           const float scale = quantization_params->scale[0];
1276           const int32 zero_point = quantization_params->zero_point[0];
1277           for (TensorT* tensor : {tensor_1, tensor_2}) {
1278             tensor->quantization = std::make_unique<QuantizationParametersT>();
1279             tensor->quantization->scale.push_back(scale);
1280             tensor->quantization->zero_point.push_back(zero_point);
1281             tensor->type = TensorType_INT8;
1282           }
1283         }
1284       }
1285     }
1286   }
1287   return kTfLiteOk;
1288 }
1289 
1290 // Quantize a constant based on min/max quantization parameters for
1291 // resource assignments during initialization. Constant buffers should
1292 // have the same quantization parameters as assignments.
QuantizeConstantVariable(ModelT * model,const TensorType & activations_type,TensorT * var_tensor,ErrorReporter * error_reporter)1293 TfLiteStatus QuantizeConstantVariable(ModelT* model,
1294                                       const TensorType& activations_type,
1295                                       TensorT* var_tensor,
1296                                       ErrorReporter* error_reporter) {
1297   if (activations_type == TensorType_INT16) {
1298     const float min = var_tensor->quantization->min[0];
1299     const float max = var_tensor->quantization->max[0];
1300     const float range = std::max(std::abs(min), std::abs(max));
1301     const float quantize_range = 32767.0;
1302     const float scale = range / quantize_range;
1303     return utils::SymmetricQuantizeFloatsToInt16(model, var_tensor, scale,
1304                                                  error_reporter);
1305   } else if (activations_type == TensorType_INT8) {
1306     TF_LITE_ENSURE_STATUS(utils::QuantizeActivation(
1307         var_tensor, activations_type, error_reporter));
1308     QuantizationParametersT* quantization_params =
1309         var_tensor->quantization.get();
1310     const float scaling_factor = quantization_params->scale[0];
1311     const int zero_point = quantization_params->zero_point[0];
1312     const BufferT* buffer = model->buffers[var_tensor->buffer].get();
1313     const float* float_data =
1314         reinterpret_cast<const float*>(buffer->data.data());
1315     uint64_t num_elements;
1316     TF_LITE_ENSURE_STATUS(utils::NumElements(*var_tensor, &num_elements));
1317     const float scaling_factor_inv =
1318         (scaling_factor == 0) ? 0 : 1.0 / scaling_factor;
1319     std::vector<int8_t> quantized(num_elements);
1320     const int32_t kMinScale = std::numeric_limits<int8_t>::min();
1321     const int32_t kMaxScale = std::numeric_limits<int8_t>::max();
1322     for (size_t i = 0; i < num_elements; i++) {
1323       const int32_t quantized_value = static_cast<int32_t>(
1324           TfLiteRound(float_data[i] * scaling_factor_inv) + zero_point);
1325       quantized[i] = std::min(kMaxScale, std::max(kMinScale, quantized_value));
1326     }
1327     uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(quantized.data());
1328     const size_t buffer_size = num_elements * sizeof(int8_t);
1329     model->buffers[var_tensor->buffer]->data.assign(uint8_buffer,
1330                                                     uint8_buffer + buffer_size);
1331     return kTfLiteOk;
1332   }
1333   return kTfLiteError;
1334 }
1335 
1336 using TensorResourceMap = std::map<std::pair<int, int>, std::string>;
1337 using ResourceMinMaxMap = std::map<std::string, std::pair<float, float>>;
1338 // Find min of mins, max of maxes for each variable read or assignment.
PopulateResourceMinMaxMap(ModelT * model,TensorResourceMap & tensor_resource_map,ResourceMinMaxMap & resource_min_max_map)1339 void PopulateResourceMinMaxMap(ModelT* model,
1340                                TensorResourceMap& tensor_resource_map,
1341                                ResourceMinMaxMap& resource_min_max_map) {
1342   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1343        subgraph_idx++) {
1344     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1345     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1346       OperatorT* op = subgraph->operators[op_idx].get();
1347       const BuiltinOperator op_code =
1348           GetBuiltinCode(model->operator_codes[op->opcode_index].get());
1349       if (op_code == BuiltinOperator_VAR_HANDLE) {
1350         const std::string& name =
1351             op->builtin_options.AsVarHandleOptions()->shared_name;
1352         resource_min_max_map.insert({name, {0.0, 0.0}});
1353         tensor_resource_map.insert({{subgraph_idx, op->outputs[0]}, name});
1354       }
1355       if ((op_code == BuiltinOperator_ASSIGN_VARIABLE) ||
1356           (op_code == BuiltinOperator_READ_VARIABLE)) {
1357         if (tensor_resource_map.find({subgraph_idx, op->inputs[0]}) ==
1358             tensor_resource_map.end()) {
1359           continue;
1360         }
1361         const std::string& name =
1362             tensor_resource_map[{subgraph_idx, op->inputs[0]}];
1363         TensorT* var_tensor;
1364         if (op_code == BuiltinOperator_ASSIGN_VARIABLE) {
1365           var_tensor = subgraph->tensors[op->inputs[1]].get();
1366         } else if (op_code == BuiltinOperator_READ_VARIABLE) {
1367           var_tensor = subgraph->tensors[op->outputs[0]].get();
1368         } else {
1369           continue;
1370         }
1371         if (!var_tensor->quantization ||
1372             var_tensor->quantization->min.empty() ||
1373             var_tensor->quantization->max.empty()) {
1374           continue;
1375         }
1376         // resources are quantized per tensor.
1377         const float current_min = var_tensor->quantization->min[0];
1378         const float current_max = var_tensor->quantization->max[0];
1379         auto inserted =
1380             resource_min_max_map.insert({name, {current_min, current_max}});
1381         if (!inserted.second) {
1382           resource_min_max_map[name] = {
1383               std::min(inserted.first->second.first, current_min),
1384               std::max(inserted.first->second.second, current_max)};
1385         }
1386       }
1387     }
1388   }
1389 }
1390 
1391 // Quantize resource variables. Each resource read and assign should have
1392 // identical quantization parameters.
QuantizeResources(ModelT * model,const TensorType & activations_type,ErrorReporter * error_reporter)1393 TfLiteStatus QuantizeResources(ModelT* model,
1394                                const TensorType& activations_type,
1395                                ErrorReporter* error_reporter) {
1396   // Shared name is only stored in the var handle operator, use resoure name map
1397   // to map tensors to resource names.
1398   TensorResourceMap tensor_resource_map;
1399   ResourceMinMaxMap resource_min_max_map;
1400   PopulateResourceMinMaxMap(model, tensor_resource_map, resource_min_max_map);
1401   if (resource_min_max_map.empty()) {
1402     // No resources found, so this is OK.
1403     return kTfLiteOk;
1404   }
1405   // Update quantization parameters.
1406   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1407        subgraph_idx++) {
1408     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1409     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1410       OperatorT* op = subgraph->operators[op_idx].get();
1411       const BuiltinOperator op_code =
1412           GetBuiltinCode(model->operator_codes[op->opcode_index].get());
1413       if (op_code == BuiltinOperator_ASSIGN_VARIABLE ||
1414           op_code == BuiltinOperator_READ_VARIABLE) {
1415         if (tensor_resource_map.find({subgraph_idx, op->inputs[0]}) ==
1416             tensor_resource_map.end()) {
1417           continue;
1418         }
1419         const std::string& name =
1420             tensor_resource_map[{subgraph_idx, op->inputs[0]}];
1421         TensorT* var_tensor = nullptr;
1422         bool is_constant_assign = false;
1423         if (op_code == BuiltinOperator_ASSIGN_VARIABLE) {
1424           var_tensor = subgraph->tensors[op->inputs[1]].get();
1425           is_constant_assign = utils::HasBuffer(model, subgraph, op->inputs[1]);
1426         } else if (op_code == BuiltinOperator_READ_VARIABLE) {
1427           var_tensor = subgraph->tensors[op->outputs[0]].get();
1428         } else {
1429           continue;
1430         }
1431         if (resource_min_max_map.find(name) == resource_min_max_map.end()) {
1432           continue;
1433         }
1434         if (!var_tensor->quantization) {
1435           var_tensor->quantization =
1436               std::make_unique<QuantizationParametersT>();
1437           var_tensor->quantization->min.push_back(
1438               resource_min_max_map[name].first);
1439           var_tensor->quantization->max.push_back(
1440               resource_min_max_map[name].second);
1441         } else {
1442           var_tensor->quantization->min[0] = resource_min_max_map[name].first;
1443           var_tensor->quantization->max[0] = resource_min_max_map[name].second;
1444         }
1445         if (!is_constant_assign) {
1446           continue;
1447         }
1448         if (QuantizeConstantVariable(model, activations_type, var_tensor,
1449                                      error_reporter) != kTfLiteOk) {
1450           TF_LITE_REPORT_ERROR(
1451               error_reporter,
1452               "Unable to quantize buffer or min/max value for assignment "
1453               "in op %s in subgraph %d, node: %d",
1454               EnumNameBuiltinOperator(op_code), subgraph_idx, op_idx);
1455           return kTfLiteError;
1456         }
1457       }
1458     }
1459   }
1460   return kTfLiteOk;
1461 }
1462 
1463 // Quantize inputs and weights.
1464 // Because of ops such as lstm, still need to do per op, instead of weights.
QuantizeWeightsInputOutput(ModelT * model,bool allow_float,const std::unordered_set<string> & operator_names,const std::unordered_set<string> & real_value_op_set,const TensorType & activations_type,bool disable_per_channel,ErrorReporter * error_reporter)1465 TfLiteStatus QuantizeWeightsInputOutput(
1466     ModelT* model, bool allow_float,
1467     const std::unordered_set<string>& operator_names,
1468     const std::unordered_set<string>& real_value_op_set,
1469     const TensorType& activations_type, bool disable_per_channel,
1470     ErrorReporter* error_reporter) {
1471   // Flag to track unsupported ops.
1472   bool quantization_not_supported = false;
1473 
1474   // Loop over the graph and quantize ops.
1475   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1476        subgraph_idx++) {
1477     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1478     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1479       OperatorT* op = subgraph->operators[op_idx].get();
1480       const BuiltinOperator op_code =
1481           GetBuiltinCode(model->operator_codes[op->opcode_index].get());
1482       if (op->outputs.empty() && op_code != BuiltinOperator_ASSIGN_VARIABLE) {
1483         continue;
1484       }
1485       const string operator_name = op_code != BuiltinOperator_ASSIGN_VARIABLE
1486                                        ? subgraph->tensors[op->outputs[0]]->name
1487                                        : subgraph->tensors[op->inputs[0]]->name;
1488       operator_property::OperatorProperty property = GetOperatorProperty(
1489           operator_names, model, subgraph_idx, op_idx, operator_name,
1490           activations_type, disable_per_channel);
1491       if (!IsRealValueOp(real_value_op_set, operator_name)) {
1492         continue;
1493       }
1494 
1495       if (activations_type == TensorType_INT16 && !property.quantizable &&
1496           !allow_float) {
1497         TF_LITE_REPORT_ERROR(
1498             error_reporter,
1499             "Quantization to 16x8-bit not yet supported for op: '%s'.\n",
1500             EnumNameBuiltinOperator(op_code));
1501         quantization_not_supported = true;
1502       } else if (!property.quantizable && !allow_float) {
1503         if (op_code == BuiltinOperator_DEQUANTIZE &&
1504             std::find(subgraph->outputs.begin(), subgraph->outputs.end(),
1505                       op->outputs[0]) != subgraph->outputs.end()) {
1506           continue;
1507         }
1508         TF_LITE_REPORT_ERROR(error_reporter,
1509                              "Quantization not yet supported for op: '%s'.\n",
1510                              EnumNameBuiltinOperator(op_code));
1511         quantization_not_supported = true;
1512       }
1513 
1514       // Quantize operator inputs/weights.
1515       for (const std::pair<int, operator_property::TensorProperty>& input :
1516            GetInputs(op, property)) {
1517         TF_LITE_ENSURE_STATUS(QuantizeOpInput(model, subgraph_idx, &op_idx,
1518                                               property, input, activations_type,
1519                                               error_reporter));
1520       }
1521 
1522       // Quantize operator outputs.
1523       for (const std::pair<int, operator_property::TensorProperty>& output :
1524            GetOutputs(op, property)) {
1525         TF_LITE_ENSURE_STATUS(
1526             QuantizeOpOutput(model, subgraph_idx, op_idx, property, output,
1527                              activations_type, error_reporter));
1528       }
1529     }
1530   }
1531 
1532   // Return; emit errors if there are any.
1533   if (quantization_not_supported) {
1534     return kTfLiteError;
1535   }
1536   return kTfLiteOk;
1537 }
1538 
1539 // Quantize bias.
QuantizeBiases(ModelT * model,const std::unordered_set<string> & operator_names,const std::unordered_set<string> & real_value_op_set,const TensorType & activations_type,const TensorType & bias_type,bool disable_per_channel,ErrorReporter * error_reporter)1540 TfLiteStatus QuantizeBiases(ModelT* model,
1541                             const std::unordered_set<string>& operator_names,
1542                             const std::unordered_set<string>& real_value_op_set,
1543                             const TensorType& activations_type,
1544                             const TensorType& bias_type,
1545                             bool disable_per_channel,
1546                             ErrorReporter* error_reporter) {
1547   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1548        subgraph_idx++) {
1549     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1550     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1551       OperatorT* op = subgraph->operators[op_idx].get();
1552       const BuiltinOperator op_code =
1553           GetBuiltinCode(model->operator_codes[op->opcode_index].get());
1554       if (op->outputs.empty()) {
1555         continue;
1556       }
1557       const string operator_name = subgraph->tensors[op->outputs[0]]->name;
1558       operator_property::OperatorProperty property = GetOperatorProperty(
1559           operator_names, model, subgraph_idx, op_idx, operator_name,
1560           activations_type, disable_per_channel);
1561       if (!property.quantizable ||
1562           !IsRealValueOp(real_value_op_set, operator_name)) {
1563         continue;
1564       }
1565       for (const int bias_idx : property.biases) {
1566         if (bias_idx >= op->inputs.size() ||
1567             op->inputs[bias_idx] == kTfLiteOptionalTensor) {
1568           continue;
1569         }
1570         // Quantize if it is not quantized already as the
1571         // output of another op or input of another op.
1572         TensorT* bias_tensor = subgraph->tensors[op->inputs[bias_idx]].get();
1573         if (!utils::QuantizationParametersExist(bias_tensor)) {
1574           if (utils::HasBuffer(model, subgraph, op->inputs[bias_idx])) {
1575             if (property.inputs.size() != 2) {
1576               TF_LITE_REPORT_ERROR(error_reporter,
1577                                    "Expect the input length of "
1578                                    "op %s at index %d in subgraph %d to be 2",
1579                                    bias_idx, op->inputs.size(),
1580                                    EnumNameBuiltinOperator(op_code), op_idx,
1581                                    subgraph_idx);
1582               return kTfLiteError;
1583             }
1584             TensorT* input_tensor =
1585                 subgraph->tensors[op->inputs[property.inputs[0].first]].get();
1586             TensorT* weight_tensor =
1587                 subgraph->tensors[op->inputs[property.inputs[1].first]].get();
1588             operator_property::TensorProperty weight_property =
1589                 property.inputs[1].second;
1590             TF_LITE_ENSURE_STATUS(QuantizeBias(
1591                 model, input_tensor, weight_tensor, bias_tensor,
1592                 weight_property.per_axis, weight_property.per_axis_index,
1593                 bias_type, error_reporter));
1594           }
1595         } else {
1596           // If bias is already quantized, make sure it is quantized to 32 bit.
1597           if (bias_tensor->type != TensorType_INT32) {
1598             TF_LITE_REPORT_ERROR(
1599                 error_reporter,
1600                 "Bias (\"%s\" at global index %d) of op \"%s\" at op_index %d "
1601                 "in subgraph %d is expected to be quantized to INT32 but it is "
1602                 "already quantized to %s.\n",
1603                 bias_tensor->name.c_str(), op->inputs[bias_idx],
1604                 operator_name.c_str(), op_idx, subgraph_idx,
1605                 EnumNameTensorType(bias_tensor->type));
1606           }
1607         }
1608       }
1609     }
1610   }
1611   return kTfLiteOk;
1612 }
1613 
GetAllOperatorOutputs(ModelT * model)1614 std::unordered_set<string> GetAllOperatorOutputs(ModelT* model) {
1615   std::unordered_set<string> operator_names;
1616   for (int32_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1617        subgraph_idx++) {
1618     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1619     for (int32_t tensor_idx = 0; tensor_idx < subgraph->tensors.size();
1620          tensor_idx++) {
1621       operator_names.insert(subgraph->tensors[tensor_idx]->name);
1622     }
1623   }
1624   return operator_names;
1625 }
1626 // Populate the quantization parameters max and min for input tensors.
1627 // Assumes that dynamic tensors already have stored min, max values and throw
1628 // an error if a tensor does not have min, max quantization parameter or a
1629 // buffer.
1630 // If any static tensors are not inputs to an operation, their max, min values
1631 // will not be filled by this function.
FillQuantizationParams(ModelT * model,const std::unordered_set<string> & operator_names,const std::unordered_set<string> & real_value_op_set,const TensorType & activations_type,bool disable_per_channel,ErrorReporter * error_reporter)1632 TfLiteStatus FillQuantizationParams(
1633     ModelT* model, const std::unordered_set<string>& operator_names,
1634     const std::unordered_set<string>& real_value_op_set,
1635     const TensorType& activations_type, bool disable_per_channel,
1636     ErrorReporter* error_reporter) {
1637   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1638        subgraph_idx++) {
1639     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1640     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1641       OperatorT* op = subgraph->operators[op_idx].get();
1642       operator_property::OperatorProperty property =
1643           operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
1644       if (!property.quantizable) {
1645         continue;
1646       }
1647       if (!op->outputs.empty()) {
1648         const string operator_name = subgraph->tensors[op->outputs[0]]->name;
1649         property = GetOperatorProperty(operator_names, model, subgraph_idx,
1650                                        op_idx, operator_name, activations_type,
1651                                        disable_per_channel);
1652         if (!IsRealValueOp(real_value_op_set, operator_name)) {
1653           continue;
1654         }
1655       }
1656 
1657       // Populate max, min for each input tensor.
1658       for (const std::pair<int, operator_property::TensorProperty>& input :
1659            property.inputs) {
1660         // Get tensor.
1661         const int32_t input_idx = input.first;
1662         const int32_t tensor_idx = op->inputs[input_idx];
1663         if (tensor_idx == -1) {
1664           // Skip optional tensor.
1665           continue;
1666         }
1667         TensorT* tensor = subgraph->tensors[tensor_idx].get();
1668 
1669         // Static tensor.
1670         if (!utils::HasMinMax(tensor) &&
1671             utils::HasBuffer(model, subgraph, tensor_idx)) {
1672           // Get input float data and tensor dimensions.
1673           const BufferT* buffer = model->buffers[tensor->buffer].get();
1674           const float* float_input_data =
1675               reinterpret_cast<const float*>(buffer->data.data());
1676 
1677           if (tensor->quantization == nullptr) {
1678             tensor->quantization = std::make_unique<QuantizationParametersT>();
1679           }
1680 
1681           // Fill per channel max and min with respect to channel_dim_index.
1682           if (input.second.per_axis) {
1683             if (tensor->shape.size() == 4) {
1684               int32_t channel_dim_index = input.second.per_axis_index;
1685               TF_LITE_ENSURE_STATUS(utils::FillPerChannelMinMax(
1686                   float_input_data, tensor->shape, channel_dim_index,
1687                   tensor->quantization.get(), error_reporter));
1688             } else {
1689               TF_LITE_REPORT_ERROR(
1690                   error_reporter,
1691                   "Could not fill max min for tensor as the dimension is %d "
1692                   "and not 4 as expected.",
1693                   tensor->shape.size());
1694               return kTfLiteError;
1695             }
1696 
1697             // Fill per layer max and min.
1698           } else if (!utils::HasMinMax(tensor) && !input.second.per_axis &&
1699                      utils::HasBuffer(model, subgraph, tensor_idx)) {
1700             uint64_t input_size;
1701             TF_LITE_ENSURE_STATUS(utils::NumElements(*tensor, &input_size));
1702             utils::FillSingleMinMax(float_input_data, input_size,
1703                                     tensor->quantization.get());
1704           }
1705           if (tensor->quantization->quantized_dimension !=
1706               input.second.per_axis_index) {
1707             TF_LITE_REPORT_ERROR(
1708                 error_reporter,
1709                 "Quantized dimension for tensor property and quantization "
1710                 "parameters do not match. Got %d and %d respectively.",
1711                 input.second.per_axis_index,
1712                 tensor->quantization->quantized_dimension);
1713             return kTfLiteError;
1714           }
1715 
1716           // Dynamic tensor.
1717         } else if (!utils::HasMinMax(tensor) &&
1718                    !utils::HasBuffer(model, subgraph, tensor_idx)) {
1719           TF_LITE_REPORT_ERROR(
1720               error_reporter,
1721               "Max and min for dynamic tensors should be"
1722               " recorded during calibration: Failed for tensor %s\n",
1723               tensor->name.c_str());
1724           if (tensor->quantization == nullptr) {
1725             TF_LITE_REPORT_ERROR(error_reporter,
1726                                  "No quantization params for tensor %s",
1727                                  tensor->name.c_str());
1728           } else if (tensor->quantization->min.empty() ||
1729                      tensor->quantization->max.empty()) {
1730             TF_LITE_REPORT_ERROR(error_reporter, "Empty min/max for tensor %s",
1731                                  tensor->name.c_str());
1732           }
1733           return kTfLiteError;
1734         }
1735 
1736         if (utils::QuantizationParametersExist(tensor)) {
1737           TF_LITE_REPORT_ERROR(
1738               error_reporter,
1739               "Scale and zero points should not be recorded before "
1740               "quantization.");
1741           return kTfLiteError;
1742         }
1743       }  // loop over op inputs
1744     }    // loop over ops
1745   }      // loop over subgraphs
1746   return kTfLiteOk;
1747 }
1748 
1749 // Check compatibility of activation, weight and bias scales. Adjust if needed.
EnsureBiasScaleCompatibility(ModelT * model,const std::unordered_set<string> & operator_names,const std::unordered_set<string> & real_value_op_set,const TensorType & activations_type,bool disable_per_channel,ErrorReporter * error_reporter)1750 TfLiteStatus EnsureBiasScaleCompatibility(
1751     ModelT* model, const std::unordered_set<string>& operator_names,
1752     const std::unordered_set<string>& real_value_op_set,
1753     const TensorType& activations_type, bool disable_per_channel,
1754     ErrorReporter* error_reporter) {
1755   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1756        subgraph_idx++) {
1757     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1758     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1759       OperatorT* op = subgraph->operators[op_idx].get();
1760       if (op->outputs.empty()) {
1761         continue;
1762       }
1763       const string operator_name = subgraph->tensors[op->outputs[0]]->name;
1764       operator_property::OperatorProperty property = GetOperatorProperty(
1765           operator_names, model, subgraph_idx, op_idx, operator_name,
1766           activations_type, disable_per_channel);
1767       if (!IsRealValueOp(real_value_op_set, operator_name)) {
1768         continue;
1769       }
1770 
1771       // Loop over all bias tensors.
1772       for (const int bias_idx : property.biases) {
1773         if (bias_idx >= op->inputs.size() ||
1774             op->inputs[bias_idx] == kTfLiteOptionalTensor) {
1775           continue;
1776         }
1777         TensorT* bias_tensor = subgraph->tensors[op->inputs[bias_idx]].get();
1778         int32_t channel_dim_size = bias_tensor->shape[0];
1779         if (bias_tensor->shape.size() != 1) {
1780           TF_LITE_REPORT_ERROR(error_reporter,
1781                                "Expected bias tensor to be a vector.");
1782           return kTfLiteError;
1783         }
1784 
1785         if (property.inputs.size() != 2) {  // Only works for two input tensors.
1786           TF_LITE_REPORT_ERROR(
1787               error_reporter,
1788               "Expect %d inputs for op %s at index %d in subgraph %d to be 2",
1789               property.inputs.size(), op_idx, subgraph_idx);
1790           return kTfLiteError;
1791         }
1792 
1793         if (!property.arbitrary_inputs && property.quantizable) {
1794           // Get input and weight tensors.
1795           TensorT* input_tensor =
1796               subgraph->tensors[op->inputs[property.inputs[0].first]].get();
1797           TensorT* weight_tensor =
1798               subgraph->tensors[op->inputs[property.inputs[1].first]].get();
1799           operator_property::TensorProperty weight_property =
1800               property.inputs[1].second;
1801           TF_LITE_ENSURE(error_reporter, input_tensor->quantization);
1802 
1803           // Check quantization parameters exist for input.
1804           if (!utils::HasMinMax(input_tensor)) {
1805             TF_LITE_REPORT_ERROR(
1806                 error_reporter,
1807                 "Input tensor missing quantization information. Should be "
1808                 "populated during calibration.");
1809             return kTfLiteError;
1810           }
1811 
1812           // Get input scale for asymmetric quantization.
1813           QuantizationParametersT temp_quant_params = QuantizationParametersT();
1814           TF_LITE_ENSURE_STATUS(
1815               utils::GetQuantizationParams(input_tensor, activations_type,
1816                                            &temp_quant_params, error_reporter));
1817           if (temp_quant_params.scale.size() != 1) {
1818             TF_LITE_REPORT_ERROR(error_reporter,
1819                                  "Unexpected input quantization scale size.");
1820             return kTfLiteError;
1821           }
1822           float input_scale = temp_quant_params.scale[0];
1823 
1824           // Check that max/min values have been filled for weights.
1825           if (!utils::HasMinMax(weight_tensor)) {
1826             TF_LITE_REPORT_ERROR(
1827                 error_reporter,
1828                 "Min and/or max values have not been recorded for weight "
1829                 "tensor. This should have happened in FillQuantizationParams.");
1830             return kTfLiteError;
1831           }
1832 
1833           // Ensure the tensor dimensions are compatible.
1834           if (weight_property.per_axis) {
1835             if (bias_tensor->shape[0] !=
1836                 weight_tensor->shape[weight_property.per_axis_index]) {
1837               TF_LITE_REPORT_ERROR(
1838                   error_reporter,
1839                   "Channel mismatch between bias and weight tensors %d vs %d",
1840                   bias_tensor->shape[0],
1841                   weight_tensor->shape[weight_property.per_axis_index]);
1842               return kTfLiteError;
1843             }
1844             // Ensure that the number of max/mins matches the channel_dim_size.
1845             if (weight_tensor->quantization->max.size() != channel_dim_size) {
1846               TF_LITE_REPORT_ERROR(
1847                   error_reporter,
1848                   "Mismatch between number of weight maxs and channels: %d vs "
1849                   "%d",
1850                   weight_tensor->quantization->max.size(), channel_dim_size);
1851               return kTfLiteError;
1852             }
1853             if (weight_tensor->quantization->min.size() != channel_dim_size) {
1854               TF_LITE_REPORT_ERROR(
1855                   error_reporter,
1856                   "Mismatch between number of weight mins and channels: %d",
1857                   weight_tensor->quantization->min.size());
1858               return kTfLiteError;
1859             }
1860           }
1861 
1862           // Get data and size of bias tensor.
1863           const BufferT* buffer = model->buffers[bias_tensor->buffer].get();
1864           const float* bias_data =
1865               reinterpret_cast<const float*>(buffer->data.data());
1866           uint64_t bias_size;
1867           TF_LITE_ENSURE_STATUS(utils::NumElements(*bias_tensor, &bias_size));
1868 
1869           // Adjust weight scales if needed.
1870           TF_LITE_ENSURE_STATUS(utils::AdjustWeightsForBiasScale(
1871               weight_tensor->quantization.get(), bias_data, bias_size,
1872               input_scale, error_reporter));
1873 
1874           if (utils::QuantizationParametersExist(weight_tensor)) {
1875             TF_LITE_REPORT_ERROR(
1876                 error_reporter,
1877                 "Scale and zero points should not be recorded for the weight "
1878                 "tensor before quantization.");
1879             return kTfLiteError;
1880           }
1881           if (utils::QuantizationParametersExist(input_tensor)) {
1882             TF_LITE_REPORT_ERROR(
1883                 error_reporter,
1884                 "Scale and zero points should not be recorded for the input "
1885                 "tensor before quantization.");
1886             return kTfLiteError;
1887           }
1888         }
1889       }
1890     }
1891   }
1892   return kTfLiteOk;
1893 }
1894 
1895 }  // namespace
1896 
1897 // Assumes that the operators in the model have been topologically sorted.
QuantizeModel(flatbuffers::FlatBufferBuilder * builder,ModelT * model,const TensorType & input_type,const TensorType & output_type,bool allow_float,const std::unordered_set<string> & operator_names,const TensorType & activations_type,const TensorType & bias_type,bool disable_per_channel,ErrorReporter * error_reporter)1898 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
1899                            ModelT* model, const TensorType& input_type,
1900                            const TensorType& output_type, bool allow_float,
1901                            const std::unordered_set<string>& operator_names,
1902                            const TensorType& activations_type,
1903                            const TensorType& bias_type,
1904                            bool disable_per_channel,
1905                            ErrorReporter* error_reporter) {
1906   auto real_value_op_set =
1907       PopulateRealValueOpSet(model, operator_names, activations_type);
1908   TF_LITE_ENSURE_STATUS(DuplicateBiasesWithMultipleUses(model, error_reporter));
1909   TF_LITE_ENSURE_STATUS(FillQuantizationParams(
1910       model, operator_names, real_value_op_set, activations_type,
1911       disable_per_channel, error_reporter));
1912   TF_LITE_ENSURE_STATUS(EnsureBiasScaleCompatibility(
1913       model, operator_names, real_value_op_set, activations_type,
1914       disable_per_channel, error_reporter));
1915   TF_LITE_ENSURE_STATUS(
1916       QuantizeIntermediateTensors(model, activations_type, error_reporter));
1917   TF_LITE_ENSURE_STATUS(QuantizeSharedRange(model, error_reporter));
1918   TF_LITE_ENSURE_STATUS(
1919       QuantizeResources(model, activations_type, error_reporter));
1920   TF_LITE_ENSURE_STATUS(QuantizeWeightsInputOutput(
1921       model, allow_float, operator_names, real_value_op_set, activations_type,
1922       disable_per_channel, error_reporter));
1923   TF_LITE_ENSURE_STATUS(ApplyConstraints(model, operator_names,
1924                                          real_value_op_set, activations_type,
1925                                          error_reporter));
1926   TF_LITE_ENSURE_STATUS(QuantizeBiases(model, operator_names, real_value_op_set,
1927                                        activations_type, bias_type,
1928                                        disable_per_channel, error_reporter));
1929   utils::SetOperatorCodeVersion(model);
1930   TF_LITE_ENSURE_STATUS(SetInputAndOutputTypes(
1931       model, input_type, output_type, activations_type, error_reporter));
1932   SetOperatorPropertyADDSUBOperator(model, activations_type);
1933   flatbuffers::Offset<Model> output_model_location =
1934       Model::Pack(*builder, model);
1935   FinishModelBuffer(*builder, output_model_location);
1936 
1937   return kTfLiteOk;
1938 }
1939 
1940 // Assumes that the operators in the model have been topologically sorted.
QuantizeModel(flatbuffers::FlatBufferBuilder * builder,ModelT * model,const TensorType & input_type,const TensorType & output_type,bool allow_float,const std::unordered_set<string> & operator_names,const TensorType & activations_type,const TensorType & bias_type,ErrorReporter * error_reporter)1941 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
1942                            ModelT* model, const TensorType& input_type,
1943                            const TensorType& output_type, bool allow_float,
1944                            const std::unordered_set<string>& operator_names,
1945                            const TensorType& activations_type,
1946                            const TensorType& bias_type,
1947                            ErrorReporter* error_reporter) {
1948   return QuantizeModel(builder, model, input_type, output_type, allow_float,
1949                        operator_names, activations_type,
1950                        /*bias_type=*/bias_type,
1951                        /*disable_per_channel=*/false, error_reporter);
1952 }
1953 
QuantizeModelAllOperators(flatbuffers::FlatBufferBuilder * builder,ModelT * model,const TensorType & input_type,const TensorType & output_type,bool allow_float,const TensorType & activations_type,const TensorType & bias_type,ErrorReporter * error_reporter)1954 TfLiteStatus QuantizeModelAllOperators(
1955     flatbuffers::FlatBufferBuilder* builder, ModelT* model,
1956     const TensorType& input_type, const TensorType& output_type,
1957     bool allow_float, const TensorType& activations_type,
1958     const TensorType& bias_type, ErrorReporter* error_reporter) {
1959   return QuantizeModel(builder, model, input_type, output_type, allow_float,
1960                        GetAllOperatorOutputs(model), activations_type,
1961                        bias_type,
1962                        /*disable_per_channel=*/false, error_reporter);
1963 }
1964 
QuantizeModelAllOperators(flatbuffers::FlatBufferBuilder * builder,ModelT * model,const TensorType & input_type,const TensorType & output_type,bool allow_float,const TensorType & activations_type,const TensorType & bias_type,bool disable_per_channel,ErrorReporter * error_reporter)1965 TfLiteStatus QuantizeModelAllOperators(
1966     flatbuffers::FlatBufferBuilder* builder, ModelT* model,
1967     const TensorType& input_type, const TensorType& output_type,
1968     bool allow_float, const TensorType& activations_type,
1969     const TensorType& bias_type, bool disable_per_channel,
1970     ErrorReporter* error_reporter) {
1971   return QuantizeModel(builder, model, input_type, output_type, allow_float,
1972                        GetAllOperatorOutputs(model), activations_type,
1973                        bias_type, disable_per_channel, error_reporter);
1974 }
1975 
QuantizeModel(flatbuffers::FlatBufferBuilder * builder,ModelT * model,const TensorType & input_type,const TensorType & output_type,bool allow_float,ErrorReporter * error_reporter)1976 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
1977                            ModelT* model, const TensorType& input_type,
1978                            const TensorType& output_type, bool allow_float,
1979                            ErrorReporter* error_reporter) {
1980   return QuantizeModel(builder, model, input_type, output_type, allow_float,
1981                        GetAllOperatorOutputs(model),
1982                        /*activations_type=*/TensorType_INT8,
1983                        /*bias_type=*/TensorType_INT32, error_reporter);
1984 }
1985 
QuantizeModel(flatbuffers::FlatBufferBuilder * builder,ModelT * model,const TensorType & input_type,const TensorType & output_type,ErrorReporter * error_reporter)1986 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
1987                            ModelT* model, const TensorType& input_type,
1988                            const TensorType& output_type,
1989                            ErrorReporter* error_reporter) {
1990   return QuantizeModel(builder, model, input_type, output_type,
1991                        /*allow_float=*/false, error_reporter);
1992 }
1993 
QuantizeModel(flatbuffers::FlatBufferBuilder * builder,ModelT * model,ErrorReporter * error_reporter)1994 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
1995                            ModelT* model, ErrorReporter* error_reporter) {
1996   return QuantizeModel(builder, model, TensorType_FLOAT32, TensorType_FLOAT32,
1997                        /*allow_float=*/false, error_reporter);
1998 }
1999 
2000 }  // namespace optimize
2001 }  // namespace tflite
2002