1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/lite/tools/optimize/quantize_model.h"
16
17 #include <algorithm>
18 #include <cstdint>
19 #include <limits>
20 #include <memory>
21 #include <string>
22 #include <unordered_map>
23 #include <unordered_set>
24 #include <utility>
25 #include <vector>
26
27 #include "flatbuffers/flexbuffers.h"
28 #include "absl/strings/str_cat.h"
29 #include "tensorflow/lite/context.h"
30 #include "tensorflow/lite/core/api/error_reporter.h"
31 #include "tensorflow/lite/kernels/internal/cppmath.h"
32 #include "tensorflow/lite/model.h"
33 #include "tensorflow/lite/schema/schema_generated.h"
34 #include "tensorflow/lite/schema/schema_utils.h"
35 #include "tensorflow/lite/tools/optimize/model_utils.h"
36 #include "tensorflow/lite/tools/optimize/operator_property.h"
37 #include "tensorflow/lite/tools/optimize/quantization_utils.h"
38
39 namespace tflite {
40 namespace optimize {
41
42 namespace {
43
44 // Bias tensors must be duplicated if it is used as a non-bias input in another
45 // op (quantized to 8 bit), in order to quantize to 32 bit.
DuplicateBiasesWithMultipleUses(ModelT * model,ErrorReporter * error_reporter)46 TfLiteStatus DuplicateBiasesWithMultipleUses(ModelT* model,
47 ErrorReporter* error_reporter) {
48 std::set<int> input_uses;
49 // Get all input uses for constant tensors.
50 for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
51 subgraph_idx++) {
52 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
53 for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
54 operator_property::OperatorProperty property =
55 operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
56 auto* op = subgraph->operators[op_idx].get();
57 for (const auto& idx_pair : property.inputs) {
58 const int idx = idx_pair.first;
59 if (op->inputs[idx] < 0 || idx >= op->inputs.size()) {
60 continue;
61 }
62 const TensorT* input_tensor = subgraph->tensors[op->inputs[idx]].get();
63 if (!input_tensor || (input_tensor->buffer < 0) ||
64 (input_tensor->buffer >= model->buffers.size())) {
65 continue;
66 }
67 const BufferT* buffer = model->buffers[input_tensor->buffer].get();
68 if (buffer && !buffer->data.empty()) {
69 input_uses.insert({op->inputs[idx]});
70 }
71 }
72 }
73 }
74
75 std::map<int, int> bias_uses;
76 for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
77 subgraph_idx++) {
78 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
79 for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
80 operator_property::OperatorProperty property =
81 operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
82 OperatorT* op = subgraph->operators[op_idx].get();
83 for (const int bias_idx : property.biases) {
84 if (bias_idx >= op->inputs.size() || op->inputs[bias_idx] < 0) {
85 continue;
86 }
87 const TensorT* bias_tensor =
88 subgraph->tensors[op->inputs[bias_idx]].get();
89 if (!bias_tensor || (bias_tensor->buffer < 0) ||
90 (bias_tensor->buffer >= model->buffers.size())) {
91 continue;
92 }
93 const BufferT* bias_buffer = model->buffers[bias_tensor->buffer].get();
94 if (!bias_buffer || bias_buffer->data.empty()) {
95 continue;
96 }
97 if (input_uses.find(op->inputs[bias_idx]) != input_uses.end()) {
98 // If used as input, duplicate the tensor and insert into bias uses.
99 int bias_use_count = 1;
100 auto inserted =
101 bias_uses.insert({op->inputs[bias_idx], bias_use_count});
102 if (!inserted.second) {
103 bias_use_count = ++inserted.first->second;
104 }
105 std::unique_ptr<TensorT> new_tensor(new TensorT);
106 new_tensor->name =
107 absl::StrCat(bias_tensor->name, "_duplicate_", bias_use_count);
108 new_tensor->shape = bias_tensor->shape;
109 new_tensor->type = bias_tensor->type;
110 if (bias_tensor->quantization) {
111 new_tensor->quantization =
112 std::make_unique<QuantizationParametersT>();
113 new_tensor->quantization->scale.assign(
114 bias_tensor->quantization->scale.begin(),
115 bias_tensor->quantization->scale.end());
116 new_tensor->quantization->zero_point.assign(
117 bias_tensor->quantization->zero_point.begin(),
118 bias_tensor->quantization->zero_point.end());
119 }
120 std::unique_ptr<BufferT> new_buffer(new BufferT);
121 new_buffer->data.assign(bias_buffer->data.begin(),
122 bias_buffer->data.end());
123 model->buffers.push_back(std::move(new_buffer));
124 new_tensor->buffer = model->buffers.size() - 1;
125 subgraph->tensors.push_back(std::move(new_tensor));
126 op->inputs[bias_idx] = subgraph->tensors.size() - 1;
127 }
128 }
129 }
130 }
131 return kTfLiteOk;
132 }
133
IsFloatTensor(const SubGraphT * subgraph,int32_t tensor_idx)134 bool IsFloatTensor(const SubGraphT* subgraph, int32_t tensor_idx) {
135 TensorT* tensor = subgraph->tensors[tensor_idx].get();
136 if (tensor->type != TensorType_FLOAT32) {
137 // Skip non-real-valued tensor.
138 return false;
139 }
140 return true;
141 }
142
143 // Gets the operator property from the operator_property list and additionally
144 // modifies the quantizable parameter based on the user's specified
145 // operator_names.
GetOperatorProperty(const std::unordered_set<string> & operator_names,const ModelT * model,int subgraph_index,int op_idx,const string & operator_name,const TensorType & activations_type,bool disable_per_channel=false)146 operator_property::OperatorProperty GetOperatorProperty(
147 const std::unordered_set<string>& operator_names, const ModelT* model,
148 int subgraph_index, int op_idx, const string& operator_name,
149 const TensorType& activations_type, bool disable_per_channel = false) {
150 operator_property::OperatorProperty property =
151 operator_property::GetOperatorProperty(model, subgraph_index, op_idx);
152 const SubGraphT* subgraph = model->subgraphs[subgraph_index].get();
153 const OperatorT* op = subgraph->operators[op_idx].get();
154 const BuiltinOperator op_code =
155 GetBuiltinCode(model->operator_codes[op->opcode_index].get());
156 if (activations_type == TensorType_INT16 && !property.quantizable_int16) {
157 property.quantizable = false;
158 }
159 // The algorithm adds Dequantize and Quantize, so we don't require them to be
160 // in the operator_names.
161 if (op_code != BuiltinOperator_DEQUANTIZE &&
162 op_code != BuiltinOperator_QUANTIZE) {
163 property.quantizable =
164 property.quantizable &&
165 (operator_names.find(operator_name) != operator_names.end());
166 }
167 if (disable_per_channel) {
168 for (auto& input : property.inputs) {
169 if (input.second.per_axis) {
170 input.second.per_axis = false;
171 }
172 }
173 }
174 return property;
175 }
176
IsRealValueOp(const std::unordered_set<string> & real_value_op_set,const string & operator_name)177 bool IsRealValueOp(const std::unordered_set<string>& real_value_op_set,
178 const string& operator_name) {
179 return real_value_op_set.find(operator_name) != real_value_op_set.end();
180 }
181
182 // Utility function to determine if tensor is constant and only has one use.
IsConstantWithOneUse(const ModelT * model,const SubGraphT * subgraph,const int tensor_id)183 bool IsConstantWithOneUse(const ModelT* model, const SubGraphT* subgraph,
184 const int tensor_id) {
185 if (!subgraph || (tensor_id >= subgraph->tensors.size())) {
186 return false;
187 }
188 const auto& tensor = subgraph->tensors[tensor_id];
189 if (!tensor || !model || (tensor->buffer < 0) ||
190 (tensor->buffer >= model->buffers.size()) ||
191 (!model->buffers[tensor->buffer]) ||
192 (model->buffers[tensor->buffer]->data.empty())) {
193 return false;
194 }
195 int uses = 0;
196 for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
197 const auto& op = subgraph->operators[op_idx];
198 if (!op) {
199 continue;
200 }
201 const std::vector<int32_t>& inputs = op->inputs;
202 if ((std::find(inputs.begin(), inputs.end(), tensor_id) != inputs.end()) &&
203 (++uses > 1)) {
204 return false;
205 }
206 }
207 return true;
208 }
209
210 // Creates a set that contains all quantizable ops that happen to take a
211 // non-float type in the source graph.
PopulateRealValueOpSet(ModelT * model,const std::unordered_set<string> & operator_names,const TensorType & activations_type)212 std::unordered_set<string> PopulateRealValueOpSet(
213 ModelT* model, const std::unordered_set<string>& operator_names,
214 const TensorType& activations_type) {
215 std::unordered_set<string> real_value_op_set;
216 for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
217 subgraph_idx++) {
218 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
219 for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
220 OperatorT* op = subgraph->operators[op_idx].get();
221 const BuiltinOperator op_code =
222 GetBuiltinCode(model->operator_codes[op->opcode_index].get());
223 if (op->outputs.empty() && op_code != BuiltinOperator_ASSIGN_VARIABLE) {
224 continue;
225 }
226 const string operator_name = op_code != BuiltinOperator_ASSIGN_VARIABLE
227 ? subgraph->tensors[op->outputs[0]]->name
228 : subgraph->tensors[op->inputs[0]]->name;
229 operator_property::OperatorProperty property =
230 GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
231 operator_name, activations_type);
232
233 if (!property.quantizable) {
234 real_value_op_set.insert(operator_name);
235 continue;
236 }
237
238 for (const std::pair<int, operator_property::TensorProperty>& input :
239 property.inputs) {
240 const int32_t input_idx = input.first;
241 const int32_t tensor_idx = op->inputs[input_idx];
242 if (IsFloatTensor(subgraph, tensor_idx)) {
243 real_value_op_set.insert(operator_name);
244 break;
245 }
246 }
247 for (const std::pair<int, operator_property::TensorProperty>& output :
248 property.outputs) {
249 const int32_t output_idx = output.first;
250 const int32_t tensor_idx = op->outputs[output_idx];
251 if (IsFloatTensor(subgraph, tensor_idx)) {
252 real_value_op_set.insert(operator_name);
253 break;
254 }
255 }
256
257 if (property.arbitrary_inputs) {
258 const int32_t tensor_idx = op->inputs[0];
259 if (IsFloatTensor(subgraph, tensor_idx)) {
260 real_value_op_set.insert(operator_name);
261 }
262 }
263
264 if (property.arbitrary_outputs) {
265 const int32_t tensor_idx = op->outputs[0];
266 if (IsFloatTensor(subgraph, tensor_idx)) {
267 real_value_op_set.insert(operator_name);
268 }
269 }
270 }
271 }
272 return real_value_op_set;
273 }
274
QuantizeBias(ModelT * model,const TensorT * input_tensor,const TensorT * weight_tensor,TensorT * bias_tensor,bool is_per_channel,int channel_dim_index,const TensorType & bias_type,ErrorReporter * error_reporter)275 TfLiteStatus QuantizeBias(ModelT* model, const TensorT* input_tensor,
276 const TensorT* weight_tensor, TensorT* bias_tensor,
277 bool is_per_channel, int channel_dim_index,
278 const TensorType& bias_type,
279 ErrorReporter* error_reporter) {
280 if (bias_tensor->shape.size() != 1) {
281 TF_LITE_REPORT_ERROR(error_reporter, "Expected bias tensor shape to be 1.");
282 return kTfLiteError;
283 }
284
285 if (input_tensor->type == tflite::TensorType_INT8 &&
286 bias_type != tflite::TensorType_INT32) {
287 TF_LITE_REPORT_ERROR(
288 error_reporter,
289 "Expected bias type to be TensorType_INT32 for Int8Quant.");
290 return kTfLiteError;
291 }
292
293 if (input_tensor->type == tflite::TensorType_INT16 &&
294 bias_type != tflite::TensorType_INT32 &&
295 bias_type != tflite::TensorType_INT64) {
296 TF_LITE_REPORT_ERROR(error_reporter,
297 "Expected bias type to be TensorType_INT32 or "
298 "TensorType_INT64 for Int16Quant.");
299 return kTfLiteError;
300 }
301
302 int32_t channel_dim_size = bias_tensor->shape[0];
303 TF_LITE_ENSURE(error_reporter, weight_tensor->quantization);
304 std::vector<float> weight_scales = weight_tensor->quantization->scale;
305
306 if (is_per_channel) {
307 if (bias_tensor->shape[0] != weight_tensor->shape[channel_dim_index]) {
308 TF_LITE_REPORT_ERROR(
309 error_reporter,
310 "Channel mismatch between bias and weight tensors %d vs %d",
311 bias_tensor->shape[0], weight_tensor->shape[channel_dim_index]);
312 return kTfLiteError;
313 }
314 if (!input_tensor->quantization ||
315 input_tensor->quantization->scale.size() != 1) {
316 TF_LITE_REPORT_ERROR(error_reporter,
317 "Input tensor missing quantization information");
318 return kTfLiteError;
319 }
320
321 if (weight_scales.size() != channel_dim_size) {
322 TF_LITE_REPORT_ERROR(error_reporter,
323 "Mismatch weight scale dimension: %d",
324 weight_scales.size());
325 return kTfLiteError;
326 }
327 if (bias_type == tflite::TensorType_INT64) {
328 return utils::SymmetricPerChannelBiasQuantize<std::int64_t>(
329 model, bias_tensor, input_tensor->quantization->scale[0],
330 weight_scales.data(), channel_dim_size, error_reporter);
331 } else {
332 return utils::SymmetricPerChannelBiasQuantize<std::int32_t>(
333 model, bias_tensor, input_tensor->quantization->scale[0],
334 weight_scales.data(), channel_dim_size, error_reporter);
335 }
336 } else {
337 if (weight_scales.size() != 1) {
338 TF_LITE_REPORT_ERROR(
339 error_reporter,
340 "Expected per-layer weight scale dimension size 1, got %d",
341 weight_scales.size());
342 return kTfLiteError;
343 }
344 if (bias_type == tflite::TensorType_INT64) {
345 return utils::SymmetricPerLayerBiasQuantize<std::int64_t>(
346 model, bias_tensor,
347 input_tensor->quantization->scale[0] * weight_scales[0],
348 error_reporter);
349 } else {
350 return utils::SymmetricPerLayerBiasQuantize<std::int32_t>(
351 model, bias_tensor,
352 input_tensor->quantization->scale[0] * weight_scales[0],
353 error_reporter);
354 }
355 }
356 return kTfLiteError;
357 }
358
359 // True if the tensor type has to be modified.
TensorTypeChangeRequired(const TensorT * tensor,const TensorType & type)360 bool TensorTypeChangeRequired(const TensorT* tensor, const TensorType& type) {
361 // The quantized model is type INT8/INT16, so if the user provided type is
362 // INT8/INT16, we do not have to do any custom logic. Additionally, if the
363 // current tensor isn't INT8/INT16 quantized, the custom type doesn't apply.
364 bool int8check = type != TensorType_INT8 && tensor->type == TensorType_INT8 &&
365 !tensor->quantization->scale.empty();
366 bool int16check = type != TensorType_INT16 &&
367 tensor->type == TensorType_INT16 &&
368 !tensor->quantization->scale.empty();
369 return (int8check || int16check);
370 }
371
372 // Check if input is consumed by quantize, which means we don't need to
373 // requantize if the output scale is the same as the input tensor's.
InputQuantizeRequired(const ModelT * model,const SubGraphT * subgraph,int32_t input_idx)374 bool InputQuantizeRequired(const ModelT* model, const SubGraphT* subgraph,
375 int32_t input_idx) {
376 std::vector<OperatorT*> quantize_ops;
377 for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
378 OperatorT* op = subgraph->operators[op_idx].get();
379 if (std::find(op->inputs.begin(), op->inputs.end(), input_idx) !=
380 op->inputs.end()) {
381 const BuiltinOperator op_code =
382 GetBuiltinCode(model->operator_codes[op->opcode_index].get());
383 if (op_code != BuiltinOperator_QUANTIZE) {
384 return true;
385 }
386 quantize_ops.push_back(op);
387 }
388 }
389 if (quantize_ops.size() == 1) {
390 const auto* tensor = subgraph->tensors[input_idx].get();
391 const auto* op = quantize_ops[0];
392 const int32_t output_idx = op->outputs[0];
393 const auto output_type = subgraph->tensors[output_idx]->type;
394 const float output_scale =
395 subgraph->tensors[output_idx]->quantization->scale[0];
396 const int64_t output_zero_point =
397 subgraph->tensors[output_idx]->quantization->zero_point[0];
398 if (output_type == tensor->type &&
399 output_scale == tensor->quantization->scale[0] &&
400 output_zero_point == tensor->quantization->zero_point[0]) {
401 return false;
402 }
403 }
404 return true;
405 }
406
407 // Sets the input type, adding a Leading Op node at the start of the model if
408 // necessary.
409 // Returns the new input tensor index.
SetInputType(ModelT * model,SubGraphT * subgraph,const int32_t tensor_idx,const TensorType & input_type,const TensorType & activations_type)410 int32_t SetInputType(ModelT* model, SubGraphT* subgraph,
411 const int32_t tensor_idx, const TensorType& input_type,
412 const TensorType& activations_type) {
413 TensorT* tensor = subgraph->tensors[tensor_idx].get();
414 if (!TensorTypeChangeRequired(tensor, input_type)) {
415 return -1;
416 }
417 if (input_type == TensorType_FLOAT32 || input_type == TensorType_UINT8) {
418 std::string type_string =
419 activations_type == TensorType_INT16 ? "int16" : "int8";
420 // Create a new tensor to be the input of the leading Op.
421 std::unique_ptr<TensorT> leading_op_input;
422 if (input_type == TensorType_FLOAT32) {
423 // Add tensor for quantize operator. Scales and zero points are not
424 // needed.
425 const string leading_op_name = tensor->name;
426 const string new_name_original_input = tensor->name + "_" + type_string;
427 tensor->name = new_name_original_input;
428 utils::MakeTensor(leading_op_name, tensor->shape, tensor->shape_signature,
429 input_type, &leading_op_input);
430 } else {
431 // Get scale and zero point from the first tensor.
432 const float scale = subgraph->tensors[tensor_idx]->quantization->scale[0];
433 const int64_t zero_point =
434 subgraph->tensors[tensor_idx]->quantization->zero_point[0];
435
436 // Add tensor for requantize operator. Scale is the existing scale and
437 // zero point is shifted by +128.
438 TFLITE_DCHECK_GE(zero_point, -128);
439 TFLITE_DCHECK_LE(zero_point, 127);
440 const string leading_op_name = tensor->name;
441 const string new_name_original_input = tensor->name + "_" + type_string;
442 tensor->name = new_name_original_input;
443 utils::MakeTensorWithQuantParam(
444 leading_op_name, tensor->shape, tensor->shape_signature, input_type,
445 scale, zero_point + 128, &leading_op_input);
446 }
447
448 // Check if quantize op already exists.
449 if (!InputQuantizeRequired(model, subgraph, tensor_idx)) {
450 subgraph->tensors[tensor_idx] = std::move(leading_op_input);
451 return tensor_idx;
452 }
453
454 const int32_t leading_op_input_idx = subgraph->tensors.size();
455 subgraph->tensors.push_back(std::move(leading_op_input));
456
457 // Create the leading op, which is Quantize Op that quantize or requantize
458 // the input.
459 std::unique_ptr<OperatorT> leading_op;
460 utils::MakeQuantizeOperator(model, &leading_op, leading_op_input_idx,
461 tensor_idx);
462
463 // Insert the new op at the start of the model.
464 subgraph->operators.insert(subgraph->operators.begin(),
465 std::move(leading_op));
466 return leading_op_input_idx;
467 }
468 return -1;
469 }
470
471 // Sets the output type, adding a Tailing Op node at the end of the model if
472 // necessary.
473 // Returns the new output tensor index.
SetOutputType(ModelT * model,SubGraphT * subgraph,const int32_t tensor_idx,const TensorType & output_type,const TensorType & activations_type)474 int32_t SetOutputType(ModelT* model, SubGraphT* subgraph,
475 const int32_t tensor_idx, const TensorType& output_type,
476 const TensorType& activations_type) {
477 TensorT* tensor = subgraph->tensors[tensor_idx].get();
478 if (!TensorTypeChangeRequired(tensor, output_type)) {
479 return -1;
480 }
481 if (output_type == TensorType_FLOAT32 || output_type == TensorType_UINT8) {
482 std::string type_string =
483 activations_type == TensorType_INT16 ? "int16" : "int8";
484 // Create a new tensor to be the output of the tailing op.
485 std::unique_ptr<TensorT> tailing_op_output;
486 if (output_type == TensorType_FLOAT32) {
487 const string tailing_op_name = tensor->name;
488 const string new_name_original_output = tensor->name + "_" + type_string;
489 tensor->name = new_name_original_output;
490 utils::MakeTensor(tailing_op_name, tensor->shape, tensor->shape_signature,
491 output_type, &tailing_op_output);
492 } else {
493 // Get scale and zero point from the last tensor.
494 const float scale = subgraph->tensors[tensor_idx]->quantization->scale[0];
495 const int64_t zero_point =
496 subgraph->tensors[tensor_idx]->quantization->zero_point[0];
497
498 // Add tensor for requantize operator. Scale is the existing scale and
499 // zero point is shifted by +128.
500 TFLITE_DCHECK_GE(zero_point, -128);
501 TFLITE_DCHECK_LE(zero_point, 127);
502 const string tailing_op_name = tensor->name;
503 const string new_name_original_output = tensor->name + "_" + type_string;
504 tensor->name = new_name_original_output;
505 utils::MakeTensorWithQuantParam(
506 tailing_op_name, tensor->shape, tensor->shape_signature, output_type,
507 scale, zero_point + 128, &tailing_op_output);
508 }
509 const int32_t tailing_op_output_idx = subgraph->tensors.size();
510 subgraph->tensors.push_back(std::move(tailing_op_output));
511
512 // Create the tailing operation.
513 std::unique_ptr<OperatorT> tailing_op;
514 if (output_type == TensorType_FLOAT32) {
515 // Tailing Op is Dequantize Op.
516 utils::MakeDequantizeOperator(model, &tailing_op, tensor_idx,
517 tailing_op_output_idx);
518 } else {
519 // Tailing Op is Quantize Op that does requantization.
520 utils::MakeQuantizeOperator(model, &tailing_op, tensor_idx,
521 tailing_op_output_idx);
522 }
523 // Add the operator at the end of the model.
524 subgraph->operators.push_back(std::move(tailing_op));
525 return tailing_op_output_idx;
526 }
527 return -1;
528 }
529
530 // Sets the input and output types to the provided types. Leading and
531 // tailing operations will be added if needed.
532 // For Float input and output, leading op is Quantize and tailing op is
533 // Dequantize.
534 // For Uint8 input and output, leading op is Quantize (uint8 to
535 // int8, can be thought as "requant") and tailing op is also Quantize (int8 to
536 // uint8, can be thought as "requant").
SetInputAndOutputTypes(ModelT * model,const TensorType & input_type,const TensorType & output_type,const TensorType & activations_type,ErrorReporter * error_reporter)537 TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
538 const TensorType& output_type,
539 const TensorType& activations_type,
540 ErrorReporter* error_reporter) {
541 for (int subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
542 subgraph_idx++) {
543 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
544 SignatureDefT* signature_def = nullptr;
545 for (const auto& sig_def : model->signature_defs) {
546 if (sig_def->subgraph_index == subgraph_idx) {
547 signature_def = sig_def.get();
548 break;
549 }
550 }
551 for (int i = 0; i < subgraph->inputs.size(); ++i) {
552 TensorT* tensor = subgraph->tensors[subgraph->inputs[i]].get();
553 // TODO(suharshs): Add support for this case if it ever comes up.
554 if (tensor->type == TensorType_FLOAT32 && input_type != tensor->type) {
555 TF_LITE_REPORT_ERROR(
556 error_reporter,
557 "Unsupported input type %s for input tensor %d of type %s.",
558 EnumNameTensorType(input_type), subgraph->inputs[i],
559 EnumNameTensorType(tensor->type));
560 return kTfLiteError;
561 }
562 const int32_t input_idx = SetInputType(
563 model, subgraph, subgraph->inputs[i], input_type, activations_type);
564 if (input_idx < 0) {
565 continue;
566 }
567 if (signature_def != nullptr) {
568 for (const auto& input : signature_def->inputs) {
569 if (input->tensor_index == subgraph->inputs[i]) {
570 input->tensor_index = input_idx;
571 break;
572 }
573 }
574 }
575 subgraph->inputs[i] = input_idx;
576 }
577 for (int i = 0; i < subgraph->outputs.size(); ++i) {
578 TensorT* tensor = subgraph->tensors[subgraph->outputs[i]].get();
579 // TODO(suharshs): Add support for this case if it ever comes up.
580 if (tensor->type == TensorType_FLOAT32 && output_type != tensor->type) {
581 TF_LITE_REPORT_ERROR(
582 error_reporter,
583 "Unsupported output type %s for output tensor '%s' of type %s.",
584 EnumNameTensorType(output_type), tensor->name.c_str(),
585 EnumNameTensorType(tensor->type));
586 return kTfLiteError;
587 }
588 const int32_t output_idx = SetOutputType(
589 model, subgraph, subgraph->outputs[i], output_type, activations_type);
590 if (output_idx < 0) {
591 continue;
592 }
593 if (signature_def != nullptr) {
594 for (const auto& output : signature_def->outputs) {
595 if (output->tensor_index == subgraph->outputs[i]) {
596 output->tensor_index = output_idx;
597 break;
598 }
599 }
600 }
601 subgraph->outputs[i] = output_idx;
602 }
603 }
604 return kTfLiteOk;
605 }
606
607 // Requantize a constant quantized tensor.
608 template <typename TensorDataType>
RequantizeConstant(const std::vector<uint8_t> & buffer_data,const TensorT * tensor,const std::unique_ptr<QuantizationParametersT> & new_quantization,std::vector<uint8_t> & new_buffer_data)609 TfLiteStatus RequantizeConstant(
610 const std::vector<uint8_t>& buffer_data, const TensorT* tensor,
611 const std::unique_ptr<QuantizationParametersT>& new_quantization,
612 std::vector<uint8_t>& new_buffer_data) {
613 if (new_buffer_data.size() != buffer_data.size()) {
614 new_buffer_data.resize(buffer_data.size());
615 }
616 const auto& quantization = tensor->quantization;
617 const std::vector<float>& scales = quantization->scale;
618 if (scales.empty()) {
619 // No existing quantization, assumes that new quantization parameters
620 // are correct.
621 new_buffer_data.assign(buffer_data.begin(), buffer_data.end());
622 return kTfLiteOk;
623 }
624 const std::vector<int64_t>& zero_points = quantization->zero_point;
625 const int num_elements = buffer_data.size() / sizeof(TensorDataType);
626 std::vector<float> float_values(num_elements);
627 const TensorDataType* buffer_values =
628 reinterpret_cast<const TensorDataType*>(buffer_data.data());
629 // This logic is for per-channel quantization, but works for per-tensor.
630 const int kPerChannelMaxDim = 4;
631 const std::vector<int32_t>& tensor_shape = tensor->shape;
632 RuntimeShape unextended_tensor_dims(tensor_shape.size(), tensor_shape.data());
633 RuntimeShape tensor_dims =
634 RuntimeShape::ExtendedShape(kPerChannelMaxDim, unextended_tensor_dims);
635 const int channel_dim_index = quantization->quantized_dimension +
636 kPerChannelMaxDim -
637 unextended_tensor_dims.DimensionsCount();
638 int indices[kPerChannelMaxDim];
639 for (indices[0] = 0; indices[0] < tensor_dims.Dims(0); indices[0]++) {
640 for (indices[1] = 0; indices[1] < tensor_dims.Dims(1); indices[1]++) {
641 for (indices[2] = 0; indices[2] < tensor_dims.Dims(2); indices[2]++) {
642 for (indices[3] = 0; indices[3] < tensor_dims.Dims(3); indices[3]++) {
643 const float scale = scales.size() > 1
644 ? scales[indices[channel_dim_index]]
645 : scales[0];
646 const int64_t zp = zero_points.size() > 1
647 ? zero_points[indices[channel_dim_index]]
648 : zero_points[0];
649 const int index = Offset(tensor_dims, indices);
650 float_values[index] = scale * (buffer_values[index] - zp);
651 }
652 }
653 }
654 }
655
656 // Only have to deal with per-tensor for new parameters.
657 if (tensor->type == TensorType_INT16) {
658 std::vector<int16_t> requant_int16 = utils::SymmetricQuantizeFloatsToInt16(
659 float_values.data(), float_values.size(), new_quantization->scale[0]);
660 uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(requant_int16.data());
661 new_buffer_data.assign(uint8_buffer, uint8_buffer + buffer_data.size());
662 return kTfLiteOk;
663 } else if (tensor->type == TensorType_INT8) {
664 const int32_t q_min = std::numeric_limits<int8_t>::min();
665 const int32_t q_max = std::numeric_limits<int8_t>::max();
666 const float scaling_factor = new_quantization->scale[0];
667 const int32_t zp = new_quantization->zero_point[0];
668 const auto& rescale = [&scaling_factor, &zp, &q_min,
669 &q_max](float f) -> uint8_t {
670 const float scaling_factor_inv =
671 (scaling_factor == 0) ? 0 : 1.0 / scaling_factor;
672 int32_t q_i32 = TfLiteRound(f * scaling_factor_inv) + zp;
673 int8_t q = std::min(std::max(q_i32, q_min), q_max);
674 return *(reinterpret_cast<uint8_t*>(&q));
675 };
676 std::transform(float_values.begin(), float_values.end(),
677 new_buffer_data.begin(), rescale);
678 return kTfLiteOk;
679 }
680 return kTfLiteError;
681 }
682
683 // Apply constraints to ops if they have any.
684 // We have made the restriction that for int8 quantized concat, minimum, and
685 // maximum, the inputs and outputs must have the same scale and zero point.
686 // The other ones with constraints are handled in QuantizeWeightsAndInput.
ApplyConstraints(ModelT * model,const std::unordered_set<string> & operator_names,const std::unordered_set<string> & real_value_op_set,TensorType activations_type,ErrorReporter * error_reporter)687 TfLiteStatus ApplyConstraints(
688 ModelT* model, const std::unordered_set<string>& operator_names,
689 const std::unordered_set<string>& real_value_op_set,
690 TensorType activations_type, ErrorReporter* error_reporter) {
691 for (int subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
692 subgraph_idx++) {
693 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
694 // Iterate backward to avoid messing with index.
695 for (int op_idx = subgraph->operators.size() - 1; op_idx >= 0; op_idx--) {
696 OperatorT* op = subgraph->operators[op_idx].get();
697 if (op->outputs.empty()) {
698 continue;
699 }
700 const string operator_name = subgraph->tensors[op->outputs[0]]->name;
701 operator_property::OperatorProperty property =
702 GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
703 operator_name, activations_type);
704 if (!property.quantizable ||
705 !IsRealValueOp(real_value_op_set, operator_name)) {
706 continue;
707 }
708 TensorT* output_tensor = subgraph->tensors[op->outputs[0]].get();
709 if (!property.arbitrary_inputs ||
710 !property.restrict_same_input_output_scale(output_tensor->type)) {
711 continue;
712 }
713 // If ApplyConstraints and requant is needed, use the min of min and max
714 // of max, which means using the scale and zero point of output.
715 if (!utils::QuantizationParametersExist(output_tensor)) {
716 TF_LITE_REPORT_ERROR(
717 error_reporter,
718 "Unable to get scale or zero point from the tensor at %d.",
719 op->outputs[0]);
720 return kTfLiteError;
721 }
722 const float output_scale = output_tensor->quantization->scale[0];
723 const float output_zp = output_tensor->quantization->zero_point[0];
724 for (size_t input_idx = 0; input_idx < op->inputs.size(); ++input_idx) {
725 TensorT* input_tensor = subgraph->tensors[op->inputs[input_idx]].get();
726 if (!utils::QuantizationParametersExist(input_tensor)) {
727 TF_LITE_REPORT_ERROR(
728 error_reporter,
729 "Unable to get scale or zero point from tensor at %d.",
730 op->inputs[input_idx]);
731 return kTfLiteError;
732 }
733 if (input_tensor->quantization->scale[0] == output_scale &&
734 input_tensor->quantization->zero_point[0] == output_zp) {
735 // This input does not need to be requantized.
736 continue;
737 }
738
739 if (IsConstantWithOneUse(model, subgraph, op->inputs[input_idx])) {
740 auto quantization = std::make_unique<QuantizationParametersT>();
741 quantization->scale.push_back(output_scale);
742 quantization->zero_point.push_back(output_zp);
743 const std::vector<uint8_t>& buffer_data =
744 model->buffers[input_tensor->buffer]->data;
745 std::vector<uint8_t> new_buffer_data;
746 TfLiteStatus requant_status = kTfLiteError;
747 if (input_tensor->type == TensorType_INT8) {
748 requant_status = RequantizeConstant<int8_t>(
749 buffer_data, input_tensor, quantization, new_buffer_data);
750 } else if (input_tensor->type == TensorType_INT16) {
751 requant_status = RequantizeConstant<int16_t>(
752 buffer_data, input_tensor, quantization, new_buffer_data);
753 }
754 if (requant_status == kTfLiteOk) {
755 model->buffers[input_tensor->buffer]->data = new_buffer_data;
756 input_tensor->quantization = std::move(quantization);
757 continue;
758 } else {
759 quantization.release();
760 }
761 }
762
763 std::unique_ptr<TensorT> additional_tensor;
764 const string requant_tensor_name = input_tensor->name + "_requantized";
765 utils::MakeTensorWithQuantParam(
766 requant_tensor_name, input_tensor->shape,
767 input_tensor->shape_signature, activations_type, output_scale,
768 output_zp, &additional_tensor);
769 const int32_t additional_tensor_idx = subgraph->tensors.size();
770 subgraph->tensors.push_back(std::move(additional_tensor));
771
772 // Add requant op before this input.
773 // There are better ways to handle this, which is to try to push the
774 // rescale upwards recursively and hope all upstream ops can absort
775 // this rescale.and only add requant when there is no other way.
776 std::unique_ptr<OperatorT> requant_op;
777 utils::MakeQuantizeOperator(model, &requant_op, op->inputs[input_idx],
778 additional_tensor_idx);
779 op->inputs[input_idx] = additional_tensor_idx;
780
781 subgraph->operators.insert(subgraph->operators.begin() + op_idx,
782 std::move(requant_op));
783 }
784 }
785 }
786 return kTfLiteOk;
787 }
788
789 // In case of int16 activations, there are two implementations of kernels for
790 // ADD/SUB operators. We set the builtin option pot_scale_int16
791 // during quantization so that from now only the general case implementation is
792 // used.
SetOperatorPropertyADDSUBOperator(ModelT * model,const TensorType & activations_type)793 void SetOperatorPropertyADDSUBOperator(ModelT* model,
794 const TensorType& activations_type) {
795 if (activations_type != TensorType_INT16) {
796 // This is needed only in case of int16 activations.
797 return;
798 }
799
800 for (int subgraph_idx = 0, end = model->subgraphs.size(); subgraph_idx < end;
801 subgraph_idx++) {
802 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
803 // Iterate backward to avoid messing with index.
804 for (int op_idx = subgraph->operators.size() - 1; op_idx >= 0; op_idx--) {
805 OperatorT* op = subgraph->operators[op_idx].get();
806 OperatorCodeT* op_code = model->operator_codes[op->opcode_index].get();
807 if (op_code && op_code->builtin_code == BuiltinOperator_ADD) {
808 {
809 auto* options = op->builtin_options.AsAddOptions();
810 if (options) {
811 options->pot_scale_int16 = false;
812 }
813 }
814 }
815 if (op_code && op_code->builtin_code == BuiltinOperator_SUB) {
816 {
817 auto* options = op->builtin_options.AsSubOptions();
818 if (options) {
819 options->pot_scale_int16 = false;
820 }
821 }
822 }
823 }
824 }
825 }
826
GetInputs(const OperatorT * op,operator_property::OperatorProperty property)827 std::vector<std::pair<int, operator_property::TensorProperty>> GetInputs(
828 const OperatorT* op, operator_property::OperatorProperty property) {
829 std::vector<std::pair<int, operator_property::TensorProperty>> inputs;
830 if (property.arbitrary_inputs || !property.quantizable) {
831 for (int i = 0; i < op->inputs.size(); ++i) {
832 inputs.push_back({i, {}});
833 }
834 } else {
835 inputs = property.inputs;
836 }
837 return inputs;
838 }
839
GetOutputs(const OperatorT * op,operator_property::OperatorProperty property)840 std::vector<std::pair<int, operator_property::TensorProperty>> GetOutputs(
841 const OperatorT* op, operator_property::OperatorProperty property) {
842 std::vector<std::pair<int, operator_property::TensorProperty>> outputs;
843 if (property.arbitrary_outputs) {
844 for (int i = 0; i < op->outputs.size(); ++i) {
845 outputs.push_back({i, {}});
846 }
847 } else {
848 outputs = property.outputs;
849 }
850 return outputs;
851 }
852
ShouldRestrictSameInputOutputScale(operator_property::OperatorProperty property,TensorType tensor_type)853 bool ShouldRestrictSameInputOutputScale(
854 operator_property::OperatorProperty property, TensorType tensor_type) {
855 // Ops with multiple inputs (i.e. concat, max and min) gets restricted in
856 // ApplyConstraints.
857 return (!property.arbitrary_inputs &&
858 property.restrict_same_input_output_scale(tensor_type));
859 }
860
IsSubgraphInput(SubGraphT * subgraph,int32_t index)861 bool IsSubgraphInput(SubGraphT* subgraph, int32_t index) {
862 for (const int32_t input_idx : subgraph->inputs) {
863 if (index == input_idx) {
864 return true;
865 }
866 }
867 return false;
868 }
869
870 // Quantize the op input. Will increment op_idx if ops are added.
QuantizeOpInput(ModelT * model,int32_t subgraph_idx,size_t * op_idx,operator_property::OperatorProperty property,const std::pair<int32_t,operator_property::TensorProperty> & input,const TensorType & activations_type,ErrorReporter * error_reporter)871 TfLiteStatus QuantizeOpInput(
872 ModelT* model, int32_t subgraph_idx, size_t* op_idx,
873 operator_property::OperatorProperty property,
874 const std::pair<int32_t, operator_property::TensorProperty>& input,
875 const TensorType& activations_type, ErrorReporter* error_reporter) {
876 int32_t input_idx = input.first;
877 operator_property::TensorProperty tensor_property = input.second;
878 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
879 OperatorT* op = subgraph->operators[*op_idx].get();
880 const BuiltinOperator op_code =
881 GetBuiltinCode(model->operator_codes[op->opcode_index].get());
882 if (input_idx >= op->inputs.size()) {
883 TF_LITE_REPORT_ERROR(
884 error_reporter,
885 "Required input index %d is larger than the input length of op "
886 "%s at index %d in subgraph %d",
887 input_idx, op->inputs.size(), EnumNameBuiltinOperator(op_code), *op_idx,
888 subgraph_idx);
889 return kTfLiteError;
890 }
891 const int32_t tensor_idx = op->inputs[input_idx];
892 if (tensor_idx == -1) {
893 // Skip optional tensor.
894 return kTfLiteOk;
895 }
896 TensorT* tensor = subgraph->tensors[tensor_idx].get();
897 // Assumes if tensor is quantized, then it is a weight and quantized to 8 bit.
898 const bool is_input_quantized = utils::QuantizationParametersExist(tensor);
899 if (property.quantizable && !is_input_quantized) {
900 // The operation is quantizable, but the input isn't yet quantized.
901 if (utils::HasBuffer(model, subgraph, tensor_idx)) {
902 // TODO(suharshs): Look at consumers, throw error if one consumer is
903 // per-channel and one per-layer.
904 bool quantize_const_input = false;
905 if (activations_type == TensorType_INT16 &&
906 (property.restrict_same_input_output_scale(tensor->type) ||
907 property.quantize_input_as_activations)) {
908 quantize_const_input = true;
909 }
910 if (tensor_property.number_of_bits == 8 && !quantize_const_input) {
911 if (tensor_property.use_derived_scale) {
912 // Currently 8bit tensors in input do not accept derived scale.
913 return kTfLiteError;
914 }
915 if (utils::QuantizeWeight(model, tensor, tensor_property.per_axis,
916 tensor_property.per_axis_index,
917 error_reporter) != kTfLiteOk) {
918 TF_LITE_REPORT_ERROR(
919 error_reporter,
920 "Unable to quantize buffer or min/max value for input %d "
921 "in op %s in subgraph %d, node: %d",
922 input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx,
923 *op_idx);
924 return kTfLiteError;
925 }
926 } else if (tensor_property.number_of_bits == 16 || quantize_const_input) {
927 if (tensor_property.use_derived_scale) {
928 // Currently 16bit tensors in input do not accept derived scale.
929 return kTfLiteError;
930 }
931 TensorT* tensor = subgraph->tensors[tensor_idx].get();
932 int total_size = 1;
933 for (int i = 0; i < tensor->shape.size(); ++i) {
934 total_size *= tensor->shape[i];
935 }
936 BufferT* buffer = model->buffers[tensor->buffer].get();
937 float* float_data = reinterpret_cast<float*>(buffer->data.data());
938 auto minmax = std::minmax_element(float_data, float_data + total_size);
939 const float min = *minmax.first;
940 const float max = *minmax.second;
941 const float range = std::max(std::abs(min), std::abs(max));
942 // The narrow range quantized value for int16.
943 const float quantize_range = 32767.0;
944 const float scale = range / quantize_range;
945 return utils::SymmetricQuantizeFloatsToInt16(model, tensor, scale,
946 error_reporter);
947 } else if (tensor_property.number_of_bits == 32) {
948 if (!tensor_property.use_derived_scale) {
949 // Currently 32 bit tensors in input only accept derived scale.
950 return kTfLiteError;
951 }
952 TensorT* tensor = subgraph->tensors[tensor_idx].get();
953 const float scale = utils::GetEffectiveScale(
954 model, subgraph, *op_idx,
955 tensor_property.derived_scale.input_tensors,
956 tensor_property.derived_scale.intermediate_tensors,
957 tensor_property.derived_scale.factors);
958 return utils::SymmetricPerLayerBiasQuantize<std::int32_t>(
959 model, tensor, scale, error_reporter);
960
961 } else if (tensor_property.number_of_bits == 10) {
962 // When the number of bits is 10 (instead of 16), quantize the tensor to
963 // [-512, 512], instead of [-32767, 32767].
964 TensorT* tensor = subgraph->tensors[tensor_idx].get();
965 int total_size = 1;
966 for (int i = 0; i < tensor->shape.size(); ++i) {
967 total_size *= tensor->shape[i];
968 }
969 BufferT* buffer = model->buffers[tensor->buffer].get();
970 float* buffer_data = reinterpret_cast<float*>(buffer->data.data());
971 auto minmax =
972 std::minmax_element(buffer_data, buffer_data + total_size);
973 const float range =
974 std::max(std::abs(*minmax.first), std::abs(*minmax.second));
975 const float quantized_range = 512.0;
976 const float scale = range / quantized_range;
977 return utils::SymmetricQuantizeFloatsToInt16(model, tensor, scale,
978 error_reporter);
979 } else {
980 // Currently supports only 8, 16, 32, 10 bits.
981 TF_LITE_REPORT_ERROR(
982 error_reporter,
983 "Unable to quantize buffer or min/max value for input %d "
984 "in op %s in subgraph %d, node: %d",
985 input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, *op_idx);
986 return kTfLiteError;
987 }
988 } else if (utils::HasMinMax(tensor)) {
989 if (IsSubgraphInput(subgraph, tensor_idx) ||
990 tensor_property.state_tensor) {
991 if (tensor_property.number_of_bits == 8) {
992 if (tensor_property.use_derived_scale) {
993 // Currently 8bit tensors in input do not accept derived scale.
994 return kTfLiteError;
995 }
996 TF_LITE_ENSURE_STATUS(utils::QuantizeActivation(
997 tensor, activations_type, error_reporter));
998 } else if (tensor_property.number_of_bits == 16) {
999 TensorT* tensor = subgraph->tensors[tensor_idx].get();
1000 float quantized_range = 32767.0;
1001 float range = std::max(std::abs(tensor->quantization->min[0]),
1002 std::abs(tensor->quantization->max[0]));
1003 if (tensor_property.extend_to_power_of_two) {
1004 const int power_of_two_scale = utils::GetPowerOfTwoScale(
1005 tensor->quantization->min[0], tensor->quantization->max[0]);
1006 range = std::pow(2, power_of_two_scale); // NOLINT
1007 quantized_range = 32768.0;
1008 }
1009 const float scale = range / quantized_range;
1010 utils::QuantizeActivationToInt16(tensor, scale);
1011 }
1012 } else {
1013 // If the tensor is not a model input, we need to add a Quantize
1014 // operation since the preceding op may require a float output.
1015 std::string type_string =
1016 activations_type == TensorType_INT16 ? "int16" : "int8";
1017 std::unique_ptr<TensorT> op_output;
1018 utils::MakeTensor(tensor->name + "_" + type_string, tensor->shape,
1019 tensor->shape_signature, activations_type,
1020 &op_output);
1021 op_output->quantization = std::make_unique<QuantizationParametersT>();
1022 op_output->quantization->min.push_back(tensor->quantization->min[0]);
1023 op_output->quantization->max.push_back(tensor->quantization->max[0]);
1024 TF_LITE_ENSURE_STATUS(utils::QuantizeActivation(
1025 op_output.get(), activations_type, error_reporter));
1026 const int32_t quant_op_output_idx = subgraph->tensors.size();
1027 subgraph->tensors.push_back(std::move(op_output));
1028 std::unique_ptr<OperatorT> quant_op;
1029 utils::MakeQuantizeOperator(model, &quant_op, tensor_idx,
1030 quant_op_output_idx);
1031 subgraph->operators.insert(subgraph->operators.begin() + *op_idx,
1032 std::move(quant_op));
1033 op->inputs[input_idx] = quant_op_output_idx;
1034 *op_idx += 1;
1035 }
1036 } else {
1037 TF_LITE_REPORT_ERROR(error_reporter,
1038 "Unable to find buffer or min/max value for input "
1039 "%d in %s in subgraph %d, node: %d",
1040 input_idx, EnumNameBuiltinOperator(op_code),
1041 subgraph_idx, *op_idx);
1042 return kTfLiteError;
1043 }
1044 } else if (!property.quantizable && is_input_quantized) {
1045 // If the tensor is quantized, we have to add a Dequantize op after
1046 // since this op is not quantizable.
1047 std::unique_ptr<TensorT> op_output;
1048 utils::MakeTensor(tensor->name + "_float", tensor->shape,
1049 tensor->shape_signature, TensorType_FLOAT32, &op_output);
1050 const int32_t dequant_op_output_idx = subgraph->tensors.size();
1051 subgraph->tensors.push_back(std::move(op_output));
1052 std::unique_ptr<OperatorT> dequant_op;
1053 utils::MakeDequantizeOperator(model, &dequant_op, tensor_idx,
1054 dequant_op_output_idx);
1055 subgraph->operators.insert(subgraph->operators.begin() + *op_idx,
1056 std::move(dequant_op));
1057 op->inputs[input_idx] = dequant_op_output_idx;
1058 *op_idx += 1;
1059 }
1060 return kTfLiteOk;
1061 }
1062
1063 // Quantize the op output.
QuantizeOpOutput(ModelT * model,int32_t subgraph_idx,int32_t op_idx,operator_property::OperatorProperty property,const std::pair<int32_t,operator_property::TensorProperty> & output,TensorType activations_type,ErrorReporter * error_reporter)1064 TfLiteStatus QuantizeOpOutput(
1065 ModelT* model, int32_t subgraph_idx, int32_t op_idx,
1066 operator_property::OperatorProperty property,
1067 const std::pair<int32_t, operator_property::TensorProperty>& output,
1068 TensorType activations_type, ErrorReporter* error_reporter) {
1069 int32_t output_idx = output.first;
1070 operator_property::TensorProperty tensor_property = output.second;
1071 // If the operator is not quantizable, we don't need to do anything for the
1072 // output.
1073 if (!property.quantizable) {
1074 return kTfLiteOk;
1075 }
1076 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1077 OperatorT* op = subgraph->operators[op_idx].get();
1078 const BuiltinOperator op_code =
1079 GetBuiltinCode(model->operator_codes[op->opcode_index].get());
1080 if (output_idx >= op->outputs.size()) {
1081 TF_LITE_REPORT_ERROR(
1082 error_reporter,
1083 "Required output index %d is larger than the output length of "
1084 "op %s at index %d in subgraph %d",
1085 output_idx, op->outputs.size(), EnumNameBuiltinOperator(op_code),
1086 op_idx, subgraph_idx);
1087 return kTfLiteError;
1088 }
1089
1090 TensorT* output_tensor = subgraph->tensors[op->outputs[output_idx]].get();
1091 if (utils::QuantizationParametersExist(output_tensor)) {
1092 // Skip output if it has been quantized.
1093 return kTfLiteOk;
1094 }
1095 if (ShouldRestrictSameInputOutputScale(property, output_tensor->type)) {
1096 // Copy quantization parameter. For average pool, max pool, etc
1097 // min/max can be different but we want them to be the same.
1098 // Get scale and zero point of input.
1099 if (property.inputs[0].first >= op->inputs.size()) {
1100 TF_LITE_REPORT_ERROR(
1101 error_reporter,
1102 "Required input index %d is larger than the input length of "
1103 "op %s at index %d in subgraph %d",
1104 property.inputs[0].first, op->inputs.size(),
1105 EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx);
1106 return kTfLiteError;
1107 }
1108 const int input_tensor_idx = op->inputs[property.inputs[0].first];
1109 TensorT* input_tensor = subgraph->tensors[input_tensor_idx].get();
1110 if (input_tensor->quantization->scale.size() != 1 ||
1111 input_tensor->quantization->zero_point.size() != 1) {
1112 TF_LITE_REPORT_ERROR(error_reporter,
1113 "Invalid quantization params for op %s at index %d "
1114 "in subgraph %d",
1115 EnumNameBuiltinOperator(op_code), op_idx,
1116 subgraph_idx);
1117 return kTfLiteError;
1118 }
1119
1120 const float input_scale = input_tensor->quantization->scale[0];
1121 const int32_t input_zero_point = input_tensor->quantization->zero_point[0];
1122
1123 // Apply to output.
1124 output_tensor->quantization = std::make_unique<QuantizationParametersT>();
1125 output_tensor->quantization->scale.push_back(input_scale);
1126 output_tensor->quantization->zero_point.push_back(input_zero_point);
1127 if (!input_tensor->quantization->min.empty()) {
1128 const float min = input_tensor->quantization->min[0];
1129 output_tensor->quantization->min = {min};
1130 }
1131 if (!input_tensor->quantization->max.empty()) {
1132 const float max = input_tensor->quantization->max[0];
1133 output_tensor->quantization->max = {max};
1134 }
1135 output_tensor->type = activations_type;
1136 } else if (tensor_property.restriction) {
1137 const auto scale_and_zp = activations_type == TensorType_INT16
1138 ? tensor_property.restricted_value_int16
1139 : tensor_property.restricted_value_int8;
1140
1141 // Apply to output.
1142 output_tensor->quantization = std::make_unique<QuantizationParametersT>();
1143 output_tensor->quantization->scale.push_back(scale_and_zp.first);
1144 output_tensor->quantization->zero_point.push_back(scale_and_zp.second);
1145 output_tensor->type = activations_type;
1146 } else {
1147 // Process regular output that doesn't have any restrictions.
1148 if (utils::HasMinMax(output_tensor)) {
1149 utils::QuantizeActivation(output_tensor, activations_type,
1150 error_reporter);
1151 } else {
1152 TF_LITE_REPORT_ERROR(
1153 error_reporter,
1154 "Unable to find min/max value for output %d in %s in "
1155 "subgraph %d, node: %d",
1156 output_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, op_idx);
1157 return kTfLiteError;
1158 }
1159 }
1160 return kTfLiteOk;
1161 }
1162
QuantizeIntermediateTensors(ModelT * model,TensorType activations_type,ErrorReporter * error_reporter)1163 TfLiteStatus QuantizeIntermediateTensors(ModelT* model,
1164 TensorType activations_type,
1165 ErrorReporter* error_reporter) {
1166 for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1167 subgraph_idx++) {
1168 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1169 for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1170 operator_property::OperatorProperty property =
1171 operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
1172 if (!property.intermediates.empty()) {
1173 OperatorT* op = subgraph->operators[op_idx].get();
1174 const BuiltinOperator op_code =
1175 GetBuiltinCode(model->operator_codes[op->opcode_index].get());
1176 for (const std::pair<int, operator_property::TensorProperty>& input :
1177 property.intermediates) {
1178 const int index_local = input.first;
1179 const int index_global = op->intermediates[index_local];
1180 if (index_global == -1) {
1181 // Skip optional tensor.
1182 continue;
1183 }
1184 if (input.second.number_of_bits == 8 &&
1185 input.second.symmetric == false) {
1186 TensorT* tensor = subgraph->tensors[index_global].get();
1187 if (tensor->quantization == nullptr) {
1188 continue;
1189 }
1190 if (utils::HasMinMax(tensor)) {
1191 utils::QuantizeActivation(tensor, activations_type,
1192 error_reporter);
1193 } else {
1194 TF_LITE_REPORT_ERROR(error_reporter,
1195 "Unable to find min/max value for "
1196 "intermediate tensor %d in %s in "
1197 "subgraph %d, node: %d",
1198 index_local,
1199 EnumNameBuiltinOperator(op_code),
1200 subgraph_idx, op_idx);
1201 return kTfLiteError;
1202 }
1203 } else if (input.second.number_of_bits == 16 &&
1204 input.second.symmetric == true) {
1205 TensorT* tensor = subgraph->tensors[index_global].get();
1206 if (tensor->quantization == nullptr) {
1207 continue;
1208 }
1209 const float min = tensor->quantization->min[0];
1210 const float max = tensor->quantization->max[0];
1211 const float range = std::max(std::abs(min), std::abs(max));
1212 if (range < 1e-8) {
1213 return kTfLiteError;
1214 }
1215
1216 // Get scale and zero point.
1217 const float quantized_range = 32767.0;
1218 const float scale = range / quantized_range;
1219 utils::QuantizeActivationToInt16(tensor, scale);
1220 } else {
1221 return kTfLiteError;
1222 }
1223 }
1224 }
1225 }
1226 }
1227 return kTfLiteOk;
1228 }
1229
1230 // Quantize tensors that have shared range. For example, in LSTM, the output
1231 // tensor and input state tensor should share the same range because they are
1232 // using the same scale and zero point.
1233 // We have to model this explicitly because the output is modeled as an extra
1234 // tensor in LSTM. In calibrator, state tensors are logged both before and after
1235 // the inference so the range is fully captured. But output, although it is
1236 // identical to activation, is not a state tensor the input value (range) of the
1237 // very first inference is not captured.
QuantizeSharedRange(ModelT * model,ErrorReporter * error_reporter)1238 TfLiteStatus QuantizeSharedRange(ModelT* model, ErrorReporter* error_reporter) {
1239 for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1240 subgraph_idx++) {
1241 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1242 for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1243 operator_property::OperatorProperty property =
1244 operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
1245 if (!property.intermediates.empty()) {
1246 OperatorT* op = subgraph->operators[op_idx].get();
1247 for (const std::vector<int>& input : property.restrict_scale) {
1248 if (input.empty()) {
1249 continue;
1250 }
1251 // Currently only support two values. The first one for input and
1252 // the second one for output.
1253 if (input.size() != 2) {
1254 return kTfLiteError;
1255 }
1256 const int index_1 = input[0];
1257 const int index_2 = input[1];
1258 TensorT* tensor_1 = subgraph->tensors[op->inputs[index_1]].get();
1259 TensorT* tensor_2 = subgraph->tensors[op->outputs[index_2]].get();
1260 const float min_of_min = std::min(tensor_1->quantization->min[0],
1261 tensor_2->quantization->min[0]);
1262 const float max_of_max = std::max(tensor_1->quantization->max[0],
1263 tensor_2->quantization->max[0]);
1264 if (min_of_min == 0.0 && max_of_max == 0.0) {
1265 return kTfLiteError;
1266 }
1267
1268 // Asmmetric quantization to 8 bit.
1269 auto quantization_params =
1270 std::make_unique<QuantizationParametersT>();
1271 utils::GetAsymmetricQuantizationParams(
1272 min_of_min, max_of_max, -128, 127, quantization_params.get());
1273
1274 // Populate both tensors with the same parameters.
1275 const float scale = quantization_params->scale[0];
1276 const int32 zero_point = quantization_params->zero_point[0];
1277 for (TensorT* tensor : {tensor_1, tensor_2}) {
1278 tensor->quantization = std::make_unique<QuantizationParametersT>();
1279 tensor->quantization->scale.push_back(scale);
1280 tensor->quantization->zero_point.push_back(zero_point);
1281 tensor->type = TensorType_INT8;
1282 }
1283 }
1284 }
1285 }
1286 }
1287 return kTfLiteOk;
1288 }
1289
1290 // Quantize a constant based on min/max quantization parameters for
1291 // resource assignments during initialization. Constant buffers should
1292 // have the same quantization parameters as assignments.
QuantizeConstantVariable(ModelT * model,const TensorType & activations_type,TensorT * var_tensor,ErrorReporter * error_reporter)1293 TfLiteStatus QuantizeConstantVariable(ModelT* model,
1294 const TensorType& activations_type,
1295 TensorT* var_tensor,
1296 ErrorReporter* error_reporter) {
1297 if (activations_type == TensorType_INT16) {
1298 const float min = var_tensor->quantization->min[0];
1299 const float max = var_tensor->quantization->max[0];
1300 const float range = std::max(std::abs(min), std::abs(max));
1301 const float quantize_range = 32767.0;
1302 const float scale = range / quantize_range;
1303 return utils::SymmetricQuantizeFloatsToInt16(model, var_tensor, scale,
1304 error_reporter);
1305 } else if (activations_type == TensorType_INT8) {
1306 TF_LITE_ENSURE_STATUS(utils::QuantizeActivation(
1307 var_tensor, activations_type, error_reporter));
1308 QuantizationParametersT* quantization_params =
1309 var_tensor->quantization.get();
1310 const float scaling_factor = quantization_params->scale[0];
1311 const int zero_point = quantization_params->zero_point[0];
1312 const BufferT* buffer = model->buffers[var_tensor->buffer].get();
1313 const float* float_data =
1314 reinterpret_cast<const float*>(buffer->data.data());
1315 uint64_t num_elements;
1316 TF_LITE_ENSURE_STATUS(utils::NumElements(*var_tensor, &num_elements));
1317 const float scaling_factor_inv =
1318 (scaling_factor == 0) ? 0 : 1.0 / scaling_factor;
1319 std::vector<int8_t> quantized(num_elements);
1320 const int32_t kMinScale = std::numeric_limits<int8_t>::min();
1321 const int32_t kMaxScale = std::numeric_limits<int8_t>::max();
1322 for (size_t i = 0; i < num_elements; i++) {
1323 const int32_t quantized_value = static_cast<int32_t>(
1324 TfLiteRound(float_data[i] * scaling_factor_inv) + zero_point);
1325 quantized[i] = std::min(kMaxScale, std::max(kMinScale, quantized_value));
1326 }
1327 uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(quantized.data());
1328 const size_t buffer_size = num_elements * sizeof(int8_t);
1329 model->buffers[var_tensor->buffer]->data.assign(uint8_buffer,
1330 uint8_buffer + buffer_size);
1331 return kTfLiteOk;
1332 }
1333 return kTfLiteError;
1334 }
1335
1336 using TensorResourceMap = std::map<std::pair<int, int>, std::string>;
1337 using ResourceMinMaxMap = std::map<std::string, std::pair<float, float>>;
1338 // Find min of mins, max of maxes for each variable read or assignment.
PopulateResourceMinMaxMap(ModelT * model,TensorResourceMap & tensor_resource_map,ResourceMinMaxMap & resource_min_max_map)1339 void PopulateResourceMinMaxMap(ModelT* model,
1340 TensorResourceMap& tensor_resource_map,
1341 ResourceMinMaxMap& resource_min_max_map) {
1342 for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1343 subgraph_idx++) {
1344 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1345 for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1346 OperatorT* op = subgraph->operators[op_idx].get();
1347 const BuiltinOperator op_code =
1348 GetBuiltinCode(model->operator_codes[op->opcode_index].get());
1349 if (op_code == BuiltinOperator_VAR_HANDLE) {
1350 const std::string& name =
1351 op->builtin_options.AsVarHandleOptions()->shared_name;
1352 resource_min_max_map.insert({name, {0.0, 0.0}});
1353 tensor_resource_map.insert({{subgraph_idx, op->outputs[0]}, name});
1354 }
1355 if ((op_code == BuiltinOperator_ASSIGN_VARIABLE) ||
1356 (op_code == BuiltinOperator_READ_VARIABLE)) {
1357 if (tensor_resource_map.find({subgraph_idx, op->inputs[0]}) ==
1358 tensor_resource_map.end()) {
1359 continue;
1360 }
1361 const std::string& name =
1362 tensor_resource_map[{subgraph_idx, op->inputs[0]}];
1363 TensorT* var_tensor;
1364 if (op_code == BuiltinOperator_ASSIGN_VARIABLE) {
1365 var_tensor = subgraph->tensors[op->inputs[1]].get();
1366 } else if (op_code == BuiltinOperator_READ_VARIABLE) {
1367 var_tensor = subgraph->tensors[op->outputs[0]].get();
1368 } else {
1369 continue;
1370 }
1371 if (!var_tensor->quantization ||
1372 var_tensor->quantization->min.empty() ||
1373 var_tensor->quantization->max.empty()) {
1374 continue;
1375 }
1376 // resources are quantized per tensor.
1377 const float current_min = var_tensor->quantization->min[0];
1378 const float current_max = var_tensor->quantization->max[0];
1379 auto inserted =
1380 resource_min_max_map.insert({name, {current_min, current_max}});
1381 if (!inserted.second) {
1382 resource_min_max_map[name] = {
1383 std::min(inserted.first->second.first, current_min),
1384 std::max(inserted.first->second.second, current_max)};
1385 }
1386 }
1387 }
1388 }
1389 }
1390
1391 // Quantize resource variables. Each resource read and assign should have
1392 // identical quantization parameters.
QuantizeResources(ModelT * model,const TensorType & activations_type,ErrorReporter * error_reporter)1393 TfLiteStatus QuantizeResources(ModelT* model,
1394 const TensorType& activations_type,
1395 ErrorReporter* error_reporter) {
1396 // Shared name is only stored in the var handle operator, use resoure name map
1397 // to map tensors to resource names.
1398 TensorResourceMap tensor_resource_map;
1399 ResourceMinMaxMap resource_min_max_map;
1400 PopulateResourceMinMaxMap(model, tensor_resource_map, resource_min_max_map);
1401 if (resource_min_max_map.empty()) {
1402 // No resources found, so this is OK.
1403 return kTfLiteOk;
1404 }
1405 // Update quantization parameters.
1406 for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1407 subgraph_idx++) {
1408 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1409 for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1410 OperatorT* op = subgraph->operators[op_idx].get();
1411 const BuiltinOperator op_code =
1412 GetBuiltinCode(model->operator_codes[op->opcode_index].get());
1413 if (op_code == BuiltinOperator_ASSIGN_VARIABLE ||
1414 op_code == BuiltinOperator_READ_VARIABLE) {
1415 if (tensor_resource_map.find({subgraph_idx, op->inputs[0]}) ==
1416 tensor_resource_map.end()) {
1417 continue;
1418 }
1419 const std::string& name =
1420 tensor_resource_map[{subgraph_idx, op->inputs[0]}];
1421 TensorT* var_tensor = nullptr;
1422 bool is_constant_assign = false;
1423 if (op_code == BuiltinOperator_ASSIGN_VARIABLE) {
1424 var_tensor = subgraph->tensors[op->inputs[1]].get();
1425 is_constant_assign = utils::HasBuffer(model, subgraph, op->inputs[1]);
1426 } else if (op_code == BuiltinOperator_READ_VARIABLE) {
1427 var_tensor = subgraph->tensors[op->outputs[0]].get();
1428 } else {
1429 continue;
1430 }
1431 if (resource_min_max_map.find(name) == resource_min_max_map.end()) {
1432 continue;
1433 }
1434 if (!var_tensor->quantization) {
1435 var_tensor->quantization =
1436 std::make_unique<QuantizationParametersT>();
1437 var_tensor->quantization->min.push_back(
1438 resource_min_max_map[name].first);
1439 var_tensor->quantization->max.push_back(
1440 resource_min_max_map[name].second);
1441 } else {
1442 var_tensor->quantization->min[0] = resource_min_max_map[name].first;
1443 var_tensor->quantization->max[0] = resource_min_max_map[name].second;
1444 }
1445 if (!is_constant_assign) {
1446 continue;
1447 }
1448 if (QuantizeConstantVariable(model, activations_type, var_tensor,
1449 error_reporter) != kTfLiteOk) {
1450 TF_LITE_REPORT_ERROR(
1451 error_reporter,
1452 "Unable to quantize buffer or min/max value for assignment "
1453 "in op %s in subgraph %d, node: %d",
1454 EnumNameBuiltinOperator(op_code), subgraph_idx, op_idx);
1455 return kTfLiteError;
1456 }
1457 }
1458 }
1459 }
1460 return kTfLiteOk;
1461 }
1462
1463 // Quantize inputs and weights.
1464 // Because of ops such as lstm, still need to do per op, instead of weights.
QuantizeWeightsInputOutput(ModelT * model,bool allow_float,const std::unordered_set<string> & operator_names,const std::unordered_set<string> & real_value_op_set,const TensorType & activations_type,bool disable_per_channel,ErrorReporter * error_reporter)1465 TfLiteStatus QuantizeWeightsInputOutput(
1466 ModelT* model, bool allow_float,
1467 const std::unordered_set<string>& operator_names,
1468 const std::unordered_set<string>& real_value_op_set,
1469 const TensorType& activations_type, bool disable_per_channel,
1470 ErrorReporter* error_reporter) {
1471 // Flag to track unsupported ops.
1472 bool quantization_not_supported = false;
1473
1474 // Loop over the graph and quantize ops.
1475 for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1476 subgraph_idx++) {
1477 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1478 for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1479 OperatorT* op = subgraph->operators[op_idx].get();
1480 const BuiltinOperator op_code =
1481 GetBuiltinCode(model->operator_codes[op->opcode_index].get());
1482 if (op->outputs.empty() && op_code != BuiltinOperator_ASSIGN_VARIABLE) {
1483 continue;
1484 }
1485 const string operator_name = op_code != BuiltinOperator_ASSIGN_VARIABLE
1486 ? subgraph->tensors[op->outputs[0]]->name
1487 : subgraph->tensors[op->inputs[0]]->name;
1488 operator_property::OperatorProperty property = GetOperatorProperty(
1489 operator_names, model, subgraph_idx, op_idx, operator_name,
1490 activations_type, disable_per_channel);
1491 if (!IsRealValueOp(real_value_op_set, operator_name)) {
1492 continue;
1493 }
1494
1495 if (activations_type == TensorType_INT16 && !property.quantizable &&
1496 !allow_float) {
1497 TF_LITE_REPORT_ERROR(
1498 error_reporter,
1499 "Quantization to 16x8-bit not yet supported for op: '%s'.\n",
1500 EnumNameBuiltinOperator(op_code));
1501 quantization_not_supported = true;
1502 } else if (!property.quantizable && !allow_float) {
1503 if (op_code == BuiltinOperator_DEQUANTIZE &&
1504 std::find(subgraph->outputs.begin(), subgraph->outputs.end(),
1505 op->outputs[0]) != subgraph->outputs.end()) {
1506 continue;
1507 }
1508 TF_LITE_REPORT_ERROR(error_reporter,
1509 "Quantization not yet supported for op: '%s'.\n",
1510 EnumNameBuiltinOperator(op_code));
1511 quantization_not_supported = true;
1512 }
1513
1514 // Quantize operator inputs/weights.
1515 for (const std::pair<int, operator_property::TensorProperty>& input :
1516 GetInputs(op, property)) {
1517 TF_LITE_ENSURE_STATUS(QuantizeOpInput(model, subgraph_idx, &op_idx,
1518 property, input, activations_type,
1519 error_reporter));
1520 }
1521
1522 // Quantize operator outputs.
1523 for (const std::pair<int, operator_property::TensorProperty>& output :
1524 GetOutputs(op, property)) {
1525 TF_LITE_ENSURE_STATUS(
1526 QuantizeOpOutput(model, subgraph_idx, op_idx, property, output,
1527 activations_type, error_reporter));
1528 }
1529 }
1530 }
1531
1532 // Return; emit errors if there are any.
1533 if (quantization_not_supported) {
1534 return kTfLiteError;
1535 }
1536 return kTfLiteOk;
1537 }
1538
1539 // Quantize bias.
QuantizeBiases(ModelT * model,const std::unordered_set<string> & operator_names,const std::unordered_set<string> & real_value_op_set,const TensorType & activations_type,const TensorType & bias_type,bool disable_per_channel,ErrorReporter * error_reporter)1540 TfLiteStatus QuantizeBiases(ModelT* model,
1541 const std::unordered_set<string>& operator_names,
1542 const std::unordered_set<string>& real_value_op_set,
1543 const TensorType& activations_type,
1544 const TensorType& bias_type,
1545 bool disable_per_channel,
1546 ErrorReporter* error_reporter) {
1547 for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1548 subgraph_idx++) {
1549 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1550 for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1551 OperatorT* op = subgraph->operators[op_idx].get();
1552 const BuiltinOperator op_code =
1553 GetBuiltinCode(model->operator_codes[op->opcode_index].get());
1554 if (op->outputs.empty()) {
1555 continue;
1556 }
1557 const string operator_name = subgraph->tensors[op->outputs[0]]->name;
1558 operator_property::OperatorProperty property = GetOperatorProperty(
1559 operator_names, model, subgraph_idx, op_idx, operator_name,
1560 activations_type, disable_per_channel);
1561 if (!property.quantizable ||
1562 !IsRealValueOp(real_value_op_set, operator_name)) {
1563 continue;
1564 }
1565 for (const int bias_idx : property.biases) {
1566 if (bias_idx >= op->inputs.size() ||
1567 op->inputs[bias_idx] == kTfLiteOptionalTensor) {
1568 continue;
1569 }
1570 // Quantize if it is not quantized already as the
1571 // output of another op or input of another op.
1572 TensorT* bias_tensor = subgraph->tensors[op->inputs[bias_idx]].get();
1573 if (!utils::QuantizationParametersExist(bias_tensor)) {
1574 if (utils::HasBuffer(model, subgraph, op->inputs[bias_idx])) {
1575 if (property.inputs.size() != 2) {
1576 TF_LITE_REPORT_ERROR(error_reporter,
1577 "Expect the input length of "
1578 "op %s at index %d in subgraph %d to be 2",
1579 bias_idx, op->inputs.size(),
1580 EnumNameBuiltinOperator(op_code), op_idx,
1581 subgraph_idx);
1582 return kTfLiteError;
1583 }
1584 TensorT* input_tensor =
1585 subgraph->tensors[op->inputs[property.inputs[0].first]].get();
1586 TensorT* weight_tensor =
1587 subgraph->tensors[op->inputs[property.inputs[1].first]].get();
1588 operator_property::TensorProperty weight_property =
1589 property.inputs[1].second;
1590 TF_LITE_ENSURE_STATUS(QuantizeBias(
1591 model, input_tensor, weight_tensor, bias_tensor,
1592 weight_property.per_axis, weight_property.per_axis_index,
1593 bias_type, error_reporter));
1594 }
1595 } else {
1596 // If bias is already quantized, make sure it is quantized to 32 bit.
1597 if (bias_tensor->type != TensorType_INT32) {
1598 TF_LITE_REPORT_ERROR(
1599 error_reporter,
1600 "Bias (\"%s\" at global index %d) of op \"%s\" at op_index %d "
1601 "in subgraph %d is expected to be quantized to INT32 but it is "
1602 "already quantized to %s.\n",
1603 bias_tensor->name.c_str(), op->inputs[bias_idx],
1604 operator_name.c_str(), op_idx, subgraph_idx,
1605 EnumNameTensorType(bias_tensor->type));
1606 }
1607 }
1608 }
1609 }
1610 }
1611 return kTfLiteOk;
1612 }
1613
GetAllOperatorOutputs(ModelT * model)1614 std::unordered_set<string> GetAllOperatorOutputs(ModelT* model) {
1615 std::unordered_set<string> operator_names;
1616 for (int32_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1617 subgraph_idx++) {
1618 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1619 for (int32_t tensor_idx = 0; tensor_idx < subgraph->tensors.size();
1620 tensor_idx++) {
1621 operator_names.insert(subgraph->tensors[tensor_idx]->name);
1622 }
1623 }
1624 return operator_names;
1625 }
1626 // Populate the quantization parameters max and min for input tensors.
1627 // Assumes that dynamic tensors already have stored min, max values and throw
1628 // an error if a tensor does not have min, max quantization parameter or a
1629 // buffer.
1630 // If any static tensors are not inputs to an operation, their max, min values
1631 // will not be filled by this function.
FillQuantizationParams(ModelT * model,const std::unordered_set<string> & operator_names,const std::unordered_set<string> & real_value_op_set,const TensorType & activations_type,bool disable_per_channel,ErrorReporter * error_reporter)1632 TfLiteStatus FillQuantizationParams(
1633 ModelT* model, const std::unordered_set<string>& operator_names,
1634 const std::unordered_set<string>& real_value_op_set,
1635 const TensorType& activations_type, bool disable_per_channel,
1636 ErrorReporter* error_reporter) {
1637 for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1638 subgraph_idx++) {
1639 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1640 for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1641 OperatorT* op = subgraph->operators[op_idx].get();
1642 operator_property::OperatorProperty property =
1643 operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
1644 if (!property.quantizable) {
1645 continue;
1646 }
1647 if (!op->outputs.empty()) {
1648 const string operator_name = subgraph->tensors[op->outputs[0]]->name;
1649 property = GetOperatorProperty(operator_names, model, subgraph_idx,
1650 op_idx, operator_name, activations_type,
1651 disable_per_channel);
1652 if (!IsRealValueOp(real_value_op_set, operator_name)) {
1653 continue;
1654 }
1655 }
1656
1657 // Populate max, min for each input tensor.
1658 for (const std::pair<int, operator_property::TensorProperty>& input :
1659 property.inputs) {
1660 // Get tensor.
1661 const int32_t input_idx = input.first;
1662 const int32_t tensor_idx = op->inputs[input_idx];
1663 if (tensor_idx == -1) {
1664 // Skip optional tensor.
1665 continue;
1666 }
1667 TensorT* tensor = subgraph->tensors[tensor_idx].get();
1668
1669 // Static tensor.
1670 if (!utils::HasMinMax(tensor) &&
1671 utils::HasBuffer(model, subgraph, tensor_idx)) {
1672 // Get input float data and tensor dimensions.
1673 const BufferT* buffer = model->buffers[tensor->buffer].get();
1674 const float* float_input_data =
1675 reinterpret_cast<const float*>(buffer->data.data());
1676
1677 if (tensor->quantization == nullptr) {
1678 tensor->quantization = std::make_unique<QuantizationParametersT>();
1679 }
1680
1681 // Fill per channel max and min with respect to channel_dim_index.
1682 if (input.second.per_axis) {
1683 if (tensor->shape.size() == 4) {
1684 int32_t channel_dim_index = input.second.per_axis_index;
1685 TF_LITE_ENSURE_STATUS(utils::FillPerChannelMinMax(
1686 float_input_data, tensor->shape, channel_dim_index,
1687 tensor->quantization.get(), error_reporter));
1688 } else {
1689 TF_LITE_REPORT_ERROR(
1690 error_reporter,
1691 "Could not fill max min for tensor as the dimension is %d "
1692 "and not 4 as expected.",
1693 tensor->shape.size());
1694 return kTfLiteError;
1695 }
1696
1697 // Fill per layer max and min.
1698 } else if (!utils::HasMinMax(tensor) && !input.second.per_axis &&
1699 utils::HasBuffer(model, subgraph, tensor_idx)) {
1700 uint64_t input_size;
1701 TF_LITE_ENSURE_STATUS(utils::NumElements(*tensor, &input_size));
1702 utils::FillSingleMinMax(float_input_data, input_size,
1703 tensor->quantization.get());
1704 }
1705 if (tensor->quantization->quantized_dimension !=
1706 input.second.per_axis_index) {
1707 TF_LITE_REPORT_ERROR(
1708 error_reporter,
1709 "Quantized dimension for tensor property and quantization "
1710 "parameters do not match. Got %d and %d respectively.",
1711 input.second.per_axis_index,
1712 tensor->quantization->quantized_dimension);
1713 return kTfLiteError;
1714 }
1715
1716 // Dynamic tensor.
1717 } else if (!utils::HasMinMax(tensor) &&
1718 !utils::HasBuffer(model, subgraph, tensor_idx)) {
1719 TF_LITE_REPORT_ERROR(
1720 error_reporter,
1721 "Max and min for dynamic tensors should be"
1722 " recorded during calibration: Failed for tensor %s\n",
1723 tensor->name.c_str());
1724 if (tensor->quantization == nullptr) {
1725 TF_LITE_REPORT_ERROR(error_reporter,
1726 "No quantization params for tensor %s",
1727 tensor->name.c_str());
1728 } else if (tensor->quantization->min.empty() ||
1729 tensor->quantization->max.empty()) {
1730 TF_LITE_REPORT_ERROR(error_reporter, "Empty min/max for tensor %s",
1731 tensor->name.c_str());
1732 }
1733 return kTfLiteError;
1734 }
1735
1736 if (utils::QuantizationParametersExist(tensor)) {
1737 TF_LITE_REPORT_ERROR(
1738 error_reporter,
1739 "Scale and zero points should not be recorded before "
1740 "quantization.");
1741 return kTfLiteError;
1742 }
1743 } // loop over op inputs
1744 } // loop over ops
1745 } // loop over subgraphs
1746 return kTfLiteOk;
1747 }
1748
1749 // Check compatibility of activation, weight and bias scales. Adjust if needed.
EnsureBiasScaleCompatibility(ModelT * model,const std::unordered_set<string> & operator_names,const std::unordered_set<string> & real_value_op_set,const TensorType & activations_type,bool disable_per_channel,ErrorReporter * error_reporter)1750 TfLiteStatus EnsureBiasScaleCompatibility(
1751 ModelT* model, const std::unordered_set<string>& operator_names,
1752 const std::unordered_set<string>& real_value_op_set,
1753 const TensorType& activations_type, bool disable_per_channel,
1754 ErrorReporter* error_reporter) {
1755 for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1756 subgraph_idx++) {
1757 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1758 for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1759 OperatorT* op = subgraph->operators[op_idx].get();
1760 if (op->outputs.empty()) {
1761 continue;
1762 }
1763 const string operator_name = subgraph->tensors[op->outputs[0]]->name;
1764 operator_property::OperatorProperty property = GetOperatorProperty(
1765 operator_names, model, subgraph_idx, op_idx, operator_name,
1766 activations_type, disable_per_channel);
1767 if (!IsRealValueOp(real_value_op_set, operator_name)) {
1768 continue;
1769 }
1770
1771 // Loop over all bias tensors.
1772 for (const int bias_idx : property.biases) {
1773 if (bias_idx >= op->inputs.size() ||
1774 op->inputs[bias_idx] == kTfLiteOptionalTensor) {
1775 continue;
1776 }
1777 TensorT* bias_tensor = subgraph->tensors[op->inputs[bias_idx]].get();
1778 int32_t channel_dim_size = bias_tensor->shape[0];
1779 if (bias_tensor->shape.size() != 1) {
1780 TF_LITE_REPORT_ERROR(error_reporter,
1781 "Expected bias tensor to be a vector.");
1782 return kTfLiteError;
1783 }
1784
1785 if (property.inputs.size() != 2) { // Only works for two input tensors.
1786 TF_LITE_REPORT_ERROR(
1787 error_reporter,
1788 "Expect %d inputs for op %s at index %d in subgraph %d to be 2",
1789 property.inputs.size(), op_idx, subgraph_idx);
1790 return kTfLiteError;
1791 }
1792
1793 if (!property.arbitrary_inputs && property.quantizable) {
1794 // Get input and weight tensors.
1795 TensorT* input_tensor =
1796 subgraph->tensors[op->inputs[property.inputs[0].first]].get();
1797 TensorT* weight_tensor =
1798 subgraph->tensors[op->inputs[property.inputs[1].first]].get();
1799 operator_property::TensorProperty weight_property =
1800 property.inputs[1].second;
1801 TF_LITE_ENSURE(error_reporter, input_tensor->quantization);
1802
1803 // Check quantization parameters exist for input.
1804 if (!utils::HasMinMax(input_tensor)) {
1805 TF_LITE_REPORT_ERROR(
1806 error_reporter,
1807 "Input tensor missing quantization information. Should be "
1808 "populated during calibration.");
1809 return kTfLiteError;
1810 }
1811
1812 // Get input scale for asymmetric quantization.
1813 QuantizationParametersT temp_quant_params = QuantizationParametersT();
1814 TF_LITE_ENSURE_STATUS(
1815 utils::GetQuantizationParams(input_tensor, activations_type,
1816 &temp_quant_params, error_reporter));
1817 if (temp_quant_params.scale.size() != 1) {
1818 TF_LITE_REPORT_ERROR(error_reporter,
1819 "Unexpected input quantization scale size.");
1820 return kTfLiteError;
1821 }
1822 float input_scale = temp_quant_params.scale[0];
1823
1824 // Check that max/min values have been filled for weights.
1825 if (!utils::HasMinMax(weight_tensor)) {
1826 TF_LITE_REPORT_ERROR(
1827 error_reporter,
1828 "Min and/or max values have not been recorded for weight "
1829 "tensor. This should have happened in FillQuantizationParams.");
1830 return kTfLiteError;
1831 }
1832
1833 // Ensure the tensor dimensions are compatible.
1834 if (weight_property.per_axis) {
1835 if (bias_tensor->shape[0] !=
1836 weight_tensor->shape[weight_property.per_axis_index]) {
1837 TF_LITE_REPORT_ERROR(
1838 error_reporter,
1839 "Channel mismatch between bias and weight tensors %d vs %d",
1840 bias_tensor->shape[0],
1841 weight_tensor->shape[weight_property.per_axis_index]);
1842 return kTfLiteError;
1843 }
1844 // Ensure that the number of max/mins matches the channel_dim_size.
1845 if (weight_tensor->quantization->max.size() != channel_dim_size) {
1846 TF_LITE_REPORT_ERROR(
1847 error_reporter,
1848 "Mismatch between number of weight maxs and channels: %d vs "
1849 "%d",
1850 weight_tensor->quantization->max.size(), channel_dim_size);
1851 return kTfLiteError;
1852 }
1853 if (weight_tensor->quantization->min.size() != channel_dim_size) {
1854 TF_LITE_REPORT_ERROR(
1855 error_reporter,
1856 "Mismatch between number of weight mins and channels: %d",
1857 weight_tensor->quantization->min.size());
1858 return kTfLiteError;
1859 }
1860 }
1861
1862 // Get data and size of bias tensor.
1863 const BufferT* buffer = model->buffers[bias_tensor->buffer].get();
1864 const float* bias_data =
1865 reinterpret_cast<const float*>(buffer->data.data());
1866 uint64_t bias_size;
1867 TF_LITE_ENSURE_STATUS(utils::NumElements(*bias_tensor, &bias_size));
1868
1869 // Adjust weight scales if needed.
1870 TF_LITE_ENSURE_STATUS(utils::AdjustWeightsForBiasScale(
1871 weight_tensor->quantization.get(), bias_data, bias_size,
1872 input_scale, error_reporter));
1873
1874 if (utils::QuantizationParametersExist(weight_tensor)) {
1875 TF_LITE_REPORT_ERROR(
1876 error_reporter,
1877 "Scale and zero points should not be recorded for the weight "
1878 "tensor before quantization.");
1879 return kTfLiteError;
1880 }
1881 if (utils::QuantizationParametersExist(input_tensor)) {
1882 TF_LITE_REPORT_ERROR(
1883 error_reporter,
1884 "Scale and zero points should not be recorded for the input "
1885 "tensor before quantization.");
1886 return kTfLiteError;
1887 }
1888 }
1889 }
1890 }
1891 }
1892 return kTfLiteOk;
1893 }
1894
1895 } // namespace
1896
1897 // Assumes that the operators in the model have been topologically sorted.
QuantizeModel(flatbuffers::FlatBufferBuilder * builder,ModelT * model,const TensorType & input_type,const TensorType & output_type,bool allow_float,const std::unordered_set<string> & operator_names,const TensorType & activations_type,const TensorType & bias_type,bool disable_per_channel,ErrorReporter * error_reporter)1898 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
1899 ModelT* model, const TensorType& input_type,
1900 const TensorType& output_type, bool allow_float,
1901 const std::unordered_set<string>& operator_names,
1902 const TensorType& activations_type,
1903 const TensorType& bias_type,
1904 bool disable_per_channel,
1905 ErrorReporter* error_reporter) {
1906 auto real_value_op_set =
1907 PopulateRealValueOpSet(model, operator_names, activations_type);
1908 TF_LITE_ENSURE_STATUS(DuplicateBiasesWithMultipleUses(model, error_reporter));
1909 TF_LITE_ENSURE_STATUS(FillQuantizationParams(
1910 model, operator_names, real_value_op_set, activations_type,
1911 disable_per_channel, error_reporter));
1912 TF_LITE_ENSURE_STATUS(EnsureBiasScaleCompatibility(
1913 model, operator_names, real_value_op_set, activations_type,
1914 disable_per_channel, error_reporter));
1915 TF_LITE_ENSURE_STATUS(
1916 QuantizeIntermediateTensors(model, activations_type, error_reporter));
1917 TF_LITE_ENSURE_STATUS(QuantizeSharedRange(model, error_reporter));
1918 TF_LITE_ENSURE_STATUS(
1919 QuantizeResources(model, activations_type, error_reporter));
1920 TF_LITE_ENSURE_STATUS(QuantizeWeightsInputOutput(
1921 model, allow_float, operator_names, real_value_op_set, activations_type,
1922 disable_per_channel, error_reporter));
1923 TF_LITE_ENSURE_STATUS(ApplyConstraints(model, operator_names,
1924 real_value_op_set, activations_type,
1925 error_reporter));
1926 TF_LITE_ENSURE_STATUS(QuantizeBiases(model, operator_names, real_value_op_set,
1927 activations_type, bias_type,
1928 disable_per_channel, error_reporter));
1929 utils::SetOperatorCodeVersion(model);
1930 TF_LITE_ENSURE_STATUS(SetInputAndOutputTypes(
1931 model, input_type, output_type, activations_type, error_reporter));
1932 SetOperatorPropertyADDSUBOperator(model, activations_type);
1933 flatbuffers::Offset<Model> output_model_location =
1934 Model::Pack(*builder, model);
1935 FinishModelBuffer(*builder, output_model_location);
1936
1937 return kTfLiteOk;
1938 }
1939
1940 // Assumes that the operators in the model have been topologically sorted.
QuantizeModel(flatbuffers::FlatBufferBuilder * builder,ModelT * model,const TensorType & input_type,const TensorType & output_type,bool allow_float,const std::unordered_set<string> & operator_names,const TensorType & activations_type,const TensorType & bias_type,ErrorReporter * error_reporter)1941 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
1942 ModelT* model, const TensorType& input_type,
1943 const TensorType& output_type, bool allow_float,
1944 const std::unordered_set<string>& operator_names,
1945 const TensorType& activations_type,
1946 const TensorType& bias_type,
1947 ErrorReporter* error_reporter) {
1948 return QuantizeModel(builder, model, input_type, output_type, allow_float,
1949 operator_names, activations_type,
1950 /*bias_type=*/bias_type,
1951 /*disable_per_channel=*/false, error_reporter);
1952 }
1953
QuantizeModelAllOperators(flatbuffers::FlatBufferBuilder * builder,ModelT * model,const TensorType & input_type,const TensorType & output_type,bool allow_float,const TensorType & activations_type,const TensorType & bias_type,ErrorReporter * error_reporter)1954 TfLiteStatus QuantizeModelAllOperators(
1955 flatbuffers::FlatBufferBuilder* builder, ModelT* model,
1956 const TensorType& input_type, const TensorType& output_type,
1957 bool allow_float, const TensorType& activations_type,
1958 const TensorType& bias_type, ErrorReporter* error_reporter) {
1959 return QuantizeModel(builder, model, input_type, output_type, allow_float,
1960 GetAllOperatorOutputs(model), activations_type,
1961 bias_type,
1962 /*disable_per_channel=*/false, error_reporter);
1963 }
1964
QuantizeModelAllOperators(flatbuffers::FlatBufferBuilder * builder,ModelT * model,const TensorType & input_type,const TensorType & output_type,bool allow_float,const TensorType & activations_type,const TensorType & bias_type,bool disable_per_channel,ErrorReporter * error_reporter)1965 TfLiteStatus QuantizeModelAllOperators(
1966 flatbuffers::FlatBufferBuilder* builder, ModelT* model,
1967 const TensorType& input_type, const TensorType& output_type,
1968 bool allow_float, const TensorType& activations_type,
1969 const TensorType& bias_type, bool disable_per_channel,
1970 ErrorReporter* error_reporter) {
1971 return QuantizeModel(builder, model, input_type, output_type, allow_float,
1972 GetAllOperatorOutputs(model), activations_type,
1973 bias_type, disable_per_channel, error_reporter);
1974 }
1975
QuantizeModel(flatbuffers::FlatBufferBuilder * builder,ModelT * model,const TensorType & input_type,const TensorType & output_type,bool allow_float,ErrorReporter * error_reporter)1976 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
1977 ModelT* model, const TensorType& input_type,
1978 const TensorType& output_type, bool allow_float,
1979 ErrorReporter* error_reporter) {
1980 return QuantizeModel(builder, model, input_type, output_type, allow_float,
1981 GetAllOperatorOutputs(model),
1982 /*activations_type=*/TensorType_INT8,
1983 /*bias_type=*/TensorType_INT32, error_reporter);
1984 }
1985
QuantizeModel(flatbuffers::FlatBufferBuilder * builder,ModelT * model,const TensorType & input_type,const TensorType & output_type,ErrorReporter * error_reporter)1986 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
1987 ModelT* model, const TensorType& input_type,
1988 const TensorType& output_type,
1989 ErrorReporter* error_reporter) {
1990 return QuantizeModel(builder, model, input_type, output_type,
1991 /*allow_float=*/false, error_reporter);
1992 }
1993
QuantizeModel(flatbuffers::FlatBufferBuilder * builder,ModelT * model,ErrorReporter * error_reporter)1994 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
1995 ModelT* model, ErrorReporter* error_reporter) {
1996 return QuantizeModel(builder, model, TensorType_FLOAT32, TensorType_FLOAT32,
1997 /*allow_float=*/false, error_reporter);
1998 }
1999
2000 } // namespace optimize
2001 } // namespace tflite
2002