1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/lite/tools/optimize/quantization_utils.h"
16
17 #include <algorithm>
18 #include <cmath>
19 #include <cstdint>
20 #include <iostream>
21 #include <memory>
22 #include <string>
23
24 #include "absl/memory/memory.h"
25 #include "third_party/eigen3/Eigen/Core"
26 #include "tensorflow/lite/c/common.h"
27 #include "tensorflow/lite/core/api/error_reporter.h"
28 #include "tensorflow/lite/kernels/internal/cppmath.h"
29 #include "tensorflow/lite/kernels/internal/quantization_util.h"
30 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
31 #include "tensorflow/lite/kernels/internal/types.h"
32 #include "tensorflow/lite/minimal_logging.h"
33 #include "tensorflow/lite/schema/schema_generated.h"
34 #include "tensorflow/lite/tools/optimize/model_utils.h"
35
36 namespace tflite {
37 namespace optimize {
38 namespace utils {
39
40 namespace {
41 const int8_t kMinQuantizedValue = -127;
42 const int8_t kMaxQuantizedValue = 127;
43
44 // The maximum number of dimensions supported in per-channel quantization.
45 constexpr int kPerChannelMaxDim = 4;
46 } // namespace
47
NumElements(const TensorT & tensor,uint64_t * num_elements)48 TfLiteStatus NumElements(const TensorT& tensor, uint64_t* num_elements) {
49 *num_elements = 1;
50 for (const int64_t dim : tensor.shape) {
51 if (dim <= 0 || *num_elements > UINT64_MAX / static_cast<uint64_t>(dim)) {
52 return kTfLiteError;
53 }
54 *num_elements *= dim;
55 }
56 return kTfLiteOk;
57 }
58
59 // Nudge min and max so that floating point 0 falls exactly on a quantized
60 // value, returning the nudges scale and zero_point.
61 //
62 // Although this code originates from FakeQuantization in quantized training,
63 // we may deviate from that implementation as we please since we do not fine
64 // tune the weights with quantized training.
GetAsymmetricQuantizationParams(float min,float max,const int quant_min,const int quant_max,QuantizationParametersT * quantization_params)65 void GetAsymmetricQuantizationParams(
66 float min, float max, const int quant_min, const int quant_max,
67 QuantizationParametersT* quantization_params) {
68 const float quant_min_float = static_cast<float>(quant_min);
69 const float quant_max_float = static_cast<float>(quant_max);
70 // Adjust the boundaries to guarantee 0 is included.
71 min = std::min(static_cast<float>(min), 0.0f);
72 max = std::max(static_cast<float>(max), 0.0f);
73 const float scale = (max - min) / (quant_max_float - quant_min_float);
74 // Scale can be zero if min and max are exactly 0.0f.
75 float zero_point_from_min = quant_min_float;
76 if (scale != 0) {
77 zero_point_from_min = quant_min_float - min / scale;
78 }
79 int64_t zero_point;
80 if (zero_point_from_min < quant_min_float) {
81 zero_point = static_cast<int64_t>(quant_min);
82 } else if (zero_point_from_min > quant_max_float) {
83 zero_point = static_cast<int64_t>(quant_max);
84 } else {
85 zero_point = static_cast<int64_t>(std::round(zero_point_from_min));
86 }
87 quantization_params->min = std::vector<float>(1, min);
88 quantization_params->max = std::vector<float>(1, max);
89 quantization_params->scale = std::vector<float>(1, scale);
90 quantization_params->zero_point = std::vector<int64_t>(1, zero_point);
91 }
92
GetSymmetricQuantizationParams(float min,float max,const int half_quant_range,QuantizationParametersT * quantization_params)93 void GetSymmetricQuantizationParams(
94 float min, float max, const int half_quant_range,
95 QuantizationParametersT* quantization_params) {
96 // Adjust the boundaries to guarantee 0 is included.
97 min = std::min(min, 0.0f);
98 max = std::max(max, 0.0f);
99 const float scale = std::max(std::abs(max), std::abs(min)) / half_quant_range;
100 quantization_params->min = std::vector<float>(1, min);
101 quantization_params->max = std::vector<float>(1, max);
102 quantization_params->scale = std::vector<float>(1, scale);
103 quantization_params->zero_point = std::vector<int64_t>(1, 0);
104 }
105
GetQuantizationParams(TensorT * tensor,TensorType activations_type,QuantizationParametersT * quantization_params,ErrorReporter * error_reporter)106 TfLiteStatus GetQuantizationParams(TensorT* tensor, TensorType activations_type,
107 QuantizationParametersT* quantization_params,
108 ErrorReporter* error_reporter) {
109 if (activations_type == TensorType_INT8) {
110 GetAsymmetricQuantizationParams(
111 tensor->quantization->min[0], tensor->quantization->max[0],
112 std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(),
113 quantization_params);
114 } else if (activations_type == TensorType_INT16) {
115 const int half_quantized_range = 32767;
116 GetSymmetricQuantizationParams(tensor->quantization->min[0],
117 tensor->quantization->max[0],
118 half_quantized_range, quantization_params);
119 } else {
120 TF_LITE_REPORT_ERROR(
121 error_reporter,
122 "Unsupported activation type for quantize-activation: %d",
123 activations_type);
124 return kTfLiteError;
125 }
126 return kTfLiteOk;
127 }
128
129 // Set the max and min quantization parameter for a single tensor given its
130 // values.
FillSingleMinMax(const float * const input,const uint64_t input_size,QuantizationParametersT * quantization_params)131 void FillSingleMinMax(const float* const input, const uint64_t input_size,
132 QuantizationParametersT* quantization_params) {
133 const auto minmax = std::minmax_element(input, input + input_size);
134 quantization_params->min.assign(1, *minmax.first);
135 quantization_params->max.assign(1, *minmax.second);
136 }
137
FillPerChannelMinMax(const float * const input,const std::vector<int32_t> & dimension,int32_t channel_dim_index,QuantizationParametersT * quantization_params,ErrorReporter * error_reporter)138 TfLiteStatus FillPerChannelMinMax(const float* const input,
139 const std::vector<int32_t>& dimension,
140 int32_t channel_dim_index,
141 QuantizationParametersT* quantization_params,
142 ErrorReporter* error_reporter) {
143 if (!quantization_params->min.empty() || !quantization_params->max.empty()) {
144 TF_LITE_REPORT_ERROR(
145 error_reporter,
146 "Min or max already present in tensor quantization params.");
147 return kTfLiteError;
148 }
149
150 if (dimension.size() > kPerChannelMaxDim) {
151 TF_LITE_REPORT_ERROR(
152 error_reporter,
153 "Expected tensor with less than %d dimensions, but got %d.",
154 kPerChannelMaxDim + 1, dimension.size());
155 return kTfLiteError;
156 }
157 if (channel_dim_index >= dimension.size()) {
158 TF_LITE_REPORT_ERROR(
159 error_reporter,
160 "Expected channel_dim_index to be less than %d, but got %d.",
161 dimension.size(), channel_dim_index);
162 return kTfLiteError;
163 }
164
165 const int32_t channel_dim_size = dimension[channel_dim_index];
166 quantization_params->quantized_dimension = channel_dim_index;
167 quantization_params->min = std::vector<float>(channel_dim_size);
168 quantization_params->max = std::vector<float>(channel_dim_size);
169 std::vector<bool> has_min_max_value(channel_dim_size, false);
170 int indices[kPerChannelMaxDim];
171 RuntimeShape unextended_tensor_dims(dimension.size(), dimension.data());
172 RuntimeShape tensor_dims =
173 RuntimeShape::ExtendedShape(kPerChannelMaxDim, unextended_tensor_dims);
174 channel_dim_index +=
175 kPerChannelMaxDim - unextended_tensor_dims.DimensionsCount();
176
177 // Compute min max ranges per channel
178 for (indices[0] = 0; indices[0] < tensor_dims.Dims(0); indices[0]++) {
179 for (indices[1] = 0; indices[1] < tensor_dims.Dims(1); indices[1]++) {
180 for (indices[2] = 0; indices[2] < tensor_dims.Dims(2); indices[2]++) {
181 for (indices[3] = 0; indices[3] < tensor_dims.Dims(3); indices[3]++) {
182 int channel_idx = indices[channel_dim_index];
183 const float val = input[Offset(tensor_dims, indices)];
184 if (has_min_max_value[channel_idx]) {
185 if (quantization_params->min[channel_idx] > val) {
186 quantization_params->min[channel_idx] = val;
187 } else if (quantization_params->max[channel_idx] < val) {
188 quantization_params->max[channel_idx] = val;
189 }
190 } else {
191 quantization_params->min[channel_idx] = val;
192 quantization_params->max[channel_idx] = val;
193 has_min_max_value[channel_idx] = true;
194 }
195 }
196 }
197 }
198 }
199 return kTfLiteOk;
200 }
201
202 // Populates the scales vector based on max and min values of quant_params
GetSymmetricScalesFromMaxMin(QuantizationParametersT * quant_params,std::vector<float> * scales,ErrorReporter * error_reporter)203 TfLiteStatus GetSymmetricScalesFromMaxMin(QuantizationParametersT* quant_params,
204 std::vector<float>* scales,
205 ErrorReporter* error_reporter) {
206 // Check that max and min values are present and their sizes match.
207 if (quant_params->min.empty() || quant_params->max.empty()) {
208 TF_LITE_REPORT_ERROR(error_reporter,
209 "Max and min values are not populated.");
210 return kTfLiteError;
211 }
212 if (quant_params->min.size() != quant_params->max.size()) {
213 TF_LITE_REPORT_ERROR(error_reporter,
214 "Dimensions of max and min values do not match.");
215 return kTfLiteError;
216 }
217 if (scales->size() != quant_params->min.size()) {
218 TF_LITE_REPORT_ERROR(error_reporter,
219 "Provided scale vector has incorrect size.");
220 return kTfLiteError;
221 }
222
223 // num_channels is calculated from min.size() to infer whether quantization
224 // is per axis.
225 int num_channels = quant_params->min.size();
226 // Calculate scales per channel.
227 for (int channel_idx = 0; channel_idx < num_channels; ++channel_idx) {
228 const float half_range = std::max(std::abs(quant_params->min[channel_idx]),
229 std::abs(quant_params->max[channel_idx]));
230 scales->at(channel_idx) = half_range / kMaxQuantizedValue;
231 }
232 return kTfLiteOk;
233 }
234
235 // Checks that the bias is quantized to within the middle half of the
236 // allowable bit range determined by the scales of the input and weight tensors.
237 // If this condition is not satisfied, the scale of the weights is increased in
238 // order to prevent overflow. The scale of the bias is not set here, only the
239 // min/max.
240 // The quant_params are the quantization parameters that correspond to the
241 // weight tensor.
AdjustWeightsForBiasScale(QuantizationParametersT * quant_params,const float * bias_data,const size_t bias_size,const float input_scale,ErrorReporter * error_reporter)242 TfLiteStatus AdjustWeightsForBiasScale(QuantizationParametersT* quant_params,
243 const float* bias_data,
244 const size_t bias_size,
245 const float input_scale,
246 ErrorReporter* error_reporter) {
247 // TODO(dmolitor) Allow adjusting activation scale.
248 // TODO(dmolitor) Tighten scale adjustment.
249 // TODO(dmolitor) Test using a separate strategy for scales of 0.
250 const int32_t kScale = std::numeric_limits<int32_t>::max();
251 if (quant_params == nullptr) {
252 TF_LITE_REPORT_ERROR(error_reporter,
253 "Missing max and min values for weight tensor.");
254 return kTfLiteError;
255 }
256 // channel_dim_size is calculated from min.size() to infer whether
257 // quantization is per axis
258 int channel_dim_size = quant_params->min.size();
259 if (channel_dim_size == 0) {
260 TF_LITE_REPORT_ERROR(
261 error_reporter,
262 "Missing weight scales. Unable to check compatibility with bias "
263 "scale.");
264 return kTfLiteError;
265 }
266
267 std::vector<float> weight_scales(channel_dim_size);
268 TF_LITE_ENSURE_STATUS(GetSymmetricScalesFromMaxMin(
269 quant_params, &weight_scales, error_reporter));
270
271 // Per channel quantization
272 if (channel_dim_size > 1) {
273 for (int i = 0; i < channel_dim_size; ++i) {
274 // Current scale is not compatible with bias. Adjust max/min values.
275 if (std::abs(bias_data[i]) >=
276 0.5 * input_scale * weight_scales[i] * kScale) {
277 quant_params->max[i] = 2.0 * std::abs(bias_data[i]) / kScale *
278 (kMaxQuantizedValue / input_scale);
279 quant_params->min[i] = -quant_params->max[i];
280 }
281 }
282 // Per layer quantization
283 } else if (channel_dim_size == 1) {
284 const auto minmax = std::minmax_element(bias_data, bias_data + bias_size);
285 const float bias_half_range =
286 std::max(std::abs(*minmax.first), std::abs(*minmax.second));
287
288 // Need to adjust weight min/max; not compatible with bias.
289 if (bias_half_range / kScale >= 0.5 * input_scale * weight_scales[0]) {
290 quant_params->min[0] =
291 2.0 * bias_half_range / kScale * (kMinQuantizedValue / input_scale);
292 quant_params->max[0] =
293 2.0 * bias_half_range / kScale * (kMaxQuantizedValue / input_scale);
294 }
295 }
296 return kTfLiteOk;
297 }
298
299 // Per-channel quantize a tensor at the given index and fills both scales and
300 // quantized values.
SymmetricPerChannelQuantization(TensorT * tensor,const float * const input,int32_t channel_dim_index,std::vector<float> * output_scales,std::vector<int8_t> * output_value,ErrorReporter * error_reporter)301 TfLiteStatus SymmetricPerChannelQuantization(TensorT* tensor,
302 const float* const input,
303 int32_t channel_dim_index,
304 std::vector<float>* output_scales,
305 std::vector<int8_t>* output_value,
306 ErrorReporter* error_reporter) {
307 if (tensor == nullptr) {
308 TF_LITE_REPORT_ERROR(error_reporter, "Cannot quantize. Tensor is null.");
309 return kTfLiteError;
310 }
311 const int32_t channel_dim_size = tensor->shape[channel_dim_index];
312 // Fill per channel max and min values if needed
313 if (tensor->quantization == nullptr) {
314 tensor->quantization = std::make_unique<QuantizationParametersT>();
315 }
316 if (!HasMinMax(tensor)) {
317 TF_LITE_ENSURE_STATUS(
318 FillPerChannelMinMax(input, tensor->shape, channel_dim_index,
319 tensor->quantization.get(), error_reporter));
320 }
321
322 // Calculate scales per channel using max and min values from tensor.
323 std::vector<float> scale_invs(channel_dim_size);
324 const float half_scale = kMaxQuantizedValue;
325 for (int channel_idx = 0; channel_idx < channel_dim_size; channel_idx++) {
326 const float half_range =
327 std::max(std::abs(tensor->quantization->min[channel_idx]),
328 std::abs(tensor->quantization->max[channel_idx]));
329 output_scales->at(channel_idx) = half_range / half_scale;
330 if (half_range == 0) {
331 scale_invs[channel_idx] = 0;
332 } else {
333 scale_invs[channel_idx] = half_scale / half_range;
334 }
335 }
336
337 // Quantize the input values.
338 SymmetricPerChannelQuantizeValues(input, scale_invs, tensor->shape,
339 channel_dim_index, output_value);
340 return kTfLiteOk;
341 }
342
SymmetricQuantizeFloatsToInt16(const float * data,uint64_t num_elements,float scaling_factor)343 std::vector<int16_t> SymmetricQuantizeFloatsToInt16(const float* data,
344 uint64_t num_elements,
345 float scaling_factor) {
346 // Compute the inverse of scale.
347 const float scaling_factor_inv =
348 (scaling_factor == 0) ? 0 : 1.0 / scaling_factor;
349 std::vector<int16_t> buffer(num_elements);
350 const int32_t kScale = std::numeric_limits<int16_t>::max();
351
352 for (size_t i = 0; i < num_elements; i++) {
353 const int32_t quantized_value =
354 static_cast<int32_t>(TfLiteRound(data[i] * scaling_factor_inv));
355 buffer[i] = std::min(kScale, std::max(-kScale, quantized_value));
356 }
357 return buffer;
358 }
359
SymmetricQuantizeFloatsToInt16(ModelT * model,TensorT * tensor,float scaling_factor,ErrorReporter * error_reporter)360 TfLiteStatus SymmetricQuantizeFloatsToInt16(ModelT* model, TensorT* tensor,
361 float scaling_factor,
362 ErrorReporter* error_reporter) {
363 const BufferT* buffer = model->buffers[tensor->buffer].get();
364 const float* float_data = reinterpret_cast<const float*>(buffer->data.data());
365 uint64_t num_elements;
366 TF_LITE_ENSURE_STATUS(NumElements(*tensor, &num_elements));
367
368 auto final_buffer =
369 SymmetricQuantizeFloatsToInt16(float_data, num_elements, scaling_factor);
370 // Set the buffers and output type.
371 uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());
372 size_t buffer_size = num_elements * sizeof(int16_t);
373 std::vector<float> scales(1, scaling_factor);
374 std::vector<int64_t> zero_points(1, 0);
375 return AddQuantizationParams(scales, zero_points, 0, uint8_buffer,
376 buffer_size, TensorType_INT16, model, tensor,
377 error_reporter);
378 }
379
SymmetricPerChannelQuantizeValues(const float * const input,const std::vector<float> & scales_inv,const std::vector<int32_t> & dimension,int32_t channel_dim_index,std::vector<int8_t> * output_value)380 void SymmetricPerChannelQuantizeValues(const float* const input,
381 const std::vector<float>& scales_inv,
382 const std::vector<int32_t>& dimension,
383 int32_t channel_dim_index,
384 std::vector<int8_t>* output_value) {
385 // Quantize the values.
386 int indices[kPerChannelMaxDim];
387 RuntimeShape unextended_tensor_dims(dimension.size(), dimension.data());
388 RuntimeShape tensor_dims =
389 RuntimeShape::ExtendedShape(kPerChannelMaxDim, unextended_tensor_dims);
390 channel_dim_index +=
391 kPerChannelMaxDim - unextended_tensor_dims.DimensionsCount();
392 for (indices[0] = 0; indices[0] < tensor_dims.Dims(0); indices[0]++) {
393 for (indices[1] = 0; indices[1] < tensor_dims.Dims(1); indices[1]++) {
394 for (indices[2] = 0; indices[2] < tensor_dims.Dims(2); indices[2]++) {
395 for (indices[3] = 0; indices[3] < tensor_dims.Dims(3); indices[3]++) {
396 int channel_idx = indices[channel_dim_index];
397 int index = Offset(tensor_dims, indices);
398 const float val = input[index];
399 const int32_t quantized_value =
400 static_cast<int32_t>(TfLiteRound(val * scales_inv[channel_idx]));
401 output_value->at(index) = std::min<int8_t>(
402 kMaxQuantizedValue,
403 std::max<int8_t>(kMinQuantizedValue, quantized_value));
404 }
405 }
406 }
407 }
408 }
409
410 // Quantize the tensor using the max and min values recorded in its quantization
411 // parameters. Applies per-layer quantization.
SymmetricQuantizeTensorFromMinMax(ModelT * model,TensorT * tensor,ErrorReporter * error_reporter)412 TfLiteStatus SymmetricQuantizeTensorFromMinMax(ModelT* model, TensorT* tensor,
413 ErrorReporter* error_reporter) {
414 if (model == nullptr || tensor == nullptr) {
415 TF_LITE_REPORT_ERROR(error_reporter, "No tensor to quantize.");
416 return kTfLiteError;
417 }
418
419 BufferT* buffer = model->buffers[tensor->buffer].get();
420 if (buffer == nullptr) {
421 TF_LITE_REPORT_ERROR(error_reporter, "Missing buffer.");
422 return kTfLiteError;
423 }
424
425 if (!HasMinMax(tensor)) {
426 TF_LITE_REPORT_ERROR(error_reporter,
427 "Missing min or max values for quantization.");
428 return kTfLiteError;
429 }
430 if (tensor->quantization->min.size() != 1 ||
431 tensor->quantization->max.size() != 1) {
432 TF_LITE_REPORT_ERROR(error_reporter,
433 "Expected single entry in max and min.");
434 return kTfLiteError;
435 }
436
437 const float* float_data = reinterpret_cast<const float*>(buffer->data.data());
438 uint64_t num_elements;
439 TF_LITE_ENSURE_STATUS(NumElements(*tensor, &num_elements));
440
441 std::vector<int8_t> quantized_buffer;
442 quantized_buffer.resize(num_elements);
443
444 // Quantize tensor using recorded min and max values
445 float scaling_factor;
446 tensor_utils::SymmetricQuantizeFloats(
447 float_data, num_elements, quantized_buffer.data(),
448 tensor->quantization->min[0], tensor->quantization->max[0],
449 &scaling_factor);
450 tensor->quantization->scale = std::vector<float>(1, scaling_factor);
451 tensor->quantization->zero_point = std::vector<int64_t>(1, 0);
452
453 uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(quantized_buffer.data());
454 model->buffers[tensor->buffer]->data.assign(uint8_buffer,
455 uint8_buffer + num_elements);
456 // Update the tensor type.
457 tensor->type = TensorType_INT8;
458
459 return kTfLiteOk;
460 }
461
SymmetricQuantizeTensor(ModelT * model,TensorT * tensor)462 TfLiteStatus SymmetricQuantizeTensor(ModelT* model, TensorT* tensor) {
463 if (model == nullptr || tensor == nullptr) {
464 TFLITE_LOG(TFLITE_LOG_ERROR, "No tensor to quantize.");
465 return kTfLiteError;
466 }
467
468 BufferT* buffer = model->buffers[tensor->buffer].get();
469 if (buffer == nullptr) {
470 TFLITE_LOG(TFLITE_LOG_ERROR, "Missing buffer.");
471 return kTfLiteError;
472 }
473 const float* float_data = reinterpret_cast<const float*>(buffer->data.data());
474 uint64_t num_elements;
475 TF_LITE_ENSURE_STATUS(NumElements(*tensor, &num_elements));
476
477 std::vector<int8_t> quantized_buffer;
478 quantized_buffer.resize(num_elements);
479
480 float min_value, max_value, scaling_factor;
481 tensor_utils::SymmetricQuantizeFloats(float_data, num_elements,
482 quantized_buffer.data(), &min_value,
483 &max_value, &scaling_factor);
484
485 if (tensor->quantization == nullptr) {
486 tensor->quantization = std::make_unique<QuantizationParametersT>();
487 }
488 tensor->quantization->scale = std::vector<float>(1, scaling_factor);
489 tensor->quantization->zero_point = std::vector<int64_t>(1, 0);
490
491 uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(quantized_buffer.data());
492 model->buffers[tensor->buffer]->data.assign(uint8_buffer,
493 uint8_buffer + num_elements);
494
495 // Update the tensor type.
496 tensor->type = TensorType_INT8;
497
498 return kTfLiteOk;
499 }
500
QuantizeTensorFloat16(ModelT * model,TensorT * tensor)501 TfLiteStatus QuantizeTensorFloat16(ModelT* model, TensorT* tensor) {
502 if (model == nullptr || tensor == nullptr) {
503 TFLITE_LOG(TFLITE_LOG_ERROR, "No tensor to quantize.");
504 return kTfLiteError;
505 }
506
507 BufferT* buffer = model->buffers[tensor->buffer].get();
508 if (buffer == nullptr) {
509 TFLITE_LOG(TFLITE_LOG_ERROR, "Missing buffer.");
510 return kTfLiteError;
511 }
512
513 uint64_t num_elements;
514 TF_LITE_ENSURE_STATUS(NumElements(*tensor, &num_elements));
515
516 // Copy single byte buffer data to float vector to guard against misalignment.
517 std::vector<float> float_vector(num_elements);
518 uint8_t* first = buffer->data.data();
519 std::copy(first, first + buffer->data.size(),
520 reinterpret_cast<uint8_t*>(float_vector.data()));
521
522 // Transform float data to float16.
523 std::vector<Eigen::half> quantized_buffer;
524 quantized_buffer.resize(num_elements);
525 constexpr float kMaxFloat16Value = 65504.f;
526 constexpr float kMinFloat16Value = -65504.f;
527 std::transform(float_vector.begin(), float_vector.end(),
528 quantized_buffer.begin(), [=](float a) {
529 float clamped = std::min(std::max(a, kMinFloat16Value),
530 kMaxFloat16Value);
531 return static_cast<Eigen::half>(clamped);
532 });
533
534 char* half_buffer = reinterpret_cast<char*>(quantized_buffer.data());
535 model->buffers[tensor->buffer]->data.assign(
536 half_buffer, half_buffer + sizeof(Eigen::half) * num_elements);
537
538 // Update the tensor type.
539 tensor->type = TensorType_FLOAT16;
540
541 return kTfLiteOk;
542 }
543
AddQuantizationParams(const std::vector<float> & scales,const std::vector<int64_t> & zero_point,int quantized_dimension,const uint8_t * buffer_data,size_t buffer_size,TensorType output_type,ModelT * model,TensorT * tensor,ErrorReporter * error_reporter)544 TfLiteStatus AddQuantizationParams(const std::vector<float>& scales,
545 const std::vector<int64_t>& zero_point,
546 int quantized_dimension,
547 const uint8_t* buffer_data,
548 size_t buffer_size, TensorType output_type,
549 ModelT* model, TensorT* tensor,
550 ErrorReporter* error_reporter) {
551 if (tensor->quantization == nullptr) {
552 tensor->quantization = std::make_unique<QuantizationParametersT>();
553 }
554 tensor->quantization->scale.assign(scales.begin(), scales.end());
555 if (zero_point.size() != scales.size()) {
556 TF_LITE_REPORT_ERROR(
557 error_reporter,
558 "Received zero_point of size %d and scales of size %d. "
559 "These sizes should match.",
560 zero_point.size(), scales.size());
561 return kTfLiteError;
562 }
563 tensor->quantization->zero_point.assign(zero_point.begin(), zero_point.end());
564 tensor->quantization->quantized_dimension = quantized_dimension;
565 model->buffers[tensor->buffer]->data.assign(buffer_data,
566 buffer_data + buffer_size);
567 // Update the tensor type.
568 tensor->type = output_type;
569 return kTfLiteOk;
570 }
571
SymmetricQuantizeTensorPerChannel(ModelT * model,TensorT * tensor,int32_t channel_dim_index,ErrorReporter * error_reporter)572 TfLiteStatus SymmetricQuantizeTensorPerChannel(ModelT* model, TensorT* tensor,
573 int32_t channel_dim_index,
574 ErrorReporter* error_reporter) {
575 if (tensor->shape.size() > kPerChannelMaxDim) {
576 TF_LITE_REPORT_ERROR(
577 error_reporter,
578 "SymmetricQuantizeTensorPerChannel requires tensor with less than %d "
579 "dimensions, but got %d dimension(s).",
580 kPerChannelMaxDim + 1, tensor->shape.size());
581 return kTfLiteError;
582 }
583
584 // Get dimensions.
585 uint64_t num_elements;
586 TF_LITE_ENSURE_STATUS(NumElements(*tensor, &num_elements));
587 const int32_t channel_dim_size = tensor->shape[channel_dim_index];
588
589 // Get input float data.
590 const BufferT* buffer = model->buffers[tensor->buffer].get();
591 const float* float_input_data =
592 reinterpret_cast<const float*>(buffer->data.data());
593
594 // Create container for output scale and output data.
595 std::vector<float> scales(channel_dim_size);
596 std::vector<int8_t> final_buffer(num_elements);
597
598 // Quantize the input data with respect to channel_dim_index.
599 TF_LITE_ENSURE_STATUS(SymmetricPerChannelQuantization(
600 tensor, float_input_data, channel_dim_index, &scales, &final_buffer,
601 error_reporter));
602
603 // Set the buffers and output type.
604 uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());
605 const size_t buffer_size = num_elements * sizeof(int8_t);
606 std::vector<int64_t> zero_point(scales.size(), 0);
607 return AddQuantizationParams(scales, zero_point, channel_dim_index,
608 uint8_buffer, buffer_size, TensorType_INT8,
609 model, tensor, error_reporter);
610 }
611
612 template <class BiasType>
SymmetricBiasQuantize(const float * data,uint64_t num_elements,const std::vector<float> & scales)613 std::vector<BiasType> SymmetricBiasQuantize(const float* data,
614 uint64_t num_elements,
615 const std::vector<float>& scales) {
616 std::vector<BiasType> buffer(num_elements);
617 const BiasType kScale = std::numeric_limits<BiasType>::max();
618 float scaling_factor_inv_per_layer = (scales[0] == 0) ? 0 : 1.0 / scales[0];
619
620 for (int32_t idx = 0; idx < num_elements; idx++) {
621 float scaling_factor_inv =
622 scales.size() == 1 ? scaling_factor_inv_per_layer
623 : ((scales[idx] == 0) ? 0 : 1.0 / scales[idx]);
624 const BiasType quantized_value =
625 tflite::SafeCast<BiasType>(TfLiteRound(data[idx] * scaling_factor_inv));
626 buffer[idx] = std::min(kScale, std::max(-kScale, quantized_value));
627 }
628 return buffer;
629 }
630
631 template std::vector<std::int32_t> SymmetricBiasQuantize<std::int32_t>(
632 const float* data, uint64_t num_elements, const std::vector<float>& scales);
633
634 template <class BiasType>
SymmetricPerLayerBiasQuantize(ModelT * model,TensorT * tensor,float scaling_factor,ErrorReporter * error_reporter)635 TfLiteStatus SymmetricPerLayerBiasQuantize(ModelT* model, TensorT* tensor,
636 float scaling_factor,
637 ErrorReporter* error_reporter) {
638 const BufferT* buffer = model->buffers[tensor->buffer].get();
639 const float* float_data = reinterpret_cast<const float*>(buffer->data.data());
640 uint64_t num_elements;
641 TF_LITE_ENSURE_STATUS(NumElements(*tensor, &num_elements));
642
643 auto final_buffer = SymmetricBiasQuantize<BiasType>(float_data, num_elements,
644 {scaling_factor});
645
646 // Set the buffers and output type.
647 uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());
648 size_t buffer_size = num_elements * sizeof(BiasType);
649 std::vector<float> scales(1, scaling_factor);
650 std::vector<int64_t> zero_points(1, 0);
651
652 auto output_type = std::is_same<BiasType, std::int32_t>::value
653 ? TensorType_INT32
654 : TensorType_INT64;
655 return AddQuantizationParams(scales, zero_points, 0, uint8_buffer,
656 buffer_size, output_type, model, tensor,
657 error_reporter);
658 }
659
660 template TfLiteStatus SymmetricPerLayerBiasQuantize<std::int32_t>(
661 ModelT* model, TensorT* tensor, float scaling_factor,
662 ErrorReporter* error_reporter);
663
664 template TfLiteStatus SymmetricPerLayerBiasQuantize<std::int64_t>(
665 ModelT* model, TensorT* tensor, float scaling_factor,
666 ErrorReporter* error_reporter);
667
668 template <class BiasType>
SymmetricPerChannelBiasQuantize(ModelT * model,TensorT * tensor,float input_scale,const float * weight_scales,int number_of_dimension,ErrorReporter * error_reporter)669 TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,
670 float input_scale,
671 const float* weight_scales,
672 int number_of_dimension,
673 ErrorReporter* error_reporter) {
674 // Compute scales.
675 std::vector<float> scales(number_of_dimension);
676 for (int i = 0; i < number_of_dimension; i++) {
677 scales[i] = input_scale * weight_scales[i];
678 }
679
680 const BufferT* buffer = model->buffers[tensor->buffer].get();
681 const float* float_data = reinterpret_cast<const float*>(buffer->data.data());
682 uint64_t num_elements;
683 TF_LITE_ENSURE_STATUS(NumElements(*tensor, &num_elements));
684
685 auto final_buffer =
686 SymmetricBiasQuantize<BiasType>(float_data, num_elements, scales);
687
688 // Set the buffers and output type.
689 uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());
690 size_t buffer_size = num_elements * sizeof(BiasType);
691 std::vector<int64_t> zero_point(scales.size(), 0);
692
693 auto output_type = std::is_same<BiasType, std::int32_t>::value
694 ? TensorType_INT32
695 : TensorType_INT64;
696 return AddQuantizationParams(scales, zero_point, 0, uint8_buffer, buffer_size,
697 output_type, model, tensor, error_reporter);
698 }
699
700 template TfLiteStatus SymmetricPerChannelBiasQuantize<std::int64_t>(
701 ModelT* model, TensorT* tensor, float input_scale,
702 const float* weight_scales, int number_of_dimension,
703 ErrorReporter* error_reporter);
704
705 template TfLiteStatus SymmetricPerChannelBiasQuantize<std::int32_t>(
706 ModelT* model, TensorT* tensor, float input_scale,
707 const float* weight_scales, int number_of_dimension,
708 ErrorReporter* error_reporter);
709
QuantizeWeight(ModelT * model,TensorT * tensor,bool per_channel,int per_axis_index,ErrorReporter * error_reporter)710 TfLiteStatus QuantizeWeight(ModelT* model, TensorT* tensor, bool per_channel,
711 int per_axis_index, ErrorReporter* error_reporter) {
712 // TODO(suharshs): Currently we conflate quantizing weights and constants. Its
713 // possible that the right thing to do is asymmetric quantize the weight. Add
714 // support for this.
715 if (per_channel) {
716 return SymmetricQuantizeTensorPerChannel(model, tensor, per_axis_index,
717 error_reporter);
718 } else if (HasMinMax(tensor) && (tensor->quantization->min.size() == 1) &&
719 (tensor->quantization->max.size() == 1)) {
720 // Quantize using recorded min/max values if per-tensor.
721 return SymmetricQuantizeTensorFromMinMax(model, tensor, error_reporter);
722 } else {
723 // Quantize using min/max from buffer.
724 return SymmetricQuantizeTensor(model, tensor);
725 }
726 }
727
GetEffectiveScale(ModelT * model,SubGraphT * subgraph,int op_idx,std::vector<int> input_index,std::vector<int> intermediate_index,std::vector<float> factors)728 float GetEffectiveScale(ModelT* model, SubGraphT* subgraph, int op_idx,
729 std::vector<int> input_index,
730 std::vector<int> intermediate_index,
731 std::vector<float> factors) {
732 float scale = 1.0f;
733 OperatorT* op = subgraph->operators[op_idx].get();
734 for (int i = 0, end = input_index.size(); i < end; ++i) {
735 const int index_local = input_index[i];
736 const int index_global = op->inputs[index_local];
737 const TensorT* tensor = subgraph->tensors[index_global].get();
738 scale *= tensor->quantization->scale[0];
739 }
740 for (int i = 0, end = intermediate_index.size(); i < end; ++i) {
741 const int index_local = intermediate_index[i];
742 const int index_global = op->intermediates[index_local];
743 const TensorT* tensor = subgraph->tensors[index_global].get();
744 scale *= tensor->quantization->scale[0];
745 }
746 for (int i = 0, end = factors.size(); i < end; ++i) {
747 scale *= factors[i];
748 }
749 return scale;
750 }
751
QuantizeActivation(TensorT * tensor,TensorType activations_type,ErrorReporter * error_reporter)752 TfLiteStatus QuantizeActivation(TensorT* tensor, TensorType activations_type,
753 ErrorReporter* error_reporter) {
754 TF_LITE_ENSURE_STATUS(GetQuantizationParams(
755 tensor, activations_type, tensor->quantization.get(), error_reporter));
756 tensor->type = activations_type;
757 return kTfLiteOk;
758 }
759
QuantizeActivationToInt16(TensorT * tensor,float scale)760 TfLiteStatus QuantizeActivationToInt16(TensorT* tensor, float scale) {
761 const int32_t zero_point = 0;
762 tensor->quantization = std::make_unique<QuantizationParametersT>();
763 tensor->quantization->scale.push_back(scale);
764 tensor->quantization->zero_point.push_back(zero_point);
765 tensor->type = TensorType_INT16;
766 return kTfLiteOk;
767 }
768
GetPowerOfTwoScale(float min,float max)769 int GetPowerOfTwoScale(float min, float max) {
770 const float range = std::max(std::abs(min), std::abs(max));
771 int pot = 0;
772 for (int i = 0; i < 10; i++) {
773 // NOTE: use std::pow() for bitwise accuracy.
774 if (std::pow(2, pot) < range) { // NOLINT
775 pot++;
776 }
777 }
778 return pot;
779 }
780
781 } // namespace utils
782 } // namespace optimize
783 } // namespace tflite
784