xref: /aosp_15_r20/external/tensorflow/tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #if GOOGLE_CUDA && GOOGLE_TENSORRT
17 #include "tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.h"
18 
19 #include "absl/strings/str_format.h"
20 #include "tensorflow/cc/ops/array_ops.h"
21 #include "tensorflow/compiler/tf2tensorrt/common/utils.h"
22 #include "tensorflow/compiler/tf2tensorrt/convert/op_converter.h"
23 #include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h"
24 #include "tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h"
25 #include "tensorflow/compiler/tf2tensorrt/convert/weights.h"
26 #include "tensorflow/core/framework/node_def_util.h"
27 #include "third_party/tensorrt/NvInfer.h"
28 
29 namespace tensorflow {
30 namespace tensorrt {
31 namespace convert {
32 
IsQuantizeAndDequantizeOp(const Node * node)33 bool IsQuantizeAndDequantizeOp(const Node* node) {
34   return absl::c_find(kQuantizationOpNames, node->def().op()) !=
35          kQuantizationOpNames.end();
36 }
37 
38 namespace {
39 
40 // Provides quantizing and dequantizing tensor scales for a given dynamic range.
41 // Borrowed from TF quantization kernel logic.
42 template <typename T>
ComputeQuantizationRange(bool signed_input,int num_bits,bool narrow_range,T * min_range,T * max_range)43 QuantizationScales<T, 1> ComputeQuantizationRange(bool signed_input,
44                                                   int num_bits,
45                                                   bool narrow_range,
46                                                   T* min_range, T* max_range) {
47   // Calculate the range for the simulated integer quantization:
48   // e.g. [-127,127] for signed = true, narrow_range = true, num_bits = 8,
49   // or [-128,127] for signed = true, narrow_range = false, num_bits = 8,
50   // or [0, 255] for signed = false, num_bits = 8.
51   const int64_t min_quantized =
52       signed_input ? narrow_range ? -(1ULL << (num_bits - 1)) + 1
53                                   : -(1ULL << (num_bits - 1))
54                    : 0;
55   const int64_t max_quantized =
56       signed_input ? (1ULL << (num_bits - 1)) - 1 : (1ULL << num_bits) - 1;
57   // Determine the maximum scaling factor that would scale
58   // [min_range, max_range] to not exceed [min_quantized, max_quantized],
59   // while keeping 0 unchanged.
60   const T scale_from_min_side = (min_quantized * *min_range > 0)
61                                     ? min_quantized / *min_range
62                                     : std::numeric_limits<T>::max();
63   const T scale_from_max_side = (max_quantized * *max_range > 0)
64                                     ? max_quantized / *max_range
65                                     : std::numeric_limits<T>::max();
66 
67   QuantizationScales<T, 1> scales;
68   // Note: Avoids changing the side of the range that determines scale.
69   if (scale_from_min_side < scale_from_max_side) {
70     scales.quantize_scale[0] = scale_from_min_side;
71     scales.dequantize_scale[0] = *min_range / min_quantized;
72     *max_range = max_quantized * scales.dequantize_scale[0];
73   } else {
74     scales.quantize_scale[0] = scale_from_max_side;
75     scales.dequantize_scale[0] = *max_range / max_quantized;
76     *min_range = min_quantized * scales.dequantize_scale[0];
77   }
78   return scales;
79 }
80 
81 // Prepares the input for a QDQ node in explicit precision mode, returning a
82 // ITensor pointer. If the input is weights, we convert it to a ITensor by
83 // adding a constant layer.
ExlicitQDQInputToTensor(TRTNetworkBuilder * builder,OpConverterParams * params,const TRT_TensorOrWeights & input)84 StatusOr<nvinfer1::ITensor*> ExlicitQDQInputToTensor(
85     TRTNetworkBuilder* builder, OpConverterParams* params,
86     const TRT_TensorOrWeights& input) {
87   if (input.is_tensor()) {
88     return input.tensor()->trt_tensor();
89   }
90   if (!IS_TRT_VERSION_GE(8, 0, 0, 0) && input.weights().count() > 1) {
91     LOG(WARNING) << absl::StrCat(
92         "QDQ per-channel for weights not "
93         "implemented, assuming uniform scaling");
94   }
95   TRT_ShapedWeights trt_weights = input.weights();
96   StatusOr<nvinfer1::IConstantLayer*> weights_const =
97       builder->WeightsToConstant(trt_weights.GetTrtWeights(),
98                                  trt_weights.Shape());
99   TRT_ENSURE_PTR_OK(weights_const);
100   params->converter->SetLayerName(*weights_const, params->node_def, "const");
101   nvinfer1::ITensor* qdq_input = (*weights_const)->getOutput(0);
102   std::string name = absl::StrCat((*weights_const)->getName(), "_output");
103   qdq_input->setName(name.c_str());
104   return qdq_input;
105 }
106 
107 }  // namespace
108 
109 // Carries traits for each specific quantization op type for conversion.
110 // Specialization for template parameter T should be given for each TF C++
111 // quantization op.
112 template <typename T>
113 struct QDQOpSpec {};
114 
115 template <>
116 struct QDQOpSpec<ops::QuantizeAndDequantizeV2> {
InputSpectensorflow::tensorrt::convert::QDQOpSpec117   static constexpr std::array<InputArgSpec, 3> InputSpec() {
118     return {
119         InputArgSpec::Create("input", TrtInputArg::kBoth),
120         InputArgSpec::Create("input_min", TrtInputArg::kWeight),
121         InputArgSpec::Create("input_max", TrtInputArg::kWeight),
122     };
123   }
124 
125   struct Attrs {
126     float min_range;
127     float max_range;
128     bool narrow_range;
129     std::string round_mode;
130     UniformQuantizationScales scales;
131   };
132 
ValidateQDQForExplicitPrecisiontensorflow::tensorrt::convert::QDQOpSpec133   static Status ValidateQDQForExplicitPrecision(
134       const std::vector<TRT_TensorOrWeights>& inputs, const NodeDef& node_def,
135       Attrs* args) {
136     AttrSlice attrs(node_def);
137     TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "round_mode", &args->round_mode));
138     if (args->round_mode != "HALF_TO_EVEN") {
139       LOG(WARNING) << node_def.op() << ": " << node_def.name()
140                    << " has round_mode=" << args->round_mode
141                    << ", but for TensorRT conversion, "
142                       "round_mode=HALF_TO_EVEN is recommended.";
143     }
144     TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "narrow_range", &args->narrow_range));
145     if (args->narrow_range) {
146       LOG(WARNING) << node_def.op() << ": " << node_def.name()
147                    << " has narrow_range=true, but for TensorRT conversion, "
148                       "narrow_range=false is recommended.";
149     }
150     args->min_range = inputs.at(1).weights().template GetPointer<float>()[0];
151     args->max_range = inputs.at(2).weights().template GetPointer<float>()[0];
152     const int num_bits = 8;
153     args->scales = ComputeQuantizationRange<float>(
154         /*signed_input=*/true, num_bits, args->narrow_range, &args->min_range,
155         &args->max_range);
156     TRT_ENSURE(args->scales.dequantize_scale[0] != 0);
157     TRT_ENSURE(args->scales.quantize_scale[0] != 0);
158     return Status::OK();
159   }
160 
161   // Converts in explicit precision mode. In this mode, QDQ operations are
162   // directly converted into TensorRT quantizing and dequantizing scale
163   // operations.
ConvertExplicittensorflow::tensorrt::convert::QDQOpSpec164   static Status ConvertExplicit(OpConverterParams* params, const Attrs& args) {
165     const auto& node_def = params->node_def;
166 
167     StatusOr<TRTNetworkBuilder> builder = TRTNetworkBuilder::Create(
168         params->converter->network(), params->weight_store);
169 
170     StatusOr<nvinfer1::ITensor*> qdq_input =
171         ExlicitQDQInputToTensor(&*builder, params, params->inputs.at(0));
172     TRT_ENSURE_PTR_OK(qdq_input);
173 
174     // TODO(cbate): check this condition exists for TRT8? Outline this block to
175     // a "reshape policy".
176     const int required_dims = params->use_implicit_batch ? 3 : 4;
177     const nvinfer1::Dims idims = (*qdq_input)->getDimensions();
178     nvinfer1::Dims intermediate_dims = idims;
179     TRT_ENSURE(idims.nbDims > 0);
180     if (idims.nbDims < required_dims) {
181       const int nb_extra_dims = required_dims - idims.nbDims;
182       intermediate_dims.nbDims = required_dims;
183       std::vector<int> ones(nb_extra_dims, 1);
184       TRT_ENSURE(ones.size() == nb_extra_dims && nb_extra_dims > 0);
185 
186       if (!params->use_implicit_batch) {
187         intermediate_dims.d[0] = idims.d[0];
188         std::copy(ones.begin(), ones.end(), intermediate_dims.d + 1);
189         std::copy_n(idims.d + 1, idims.nbDims - 1,
190                     intermediate_dims.d + ones.size() + 1);
191       } else {
192         std::copy(ones.begin(), ones.end(), intermediate_dims.d);
193         std::copy_n(idims.d, idims.nbDims, intermediate_dims.d + ones.size());
194       }
195 
196       LOG(WARNING) << absl::StrCat(
197           node_def.name(), ":", node_def.op(), ": tensor ",
198           (*qdq_input)->getName(), " has shape ", DebugString(idims),
199           " but TRT scale layer requires at least 3 dims excluding batch dim, "
200           "trying to recover by inserting 1's to create shape ",
201           DebugString(intermediate_dims));
202       StatusOr<nvinfer1::IShuffleLayer*> reshape =
203           builder->Reshape(*qdq_input, intermediate_dims);
204       TRT_ENSURE_PTR_OK(reshape);
205       *qdq_input = (*reshape)->getOutput(0);
206     }
207 
208     VLOG(1) << "[ExplicitPrecision]" << node_def.op() << ": " << node_def.name()
209             << " computed scales: " << args.scales << " from min/max ranges "
210             << args.min_range << "/" << args.max_range;
211 
212     StatusOr<nvinfer1::ILayer*> qdq =
213         builder->UniformQuantizeDequantizeExplicit(
214             *qdq_input, args.scales.quantize_scale[0],
215             args.scales.dequantize_scale[0], node_def.name());
216     TRT_ENSURE_PTR_OK(qdq);
217     ITensorProxyPtr final_output = (*qdq)->getOutput(0);
218     if (idims.nbDims != intermediate_dims.nbDims) {
219       StatusOr<nvinfer1::IShuffleLayer*> undo_reshape =
220           builder->Reshape(*qdq_input, idims);
221       TRT_ENSURE_PTR_OK(undo_reshape);
222       final_output = (*undo_reshape)->getOutput(0);
223     }
224     params->outputs->push_back(final_output);
225     return Status::OK();
226   }
227 };
228 
229 template <>
230 
231 struct QDQOpSpec<ops::QuantizeAndDequantizeV3> {
InputSpectensorflow::tensorrt::convert::QDQOpSpec232   static constexpr std::array<InputArgSpec, 4> InputSpec() {
233     return {
234         InputArgSpec::Create("input", TrtInputArg::kBoth),
235         InputArgSpec::Create("min", TrtInputArg::kWeight),
236         InputArgSpec::Create("max", TrtInputArg::kWeight),
237         InputArgSpec::Create("num_bits", TrtInputArg::kWeight),
238     };
239   }
240   // Use same attributes and conversion functions as QDQV2.
241   using Attrs = QDQOpSpec<ops::QuantizeAndDequantizeV2>::Attrs;
242 
ValidateQDQForExplicitPrecisiontensorflow::tensorrt::convert::QDQOpSpec243   static Status ValidateQDQForExplicitPrecision(
244       const std::vector<TRT_TensorOrWeights>& inputs, const NodeDef& node_def,
245       Attrs* args) {
246     return QDQOpSpec<
247         ops::QuantizeAndDequantizeV2>::ValidateQDQForExplicitPrecision(inputs,
248                                                                        node_def,
249                                                                        args);
250   }
251 
ConvertExplicittensorflow::tensorrt::convert::QDQOpSpec252   static Status ConvertExplicit(OpConverterParams* params, const Attrs& args) {
253     return QDQOpSpec<ops::QuantizeAndDequantizeV2>::ConvertExplicit(params,
254                                                                     args);
255   }
256 };
257 
258 template <>
259 
260 struct QDQOpSpec<ops::FakeQuantWithMinMaxVars> {
InputSpectensorflow::tensorrt::convert::QDQOpSpec261   static constexpr std::array<InputArgSpec, 3> InputSpec() {
262     return {
263         InputArgSpec::Create("input", TrtInputArg::kBoth),
264         InputArgSpec::Create("min", TrtInputArg::kWeight),
265         InputArgSpec::Create("max", TrtInputArg::kWeight),
266     };
267   }
268   struct Attrs {
269     int num_bits;
270     bool narrow_range;
271   };
272 
ValidateQDQForExplicitPrecisiontensorflow::tensorrt::convert::QDQOpSpec273   static Status ValidateQDQForExplicitPrecision(
274       const std::vector<TRT_TensorOrWeights>& inputs, const NodeDef& node_def,
275       Attrs* args) {
276     return errors::Unimplemented("");
277   }
278 
ConvertExplicittensorflow::tensorrt::convert::QDQOpSpec279   static Status ConvertExplicit(OpConverterParams* params, const Attrs& args) {
280     return errors::Unimplemented("");
281   }
282 };
283 
284 template <>
285 
286 struct QDQOpSpec<ops::FakeQuantWithMinMaxArgs> {
InputSpectensorflow::tensorrt::convert::QDQOpSpec287   static constexpr std::array<InputArgSpec, 1> InputSpec() {
288     return {
289         InputArgSpec::Create("input", TrtInputArg::kBoth),
290     };
291   }
292 
293   struct Attrs {
294     float min;
295     float max;
296     int num_bits;
297     bool narrow_range;
298   };
299 
ValidateQDQForExplicitPrecisiontensorflow::tensorrt::convert::QDQOpSpec300   static Status ValidateQDQForExplicitPrecision(
301       const std::vector<TRT_TensorOrWeights>& inputs, const NodeDef& node_def,
302       Attrs* args) {
303     return errors::Unimplemented("");
304   }
305 
ConvertExplicittensorflow::tensorrt::convert::QDQOpSpec306   static Status ConvertExplicit(OpConverterParams* params, const Attrs& args) {
307     return errors::Unimplemented("");
308   }
309 };
310 
311 // Converts QDQ operations in non-explicit precision mode. This is the original
312 // "ConvertQuantize" function. In this mode, Q/DQ operations are no-ops and are
313 // instead used to set the dynamic range of the input tensor.
ConvertDynamicRangeMode(OpConverterParams * params)314 Status ConvertDynamicRangeMode(OpConverterParams* params) {
315   const auto& inputs = params->inputs;
316   const auto& node_def = params->node_def;
317 
318   float min_range = 0.0f;
319   float max_range = 0.0f;
320   AttrSlice attrs(params->node_def);
321 
322   if (node_def.op() == "FakeQuantWithMinMaxArgs") {
323     // Get ranges via node attributes.
324     TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "min", &min_range));
325     TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "max", &max_range));
326   } else if (node_def.op() == "FakeQuantWithMinMaxVars" ||
327              node_def.op() == "QuantizeAndDequantizeV2" ||
328              node_def.op() == "QuantizeAndDequantizeV3") {
329     // Get ranges via inputs.
330     auto get_weights_value = [&inputs](int index) {
331       const auto* raw_weights = inputs.at(index).weights().GetPointer<float>();
332       return raw_weights[0];
333     };
334     min_range = get_weights_value(1);
335     max_range = get_weights_value(2);
336   } else {
337     return errors::InvalidArgument("Unknown quantization op ", node_def.op(),
338                                    ", at ", node_def.name());
339   }
340   if (params->validation_only) {
341     return Status::OK();
342   }
343 
344   // Store ranges for tensor
345   ITensorProxyPtr input0 = inputs.at(0).tensor();
346   params->converter->ProvideQuantizationRange(&input0, min_range, max_range);
347   // Sometimes, TRT may not quantize a tensor, either because it chooses to
348   // execute a higher precision kernel or because of op fusion. In these
349   // cases, accuracy will suffer if the model was trained to expect
350   // quantization at that tensor. We should consider adding a clip(tensor,
351   // min_range, max_range) operation here to ensure that any arbitrarily
352   // placed quantize node will execute as expected. However, this will
353   // negatively affect performance. If users train their models in a way which
354   // models inference as close as possible (i.e. not quantizing in place where
355   // fusion will occur), then there is no problem with the current
356   // implementation.
357   params->outputs->push_back(inputs.at(0));
358   return Status::OK();
359 }
360 
361 template <typename TFOpType>
362 class ConvertQDQ : public OpConverterBase<ConvertQDQ<TFOpType>> {
363  public:
ConvertQDQ(OpConverterParams * params)364   explicit ConvertQDQ(OpConverterParams* params)
365       : OpConverterBase<ConvertQDQ<TFOpType>>(params) {}
366 
InputSpec()367   static constexpr auto InputSpec() { return QDQOpSpec<TFOpType>::InputSpec(); }
368 
369   // Disable the non-applicable data type check by providing empty string.
NodeDefDataTypeAttributeName()370   static constexpr const char* NodeDefDataTypeAttributeName() { return ""; }
371 
ValidateDynamicRangeINT8Mode()372   Status ValidateDynamicRangeINT8Mode() {
373     // The condition ensures we only call the conversion once. We should break
374     // this function up into validation and conversion.
375     if (this->params_->validation_only) {
376       return ConvertDynamicRangeMode(this->params_);
377     }
378     return Status::OK();
379   }
380 
Validate()381   Status Validate() {
382     if (!this->params_->use_explicit_precision) {
383       return ValidateDynamicRangeINT8Mode();
384     }
385     return OpSpec::ValidateQDQForExplicitPrecision(
386         this->params_->inputs, this->params_->node_def, &attrs_);
387   }
388 
Convert()389   Status Convert() {
390     if (!this->params_->use_explicit_precision) {
391       return ConvertDynamicRangeMode(this->params_);
392     }
393     return OpSpec::ConvertExplicit(this->params_, attrs_);
394   }
395 
396   using OpSpec = QDQOpSpec<TFOpType>;
397   using OpSpecAttrs = typename QDQOpSpec<TFOpType>::Attrs;
398   OpSpecAttrs attrs_;
399 };
400 
401 REGISTER_DEFAULT_TRT_OP_CONVERTER(
402     MakeConverterFunction<ConvertQDQ<ops::QuantizeAndDequantizeV2>>(),
403     "QuantizeAndDequantizeV2");
404 REGISTER_DEFAULT_TRT_OP_CONVERTER(
405     MakeConverterFunction<ConvertQDQ<ops::QuantizeAndDequantizeV3>>(),
406     "QuantizeAndDequantizeV3");
407 REGISTER_DEFAULT_TRT_OP_CONVERTER(
408     MakeConverterFunction<ConvertQDQ<ops::FakeQuantWithMinMaxVars>>(),
409     "FakeQuantWithMinMaxVars");
410 REGISTER_DEFAULT_TRT_OP_CONVERTER(
411     MakeConverterFunction<ConvertQDQ<ops::FakeQuantWithMinMaxArgs>>(),
412     "FakeQuantWithMinMaxArgs");
413 
414 }  // namespace convert
415 }  // namespace tensorrt
416 }  // namespace tensorflow
417 
418 #endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
419