1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #if GOOGLE_CUDA && GOOGLE_TENSORRT
17 #include "tensorflow/compiler/tf2tensorrt/convert/ops/quantization_ops.h"
18
19 #include "absl/strings/str_format.h"
20 #include "tensorflow/cc/ops/array_ops.h"
21 #include "tensorflow/compiler/tf2tensorrt/common/utils.h"
22 #include "tensorflow/compiler/tf2tensorrt/convert/op_converter.h"
23 #include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h"
24 #include "tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h"
25 #include "tensorflow/compiler/tf2tensorrt/convert/weights.h"
26 #include "tensorflow/core/framework/node_def_util.h"
27 #include "third_party/tensorrt/NvInfer.h"
28
29 namespace tensorflow {
30 namespace tensorrt {
31 namespace convert {
32
IsQuantizeAndDequantizeOp(const Node * node)33 bool IsQuantizeAndDequantizeOp(const Node* node) {
34 return absl::c_find(kQuantizationOpNames, node->def().op()) !=
35 kQuantizationOpNames.end();
36 }
37
38 namespace {
39
40 // Provides quantizing and dequantizing tensor scales for a given dynamic range.
41 // Borrowed from TF quantization kernel logic.
42 template <typename T>
ComputeQuantizationRange(bool signed_input,int num_bits,bool narrow_range,T * min_range,T * max_range)43 QuantizationScales<T, 1> ComputeQuantizationRange(bool signed_input,
44 int num_bits,
45 bool narrow_range,
46 T* min_range, T* max_range) {
47 // Calculate the range for the simulated integer quantization:
48 // e.g. [-127,127] for signed = true, narrow_range = true, num_bits = 8,
49 // or [-128,127] for signed = true, narrow_range = false, num_bits = 8,
50 // or [0, 255] for signed = false, num_bits = 8.
51 const int64_t min_quantized =
52 signed_input ? narrow_range ? -(1ULL << (num_bits - 1)) + 1
53 : -(1ULL << (num_bits - 1))
54 : 0;
55 const int64_t max_quantized =
56 signed_input ? (1ULL << (num_bits - 1)) - 1 : (1ULL << num_bits) - 1;
57 // Determine the maximum scaling factor that would scale
58 // [min_range, max_range] to not exceed [min_quantized, max_quantized],
59 // while keeping 0 unchanged.
60 const T scale_from_min_side = (min_quantized * *min_range > 0)
61 ? min_quantized / *min_range
62 : std::numeric_limits<T>::max();
63 const T scale_from_max_side = (max_quantized * *max_range > 0)
64 ? max_quantized / *max_range
65 : std::numeric_limits<T>::max();
66
67 QuantizationScales<T, 1> scales;
68 // Note: Avoids changing the side of the range that determines scale.
69 if (scale_from_min_side < scale_from_max_side) {
70 scales.quantize_scale[0] = scale_from_min_side;
71 scales.dequantize_scale[0] = *min_range / min_quantized;
72 *max_range = max_quantized * scales.dequantize_scale[0];
73 } else {
74 scales.quantize_scale[0] = scale_from_max_side;
75 scales.dequantize_scale[0] = *max_range / max_quantized;
76 *min_range = min_quantized * scales.dequantize_scale[0];
77 }
78 return scales;
79 }
80
81 // Prepares the input for a QDQ node in explicit precision mode, returning a
82 // ITensor pointer. If the input is weights, we convert it to a ITensor by
83 // adding a constant layer.
ExlicitQDQInputToTensor(TRTNetworkBuilder * builder,OpConverterParams * params,const TRT_TensorOrWeights & input)84 StatusOr<nvinfer1::ITensor*> ExlicitQDQInputToTensor(
85 TRTNetworkBuilder* builder, OpConverterParams* params,
86 const TRT_TensorOrWeights& input) {
87 if (input.is_tensor()) {
88 return input.tensor()->trt_tensor();
89 }
90 if (!IS_TRT_VERSION_GE(8, 0, 0, 0) && input.weights().count() > 1) {
91 LOG(WARNING) << absl::StrCat(
92 "QDQ per-channel for weights not "
93 "implemented, assuming uniform scaling");
94 }
95 TRT_ShapedWeights trt_weights = input.weights();
96 StatusOr<nvinfer1::IConstantLayer*> weights_const =
97 builder->WeightsToConstant(trt_weights.GetTrtWeights(),
98 trt_weights.Shape());
99 TRT_ENSURE_PTR_OK(weights_const);
100 params->converter->SetLayerName(*weights_const, params->node_def, "const");
101 nvinfer1::ITensor* qdq_input = (*weights_const)->getOutput(0);
102 std::string name = absl::StrCat((*weights_const)->getName(), "_output");
103 qdq_input->setName(name.c_str());
104 return qdq_input;
105 }
106
107 } // namespace
108
109 // Carries traits for each specific quantization op type for conversion.
110 // Specialization for template parameter T should be given for each TF C++
111 // quantization op.
112 template <typename T>
113 struct QDQOpSpec {};
114
115 template <>
116 struct QDQOpSpec<ops::QuantizeAndDequantizeV2> {
InputSpectensorflow::tensorrt::convert::QDQOpSpec117 static constexpr std::array<InputArgSpec, 3> InputSpec() {
118 return {
119 InputArgSpec::Create("input", TrtInputArg::kBoth),
120 InputArgSpec::Create("input_min", TrtInputArg::kWeight),
121 InputArgSpec::Create("input_max", TrtInputArg::kWeight),
122 };
123 }
124
125 struct Attrs {
126 float min_range;
127 float max_range;
128 bool narrow_range;
129 std::string round_mode;
130 UniformQuantizationScales scales;
131 };
132
ValidateQDQForExplicitPrecisiontensorflow::tensorrt::convert::QDQOpSpec133 static Status ValidateQDQForExplicitPrecision(
134 const std::vector<TRT_TensorOrWeights>& inputs, const NodeDef& node_def,
135 Attrs* args) {
136 AttrSlice attrs(node_def);
137 TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "round_mode", &args->round_mode));
138 if (args->round_mode != "HALF_TO_EVEN") {
139 LOG(WARNING) << node_def.op() << ": " << node_def.name()
140 << " has round_mode=" << args->round_mode
141 << ", but for TensorRT conversion, "
142 "round_mode=HALF_TO_EVEN is recommended.";
143 }
144 TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "narrow_range", &args->narrow_range));
145 if (args->narrow_range) {
146 LOG(WARNING) << node_def.op() << ": " << node_def.name()
147 << " has narrow_range=true, but for TensorRT conversion, "
148 "narrow_range=false is recommended.";
149 }
150 args->min_range = inputs.at(1).weights().template GetPointer<float>()[0];
151 args->max_range = inputs.at(2).weights().template GetPointer<float>()[0];
152 const int num_bits = 8;
153 args->scales = ComputeQuantizationRange<float>(
154 /*signed_input=*/true, num_bits, args->narrow_range, &args->min_range,
155 &args->max_range);
156 TRT_ENSURE(args->scales.dequantize_scale[0] != 0);
157 TRT_ENSURE(args->scales.quantize_scale[0] != 0);
158 return Status::OK();
159 }
160
161 // Converts in explicit precision mode. In this mode, QDQ operations are
162 // directly converted into TensorRT quantizing and dequantizing scale
163 // operations.
ConvertExplicittensorflow::tensorrt::convert::QDQOpSpec164 static Status ConvertExplicit(OpConverterParams* params, const Attrs& args) {
165 const auto& node_def = params->node_def;
166
167 StatusOr<TRTNetworkBuilder> builder = TRTNetworkBuilder::Create(
168 params->converter->network(), params->weight_store);
169
170 StatusOr<nvinfer1::ITensor*> qdq_input =
171 ExlicitQDQInputToTensor(&*builder, params, params->inputs.at(0));
172 TRT_ENSURE_PTR_OK(qdq_input);
173
174 // TODO(cbate): check this condition exists for TRT8? Outline this block to
175 // a "reshape policy".
176 const int required_dims = params->use_implicit_batch ? 3 : 4;
177 const nvinfer1::Dims idims = (*qdq_input)->getDimensions();
178 nvinfer1::Dims intermediate_dims = idims;
179 TRT_ENSURE(idims.nbDims > 0);
180 if (idims.nbDims < required_dims) {
181 const int nb_extra_dims = required_dims - idims.nbDims;
182 intermediate_dims.nbDims = required_dims;
183 std::vector<int> ones(nb_extra_dims, 1);
184 TRT_ENSURE(ones.size() == nb_extra_dims && nb_extra_dims > 0);
185
186 if (!params->use_implicit_batch) {
187 intermediate_dims.d[0] = idims.d[0];
188 std::copy(ones.begin(), ones.end(), intermediate_dims.d + 1);
189 std::copy_n(idims.d + 1, idims.nbDims - 1,
190 intermediate_dims.d + ones.size() + 1);
191 } else {
192 std::copy(ones.begin(), ones.end(), intermediate_dims.d);
193 std::copy_n(idims.d, idims.nbDims, intermediate_dims.d + ones.size());
194 }
195
196 LOG(WARNING) << absl::StrCat(
197 node_def.name(), ":", node_def.op(), ": tensor ",
198 (*qdq_input)->getName(), " has shape ", DebugString(idims),
199 " but TRT scale layer requires at least 3 dims excluding batch dim, "
200 "trying to recover by inserting 1's to create shape ",
201 DebugString(intermediate_dims));
202 StatusOr<nvinfer1::IShuffleLayer*> reshape =
203 builder->Reshape(*qdq_input, intermediate_dims);
204 TRT_ENSURE_PTR_OK(reshape);
205 *qdq_input = (*reshape)->getOutput(0);
206 }
207
208 VLOG(1) << "[ExplicitPrecision]" << node_def.op() << ": " << node_def.name()
209 << " computed scales: " << args.scales << " from min/max ranges "
210 << args.min_range << "/" << args.max_range;
211
212 StatusOr<nvinfer1::ILayer*> qdq =
213 builder->UniformQuantizeDequantizeExplicit(
214 *qdq_input, args.scales.quantize_scale[0],
215 args.scales.dequantize_scale[0], node_def.name());
216 TRT_ENSURE_PTR_OK(qdq);
217 ITensorProxyPtr final_output = (*qdq)->getOutput(0);
218 if (idims.nbDims != intermediate_dims.nbDims) {
219 StatusOr<nvinfer1::IShuffleLayer*> undo_reshape =
220 builder->Reshape(*qdq_input, idims);
221 TRT_ENSURE_PTR_OK(undo_reshape);
222 final_output = (*undo_reshape)->getOutput(0);
223 }
224 params->outputs->push_back(final_output);
225 return Status::OK();
226 }
227 };
228
229 template <>
230
231 struct QDQOpSpec<ops::QuantizeAndDequantizeV3> {
InputSpectensorflow::tensorrt::convert::QDQOpSpec232 static constexpr std::array<InputArgSpec, 4> InputSpec() {
233 return {
234 InputArgSpec::Create("input", TrtInputArg::kBoth),
235 InputArgSpec::Create("min", TrtInputArg::kWeight),
236 InputArgSpec::Create("max", TrtInputArg::kWeight),
237 InputArgSpec::Create("num_bits", TrtInputArg::kWeight),
238 };
239 }
240 // Use same attributes and conversion functions as QDQV2.
241 using Attrs = QDQOpSpec<ops::QuantizeAndDequantizeV2>::Attrs;
242
ValidateQDQForExplicitPrecisiontensorflow::tensorrt::convert::QDQOpSpec243 static Status ValidateQDQForExplicitPrecision(
244 const std::vector<TRT_TensorOrWeights>& inputs, const NodeDef& node_def,
245 Attrs* args) {
246 return QDQOpSpec<
247 ops::QuantizeAndDequantizeV2>::ValidateQDQForExplicitPrecision(inputs,
248 node_def,
249 args);
250 }
251
ConvertExplicittensorflow::tensorrt::convert::QDQOpSpec252 static Status ConvertExplicit(OpConverterParams* params, const Attrs& args) {
253 return QDQOpSpec<ops::QuantizeAndDequantizeV2>::ConvertExplicit(params,
254 args);
255 }
256 };
257
258 template <>
259
260 struct QDQOpSpec<ops::FakeQuantWithMinMaxVars> {
InputSpectensorflow::tensorrt::convert::QDQOpSpec261 static constexpr std::array<InputArgSpec, 3> InputSpec() {
262 return {
263 InputArgSpec::Create("input", TrtInputArg::kBoth),
264 InputArgSpec::Create("min", TrtInputArg::kWeight),
265 InputArgSpec::Create("max", TrtInputArg::kWeight),
266 };
267 }
268 struct Attrs {
269 int num_bits;
270 bool narrow_range;
271 };
272
ValidateQDQForExplicitPrecisiontensorflow::tensorrt::convert::QDQOpSpec273 static Status ValidateQDQForExplicitPrecision(
274 const std::vector<TRT_TensorOrWeights>& inputs, const NodeDef& node_def,
275 Attrs* args) {
276 return errors::Unimplemented("");
277 }
278
ConvertExplicittensorflow::tensorrt::convert::QDQOpSpec279 static Status ConvertExplicit(OpConverterParams* params, const Attrs& args) {
280 return errors::Unimplemented("");
281 }
282 };
283
284 template <>
285
286 struct QDQOpSpec<ops::FakeQuantWithMinMaxArgs> {
InputSpectensorflow::tensorrt::convert::QDQOpSpec287 static constexpr std::array<InputArgSpec, 1> InputSpec() {
288 return {
289 InputArgSpec::Create("input", TrtInputArg::kBoth),
290 };
291 }
292
293 struct Attrs {
294 float min;
295 float max;
296 int num_bits;
297 bool narrow_range;
298 };
299
ValidateQDQForExplicitPrecisiontensorflow::tensorrt::convert::QDQOpSpec300 static Status ValidateQDQForExplicitPrecision(
301 const std::vector<TRT_TensorOrWeights>& inputs, const NodeDef& node_def,
302 Attrs* args) {
303 return errors::Unimplemented("");
304 }
305
ConvertExplicittensorflow::tensorrt::convert::QDQOpSpec306 static Status ConvertExplicit(OpConverterParams* params, const Attrs& args) {
307 return errors::Unimplemented("");
308 }
309 };
310
311 // Converts QDQ operations in non-explicit precision mode. This is the original
312 // "ConvertQuantize" function. In this mode, Q/DQ operations are no-ops and are
313 // instead used to set the dynamic range of the input tensor.
ConvertDynamicRangeMode(OpConverterParams * params)314 Status ConvertDynamicRangeMode(OpConverterParams* params) {
315 const auto& inputs = params->inputs;
316 const auto& node_def = params->node_def;
317
318 float min_range = 0.0f;
319 float max_range = 0.0f;
320 AttrSlice attrs(params->node_def);
321
322 if (node_def.op() == "FakeQuantWithMinMaxArgs") {
323 // Get ranges via node attributes.
324 TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "min", &min_range));
325 TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "max", &max_range));
326 } else if (node_def.op() == "FakeQuantWithMinMaxVars" ||
327 node_def.op() == "QuantizeAndDequantizeV2" ||
328 node_def.op() == "QuantizeAndDequantizeV3") {
329 // Get ranges via inputs.
330 auto get_weights_value = [&inputs](int index) {
331 const auto* raw_weights = inputs.at(index).weights().GetPointer<float>();
332 return raw_weights[0];
333 };
334 min_range = get_weights_value(1);
335 max_range = get_weights_value(2);
336 } else {
337 return errors::InvalidArgument("Unknown quantization op ", node_def.op(),
338 ", at ", node_def.name());
339 }
340 if (params->validation_only) {
341 return Status::OK();
342 }
343
344 // Store ranges for tensor
345 ITensorProxyPtr input0 = inputs.at(0).tensor();
346 params->converter->ProvideQuantizationRange(&input0, min_range, max_range);
347 // Sometimes, TRT may not quantize a tensor, either because it chooses to
348 // execute a higher precision kernel or because of op fusion. In these
349 // cases, accuracy will suffer if the model was trained to expect
350 // quantization at that tensor. We should consider adding a clip(tensor,
351 // min_range, max_range) operation here to ensure that any arbitrarily
352 // placed quantize node will execute as expected. However, this will
353 // negatively affect performance. If users train their models in a way which
354 // models inference as close as possible (i.e. not quantizing in place where
355 // fusion will occur), then there is no problem with the current
356 // implementation.
357 params->outputs->push_back(inputs.at(0));
358 return Status::OK();
359 }
360
361 template <typename TFOpType>
362 class ConvertQDQ : public OpConverterBase<ConvertQDQ<TFOpType>> {
363 public:
ConvertQDQ(OpConverterParams * params)364 explicit ConvertQDQ(OpConverterParams* params)
365 : OpConverterBase<ConvertQDQ<TFOpType>>(params) {}
366
InputSpec()367 static constexpr auto InputSpec() { return QDQOpSpec<TFOpType>::InputSpec(); }
368
369 // Disable the non-applicable data type check by providing empty string.
NodeDefDataTypeAttributeName()370 static constexpr const char* NodeDefDataTypeAttributeName() { return ""; }
371
ValidateDynamicRangeINT8Mode()372 Status ValidateDynamicRangeINT8Mode() {
373 // The condition ensures we only call the conversion once. We should break
374 // this function up into validation and conversion.
375 if (this->params_->validation_only) {
376 return ConvertDynamicRangeMode(this->params_);
377 }
378 return Status::OK();
379 }
380
Validate()381 Status Validate() {
382 if (!this->params_->use_explicit_precision) {
383 return ValidateDynamicRangeINT8Mode();
384 }
385 return OpSpec::ValidateQDQForExplicitPrecision(
386 this->params_->inputs, this->params_->node_def, &attrs_);
387 }
388
Convert()389 Status Convert() {
390 if (!this->params_->use_explicit_precision) {
391 return ConvertDynamicRangeMode(this->params_);
392 }
393 return OpSpec::ConvertExplicit(this->params_, attrs_);
394 }
395
396 using OpSpec = QDQOpSpec<TFOpType>;
397 using OpSpecAttrs = typename QDQOpSpec<TFOpType>::Attrs;
398 OpSpecAttrs attrs_;
399 };
400
401 REGISTER_DEFAULT_TRT_OP_CONVERTER(
402 MakeConverterFunction<ConvertQDQ<ops::QuantizeAndDequantizeV2>>(),
403 "QuantizeAndDequantizeV2");
404 REGISTER_DEFAULT_TRT_OP_CONVERTER(
405 MakeConverterFunction<ConvertQDQ<ops::QuantizeAndDequantizeV3>>(),
406 "QuantizeAndDequantizeV3");
407 REGISTER_DEFAULT_TRT_OP_CONVERTER(
408 MakeConverterFunction<ConvertQDQ<ops::FakeQuantWithMinMaxVars>>(),
409 "FakeQuantWithMinMaxVars");
410 REGISTER_DEFAULT_TRT_OP_CONVERTER(
411 MakeConverterFunction<ConvertQDQ<ops::FakeQuantWithMinMaxArgs>>(),
412 "FakeQuantWithMinMaxArgs");
413
414 } // namespace convert
415 } // namespace tensorrt
416 } // namespace tensorflow
417
418 #endif // GOOGLE_CUDA && GOOGLE_TENSORRT
419