xref: /aosp_15_r20/external/tensorflow/tensorflow/compiler/tf2tensorrt/trt_convert_api.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_TF2TENSORRT_TRT_CONVERT_H_
17 #define TENSORFLOW_COMPILER_TF2TENSORRT_TRT_CONVERT_H_
18 
19 #include <climits>
20 #include <string>
21 #include <vector>
22 
23 #if GOOGLE_CUDA && GOOGLE_TENSORRT
24 
25 #include "tensorflow/compiler/tf2tensorrt/common/utils.h"
26 #include "tensorflow/compiler/tf2tensorrt/convert/trt_parameters.h"
27 #include "tensorflow/core/framework/tensor.h"
28 #include "tensorflow/core/platform/statusor.h"
29 #include "tensorflow/core/protobuf/meta_graph.pb.h"
30 
31 namespace tensorflow {
32 
33 struct SavedModelBundle;
34 
35 namespace tensorrt {
36 
37 struct TfTrtConversionParams {
38   // Corresponds 'workspaceSize' parameter of
39   // nvinfer1::IBuilderConfig::setMaxWorkspaceSize.
40 #if IS_TRT_VERSION_GE(8, 4, 0, 0)
41   // Must use `LLONG_MAX - 512` to avoid overflow during casting.
42   size_t max_workspace_size_bytes = LLONG_MAX - 512;
43 #else
44   size_t max_workspace_size_bytes = 1 << 30;  // 1,073,741,824
45 #endif
46 
47   // Minimum precision used by the TRT Engine.
48   TrtPrecisionMode precision_mode = TrtPrecisionMode::FP32;
49 
50   // The minimum number of nodes required for a subgraph to be replaced by
51   // TRTEngineOp. Note that many small TRT subgraphs could be detrimental for
52   // performance, increasing the minimum segment size can help avoid the
53   // problem.
54   int minimum_segment_size = 3;
55 
56   // Max number of cached TRT engines for dynamic TRT ops (by default we have
57   // dynamic TRT ops).
58   int max_cached_engines = 1;
59 
60   // Note that calibration is currently not implemented with the C++ converter.
61   // This argument is ignored if precision_mode is not INT8. If set to True, the
62   // implementation will use the user provided inputs to generate calibration
63   // data. If set to False, quantization nodes will be expected for every tensor
64   // in the graph (excluding those which will be fused). If a range is missing,
65   // an error will occur. Please note that accuracy may be negatively affected
66   // if there is a mismatch between which tensors TRT quantizes and which
67   // tensors were trained with fake quantization.
68   bool use_calibration = true;
69 
70   // Whether to enable dynamic shape mode for the TRT engines. It is
71   // recommended to use_dynamic_shape mode to handle dynamic input shape.
72   // Enabling dynamic shape mode can also improve the conversion rate of graphs
73   // with static input shape.
74   bool use_dynamic_shape = true;
75 
76   // In dynamic shape mode we create an engine that can handle various input
77   // shape ranges. We derive the shape optimization profiles for the TRT engines
78   // in the graph based on user provided input data and profile_strategy.
79   ProfileStrategy profile_strategy = ProfileStrategy::kRange;
80 
81   // Whether to allow bulding TRT engines at runtime. If no TensorRT engine can
82   // be found in cache that can handle the given inputs during runtime, then a
83   // new TensorRT engine is built at runtime if allow_build_at_runtime=True,
84   // otherwise native TF is used. We recommend to set this value false and build
85   // the engine in advance, to avoid runtime overhead.
86   bool allow_build_at_runtime = true;
87 
88   // Record the TRT engine as an attribute of the TRTEngineOp. This is only
89   // valid when max_cached_engines == 1. Note: the frozen graph together with
90   // the serialized engines have to be below 2GiB (protobuf size limit). If
91   // convert_to_static_engine = false, then the converted graph_def only
92   // contains placeholder TRTEngineOp nodes.
93   bool convert_to_static_engine = true;
94 };
95 
96 /**
97  * Converts the graph with TF-TRT.
98  *
99  * Performs TF-TRT conversion and returns the converted GraphDef. If inputs is
100  * not empty and convert_to_static_engine is requested, we also build the
101  * engines and convert the engines to static engines.
102  *
103  * Arguments:
104  * - frozen_graph_def input graph, it is assumed to be frozen
105  * - input_names names of the input tensors
106  * - output_names names of the output tensors
107  * - inputs tensors that we will use as input while building the TRT engines
108  * - conv_params parameters for the TF-TRT conversion
109  *
110  * Returns the converted graph_def.
111  */
112 StatusOr<GraphDef> ConvertAndBuild(
113     const GraphDef& frozen_graph_def, const std::vector<string>& input_names,
114     const std::vector<string>& output_names,
115     const std::vector<std::vector<tensorflow::Tensor>>& inputs,
116     const TfTrtConversionParams& conv_params);
117 
118 StatusOr<GraphDef> ConvertAndBuild(
119     SavedModelBundle* bundle,
120     const std::string& signature_key = "serving_default",
121     const std::vector<std::vector<tensorflow::Tensor>>& inputs = {},
122     const TfTrtConversionParams& conversion_params = TfTrtConversionParams());
123 
124 }  // namespace tensorrt
125 }  // namespace tensorflow
126 
127 #endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
128 
129 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_TRT_CONVERT_H_
130