xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/delegates/gpu/cl/inference_context.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_
18 
19 #include <cstdint>
20 #include <functional>
21 #include <map>
22 #include <memory>
23 #include <string>
24 #include <vector>
25 
26 #include "absl/container/flat_hash_map.h"
27 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
28 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
29 #include "tensorflow/lite/delegates/gpu/cl/cl_operation.h"
30 #include "tensorflow/lite/delegates/gpu/cl/environment.h"
31 #include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
32 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
33 #include "tensorflow/lite/delegates/gpu/cl/recordable_queue_builder.h"
34 #include "tensorflow/lite/delegates/gpu/cl/serialization_generated.h"
35 #include "tensorflow/lite/delegates/gpu/cl/tensor.h"
36 #include "tensorflow/lite/delegates/gpu/common/gpu_model.h"
37 #include "tensorflow/lite/delegates/gpu/common/model.h"
38 #include "tensorflow/lite/delegates/gpu/common/model_hints.h"
39 #include "tensorflow/lite/delegates/gpu/common/precision.h"
40 #include "tensorflow/lite/delegates/gpu/common/status.h"
41 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
42 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
43 
44 namespace tflite {
45 namespace gpu {
46 namespace cl {
47 
48 struct CLNode {
49   ClOperation cl_operation;
50   std::vector<ValueId> inputs;
51   std::vector<ValueId> outputs;
52 
53   // Mostly for debug purposes.
54   std::string name;
55 
56   CLNode() = default;
57 
58   CLNode(CLNode&& node) = default;
59   CLNode& operator=(CLNode&& node) = default;
60   CLNode(const CLNode&) = delete;
61   CLNode& operator=(const CLNode&) = delete;
62 };
63 
64 enum class TensorType { kVariable, kConst, kExternal, kRuntime };
65 
66 class InferenceContext {
67  public:
68   absl::Status InitFromGraph(const CreateGpuModelInfo& create_info,
69                              const GraphFloat32& graph, Environment* env,
70                              std::vector<uint8_t>* serialized_model = nullptr);
71 
72   absl::Status InitFromGpuModel(
73       const CreateGpuModelInfo& create_info, GpuModel* gpu_model,
74       Environment* env, std::vector<uint8_t>* serialized_model = nullptr,
75       Buffer* shared_buffer = nullptr);
76 
77   absl::Status AddToCommanBuffer(cl_command_buffer_khr cb);
78 
79   // Applies OpenCL-specific transformations to the graph before the
80   // initialization. These transformations are either impossible or useless in
81   // other backends.
82   absl::Status InitFromGraphWithTransforms(
83       const CreateGpuModelInfo& create_info, GraphFloat32* graph,
84       Environment* env, std::vector<uint8_t>* serialized_model = nullptr);
85 
86   absl::Status AddToQueue(CLCommandQueue* queue);
87   absl::Status Profile(ProfilingCommandQueue* queue, ProfilingInfo* result);
88   // for profiling and memory statistics
89   uint64_t GetSizeOfMemoryAllocatedForIntermediateTensors() const;
90   uint64_t GetConstantTensorsSize() const;
91 
92   absl::Status SetInputTensor(ValueId id, const TensorFloat32& tensor,
93                               CLCommandQueue* queue);
94 
95   // It will work only with input/output tensor ids. For all other ids we don't
96   // have any guarantees.
97   Tensor* GetTensor(ValueId id);
98 
99   absl::Status GetOutputTensor(ValueId id, CLCommandQueue* queue,
100                                TensorFloat32* result);
101 
GetInputIds()102   const std::vector<ValueId>& GetInputIds() const { return input_ids_; }
GetOutputIds()103   const std::vector<ValueId>& GetOutputIds() const { return output_ids_; }
104 
105   absl::Status RestoreDeserialized(
106       const absl::Span<const uint8_t> serialized_model, Environment* env,
107       CreateGpuModelInfo* create_info = nullptr);
108 
109   // Can be used only with ids from external_mutable_tensors in create_info
110   // Must be called after initialization and before execution
111   absl::Status SetTensor(const ValueId& tensor_id, Tensor* tensor_ptr);
112 
113  private:
114   flatbuffers::Offset<data::InferenceContext> Encode(
115       const CLDevice& device, const ProgramCache& program_cache,
116       flatbuffers::Offset<tflite::gpu::data::GpuModel> gpu_model_fb,
117       flatbuffers::FlatBufferBuilder* builder);
118 
119   void InitFromGpuModel(GpuModel* gpu_model);
120 
121   absl::Status AllocateMemory(const GpuModel& gpu_model,
122                               const GpuInfo& gpu_info,
123                               const CreateGpuModelInfo* create_info,
124                               CLContext* context);
125 
126   absl::Status AllocateConstTensors(const GpuModel& gpu_model,
127                                     CLContext* context);
128 
129   absl::Status AllocateVariableTensors(const GpuModel& gpu_model,
130                                        CLContext* context);
131 
132   absl::Status AllocateBufferBasedTensors(const GpuModel& gpu_model,
133                                           const GpuInfo& gpu_info,
134                                           const CreateGpuModelInfo* create_info,
135                                           CLContext* context);
136 
137   absl::Status AllocateStrongShapesTensors(
138       const GpuModel& gpu_model, const GpuInfo& gpu_info,
139       const CreateGpuModelInfo* create_info, CLContext* context);
140 
141   void BindMemoryToOperations();
142   absl::Status Compile(const CreationContext& creation_context);
143   absl::Status Tune(TuningType tuning_type, const GpuInfo& gpu_info,
144                     ProfilingCommandQueue* profiling_queue);
145   absl::Status UpdateParams();
146   void PrepareExternal();
147 
148   void InitRecordableQueue(Environment* env);
149 
150   absl::Status ProfileTime(ProfilingCommandQueue* queue, ProfilingInfo* result);
151 
152   struct ExecutionHints {
153     bool need_flush = false;
154 
155     bool flush_periodically = false;
156     int flush_period = 1;
157 
158     // In order to reduce memory leak on Mali a pipeline needs to be
159     // synchronized with CPU to prevent growing internal global OpenCL kernel
160     // pool. One trick is to enqueue an event from a previous run. Most of the
161     // time is should already be executed on GPU and should not stall the
162     // pipeline.
163     bool need_manual_release = false;
164     CLEvent prev_enqueue_start_point;
165 
166     void Init(const GpuInfo& gpu_info);
167   };
168   ExecutionHints execution_hints_;
169 
170   // Directly mapped nodes from graph, but some of them "inactive" due
171   //  to fusion (inactive = fused).
172   // Memory is allocated only once, in ConvertOperations, and is not modified
173   //  anywhere.
174   std::vector<CLNode> nodes_;
175 
176   absl::flat_hash_map<ValueId, Tensor*> external_immutable_tensors_;
177   absl::flat_hash_map<ValueId, Tensor*> external_mutable_tensors_;
178   absl::flat_hash_map<ValueId, std::vector<int>> external_tensor_to_nodes_;
179 
180   std::map<ValueId, Tensor> const_tensors_;
181 
182   std::map<ValueId, ValueId> variable_ids_and_refs_;
183   std::map<ValueId, Tensor> variable_tensors_;
184 
185   std::unique_ptr<Buffer> shared_buffers_parent_;
186   Buffer* shared_buffers_parent_ptr_ = nullptr;
187   std::vector<Buffer> shared_buffers_;
188   std::vector<Tensor>
189       shared_buffer_tensors_;  // use references to memory from shared_buffers_
190   std::map<ValueId, int> graph_ids_to_shared_buffer_tensors_;
191 
192   std::map<ValueId, Tensor> strong_shape_tensors_;
193   std::map<ValueId, ValueId> graph_ids_to_strong_shape_tensors_;
194 
195   std::vector<ValueId> input_ids_;
196   std::vector<ValueId> output_ids_;
197 
198   std::unique_ptr<RecordableQueue> recordable_queue_ = nullptr;
199 
200   GpuInfo gpu_info_;
201 };
202 
203 absl::Status GetInOutRefs(const absl::Span<const uint8_t> serialized_model,
204                           std::vector<int64_t>* in_refs,
205                           std::vector<int64_t>* out_refs);
206 
207 absl::Status GetTotalBufferSizeForTensors(const GpuModel& gpu_model,
208                                           const CreateGpuModelInfo& create_info,
209                                           const GpuInfo& gpu_info,
210                                           uint64_t* result);
211 
212 }  // namespace cl
213 }  // namespace gpu
214 }  // namespace tflite
215 
216 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_
217