1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_ 17 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_ 18 19 #include <cstdint> 20 #include <functional> 21 #include <map> 22 #include <memory> 23 #include <string> 24 #include <vector> 25 26 #include "absl/container/flat_hash_map.h" 27 #include "tensorflow/lite/delegates/gpu/cl/buffer.h" 28 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h" 29 #include "tensorflow/lite/delegates/gpu/cl/cl_operation.h" 30 #include "tensorflow/lite/delegates/gpu/cl/environment.h" 31 #include "tensorflow/lite/delegates/gpu/cl/gpu_object.h" 32 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h" 33 #include "tensorflow/lite/delegates/gpu/cl/recordable_queue_builder.h" 34 #include "tensorflow/lite/delegates/gpu/cl/serialization_generated.h" 35 #include "tensorflow/lite/delegates/gpu/cl/tensor.h" 36 #include "tensorflow/lite/delegates/gpu/common/gpu_model.h" 37 #include "tensorflow/lite/delegates/gpu/common/model.h" 38 #include "tensorflow/lite/delegates/gpu/common/model_hints.h" 39 #include "tensorflow/lite/delegates/gpu/common/precision.h" 40 #include "tensorflow/lite/delegates/gpu/common/status.h" 41 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h" 42 #include "tensorflow/lite/delegates/gpu/common/tensor.h" 43 44 namespace tflite { 45 namespace gpu { 46 namespace cl { 47 48 struct CLNode { 49 ClOperation cl_operation; 50 std::vector<ValueId> inputs; 51 std::vector<ValueId> outputs; 52 53 // Mostly for debug purposes. 54 std::string name; 55 56 CLNode() = default; 57 58 CLNode(CLNode&& node) = default; 59 CLNode& operator=(CLNode&& node) = default; 60 CLNode(const CLNode&) = delete; 61 CLNode& operator=(const CLNode&) = delete; 62 }; 63 64 enum class TensorType { kVariable, kConst, kExternal, kRuntime }; 65 66 class InferenceContext { 67 public: 68 absl::Status InitFromGraph(const CreateGpuModelInfo& create_info, 69 const GraphFloat32& graph, Environment* env, 70 std::vector<uint8_t>* serialized_model = nullptr); 71 72 absl::Status InitFromGpuModel( 73 const CreateGpuModelInfo& create_info, GpuModel* gpu_model, 74 Environment* env, std::vector<uint8_t>* serialized_model = nullptr, 75 Buffer* shared_buffer = nullptr); 76 77 absl::Status AddToCommanBuffer(cl_command_buffer_khr cb); 78 79 // Applies OpenCL-specific transformations to the graph before the 80 // initialization. These transformations are either impossible or useless in 81 // other backends. 82 absl::Status InitFromGraphWithTransforms( 83 const CreateGpuModelInfo& create_info, GraphFloat32* graph, 84 Environment* env, std::vector<uint8_t>* serialized_model = nullptr); 85 86 absl::Status AddToQueue(CLCommandQueue* queue); 87 absl::Status Profile(ProfilingCommandQueue* queue, ProfilingInfo* result); 88 // for profiling and memory statistics 89 uint64_t GetSizeOfMemoryAllocatedForIntermediateTensors() const; 90 uint64_t GetConstantTensorsSize() const; 91 92 absl::Status SetInputTensor(ValueId id, const TensorFloat32& tensor, 93 CLCommandQueue* queue); 94 95 // It will work only with input/output tensor ids. For all other ids we don't 96 // have any guarantees. 97 Tensor* GetTensor(ValueId id); 98 99 absl::Status GetOutputTensor(ValueId id, CLCommandQueue* queue, 100 TensorFloat32* result); 101 GetInputIds()102 const std::vector<ValueId>& GetInputIds() const { return input_ids_; } GetOutputIds()103 const std::vector<ValueId>& GetOutputIds() const { return output_ids_; } 104 105 absl::Status RestoreDeserialized( 106 const absl::Span<const uint8_t> serialized_model, Environment* env, 107 CreateGpuModelInfo* create_info = nullptr); 108 109 // Can be used only with ids from external_mutable_tensors in create_info 110 // Must be called after initialization and before execution 111 absl::Status SetTensor(const ValueId& tensor_id, Tensor* tensor_ptr); 112 113 private: 114 flatbuffers::Offset<data::InferenceContext> Encode( 115 const CLDevice& device, const ProgramCache& program_cache, 116 flatbuffers::Offset<tflite::gpu::data::GpuModel> gpu_model_fb, 117 flatbuffers::FlatBufferBuilder* builder); 118 119 void InitFromGpuModel(GpuModel* gpu_model); 120 121 absl::Status AllocateMemory(const GpuModel& gpu_model, 122 const GpuInfo& gpu_info, 123 const CreateGpuModelInfo* create_info, 124 CLContext* context); 125 126 absl::Status AllocateConstTensors(const GpuModel& gpu_model, 127 CLContext* context); 128 129 absl::Status AllocateVariableTensors(const GpuModel& gpu_model, 130 CLContext* context); 131 132 absl::Status AllocateBufferBasedTensors(const GpuModel& gpu_model, 133 const GpuInfo& gpu_info, 134 const CreateGpuModelInfo* create_info, 135 CLContext* context); 136 137 absl::Status AllocateStrongShapesTensors( 138 const GpuModel& gpu_model, const GpuInfo& gpu_info, 139 const CreateGpuModelInfo* create_info, CLContext* context); 140 141 void BindMemoryToOperations(); 142 absl::Status Compile(const CreationContext& creation_context); 143 absl::Status Tune(TuningType tuning_type, const GpuInfo& gpu_info, 144 ProfilingCommandQueue* profiling_queue); 145 absl::Status UpdateParams(); 146 void PrepareExternal(); 147 148 void InitRecordableQueue(Environment* env); 149 150 absl::Status ProfileTime(ProfilingCommandQueue* queue, ProfilingInfo* result); 151 152 struct ExecutionHints { 153 bool need_flush = false; 154 155 bool flush_periodically = false; 156 int flush_period = 1; 157 158 // In order to reduce memory leak on Mali a pipeline needs to be 159 // synchronized with CPU to prevent growing internal global OpenCL kernel 160 // pool. One trick is to enqueue an event from a previous run. Most of the 161 // time is should already be executed on GPU and should not stall the 162 // pipeline. 163 bool need_manual_release = false; 164 CLEvent prev_enqueue_start_point; 165 166 void Init(const GpuInfo& gpu_info); 167 }; 168 ExecutionHints execution_hints_; 169 170 // Directly mapped nodes from graph, but some of them "inactive" due 171 // to fusion (inactive = fused). 172 // Memory is allocated only once, in ConvertOperations, and is not modified 173 // anywhere. 174 std::vector<CLNode> nodes_; 175 176 absl::flat_hash_map<ValueId, Tensor*> external_immutable_tensors_; 177 absl::flat_hash_map<ValueId, Tensor*> external_mutable_tensors_; 178 absl::flat_hash_map<ValueId, std::vector<int>> external_tensor_to_nodes_; 179 180 std::map<ValueId, Tensor> const_tensors_; 181 182 std::map<ValueId, ValueId> variable_ids_and_refs_; 183 std::map<ValueId, Tensor> variable_tensors_; 184 185 std::unique_ptr<Buffer> shared_buffers_parent_; 186 Buffer* shared_buffers_parent_ptr_ = nullptr; 187 std::vector<Buffer> shared_buffers_; 188 std::vector<Tensor> 189 shared_buffer_tensors_; // use references to memory from shared_buffers_ 190 std::map<ValueId, int> graph_ids_to_shared_buffer_tensors_; 191 192 std::map<ValueId, Tensor> strong_shape_tensors_; 193 std::map<ValueId, ValueId> graph_ids_to_strong_shape_tensors_; 194 195 std::vector<ValueId> input_ids_; 196 std::vector<ValueId> output_ids_; 197 198 std::unique_ptr<RecordableQueue> recordable_queue_ = nullptr; 199 200 GpuInfo gpu_info_; 201 }; 202 203 absl::Status GetInOutRefs(const absl::Span<const uint8_t> serialized_model, 204 std::vector<int64_t>* in_refs, 205 std::vector<int64_t>* out_refs); 206 207 absl::Status GetTotalBufferSizeForTensors(const GpuModel& gpu_model, 208 const CreateGpuModelInfo& create_info, 209 const GpuInfo& gpu_info, 210 uint64_t* result); 211 212 } // namespace cl 213 } // namespace gpu 214 } // namespace tflite 215 216 #endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_ 217