1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
17
18 #include <algorithm>
19 #include <cmath>
20 #include <cstdint>
21 #include <cstring>
22 #include <functional>
23 #include <limits>
24 #include <map>
25 #include <memory>
26 #include <numeric>
27 #include <set>
28 #include <string>
29 #include <utility>
30 #include <vector>
31
32 #include "absl/container/flat_hash_map.h"
33 #include "absl/container/flat_hash_set.h"
34 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
35 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
36 #include "tensorflow/lite/delegates/gpu/cl/serialization_generated.h"
37 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
38 #include "tensorflow/lite/delegates/gpu/common/gpu_model.h"
39 #include "tensorflow/lite/delegates/gpu/common/gpu_model_generated.h"
40 #include "tensorflow/lite/delegates/gpu/common/memory_management.h"
41 #include "tensorflow/lite/delegates/gpu/common/model.h"
42 #include "tensorflow/lite/delegates/gpu/common/shape.h"
43 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
44 #include "tensorflow/lite/delegates/gpu/common/task/serialization_base.h"
45 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
46 #include "tensorflow/lite/delegates/gpu/common/types.h"
47 #include "tensorflow/lite/delegates/gpu/common/util.h"
48
49 namespace tflite {
50 namespace gpu {
51 namespace cl {
52
53 namespace {
AddUsage(ValueId id,int task_index,std::map<ValueId,int2> * usage_records)54 void AddUsage(ValueId id, int task_index,
55 std::map<ValueId, int2>* usage_records) {
56 auto it = usage_records->find(id);
57 if (it == usage_records->end()) {
58 (*usage_records)[id].x = task_index;
59 (*usage_records)[id].y = task_index;
60 } else {
61 (*usage_records)[id].y = task_index;
62 }
63 }
64
65 // returns true if actual memory for this storage type will be allocated with
66 // clCreateBuffer.
IsBufferBased(const GpuInfo & gpu_info,const TensorStorageType & type)67 bool IsBufferBased(const GpuInfo& gpu_info, const TensorStorageType& type) {
68 const bool image2d_based_buffer =
69 (type == TensorStorageType::TEXTURE_2D ||
70 type == TensorStorageType::SINGLE_TEXTURE_2D) &&
71 gpu_info.opencl_info.IsImage2dFromBufferSupported();
72 return type == TensorStorageType::BUFFER ||
73 type == TensorStorageType::IMAGE_BUFFER || image2d_based_buffer;
74 }
75
76 // Calculates the total size of the assignment.
TotalSize(const ObjectsAssignment<size_t> & assignment,size_t alignment=1)77 size_t TotalSize(const ObjectsAssignment<size_t>& assignment,
78 size_t alignment = 1) {
79 size_t total_size = 0;
80 for (auto object_size : assignment.object_sizes) {
81 total_size += AlignByN(object_size, alignment);
82 }
83 return total_size;
84 }
85
GetTensorType(const GpuModel & gpu_model,const CreateGpuModelInfo * create_info,const GpuInfo & gpu_info,ValueId id)86 TensorType GetTensorType(const GpuModel& gpu_model,
87 const CreateGpuModelInfo* create_info,
88 const GpuInfo& gpu_info, ValueId id) {
89 bool is_variable = false;
90 for (int i = 0; i < gpu_model.variable_ids_and_refs.size(); ++i) {
91 if (gpu_model.variable_ids_and_refs[i].first == id) {
92 is_variable = true;
93 break;
94 }
95 }
96 if (is_variable) {
97 return TensorType::kVariable;
98 } else if (create_info &&
99 (create_info->external_immutable_tensors.find(id) !=
100 create_info->external_immutable_tensors.end() ||
101 create_info->external_mutable_tensors.find(id) !=
102 create_info->external_mutable_tensors.end())) {
103 return TensorType::kExternal;
104 } else if (gpu_model.const_tensors.find(id) !=
105 gpu_model.const_tensors.end()) {
106 return TensorType::kConst;
107 } else {
108 return TensorType::kRuntime;
109 }
110 }
111
GetUsages(const GpuModel & model,const std::function<bool (ValueId)> & functor,std::map<ValueId,int2> * usages)112 void GetUsages(const GpuModel& model,
113 const std::function<bool(ValueId)>& functor,
114 std::map<ValueId, int2>* usages) {
115 for (const auto& in_id : model.input_ids_and_refs) {
116 if (functor(in_id.first)) {
117 AddUsage(in_id.first, 0, usages);
118 }
119 }
120 for (int op_index = 0; op_index < model.nodes.size(); ++op_index) {
121 for (auto input_id : model.nodes[op_index].inputs) {
122 if (functor(input_id)) {
123 AddUsage(input_id, op_index, usages);
124 }
125 }
126 for (auto output_id : model.nodes[op_index].outputs) {
127 if (functor(output_id)) {
128 AddUsage(output_id, op_index, usages);
129 }
130 }
131 }
132 for (const auto& out_id : model.output_ids_and_refs) {
133 if (functor(out_id.first)) {
134 AddUsage(out_id.first, model.nodes.size(), usages);
135 }
136 }
137 }
138
GetBufferAsignment(const GpuModel & gpu_model,const CreateGpuModelInfo * create_info,const GpuInfo & gpu_info,std::vector<TensorUsageRecord<size_t>> * buffer_usage_records,std::map<ValueId,int> * graph_ids_to_shared_buffer_tensors,ObjectsAssignment<size_t> * buffer_assignment,OffsetsAssignment * offset_assignment,bool * use_offset_assignment,bool * is_sub_buffers_supported)139 absl::Status GetBufferAsignment(
140 const GpuModel& gpu_model, const CreateGpuModelInfo* create_info,
141 const GpuInfo& gpu_info,
142 std::vector<TensorUsageRecord<size_t>>* buffer_usage_records,
143 std::map<ValueId, int>* graph_ids_to_shared_buffer_tensors,
144 ObjectsAssignment<size_t>* buffer_assignment,
145 OffsetsAssignment* offset_assignment, bool* use_offset_assignment,
146 bool* is_sub_buffers_supported) {
147 std::map<ValueId, int2> buffer_usages;
148 GetUsages(
149 gpu_model,
150 [&gpu_model, &gpu_info, &create_info](ValueId id) {
151 return GetTensorType(gpu_model, create_info, gpu_info, id) ==
152 TensorType::kRuntime &&
153 IsBufferBased(gpu_info,
154 gpu_model.tensors.at(id).GetStorageType());
155 },
156 &buffer_usages);
157
158 bool has_buffer_based_images = false;
159 for (auto& usage : buffer_usages) {
160 const auto& t = gpu_model.tensors.at(usage.first);
161 const auto& shape = t.GetBHWDCShape();
162 const auto& descriptor = t;
163 const size_t element_size = SizeOf(descriptor.GetDataType());
164 size_t buffer_size;
165 if (descriptor.GetStorageType() == TensorStorageType::TEXTURE_2D ||
166 descriptor.GetStorageType() == TensorStorageType::SINGLE_TEXTURE_2D) {
167 has_buffer_based_images = true;
168 const size_t bytes_per_pixel =
169 element_size *
170 (descriptor.GetStorageType() == TensorStorageType::TEXTURE_2D
171 ? 4
172 : shape.c);
173 const size_t width = shape.b * shape.w;
174 const size_t height = shape.h * DivideRoundUp(shape.c, 4);
175 size_t width_pixel_alignment = gpu_info.opencl_info.image_pitch_alignment;
176 if (gpu_info.IsAdreno() && width_pixel_alignment % bytes_per_pixel == 0) {
177 width_pixel_alignment /= bytes_per_pixel;
178 }
179 const size_t width_aligned = AlignByN(width, width_pixel_alignment);
180 buffer_size = width_aligned * bytes_per_pixel * height;
181 } else {
182 if (descriptor.GetStorageType() == TensorStorageType::IMAGE_BUFFER) {
183 has_buffer_based_images = true;
184 }
185 buffer_size =
186 shape.b * shape.w * shape.h * AlignByN(shape.c, 4) * element_size;
187 }
188 if (graph_ids_to_shared_buffer_tensors) {
189 (*graph_ids_to_shared_buffer_tensors)[usage.first] =
190 buffer_usage_records->size();
191 }
192 buffer_usage_records->push_back({buffer_size,
193 static_cast<TaskId>(usage.second.x),
194 static_cast<TaskId>(usage.second.y)});
195 }
196
197 RETURN_IF_ERROR(AssignObjectsToTensors(
198 *buffer_usage_records, MemoryStrategy::GREEDY_BEST, buffer_assignment));
199
200 *is_sub_buffers_supported =
201 (!has_buffer_based_images && gpu_info.IsCL11OrHigher()) ||
202 CanUseSubBufferForImage2d(gpu_info);
203 const size_t base_align_bytes =
204 std::max<size_t>(gpu_info.opencl_info.base_addr_align_in_bits >> 3, 1);
205
206 *use_offset_assignment = false;
207 if (*is_sub_buffers_supported) {
208 RETURN_IF_ERROR(AssignOffsetsToTensors(
209 *buffer_usage_records, MemoryStrategy::GREEDY_BY_SIZE,
210 offset_assignment, base_align_bytes));
211 if (offset_assignment->total_size <= TotalSize(*buffer_assignment) &&
212 offset_assignment->total_size <= gpu_info.GetMaxBufferSize()) {
213 *use_offset_assignment = true;
214 }
215 }
216 return absl::OkStatus();
217 }
218
219 } // namespace
220
Init(const GpuInfo & gpu_info)221 void InferenceContext::ExecutionHints::Init(const GpuInfo& gpu_info) {
222 if (gpu_info.IsMali()) {
223 need_flush = true;
224 need_manual_release = gpu_info.mali_info.IsValhall() ? false : true;
225
226 flush_periodically = true;
227 flush_period = 24;
228 }
229 if (gpu_info.IsPowerVR()) {
230 need_flush = true;
231 flush_periodically = true;
232 flush_period = 16;
233 }
234 }
235
InitFromGraph(const CreateGpuModelInfo & create_info,const GraphFloat32 & graph,Environment * env,std::vector<uint8_t> * serialized_model)236 absl::Status InferenceContext::InitFromGraph(
237 const CreateGpuModelInfo& create_info, const GraphFloat32& graph,
238 Environment* env, std::vector<uint8_t>* serialized_model) {
239 GpuModel gpu_model;
240 RETURN_IF_ERROR(GraphToGpuModel(graph, create_info,
241 env->GetDevicePtr()->GetInfo(), &gpu_model));
242 return InitFromGpuModel(create_info, &gpu_model, env, serialized_model);
243 }
244
InitFromGpuModel(const CreateGpuModelInfo & create_info,GpuModel * gpu_model,Environment * env,std::vector<uint8_t> * serialized_model,Buffer * shared_buffer)245 absl::Status InferenceContext::InitFromGpuModel(
246 const CreateGpuModelInfo& create_info, GpuModel* gpu_model,
247 Environment* env, std::vector<uint8_t>* serialized_model,
248 Buffer* shared_buffer) {
249 flatbuffers::FlatBufferBuilder builder;
250 flatbuffers::Offset<tflite::gpu::data::GpuModel> gpu_model_fb;
251 if (serialized_model) {
252 gpu_model_fb = tflite::gpu::Encode(*gpu_model, &builder);
253 }
254 shared_buffers_parent_ptr_ = shared_buffer;
255 RETURN_IF_ERROR(AllocateMemory(*gpu_model, env->GetDevicePtr()->GetInfo(),
256 &create_info, &env->context()));
257 InitFromGpuModel(gpu_model);
258
259 CreationContext creation_context;
260 creation_context.device = env->GetDevicePtr();
261 creation_context.context = &env->context();
262 creation_context.queue = env->queue();
263 creation_context.cache = env->program_cache();
264 for (const auto& external_tensor : create_info.external_immutable_tensors) {
265 auto* cl_spatial_tensor = dynamic_cast<Tensor*>(external_tensor.second);
266 if (!cl_spatial_tensor) {
267 return absl::InvalidArgumentError("Expected CLSpatialTensor.");
268 }
269 external_immutable_tensors_[external_tensor.first] = cl_spatial_tensor;
270 }
271 std::map<ValueId, Tensor> temp_external_tensors;
272 for (const auto& external_tensor : create_info.external_mutable_tensors) {
273 RETURN_IF_ERROR(
274 CreateTensor(env->context(),
275 gpu_model->tensors[external_tensor.first],
276 &temp_external_tensors[external_tensor.first]));
277 external_mutable_tensors_[external_tensor.first] =
278 &temp_external_tensors[external_tensor.first];
279 }
280 PrepareExternal();
281 execution_hints_.Init(env->device().GetInfo());
282 BindMemoryToOperations();
283 RETURN_IF_ERROR(Compile(creation_context));
284 RETURN_IF_ERROR(UpdateParams());
285
286 TuningType tuning_type = TuningType::kExhaustive;
287 if (create_info.hints.Check(ModelHints::kFastTuning)) {
288 tuning_type = TuningType::kFast;
289 }
290 if (env->device().GetInfo().IsMali()) {
291 const MaliInfo& info = env->device().GetInfo().mali_info;
292 if (info.IsMaliT6xx()) {
293 // Mali T628 hangs forever in clFinish when used profiling queue
294 // TuningType::FAST does not use profiling queue.
295 tuning_type = TuningType::kFast;
296 }
297 }
298 RETURN_IF_ERROR(
299 Tune(tuning_type, env->device().GetInfo(), env->profiling_queue()));
300 if (external_mutable_tensors_.empty()) {
301 // using recordable queue only when no mutable external tensors
302 InitRecordableQueue(env);
303 }
304
305 for (auto& external_tensor : external_mutable_tensors_) {
306 external_tensor.second = nullptr;
307 }
308
309 gpu_info_ = env->device().GetInfo();
310
311 if (serialized_model) {
312 auto encoded_fb = Encode(*env->GetDevicePtr(), *env->program_cache(),
313 gpu_model_fb, &builder);
314 data::FinishInferenceContextBuffer(builder, encoded_fb);
315 serialized_model->resize(builder.GetSize());
316 std::memcpy(serialized_model->data(), builder.GetBufferPointer(),
317 builder.GetSize());
318 }
319 return absl::OkStatus();
320 }
321
AddToCommanBuffer(cl_command_buffer_khr cb)322 absl::Status InferenceContext::AddToCommanBuffer(cl_command_buffer_khr cb) {
323 for (auto& node : nodes_) {
324 RETURN_IF_ERROR(node.cl_operation.AddToCommanBuffer(cb));
325 }
326 return absl::OkStatus();
327 }
328
RestoreDeserialized(const absl::Span<const uint8_t> serialized_model,Environment * env,CreateGpuModelInfo * create_info)329 absl::Status InferenceContext::RestoreDeserialized(
330 const absl::Span<const uint8_t> serialized_model, Environment* env,
331 CreateGpuModelInfo* create_info) {
332 flatbuffers::Verifier verifier(serialized_model.data(),
333 serialized_model.size());
334 if (!data::VerifyInferenceContextBuffer(verifier)) {
335 return absl::DataLossError("Deserialization failed.");
336 }
337 auto decoded_fb = data::GetInferenceContext(serialized_model.data());
338 std::string platform_version(decoded_fb->driver_version()->c_str(),
339 decoded_fb->driver_version()->size());
340 if (env->GetDevicePtr()->GetPlatformVersion() != platform_version) {
341 return absl::InvalidArgumentError(
342 "OpenCL driver changed, model respresentation invalid, must be "
343 "regenerated.");
344 }
345 GpuModel gpu_model;
346 RETURN_IF_ERROR(tflite::gpu::Decode(decoded_fb->gpu_model(), &gpu_model));
347 RETURN_IF_ERROR(AllocateMemory(gpu_model, env->GetDevicePtr()->GetInfo(),
348 create_info, &env->context()));
349 InitFromGpuModel(&gpu_model);
350
351 // deserializing kernels into program_cache
352 for (auto binary_program_fb : *decoded_fb->binary_programs()) {
353 RETURN_IF_ERROR(env->program_cache()->AddProgramBinary(
354 env->context(), *env->GetDevicePtr(), binary_program_fb->fingerprint(),
355 absl::MakeSpan(binary_program_fb->binary()->data(),
356 binary_program_fb->binary()->size())));
357 }
358
359 std::map<ValueId, Tensor> temp_external_tensors;
360 if (create_info) {
361 for (const auto& external_tensor :
362 create_info->external_immutable_tensors) {
363 auto* cl_spatial_tensor = dynamic_cast<Tensor*>(external_tensor.second);
364 if (!cl_spatial_tensor) {
365 return absl::InvalidArgumentError("Expected CLSpatialTensor.");
366 }
367 external_immutable_tensors_[external_tensor.first] = cl_spatial_tensor;
368 }
369 for (const auto& external_tensor : create_info->external_mutable_tensors) {
370 RETURN_IF_ERROR(
371 CreateTensor(env->context(),
372 gpu_model.tensors[external_tensor.first],
373 &temp_external_tensors[external_tensor.first]));
374 external_mutable_tensors_[external_tensor.first] =
375 &temp_external_tensors[external_tensor.first];
376 }
377 }
378 PrepareExternal();
379
380 execution_hints_.Init(env->device().GetInfo());
381
382 BindMemoryToOperations();
383 for (int i = 0; i < nodes_.size(); ++i) {
384 uint64_t fingerprint = (*decoded_fb->fingerprints_per_node())[i];
385 int3 wg_size;
386 wg_size.x = (*decoded_fb->tuned_work_group_sizes_per_node())[i]->x();
387 wg_size.y = (*decoded_fb->tuned_work_group_sizes_per_node())[i]->y();
388 wg_size.z = (*decoded_fb->tuned_work_group_sizes_per_node())[i]->z();
389 RETURN_IF_ERROR(nodes_[i].cl_operation.RestoreDeserialized(
390 *env->program_cache(), fingerprint, env->GetDevicePtr()->GetInfo(),
391 wg_size, &env->context()));
392 }
393 RETURN_IF_ERROR(UpdateParams());
394 if (external_mutable_tensors_.empty()) {
395 // using recordable queue only when no mutable external tensors
396 InitRecordableQueue(env);
397 }
398 for (auto& external_tensor : external_mutable_tensors_) {
399 external_tensor.second = nullptr;
400 }
401 return absl::OkStatus();
402 }
403
InitFromGpuModel(GpuModel * gpu_model)404 void InferenceContext::InitFromGpuModel(GpuModel* gpu_model) {
405 for (const auto& input : gpu_model->input_ids_and_refs) {
406 input_ids_.push_back(input.first);
407 }
408 for (const auto& output : gpu_model->output_ids_and_refs) {
409 output_ids_.push_back(output.first);
410 }
411 nodes_.resize(gpu_model->nodes.size());
412 for (int i = 0; i < gpu_model->nodes.size(); ++i) {
413 nodes_[i].cl_operation.Init(std::move(gpu_model->nodes[i].gpu_operation));
414 nodes_[i].inputs = gpu_model->nodes[i].inputs;
415 nodes_[i].outputs = gpu_model->nodes[i].outputs;
416 nodes_[i].name = gpu_model->nodes[i].name;
417 }
418 }
419
InitRecordableQueue(Environment * env)420 void InferenceContext::InitRecordableQueue(Environment* env) {
421 std::vector<ClOperation*> ops(nodes_.size());
422 for (int i = 0; i < nodes_.size(); ++i) {
423 ops[i] = &nodes_[i].cl_operation;
424 }
425 recordable_queue_ = CreateRecordableQueue(ops, env->device(), env->context());
426 }
427
InitFromGraphWithTransforms(const CreateGpuModelInfo & create_info,GraphFloat32 * graph,Environment * env,std::vector<uint8_t> * serialized_model)428 absl::Status InferenceContext::InitFromGraphWithTransforms(
429 const CreateGpuModelInfo& create_info, GraphFloat32* graph,
430 Environment* env, std::vector<uint8_t>* serialized_model) {
431 RETURN_IF_ERROR(RunGraphTransformsForGpuModel(graph));
432 RETURN_IF_ERROR(InitFromGraph(create_info, *graph, env, serialized_model));
433 return absl::OkStatus();
434 }
435
AllocateMemory(const GpuModel & gpu_model,const GpuInfo & gpu_info,const CreateGpuModelInfo * create_info,CLContext * context)436 absl::Status InferenceContext::AllocateMemory(
437 const GpuModel& gpu_model, const GpuInfo& gpu_info,
438 const CreateGpuModelInfo* create_info, CLContext* context) {
439 RETURN_IF_ERROR(AllocateConstTensors(gpu_model, context));
440 RETURN_IF_ERROR(AllocateVariableTensors(gpu_model, context));
441 RETURN_IF_ERROR(
442 AllocateBufferBasedTensors(gpu_model, gpu_info, create_info, context));
443 RETURN_IF_ERROR(
444 AllocateStrongShapesTensors(gpu_model, gpu_info, create_info, context));
445 return absl::OkStatus();
446 }
447
AllocateConstTensors(const GpuModel & gpu_model,CLContext * context)448 absl::Status InferenceContext::AllocateConstTensors(const GpuModel& gpu_model,
449 CLContext* context) {
450 for (auto& description : gpu_model.const_tensors) {
451 RETURN_IF_ERROR(const_tensors_[description.first].CreateFromDescriptor(
452 description.second, context));
453 }
454 return absl::OkStatus();
455 }
456
AllocateVariableTensors(const GpuModel & gpu_model,CLContext * context)457 absl::Status InferenceContext::AllocateVariableTensors(
458 const GpuModel& gpu_model, CLContext* context) {
459 for (const auto& variable_input : gpu_model.variable_ids_and_refs) {
460 variable_ids_and_refs_[variable_input.first] = variable_input.second;
461 }
462
463 std::map<ValueId, int> ref_value_to_tensor_index;
464
465 for (auto value_and_ref_value : variable_ids_and_refs_) {
466 if (ref_value_to_tensor_index.find(value_and_ref_value.second) ==
467 ref_value_to_tensor_index.end()) {
468 auto it = gpu_model.tensors.find(value_and_ref_value.first);
469 if (it == gpu_model.tensors.end()) {
470 return absl::InternalError("No variable tensor with this id.");
471 }
472 RETURN_IF_ERROR(
473 CreateTensor(*context, it->second,
474 &variable_tensors_[value_and_ref_value.second]));
475 }
476 }
477 return absl::OkStatus();
478 }
479
AllocateBufferBasedTensors(const GpuModel & gpu_model,const GpuInfo & gpu_info,const CreateGpuModelInfo * create_info,CLContext * context)480 absl::Status InferenceContext::AllocateBufferBasedTensors(
481 const GpuModel& gpu_model, const GpuInfo& gpu_info,
482 const CreateGpuModelInfo* create_info, CLContext* context) {
483 std::vector<TensorUsageRecord<size_t>> buffer_usage_records;
484 ObjectsAssignment<size_t> buffer_assignment;
485 OffsetsAssignment offset_assignment;
486 bool use_offset_assignment;
487 bool is_sub_buffers_supported;
488 RETURN_IF_ERROR(GetBufferAsignment(
489 gpu_model, create_info, gpu_info, &buffer_usage_records,
490 &graph_ids_to_shared_buffer_tensors_, &buffer_assignment,
491 &offset_assignment, &use_offset_assignment, &is_sub_buffers_supported));
492 const size_t base_align_bytes =
493 std::max<size_t>(gpu_info.opencl_info.base_addr_align_in_bits >> 3, 1);
494
495 if (buffer_usage_records.empty()) {
496 return absl::OkStatus();
497 }
498
499 if (use_offset_assignment) {
500 if (!shared_buffers_parent_ptr_) {
501 Buffer shared_buffer;
502 RETURN_IF_ERROR(CreateReadWriteBuffer(offset_assignment.total_size,
503 context, &shared_buffer));
504 shared_buffers_parent_ =
505 std::make_unique<Buffer>(std::move(shared_buffer));
506 shared_buffers_parent_ptr_ = shared_buffers_parent_.get();
507 } else if (shared_buffers_parent_ptr_->GetMemorySizeInBytes() <
508 offset_assignment.total_size) {
509 return absl::FailedPreconditionError(
510 "Externally provided buffer not big enough.");
511 }
512 shared_buffers_.resize(offset_assignment.offsets.size());
513 for (int i = 0; i < offset_assignment.offsets.size(); ++i) {
514 RETURN_IF_ERROR(CreateReadWriteSubBuffer(
515 *shared_buffers_parent_ptr_, offset_assignment.offsets[i],
516 buffer_usage_records[i].tensor_size, context, &shared_buffers_[i]));
517 }
518 } else {
519 const size_t total_size = TotalSize(buffer_assignment, base_align_bytes);
520 if (is_sub_buffers_supported && total_size <= gpu_info.GetMaxBufferSize()) {
521 // use single parent buffer:
522 if (!shared_buffers_parent_ptr_) {
523 Buffer shared_buffer;
524 RETURN_IF_ERROR(
525 CreateReadWriteBuffer(total_size, context, &shared_buffer));
526 shared_buffers_parent_ =
527 std::make_unique<Buffer>(std::move(shared_buffer));
528 shared_buffers_parent_ptr_ = shared_buffers_parent_.get();
529 } else if (shared_buffers_parent_ptr_->GetMemorySizeInBytes() <
530 total_size) {
531 return absl::FailedPreconditionError(
532 "Externally provided buffer not big enough.");
533 }
534
535 shared_buffers_.resize(buffer_assignment.object_sizes.size());
536 size_t offset = 0;
537 for (int i = 0; i < buffer_assignment.object_sizes.size(); ++i) {
538 const size_t aligned_size =
539 AlignByN(buffer_assignment.object_sizes[i], base_align_bytes);
540 RETURN_IF_ERROR(CreateReadWriteSubBuffer(*shared_buffers_parent_ptr_,
541 offset, aligned_size, context,
542 &shared_buffers_[i]));
543 offset += aligned_size;
544 }
545 } else {
546 shared_buffers_.resize(buffer_assignment.object_sizes.size());
547 for (int i = 0; i < buffer_assignment.object_sizes.size(); ++i) {
548 RETURN_IF_ERROR(CreateReadWriteBuffer(buffer_assignment.object_sizes[i],
549 context, &shared_buffers_[i]));
550 }
551 }
552 }
553
554 std::vector<bool> created_tensors(buffer_usage_records.size(), false);
555 shared_buffer_tensors_.resize(buffer_usage_records.size());
556 for (auto& node : gpu_model.nodes) {
557 std::vector<ValueId> node_tensor_ids = node.inputs;
558 node_tensor_ids.insert(node_tensor_ids.end(), node.outputs.begin(),
559 node.outputs.end());
560 for (auto& tensor_id : node_tensor_ids) {
561 if (GetTensorType(gpu_model, create_info, gpu_info, tensor_id) !=
562 TensorType::kRuntime) {
563 continue;
564 }
565 const auto& tensor_desc = gpu_model.tensors.at(tensor_id);
566 if (!IsBufferBased(gpu_info, tensor_desc.GetStorageType())) {
567 continue;
568 }
569 const int tensor_index = graph_ids_to_shared_buffer_tensors_[tensor_id];
570 if (created_tensors[tensor_index]) continue;
571 const int buffer_index = use_offset_assignment
572 ? tensor_index
573 : buffer_assignment.object_ids[tensor_index];
574 if (tensor_desc.GetStorageType() == TensorStorageType::TEXTURE_2D ||
575 tensor_desc.GetStorageType() ==
576 TensorStorageType::SINGLE_TEXTURE_2D) {
577 const size_t bytes_per_pixel =
578 SizeOf(tensor_desc.GetDataType()) *
579 (tensor_desc.GetStorageType() == TensorStorageType::TEXTURE_2D
580 ? 4
581 : tensor_desc.GetBHWCShape().c);
582 size_t width_pixel_alignment =
583 gpu_info.opencl_info.image_pitch_alignment;
584 if (gpu_info.IsAdreno() &&
585 width_pixel_alignment % bytes_per_pixel == 0) {
586 width_pixel_alignment /= bytes_per_pixel;
587 }
588 RETURN_IF_ERROR(CreateTensorSharedImage2DBuffer(
589 *context, shared_buffers_[buffer_index].GetMemoryPtr(), tensor_desc,
590 width_pixel_alignment, &shared_buffer_tensors_[tensor_index]));
591 } else {
592 RETURN_IF_ERROR(CreateTensorShared(
593 *context, shared_buffers_[buffer_index].GetMemoryPtr(), tensor_desc,
594 &shared_buffer_tensors_[tensor_index]));
595 }
596 created_tensors[tensor_index] = true;
597 }
598 }
599 return absl::OkStatus();
600 }
601
AllocateStrongShapesTensors(const GpuModel & gpu_model,const GpuInfo & gpu_info,const CreateGpuModelInfo * create_info,CLContext * context)602 absl::Status InferenceContext::AllocateStrongShapesTensors(
603 const GpuModel& gpu_model, const GpuInfo& gpu_info,
604 const CreateGpuModelInfo* create_info, CLContext* context) {
605 std::map<ValueId, int2> usages;
606 GetUsages(
607 gpu_model,
608 [&gpu_model, &gpu_info, &create_info](ValueId id) {
609 return GetTensorType(gpu_model, create_info, gpu_info, id) ==
610 TensorType::kRuntime &&
611 !IsBufferBased(gpu_info,
612 gpu_model.tensors.at(id).GetStorageType());
613 },
614 &usages);
615
616 struct TensorDescComparator {
617 TensorDescriptor tensor_desc;
618
619 bool operator==(const TensorDescComparator& t) const {
620 return tensor_desc == t.tensor_desc &&
621 tensor_desc.GetBHWDCShape() == t.tensor_desc.GetBHWDCShape();
622 }
623 };
624
625 std::vector<TensorUsageRecord<TensorDescComparator>> usage_records;
626 std::map<ValueId, ValueId> remap_from_graph_ids;
627 for (auto& usage : usages) {
628 remap_from_graph_ids[usage.first] = usage_records.size();
629 usage_records.push_back({{gpu_model.tensors.at(usage.first)},
630 static_cast<TaskId>(usage.second.x),
631 static_cast<TaskId>(usage.second.y)});
632 }
633
634 ObjectsAssignment<TensorDescComparator> assignment;
635 RETURN_IF_ERROR(AssignObjectsToTensors(
636 usage_records, MemoryStrategy::EQUALITY, &assignment));
637
638 for (auto& node : gpu_model.nodes) {
639 std::vector<ValueId> node_tensor_ids = node.inputs;
640 node_tensor_ids.insert(node_tensor_ids.end(), node.outputs.begin(),
641 node.outputs.end());
642 for (auto& tensor_id : node_tensor_ids) {
643 if (GetTensorType(gpu_model, create_info, gpu_info, tensor_id) !=
644 TensorType::kRuntime) {
645 continue;
646 }
647 const auto& tensor_desc = gpu_model.tensors.at(tensor_id);
648 if (IsBufferBased(gpu_info, tensor_desc.GetStorageType())) {
649 continue;
650 }
651 const auto id = assignment.object_ids[remap_from_graph_ids[tensor_id]];
652 graph_ids_to_strong_shape_tensors_[tensor_id] = id;
653 const auto& it = strong_shape_tensors_.find(id);
654 if (it == strong_shape_tensors_.end()) {
655 RETURN_IF_ERROR(
656 CreateTensor(*context, tensor_desc, &strong_shape_tensors_[id]));
657 }
658 }
659 }
660 return absl::OkStatus();
661 }
662
BindMemoryToOperations()663 void InferenceContext::BindMemoryToOperations() {
664 for (auto& node : nodes_) {
665 for (int i = 0; i < node.inputs.size(); ++i) {
666 node.cl_operation.GetGpuOperation().SetSrc(GetTensor(node.inputs[i]), i);
667 }
668 for (int i = 0; i < node.outputs.size(); ++i) {
669 node.cl_operation.GetGpuOperation().SetDst(GetTensor(node.outputs[i]), i);
670 }
671 }
672 }
673
Compile(const CreationContext & creation_context)674 absl::Status InferenceContext::Compile(
675 const CreationContext& creation_context) {
676 for (auto& node : nodes_) {
677 RETURN_IF_ERROR(node.cl_operation.Compile(creation_context));
678 }
679 return absl::OkStatus();
680 }
681
Tune(TuningType tuning_type,const GpuInfo & gpu_info,ProfilingCommandQueue * profiling_queue)682 absl::Status InferenceContext::Tune(TuningType tuning_type,
683 const GpuInfo& gpu_info,
684 ProfilingCommandQueue* profiling_queue) {
685 for (auto& node : nodes_) {
686 RETURN_IF_ERROR(
687 node.cl_operation.Tune(tuning_type, gpu_info, profiling_queue));
688 }
689 return absl::OkStatus();
690 }
691
UpdateParams()692 absl::Status InferenceContext::UpdateParams() {
693 for (auto& node : nodes_) {
694 RETURN_IF_ERROR(node.cl_operation.UpdateParams());
695 }
696 return absl::OkStatus();
697 }
698
SetTensor(const ValueId & tensor_id,Tensor * tensor_ptr)699 absl::Status InferenceContext::SetTensor(const ValueId& tensor_id,
700 Tensor* tensor_ptr) {
701 auto it = external_mutable_tensors_.find(tensor_id);
702 if (it == external_mutable_tensors_.end()) {
703 return absl::InvalidArgumentError("No external tensor with this id.");
704 }
705 external_mutable_tensors_[tensor_id] = tensor_ptr;
706 for (int node_index : external_tensor_to_nodes_[tensor_id]) {
707 auto& node = nodes_[node_index];
708 for (int i = 0; i < node.inputs.size(); ++i) {
709 if (node.inputs[i] == tensor_id) {
710 RETURN_IF_ERROR(node.cl_operation.SetSrcTensor(i, tensor_ptr));
711 }
712 }
713 for (int i = 0; i < node.outputs.size(); ++i) {
714 if (node.outputs[i] == tensor_id) {
715 RETURN_IF_ERROR(node.cl_operation.SetDstTensor(i, tensor_ptr));
716 }
717 }
718 }
719 return absl::OkStatus();
720 }
721
PrepareExternal()722 void InferenceContext::PrepareExternal() {
723 for (auto& external : external_mutable_tensors_) {
724 for (int i = 0; i < nodes_.size(); ++i) {
725 bool has_tensor = false;
726 const auto& src_ids = nodes_[i].inputs;
727 for (int i = 0; i < src_ids.size(); ++i) {
728 if (src_ids[i] == external.first) {
729 has_tensor = true;
730 }
731 }
732 const auto& dst_ids = nodes_[i].outputs;
733 for (int i = 0; i < dst_ids.size(); ++i) {
734 if (dst_ids[i] == external.first) {
735 has_tensor = true;
736 }
737 }
738 if (has_tensor) {
739 external_tensor_to_nodes_[external.first].push_back(i);
740 }
741 }
742 }
743 }
744
AddToQueue(CLCommandQueue * queue)745 absl::Status InferenceContext::AddToQueue(CLCommandQueue* queue) {
746 if (recordable_queue_ && recordable_queue_->IsSupported()) {
747 return recordable_queue_->Execute(queue);
748 }
749 if (execution_hints_.need_manual_release) {
750 if (execution_hints_.prev_enqueue_start_point.is_valid()) {
751 execution_hints_.prev_enqueue_start_point.Wait();
752 }
753 RETURN_IF_ERROR(
754 queue->EnqueueEvent(&execution_hints_.prev_enqueue_start_point));
755 }
756 int counter = 0;
757 for (auto& node : nodes_) {
758 RETURN_IF_ERROR(node.cl_operation.AddToQueue(queue));
759 counter++;
760 if (execution_hints_.flush_periodically &&
761 counter % execution_hints_.flush_period == 0) {
762 clFlush(queue->queue());
763 }
764 }
765 if (execution_hints_.need_flush) {
766 clFlush(queue->queue());
767 }
768 return absl::OkStatus();
769 }
770
ProfileTime(ProfilingCommandQueue * queue,ProfilingInfo * result)771 absl::Status InferenceContext::ProfileTime(ProfilingCommandQueue* queue,
772 ProfilingInfo* result) {
773 queue->ResetMeasurements();
774 for (auto& node : nodes_) {
775 queue->SetEventsLabel(node.name);
776 RETURN_IF_ERROR(node.cl_operation.AddToQueue(queue));
777 }
778 RETURN_IF_ERROR(queue->WaitForCompletion());
779 *result = queue->GetProfilingInfo();
780
781 if (!(gpu_info_.IsMali() || gpu_info_.IsPowerVR())) {
782 return absl::OkStatus();
783 }
784
785 if (gpu_info_.IsMali()) {
786 queue->ResetMeasurements();
787 for (int i = 0; i < nodes_.size(); ++i) {
788 queue->SetEventsLabel(nodes_[i].name);
789 const double times =
790 16.0 / absl::ToDoubleMilliseconds(result->dispatches[i].duration);
791 const int n = std::min(256.0, std::max(2.0, times));
792 RETURN_IF_ERROR(nodes_[i].cl_operation.AddToQueueNTimes(queue, n));
793 }
794 RETURN_IF_ERROR(queue->WaitForCompletion());
795 *result = queue->GetProfilingInfo();
796 return absl::OkStatus();
797 }
798
799 if (gpu_info_.IsPowerVR()) {
800 queue->ResetMeasurements();
801 for (int i = 0; i < nodes_.size(); ++i) {
802 queue->SetEventsLabel(nodes_[i].name);
803 const double times =
804 32.0 / absl::ToDoubleMilliseconds(result->dispatches[i].duration);
805 const int n = std::min(64.0, std::max(4.0, times));
806 RETURN_IF_ERROR(nodes_[i].cl_operation.AddToQueueNTimes(queue, n));
807 }
808 RETURN_IF_ERROR(queue->WaitForCompletion());
809 *result = queue->GetProfilingInfo();
810
811 queue->ResetMeasurements();
812 for (int i = 0; i < nodes_.size(); ++i) {
813 queue->SetEventsLabel(nodes_[i].name);
814 const double times =
815 128.0 / absl::ToDoubleMilliseconds(result->dispatches[i].duration);
816 const int n = std::min(1024.0, std::max(4.0, times));
817 RETURN_IF_ERROR(nodes_[i].cl_operation.AddToQueueNTimes(queue, n));
818 }
819 RETURN_IF_ERROR(queue->WaitForCompletion());
820 *result = queue->GetProfilingInfo();
821 return absl::OkStatus();
822 }
823
824 return absl::OkStatus();
825 }
826
Profile(ProfilingCommandQueue * queue,ProfilingInfo * result)827 absl::Status InferenceContext::Profile(ProfilingCommandQueue* queue,
828 ProfilingInfo* result) {
829 RETURN_IF_ERROR(ProfileTime(queue, result));
830 for (int i = 0; i < nodes_.size(); ++i) {
831 uint64_t read_size = 0;
832 for (auto& src_id : nodes_[i].inputs) {
833 read_size += GetTensor(src_id)->GetMemorySizeInBytes();
834 }
835 const auto& gpu_op = nodes_[i].cl_operation.GetGpuOperation();
836 read_size += gpu_op.const_args_size_;
837 uint64_t write_size = 0;
838 for (auto& dst_id : nodes_[i].outputs) {
839 write_size += GetTensor(dst_id)->GetMemorySizeInBytes();
840 }
841 result->dispatches[i].flops = gpu_op.flops_;
842 result->dispatches[i].read_mem_size = read_size;
843 result->dispatches[i].write_mem_size = write_size;
844 }
845
846 return absl::OkStatus();
847 }
848
GetSizeOfMemoryAllocatedForIntermediateTensors() const849 uint64_t InferenceContext::GetSizeOfMemoryAllocatedForIntermediateTensors()
850 const {
851 uint64_t total_memory = 0;
852 for (const auto& t : strong_shape_tensors_) {
853 total_memory += t.second.GetMemorySizeInBytes();
854 }
855 for (const auto& b : shared_buffers_) {
856 // Sub-buffers do not allocate memory. Count the size of the parent buffer
857 // object instead.
858 if (!b.IsSubBuffer()) {
859 total_memory += b.GetMemorySizeInBytes();
860 }
861 }
862 for (const auto& t : variable_tensors_) {
863 total_memory += t.second.GetMemorySizeInBytes();
864 }
865 if (shared_buffers_parent_) {
866 total_memory += shared_buffers_parent_->GetMemorySizeInBytes();
867 }
868
869 return total_memory;
870 }
871
GetConstantTensorsSize() const872 uint64_t InferenceContext::GetConstantTensorsSize() const {
873 uint64_t total_size = 0;
874 for (const auto& node : nodes_) {
875 total_size += node.cl_operation.GetGpuOperation().const_args_size_;
876 }
877 for (const auto& t : const_tensors_) {
878 total_size += t.second.GetMemorySizeInBytes();
879 }
880 return total_size;
881 }
882
GetTensor(ValueId id)883 Tensor* InferenceContext::GetTensor(ValueId id) {
884 if (external_immutable_tensors_.find(id) !=
885 external_immutable_tensors_.end()) {
886 return external_immutable_tensors_[id];
887 } else if (external_mutable_tensors_.find(id) !=
888 external_mutable_tensors_.end()) {
889 return external_mutable_tensors_[id];
890 } else if (const_tensors_.find(id) != const_tensors_.end()) {
891 return &const_tensors_[id];
892 } else if (variable_ids_and_refs_.find(id) != variable_ids_and_refs_.end()) {
893 return &variable_tensors_[variable_ids_and_refs_[id]];
894 } else if (graph_ids_to_shared_buffer_tensors_.find(id) !=
895 graph_ids_to_shared_buffer_tensors_.end()) {
896 return &shared_buffer_tensors_[graph_ids_to_shared_buffer_tensors_[id]];
897 } else {
898 return &strong_shape_tensors_[graph_ids_to_strong_shape_tensors_[id]];
899 }
900 }
901
SetInputTensor(ValueId id,const TensorFloat32 & tensor,CLCommandQueue * queue)902 absl::Status InferenceContext::SetInputTensor(ValueId id,
903 const TensorFloat32& tensor,
904 CLCommandQueue* queue) {
905 Tensor* gpu_tensor = GetTensor(id);
906 TensorDescriptor descriptor_with_data = gpu_tensor->GetDescriptor();
907 descriptor_with_data.UploadData(tensor);
908 return gpu_tensor->UploadDescriptorData(descriptor_with_data, queue);
909 }
910
GetOutputTensor(ValueId id,CLCommandQueue * queue,TensorFloat32 * result)911 absl::Status InferenceContext::GetOutputTensor(ValueId id,
912 CLCommandQueue* queue,
913 TensorFloat32* result) {
914 const Tensor* gpu_tensor = GetTensor(id);
915 const auto dst_shape = BHWC(gpu_tensor->Batch(), gpu_tensor->Height(),
916 gpu_tensor->Width(), gpu_tensor->Channels());
917 result->id = id;
918 result->shape = dst_shape;
919 result->data.resize(dst_shape.DimensionsProduct());
920
921 TensorDescriptor desc;
922 RETURN_IF_ERROR(gpu_tensor->ToDescriptor(&desc, queue));
923 desc.DownloadData(result);
924 return absl::OkStatus();
925 }
926
Encode(const CLDevice & device,const ProgramCache & program_cache,flatbuffers::Offset<tflite::gpu::data::GpuModel> gpu_model_fb,flatbuffers::FlatBufferBuilder * builder)927 flatbuffers::Offset<data::InferenceContext> InferenceContext::Encode(
928 const CLDevice& device, const ProgramCache& program_cache,
929 flatbuffers::Offset<tflite::gpu::data::GpuModel> gpu_model_fb,
930 flatbuffers::FlatBufferBuilder* builder) {
931 std::vector<flatbuffers::Offset<tflite::gpu::data::Int3>> work_groups_fb;
932 for (int i = 0; i < nodes_.size(); ++i) {
933 auto work_group_fb =
934 tflite::gpu::Encode(nodes_[i].cl_operation.GetWorkGroupSize(), builder);
935 work_groups_fb.push_back(work_group_fb);
936 }
937 auto work_groups_fb_vec = builder->CreateVector(work_groups_fb);
938 std::vector<uint64_t> node_fingerprints(nodes_.size());
939 for (int i = 0; i < nodes_.size(); ++i) {
940 node_fingerprints[i] = nodes_[i].cl_operation.GetKernelFingerprint();
941 }
942 auto node_fingerprints_fb = builder->CreateVector(node_fingerprints);
943
944 std::set<uint64_t> fingerprints;
945 for (const auto& node : nodes_) {
946 fingerprints.insert(node.cl_operation.GetKernelFingerprint());
947 }
948 std::vector<flatbuffers::Offset<data::BinaryProgram>> binary_programs_fb;
949 for (auto fingerprint : fingerprints) {
950 std::vector<uint8_t> program_binary;
951 program_cache.GetProgramBinary(fingerprint, &program_binary).IgnoreError();
952 auto binary_fb = builder->CreateVector(program_binary);
953 data::BinaryProgramBuilder program_builder(*builder);
954 program_builder.add_fingerprint(fingerprint);
955 program_builder.add_binary(binary_fb);
956 binary_programs_fb.push_back(program_builder.Finish());
957 }
958 auto binary_programs_fb_vec = builder->CreateVector(binary_programs_fb);
959 auto driver_version = builder->CreateString(device.GetPlatformVersion());
960
961 data::InferenceContextBuilder inf_builder(*builder);
962 inf_builder.add_gpu_model(gpu_model_fb);
963 inf_builder.add_driver_version(driver_version);
964 inf_builder.add_binary_programs(binary_programs_fb_vec);
965 inf_builder.add_tuned_work_group_sizes_per_node(work_groups_fb_vec);
966 inf_builder.add_fingerprints_per_node(node_fingerprints_fb);
967 return inf_builder.Finish();
968 }
969
GetInOutRefs(const absl::Span<const uint8_t> serialized_model,std::vector<int64_t> * in_refs,std::vector<int64_t> * out_refs)970 absl::Status GetInOutRefs(const absl::Span<const uint8_t> serialized_model,
971 std::vector<int64_t>* in_refs,
972 std::vector<int64_t>* out_refs) {
973 flatbuffers::Verifier verifier(serialized_model.data(),
974 serialized_model.size());
975 if (!data::VerifyInferenceContextBuffer(verifier)) {
976 return absl::DataLossError("Deserialization failed.");
977 }
978 auto fb_inference = data::GetInferenceContext(serialized_model.data());
979 if (in_refs) {
980 in_refs->clear();
981 for (auto in_fb : *fb_inference->gpu_model()->input_refs()) {
982 in_refs->push_back(in_fb);
983 }
984 }
985 if (out_refs) {
986 out_refs->clear();
987 for (auto out_fb : *fb_inference->gpu_model()->output_refs()) {
988 out_refs->push_back(out_fb);
989 }
990 }
991 return absl::OkStatus();
992 }
993
GetTotalBufferSizeForTensors(const GpuModel & gpu_model,const CreateGpuModelInfo & create_info,const GpuInfo & gpu_info,uint64_t * result)994 absl::Status GetTotalBufferSizeForTensors(const GpuModel& gpu_model,
995 const CreateGpuModelInfo& create_info,
996 const GpuInfo& gpu_info,
997 uint64_t* result) {
998 std::vector<TensorUsageRecord<size_t>> buffer_usage_records;
999 ObjectsAssignment<size_t> buffer_assignment;
1000 OffsetsAssignment offset_assignment;
1001 bool use_offset_assignment;
1002 bool is_sub_buffers_supported;
1003 RETURN_IF_ERROR(GetBufferAsignment(
1004 gpu_model, &create_info, gpu_info, &buffer_usage_records, nullptr,
1005 &buffer_assignment, &offset_assignment, &use_offset_assignment,
1006 &is_sub_buffers_supported));
1007 if (use_offset_assignment) {
1008 *result = offset_assignment.total_size;
1009 return absl::OkStatus();
1010 }
1011
1012 const size_t base_align_bytes =
1013 std::max<size_t>(gpu_info.opencl_info.base_addr_align_in_bits >> 3, 1);
1014 *result = TotalSize(buffer_assignment, base_align_bytes);
1015 return absl::OkStatus();
1016 }
1017
1018 } // namespace cl
1019 } // namespace gpu
1020 } // namespace tflite
1021