xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/delegates/gpu/cl/inference_context.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
17 
18 #include <algorithm>
19 #include <cmath>
20 #include <cstdint>
21 #include <cstring>
22 #include <functional>
23 #include <limits>
24 #include <map>
25 #include <memory>
26 #include <numeric>
27 #include <set>
28 #include <string>
29 #include <utility>
30 #include <vector>
31 
32 #include "absl/container/flat_hash_map.h"
33 #include "absl/container/flat_hash_set.h"
34 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
35 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
36 #include "tensorflow/lite/delegates/gpu/cl/serialization_generated.h"
37 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
38 #include "tensorflow/lite/delegates/gpu/common/gpu_model.h"
39 #include "tensorflow/lite/delegates/gpu/common/gpu_model_generated.h"
40 #include "tensorflow/lite/delegates/gpu/common/memory_management.h"
41 #include "tensorflow/lite/delegates/gpu/common/model.h"
42 #include "tensorflow/lite/delegates/gpu/common/shape.h"
43 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
44 #include "tensorflow/lite/delegates/gpu/common/task/serialization_base.h"
45 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
46 #include "tensorflow/lite/delegates/gpu/common/types.h"
47 #include "tensorflow/lite/delegates/gpu/common/util.h"
48 
49 namespace tflite {
50 namespace gpu {
51 namespace cl {
52 
53 namespace {
AddUsage(ValueId id,int task_index,std::map<ValueId,int2> * usage_records)54 void AddUsage(ValueId id, int task_index,
55               std::map<ValueId, int2>* usage_records) {
56   auto it = usage_records->find(id);
57   if (it == usage_records->end()) {
58     (*usage_records)[id].x = task_index;
59     (*usage_records)[id].y = task_index;
60   } else {
61     (*usage_records)[id].y = task_index;
62   }
63 }
64 
65 // returns true if actual memory for this storage type will be allocated with
66 // clCreateBuffer.
IsBufferBased(const GpuInfo & gpu_info,const TensorStorageType & type)67 bool IsBufferBased(const GpuInfo& gpu_info, const TensorStorageType& type) {
68   const bool image2d_based_buffer =
69       (type == TensorStorageType::TEXTURE_2D ||
70        type == TensorStorageType::SINGLE_TEXTURE_2D) &&
71       gpu_info.opencl_info.IsImage2dFromBufferSupported();
72   return type == TensorStorageType::BUFFER ||
73          type == TensorStorageType::IMAGE_BUFFER || image2d_based_buffer;
74 }
75 
76 // Calculates the total size of the assignment.
TotalSize(const ObjectsAssignment<size_t> & assignment,size_t alignment=1)77 size_t TotalSize(const ObjectsAssignment<size_t>& assignment,
78                  size_t alignment = 1) {
79   size_t total_size = 0;
80   for (auto object_size : assignment.object_sizes) {
81     total_size += AlignByN(object_size, alignment);
82   }
83   return total_size;
84 }
85 
GetTensorType(const GpuModel & gpu_model,const CreateGpuModelInfo * create_info,const GpuInfo & gpu_info,ValueId id)86 TensorType GetTensorType(const GpuModel& gpu_model,
87                          const CreateGpuModelInfo* create_info,
88                          const GpuInfo& gpu_info, ValueId id) {
89   bool is_variable = false;
90   for (int i = 0; i < gpu_model.variable_ids_and_refs.size(); ++i) {
91     if (gpu_model.variable_ids_and_refs[i].first == id) {
92       is_variable = true;
93       break;
94     }
95   }
96   if (is_variable) {
97     return TensorType::kVariable;
98   } else if (create_info &&
99              (create_info->external_immutable_tensors.find(id) !=
100                   create_info->external_immutable_tensors.end() ||
101               create_info->external_mutable_tensors.find(id) !=
102                   create_info->external_mutable_tensors.end())) {
103     return TensorType::kExternal;
104   } else if (gpu_model.const_tensors.find(id) !=
105              gpu_model.const_tensors.end()) {
106     return TensorType::kConst;
107   } else {
108     return TensorType::kRuntime;
109   }
110 }
111 
GetUsages(const GpuModel & model,const std::function<bool (ValueId)> & functor,std::map<ValueId,int2> * usages)112 void GetUsages(const GpuModel& model,
113                const std::function<bool(ValueId)>& functor,
114                std::map<ValueId, int2>* usages) {
115   for (const auto& in_id : model.input_ids_and_refs) {
116     if (functor(in_id.first)) {
117       AddUsage(in_id.first, 0, usages);
118     }
119   }
120   for (int op_index = 0; op_index < model.nodes.size(); ++op_index) {
121     for (auto input_id : model.nodes[op_index].inputs) {
122       if (functor(input_id)) {
123         AddUsage(input_id, op_index, usages);
124       }
125     }
126     for (auto output_id : model.nodes[op_index].outputs) {
127       if (functor(output_id)) {
128         AddUsage(output_id, op_index, usages);
129       }
130     }
131   }
132   for (const auto& out_id : model.output_ids_and_refs) {
133     if (functor(out_id.first)) {
134       AddUsage(out_id.first, model.nodes.size(), usages);
135     }
136   }
137 }
138 
GetBufferAsignment(const GpuModel & gpu_model,const CreateGpuModelInfo * create_info,const GpuInfo & gpu_info,std::vector<TensorUsageRecord<size_t>> * buffer_usage_records,std::map<ValueId,int> * graph_ids_to_shared_buffer_tensors,ObjectsAssignment<size_t> * buffer_assignment,OffsetsAssignment * offset_assignment,bool * use_offset_assignment,bool * is_sub_buffers_supported)139 absl::Status GetBufferAsignment(
140     const GpuModel& gpu_model, const CreateGpuModelInfo* create_info,
141     const GpuInfo& gpu_info,
142     std::vector<TensorUsageRecord<size_t>>* buffer_usage_records,
143     std::map<ValueId, int>* graph_ids_to_shared_buffer_tensors,
144     ObjectsAssignment<size_t>* buffer_assignment,
145     OffsetsAssignment* offset_assignment, bool* use_offset_assignment,
146     bool* is_sub_buffers_supported) {
147   std::map<ValueId, int2> buffer_usages;
148   GetUsages(
149       gpu_model,
150       [&gpu_model, &gpu_info, &create_info](ValueId id) {
151         return GetTensorType(gpu_model, create_info, gpu_info, id) ==
152                    TensorType::kRuntime &&
153                IsBufferBased(gpu_info,
154                              gpu_model.tensors.at(id).GetStorageType());
155       },
156       &buffer_usages);
157 
158   bool has_buffer_based_images = false;
159   for (auto& usage : buffer_usages) {
160     const auto& t = gpu_model.tensors.at(usage.first);
161     const auto& shape = t.GetBHWDCShape();
162     const auto& descriptor = t;
163     const size_t element_size = SizeOf(descriptor.GetDataType());
164     size_t buffer_size;
165     if (descriptor.GetStorageType() == TensorStorageType::TEXTURE_2D ||
166         descriptor.GetStorageType() == TensorStorageType::SINGLE_TEXTURE_2D) {
167       has_buffer_based_images = true;
168       const size_t bytes_per_pixel =
169           element_size *
170           (descriptor.GetStorageType() == TensorStorageType::TEXTURE_2D
171                ? 4
172                : shape.c);
173       const size_t width = shape.b * shape.w;
174       const size_t height = shape.h * DivideRoundUp(shape.c, 4);
175       size_t width_pixel_alignment = gpu_info.opencl_info.image_pitch_alignment;
176       if (gpu_info.IsAdreno() && width_pixel_alignment % bytes_per_pixel == 0) {
177         width_pixel_alignment /= bytes_per_pixel;
178       }
179       const size_t width_aligned = AlignByN(width, width_pixel_alignment);
180       buffer_size = width_aligned * bytes_per_pixel * height;
181     } else {
182       if (descriptor.GetStorageType() == TensorStorageType::IMAGE_BUFFER) {
183         has_buffer_based_images = true;
184       }
185       buffer_size =
186           shape.b * shape.w * shape.h * AlignByN(shape.c, 4) * element_size;
187     }
188     if (graph_ids_to_shared_buffer_tensors) {
189       (*graph_ids_to_shared_buffer_tensors)[usage.first] =
190           buffer_usage_records->size();
191     }
192     buffer_usage_records->push_back({buffer_size,
193                                      static_cast<TaskId>(usage.second.x),
194                                      static_cast<TaskId>(usage.second.y)});
195   }
196 
197   RETURN_IF_ERROR(AssignObjectsToTensors(
198       *buffer_usage_records, MemoryStrategy::GREEDY_BEST, buffer_assignment));
199 
200   *is_sub_buffers_supported =
201       (!has_buffer_based_images && gpu_info.IsCL11OrHigher()) ||
202       CanUseSubBufferForImage2d(gpu_info);
203   const size_t base_align_bytes =
204       std::max<size_t>(gpu_info.opencl_info.base_addr_align_in_bits >> 3, 1);
205 
206   *use_offset_assignment = false;
207   if (*is_sub_buffers_supported) {
208     RETURN_IF_ERROR(AssignOffsetsToTensors(
209         *buffer_usage_records, MemoryStrategy::GREEDY_BY_SIZE,
210         offset_assignment, base_align_bytes));
211     if (offset_assignment->total_size <= TotalSize(*buffer_assignment) &&
212         offset_assignment->total_size <= gpu_info.GetMaxBufferSize()) {
213       *use_offset_assignment = true;
214     }
215   }
216   return absl::OkStatus();
217 }
218 
219 }  // namespace
220 
Init(const GpuInfo & gpu_info)221 void InferenceContext::ExecutionHints::Init(const GpuInfo& gpu_info) {
222   if (gpu_info.IsMali()) {
223     need_flush = true;
224     need_manual_release = gpu_info.mali_info.IsValhall() ? false : true;
225 
226     flush_periodically = true;
227     flush_period = 24;
228   }
229   if (gpu_info.IsPowerVR()) {
230     need_flush = true;
231     flush_periodically = true;
232     flush_period = 16;
233   }
234 }
235 
InitFromGraph(const CreateGpuModelInfo & create_info,const GraphFloat32 & graph,Environment * env,std::vector<uint8_t> * serialized_model)236 absl::Status InferenceContext::InitFromGraph(
237     const CreateGpuModelInfo& create_info, const GraphFloat32& graph,
238     Environment* env, std::vector<uint8_t>* serialized_model) {
239   GpuModel gpu_model;
240   RETURN_IF_ERROR(GraphToGpuModel(graph, create_info,
241                                   env->GetDevicePtr()->GetInfo(), &gpu_model));
242   return InitFromGpuModel(create_info, &gpu_model, env, serialized_model);
243 }
244 
InitFromGpuModel(const CreateGpuModelInfo & create_info,GpuModel * gpu_model,Environment * env,std::vector<uint8_t> * serialized_model,Buffer * shared_buffer)245 absl::Status InferenceContext::InitFromGpuModel(
246     const CreateGpuModelInfo& create_info, GpuModel* gpu_model,
247     Environment* env, std::vector<uint8_t>* serialized_model,
248     Buffer* shared_buffer) {
249   flatbuffers::FlatBufferBuilder builder;
250   flatbuffers::Offset<tflite::gpu::data::GpuModel> gpu_model_fb;
251   if (serialized_model) {
252     gpu_model_fb = tflite::gpu::Encode(*gpu_model, &builder);
253   }
254   shared_buffers_parent_ptr_ = shared_buffer;
255   RETURN_IF_ERROR(AllocateMemory(*gpu_model, env->GetDevicePtr()->GetInfo(),
256                                  &create_info, &env->context()));
257   InitFromGpuModel(gpu_model);
258 
259   CreationContext creation_context;
260   creation_context.device = env->GetDevicePtr();
261   creation_context.context = &env->context();
262   creation_context.queue = env->queue();
263   creation_context.cache = env->program_cache();
264   for (const auto& external_tensor : create_info.external_immutable_tensors) {
265     auto* cl_spatial_tensor = dynamic_cast<Tensor*>(external_tensor.second);
266     if (!cl_spatial_tensor) {
267       return absl::InvalidArgumentError("Expected CLSpatialTensor.");
268     }
269     external_immutable_tensors_[external_tensor.first] = cl_spatial_tensor;
270   }
271   std::map<ValueId, Tensor> temp_external_tensors;
272   for (const auto& external_tensor : create_info.external_mutable_tensors) {
273     RETURN_IF_ERROR(
274         CreateTensor(env->context(),
275                      gpu_model->tensors[external_tensor.first],
276                      &temp_external_tensors[external_tensor.first]));
277     external_mutable_tensors_[external_tensor.first] =
278         &temp_external_tensors[external_tensor.first];
279   }
280   PrepareExternal();
281   execution_hints_.Init(env->device().GetInfo());
282   BindMemoryToOperations();
283   RETURN_IF_ERROR(Compile(creation_context));
284   RETURN_IF_ERROR(UpdateParams());
285 
286   TuningType tuning_type = TuningType::kExhaustive;
287   if (create_info.hints.Check(ModelHints::kFastTuning)) {
288     tuning_type = TuningType::kFast;
289   }
290   if (env->device().GetInfo().IsMali()) {
291     const MaliInfo& info = env->device().GetInfo().mali_info;
292     if (info.IsMaliT6xx()) {
293       // Mali T628 hangs forever in clFinish when used profiling queue
294       // TuningType::FAST does not use profiling queue.
295       tuning_type = TuningType::kFast;
296     }
297   }
298   RETURN_IF_ERROR(
299       Tune(tuning_type, env->device().GetInfo(), env->profiling_queue()));
300   if (external_mutable_tensors_.empty()) {
301     // using recordable queue only when no mutable external tensors
302     InitRecordableQueue(env);
303   }
304 
305   for (auto& external_tensor : external_mutable_tensors_) {
306     external_tensor.second = nullptr;
307   }
308 
309   gpu_info_ = env->device().GetInfo();
310 
311   if (serialized_model) {
312     auto encoded_fb = Encode(*env->GetDevicePtr(), *env->program_cache(),
313                              gpu_model_fb, &builder);
314     data::FinishInferenceContextBuffer(builder, encoded_fb);
315     serialized_model->resize(builder.GetSize());
316     std::memcpy(serialized_model->data(), builder.GetBufferPointer(),
317                 builder.GetSize());
318   }
319   return absl::OkStatus();
320 }
321 
AddToCommanBuffer(cl_command_buffer_khr cb)322 absl::Status InferenceContext::AddToCommanBuffer(cl_command_buffer_khr cb) {
323   for (auto& node : nodes_) {
324     RETURN_IF_ERROR(node.cl_operation.AddToCommanBuffer(cb));
325   }
326   return absl::OkStatus();
327 }
328 
RestoreDeserialized(const absl::Span<const uint8_t> serialized_model,Environment * env,CreateGpuModelInfo * create_info)329 absl::Status InferenceContext::RestoreDeserialized(
330     const absl::Span<const uint8_t> serialized_model, Environment* env,
331     CreateGpuModelInfo* create_info) {
332   flatbuffers::Verifier verifier(serialized_model.data(),
333                                  serialized_model.size());
334   if (!data::VerifyInferenceContextBuffer(verifier)) {
335     return absl::DataLossError("Deserialization failed.");
336   }
337   auto decoded_fb = data::GetInferenceContext(serialized_model.data());
338   std::string platform_version(decoded_fb->driver_version()->c_str(),
339                                decoded_fb->driver_version()->size());
340   if (env->GetDevicePtr()->GetPlatformVersion() != platform_version) {
341     return absl::InvalidArgumentError(
342         "OpenCL driver changed, model respresentation invalid, must be "
343         "regenerated.");
344   }
345   GpuModel gpu_model;
346   RETURN_IF_ERROR(tflite::gpu::Decode(decoded_fb->gpu_model(), &gpu_model));
347   RETURN_IF_ERROR(AllocateMemory(gpu_model, env->GetDevicePtr()->GetInfo(),
348                                  create_info, &env->context()));
349   InitFromGpuModel(&gpu_model);
350 
351   // deserializing kernels into program_cache
352   for (auto binary_program_fb : *decoded_fb->binary_programs()) {
353     RETURN_IF_ERROR(env->program_cache()->AddProgramBinary(
354         env->context(), *env->GetDevicePtr(), binary_program_fb->fingerprint(),
355         absl::MakeSpan(binary_program_fb->binary()->data(),
356                        binary_program_fb->binary()->size())));
357   }
358 
359   std::map<ValueId, Tensor> temp_external_tensors;
360   if (create_info) {
361     for (const auto& external_tensor :
362          create_info->external_immutable_tensors) {
363       auto* cl_spatial_tensor = dynamic_cast<Tensor*>(external_tensor.second);
364       if (!cl_spatial_tensor) {
365         return absl::InvalidArgumentError("Expected CLSpatialTensor.");
366       }
367       external_immutable_tensors_[external_tensor.first] = cl_spatial_tensor;
368     }
369     for (const auto& external_tensor : create_info->external_mutable_tensors) {
370       RETURN_IF_ERROR(
371           CreateTensor(env->context(),
372                        gpu_model.tensors[external_tensor.first],
373                        &temp_external_tensors[external_tensor.first]));
374       external_mutable_tensors_[external_tensor.first] =
375           &temp_external_tensors[external_tensor.first];
376     }
377   }
378   PrepareExternal();
379 
380   execution_hints_.Init(env->device().GetInfo());
381 
382   BindMemoryToOperations();
383   for (int i = 0; i < nodes_.size(); ++i) {
384     uint64_t fingerprint = (*decoded_fb->fingerprints_per_node())[i];
385     int3 wg_size;
386     wg_size.x = (*decoded_fb->tuned_work_group_sizes_per_node())[i]->x();
387     wg_size.y = (*decoded_fb->tuned_work_group_sizes_per_node())[i]->y();
388     wg_size.z = (*decoded_fb->tuned_work_group_sizes_per_node())[i]->z();
389     RETURN_IF_ERROR(nodes_[i].cl_operation.RestoreDeserialized(
390         *env->program_cache(), fingerprint, env->GetDevicePtr()->GetInfo(),
391         wg_size, &env->context()));
392   }
393   RETURN_IF_ERROR(UpdateParams());
394   if (external_mutable_tensors_.empty()) {
395     // using recordable queue only when no mutable external tensors
396     InitRecordableQueue(env);
397   }
398   for (auto& external_tensor : external_mutable_tensors_) {
399     external_tensor.second = nullptr;
400   }
401   return absl::OkStatus();
402 }
403 
InitFromGpuModel(GpuModel * gpu_model)404 void InferenceContext::InitFromGpuModel(GpuModel* gpu_model) {
405   for (const auto& input : gpu_model->input_ids_and_refs) {
406     input_ids_.push_back(input.first);
407   }
408   for (const auto& output : gpu_model->output_ids_and_refs) {
409     output_ids_.push_back(output.first);
410   }
411   nodes_.resize(gpu_model->nodes.size());
412   for (int i = 0; i < gpu_model->nodes.size(); ++i) {
413     nodes_[i].cl_operation.Init(std::move(gpu_model->nodes[i].gpu_operation));
414     nodes_[i].inputs = gpu_model->nodes[i].inputs;
415     nodes_[i].outputs = gpu_model->nodes[i].outputs;
416     nodes_[i].name = gpu_model->nodes[i].name;
417   }
418 }
419 
InitRecordableQueue(Environment * env)420 void InferenceContext::InitRecordableQueue(Environment* env) {
421   std::vector<ClOperation*> ops(nodes_.size());
422   for (int i = 0; i < nodes_.size(); ++i) {
423     ops[i] = &nodes_[i].cl_operation;
424   }
425   recordable_queue_ = CreateRecordableQueue(ops, env->device(), env->context());
426 }
427 
InitFromGraphWithTransforms(const CreateGpuModelInfo & create_info,GraphFloat32 * graph,Environment * env,std::vector<uint8_t> * serialized_model)428 absl::Status InferenceContext::InitFromGraphWithTransforms(
429     const CreateGpuModelInfo& create_info, GraphFloat32* graph,
430     Environment* env, std::vector<uint8_t>* serialized_model) {
431   RETURN_IF_ERROR(RunGraphTransformsForGpuModel(graph));
432   RETURN_IF_ERROR(InitFromGraph(create_info, *graph, env, serialized_model));
433   return absl::OkStatus();
434 }
435 
AllocateMemory(const GpuModel & gpu_model,const GpuInfo & gpu_info,const CreateGpuModelInfo * create_info,CLContext * context)436 absl::Status InferenceContext::AllocateMemory(
437     const GpuModel& gpu_model, const GpuInfo& gpu_info,
438     const CreateGpuModelInfo* create_info, CLContext* context) {
439   RETURN_IF_ERROR(AllocateConstTensors(gpu_model, context));
440   RETURN_IF_ERROR(AllocateVariableTensors(gpu_model, context));
441   RETURN_IF_ERROR(
442       AllocateBufferBasedTensors(gpu_model, gpu_info, create_info, context));
443   RETURN_IF_ERROR(
444       AllocateStrongShapesTensors(gpu_model, gpu_info, create_info, context));
445   return absl::OkStatus();
446 }
447 
AllocateConstTensors(const GpuModel & gpu_model,CLContext * context)448 absl::Status InferenceContext::AllocateConstTensors(const GpuModel& gpu_model,
449                                                     CLContext* context) {
450   for (auto& description : gpu_model.const_tensors) {
451     RETURN_IF_ERROR(const_tensors_[description.first].CreateFromDescriptor(
452         description.second, context));
453   }
454   return absl::OkStatus();
455 }
456 
AllocateVariableTensors(const GpuModel & gpu_model,CLContext * context)457 absl::Status InferenceContext::AllocateVariableTensors(
458     const GpuModel& gpu_model, CLContext* context) {
459   for (const auto& variable_input : gpu_model.variable_ids_and_refs) {
460     variable_ids_and_refs_[variable_input.first] = variable_input.second;
461   }
462 
463   std::map<ValueId, int> ref_value_to_tensor_index;
464 
465   for (auto value_and_ref_value : variable_ids_and_refs_) {
466     if (ref_value_to_tensor_index.find(value_and_ref_value.second) ==
467         ref_value_to_tensor_index.end()) {
468       auto it = gpu_model.tensors.find(value_and_ref_value.first);
469       if (it == gpu_model.tensors.end()) {
470         return absl::InternalError("No variable tensor with this id.");
471       }
472       RETURN_IF_ERROR(
473           CreateTensor(*context, it->second,
474                        &variable_tensors_[value_and_ref_value.second]));
475     }
476   }
477   return absl::OkStatus();
478 }
479 
AllocateBufferBasedTensors(const GpuModel & gpu_model,const GpuInfo & gpu_info,const CreateGpuModelInfo * create_info,CLContext * context)480 absl::Status InferenceContext::AllocateBufferBasedTensors(
481     const GpuModel& gpu_model, const GpuInfo& gpu_info,
482     const CreateGpuModelInfo* create_info, CLContext* context) {
483   std::vector<TensorUsageRecord<size_t>> buffer_usage_records;
484   ObjectsAssignment<size_t> buffer_assignment;
485   OffsetsAssignment offset_assignment;
486   bool use_offset_assignment;
487   bool is_sub_buffers_supported;
488   RETURN_IF_ERROR(GetBufferAsignment(
489       gpu_model, create_info, gpu_info, &buffer_usage_records,
490       &graph_ids_to_shared_buffer_tensors_, &buffer_assignment,
491       &offset_assignment, &use_offset_assignment, &is_sub_buffers_supported));
492   const size_t base_align_bytes =
493       std::max<size_t>(gpu_info.opencl_info.base_addr_align_in_bits >> 3, 1);
494 
495   if (buffer_usage_records.empty()) {
496     return absl::OkStatus();
497   }
498 
499   if (use_offset_assignment) {
500     if (!shared_buffers_parent_ptr_) {
501       Buffer shared_buffer;
502       RETURN_IF_ERROR(CreateReadWriteBuffer(offset_assignment.total_size,
503                                             context, &shared_buffer));
504       shared_buffers_parent_ =
505           std::make_unique<Buffer>(std::move(shared_buffer));
506       shared_buffers_parent_ptr_ = shared_buffers_parent_.get();
507     } else if (shared_buffers_parent_ptr_->GetMemorySizeInBytes() <
508                offset_assignment.total_size) {
509       return absl::FailedPreconditionError(
510           "Externally provided buffer not big enough.");
511     }
512     shared_buffers_.resize(offset_assignment.offsets.size());
513     for (int i = 0; i < offset_assignment.offsets.size(); ++i) {
514       RETURN_IF_ERROR(CreateReadWriteSubBuffer(
515           *shared_buffers_parent_ptr_, offset_assignment.offsets[i],
516           buffer_usage_records[i].tensor_size, context, &shared_buffers_[i]));
517     }
518   } else {
519     const size_t total_size = TotalSize(buffer_assignment, base_align_bytes);
520     if (is_sub_buffers_supported && total_size <= gpu_info.GetMaxBufferSize()) {
521       // use single parent buffer:
522       if (!shared_buffers_parent_ptr_) {
523         Buffer shared_buffer;
524         RETURN_IF_ERROR(
525             CreateReadWriteBuffer(total_size, context, &shared_buffer));
526         shared_buffers_parent_ =
527             std::make_unique<Buffer>(std::move(shared_buffer));
528         shared_buffers_parent_ptr_ = shared_buffers_parent_.get();
529       } else if (shared_buffers_parent_ptr_->GetMemorySizeInBytes() <
530                  total_size) {
531         return absl::FailedPreconditionError(
532             "Externally provided buffer not big enough.");
533       }
534 
535       shared_buffers_.resize(buffer_assignment.object_sizes.size());
536       size_t offset = 0;
537       for (int i = 0; i < buffer_assignment.object_sizes.size(); ++i) {
538         const size_t aligned_size =
539             AlignByN(buffer_assignment.object_sizes[i], base_align_bytes);
540         RETURN_IF_ERROR(CreateReadWriteSubBuffer(*shared_buffers_parent_ptr_,
541                                                  offset, aligned_size, context,
542                                                  &shared_buffers_[i]));
543         offset += aligned_size;
544       }
545     } else {
546       shared_buffers_.resize(buffer_assignment.object_sizes.size());
547       for (int i = 0; i < buffer_assignment.object_sizes.size(); ++i) {
548         RETURN_IF_ERROR(CreateReadWriteBuffer(buffer_assignment.object_sizes[i],
549                                               context, &shared_buffers_[i]));
550       }
551     }
552   }
553 
554   std::vector<bool> created_tensors(buffer_usage_records.size(), false);
555   shared_buffer_tensors_.resize(buffer_usage_records.size());
556   for (auto& node : gpu_model.nodes) {
557     std::vector<ValueId> node_tensor_ids = node.inputs;
558     node_tensor_ids.insert(node_tensor_ids.end(), node.outputs.begin(),
559                            node.outputs.end());
560     for (auto& tensor_id : node_tensor_ids) {
561       if (GetTensorType(gpu_model, create_info, gpu_info, tensor_id) !=
562           TensorType::kRuntime) {
563         continue;
564       }
565       const auto& tensor_desc = gpu_model.tensors.at(tensor_id);
566       if (!IsBufferBased(gpu_info, tensor_desc.GetStorageType())) {
567         continue;
568       }
569       const int tensor_index = graph_ids_to_shared_buffer_tensors_[tensor_id];
570       if (created_tensors[tensor_index]) continue;
571       const int buffer_index = use_offset_assignment
572                                    ? tensor_index
573                                    : buffer_assignment.object_ids[tensor_index];
574       if (tensor_desc.GetStorageType() == TensorStorageType::TEXTURE_2D ||
575           tensor_desc.GetStorageType() ==
576               TensorStorageType::SINGLE_TEXTURE_2D) {
577         const size_t bytes_per_pixel =
578             SizeOf(tensor_desc.GetDataType()) *
579             (tensor_desc.GetStorageType() == TensorStorageType::TEXTURE_2D
580                  ? 4
581                  : tensor_desc.GetBHWCShape().c);
582         size_t width_pixel_alignment =
583             gpu_info.opencl_info.image_pitch_alignment;
584         if (gpu_info.IsAdreno() &&
585             width_pixel_alignment % bytes_per_pixel == 0) {
586           width_pixel_alignment /= bytes_per_pixel;
587         }
588         RETURN_IF_ERROR(CreateTensorSharedImage2DBuffer(
589             *context, shared_buffers_[buffer_index].GetMemoryPtr(), tensor_desc,
590             width_pixel_alignment, &shared_buffer_tensors_[tensor_index]));
591       } else {
592         RETURN_IF_ERROR(CreateTensorShared(
593             *context, shared_buffers_[buffer_index].GetMemoryPtr(), tensor_desc,
594             &shared_buffer_tensors_[tensor_index]));
595       }
596       created_tensors[tensor_index] = true;
597     }
598   }
599   return absl::OkStatus();
600 }
601 
AllocateStrongShapesTensors(const GpuModel & gpu_model,const GpuInfo & gpu_info,const CreateGpuModelInfo * create_info,CLContext * context)602 absl::Status InferenceContext::AllocateStrongShapesTensors(
603     const GpuModel& gpu_model, const GpuInfo& gpu_info,
604     const CreateGpuModelInfo* create_info, CLContext* context) {
605   std::map<ValueId, int2> usages;
606   GetUsages(
607       gpu_model,
608       [&gpu_model, &gpu_info, &create_info](ValueId id) {
609         return GetTensorType(gpu_model, create_info, gpu_info, id) ==
610                    TensorType::kRuntime &&
611                !IsBufferBased(gpu_info,
612                               gpu_model.tensors.at(id).GetStorageType());
613       },
614       &usages);
615 
616   struct TensorDescComparator {
617     TensorDescriptor tensor_desc;
618 
619     bool operator==(const TensorDescComparator& t) const {
620       return tensor_desc == t.tensor_desc &&
621              tensor_desc.GetBHWDCShape() == t.tensor_desc.GetBHWDCShape();
622     }
623   };
624 
625   std::vector<TensorUsageRecord<TensorDescComparator>> usage_records;
626   std::map<ValueId, ValueId> remap_from_graph_ids;
627   for (auto& usage : usages) {
628     remap_from_graph_ids[usage.first] = usage_records.size();
629     usage_records.push_back({{gpu_model.tensors.at(usage.first)},
630                              static_cast<TaskId>(usage.second.x),
631                              static_cast<TaskId>(usage.second.y)});
632   }
633 
634   ObjectsAssignment<TensorDescComparator> assignment;
635   RETURN_IF_ERROR(AssignObjectsToTensors(
636       usage_records, MemoryStrategy::EQUALITY, &assignment));
637 
638   for (auto& node : gpu_model.nodes) {
639     std::vector<ValueId> node_tensor_ids = node.inputs;
640     node_tensor_ids.insert(node_tensor_ids.end(), node.outputs.begin(),
641                            node.outputs.end());
642     for (auto& tensor_id : node_tensor_ids) {
643       if (GetTensorType(gpu_model, create_info, gpu_info, tensor_id) !=
644           TensorType::kRuntime) {
645         continue;
646       }
647       const auto& tensor_desc = gpu_model.tensors.at(tensor_id);
648       if (IsBufferBased(gpu_info, tensor_desc.GetStorageType())) {
649         continue;
650       }
651       const auto id = assignment.object_ids[remap_from_graph_ids[tensor_id]];
652       graph_ids_to_strong_shape_tensors_[tensor_id] = id;
653       const auto& it = strong_shape_tensors_.find(id);
654       if (it == strong_shape_tensors_.end()) {
655         RETURN_IF_ERROR(
656             CreateTensor(*context, tensor_desc, &strong_shape_tensors_[id]));
657       }
658     }
659   }
660   return absl::OkStatus();
661 }
662 
BindMemoryToOperations()663 void InferenceContext::BindMemoryToOperations() {
664   for (auto& node : nodes_) {
665     for (int i = 0; i < node.inputs.size(); ++i) {
666       node.cl_operation.GetGpuOperation().SetSrc(GetTensor(node.inputs[i]), i);
667     }
668     for (int i = 0; i < node.outputs.size(); ++i) {
669       node.cl_operation.GetGpuOperation().SetDst(GetTensor(node.outputs[i]), i);
670     }
671   }
672 }
673 
Compile(const CreationContext & creation_context)674 absl::Status InferenceContext::Compile(
675     const CreationContext& creation_context) {
676   for (auto& node : nodes_) {
677     RETURN_IF_ERROR(node.cl_operation.Compile(creation_context));
678   }
679   return absl::OkStatus();
680 }
681 
Tune(TuningType tuning_type,const GpuInfo & gpu_info,ProfilingCommandQueue * profiling_queue)682 absl::Status InferenceContext::Tune(TuningType tuning_type,
683                                     const GpuInfo& gpu_info,
684                                     ProfilingCommandQueue* profiling_queue) {
685   for (auto& node : nodes_) {
686     RETURN_IF_ERROR(
687         node.cl_operation.Tune(tuning_type, gpu_info, profiling_queue));
688   }
689   return absl::OkStatus();
690 }
691 
UpdateParams()692 absl::Status InferenceContext::UpdateParams() {
693   for (auto& node : nodes_) {
694     RETURN_IF_ERROR(node.cl_operation.UpdateParams());
695   }
696   return absl::OkStatus();
697 }
698 
SetTensor(const ValueId & tensor_id,Tensor * tensor_ptr)699 absl::Status InferenceContext::SetTensor(const ValueId& tensor_id,
700                                          Tensor* tensor_ptr) {
701   auto it = external_mutable_tensors_.find(tensor_id);
702   if (it == external_mutable_tensors_.end()) {
703     return absl::InvalidArgumentError("No external tensor with this id.");
704   }
705   external_mutable_tensors_[tensor_id] = tensor_ptr;
706   for (int node_index : external_tensor_to_nodes_[tensor_id]) {
707     auto& node = nodes_[node_index];
708     for (int i = 0; i < node.inputs.size(); ++i) {
709       if (node.inputs[i] == tensor_id) {
710         RETURN_IF_ERROR(node.cl_operation.SetSrcTensor(i, tensor_ptr));
711       }
712     }
713     for (int i = 0; i < node.outputs.size(); ++i) {
714       if (node.outputs[i] == tensor_id) {
715         RETURN_IF_ERROR(node.cl_operation.SetDstTensor(i, tensor_ptr));
716       }
717     }
718   }
719   return absl::OkStatus();
720 }
721 
PrepareExternal()722 void InferenceContext::PrepareExternal() {
723   for (auto& external : external_mutable_tensors_) {
724     for (int i = 0; i < nodes_.size(); ++i) {
725       bool has_tensor = false;
726       const auto& src_ids = nodes_[i].inputs;
727       for (int i = 0; i < src_ids.size(); ++i) {
728         if (src_ids[i] == external.first) {
729           has_tensor = true;
730         }
731       }
732       const auto& dst_ids = nodes_[i].outputs;
733       for (int i = 0; i < dst_ids.size(); ++i) {
734         if (dst_ids[i] == external.first) {
735           has_tensor = true;
736         }
737       }
738       if (has_tensor) {
739         external_tensor_to_nodes_[external.first].push_back(i);
740       }
741     }
742   }
743 }
744 
AddToQueue(CLCommandQueue * queue)745 absl::Status InferenceContext::AddToQueue(CLCommandQueue* queue) {
746   if (recordable_queue_ && recordable_queue_->IsSupported()) {
747     return recordable_queue_->Execute(queue);
748   }
749   if (execution_hints_.need_manual_release) {
750     if (execution_hints_.prev_enqueue_start_point.is_valid()) {
751       execution_hints_.prev_enqueue_start_point.Wait();
752     }
753     RETURN_IF_ERROR(
754         queue->EnqueueEvent(&execution_hints_.prev_enqueue_start_point));
755   }
756   int counter = 0;
757   for (auto& node : nodes_) {
758     RETURN_IF_ERROR(node.cl_operation.AddToQueue(queue));
759     counter++;
760     if (execution_hints_.flush_periodically &&
761         counter % execution_hints_.flush_period == 0) {
762       clFlush(queue->queue());
763     }
764   }
765   if (execution_hints_.need_flush) {
766     clFlush(queue->queue());
767   }
768   return absl::OkStatus();
769 }
770 
ProfileTime(ProfilingCommandQueue * queue,ProfilingInfo * result)771 absl::Status InferenceContext::ProfileTime(ProfilingCommandQueue* queue,
772                                            ProfilingInfo* result) {
773   queue->ResetMeasurements();
774   for (auto& node : nodes_) {
775     queue->SetEventsLabel(node.name);
776     RETURN_IF_ERROR(node.cl_operation.AddToQueue(queue));
777   }
778   RETURN_IF_ERROR(queue->WaitForCompletion());
779   *result = queue->GetProfilingInfo();
780 
781   if (!(gpu_info_.IsMali() || gpu_info_.IsPowerVR())) {
782     return absl::OkStatus();
783   }
784 
785   if (gpu_info_.IsMali()) {
786     queue->ResetMeasurements();
787     for (int i = 0; i < nodes_.size(); ++i) {
788       queue->SetEventsLabel(nodes_[i].name);
789       const double times =
790           16.0 / absl::ToDoubleMilliseconds(result->dispatches[i].duration);
791       const int n = std::min(256.0, std::max(2.0, times));
792       RETURN_IF_ERROR(nodes_[i].cl_operation.AddToQueueNTimes(queue, n));
793     }
794     RETURN_IF_ERROR(queue->WaitForCompletion());
795     *result = queue->GetProfilingInfo();
796     return absl::OkStatus();
797   }
798 
799   if (gpu_info_.IsPowerVR()) {
800     queue->ResetMeasurements();
801     for (int i = 0; i < nodes_.size(); ++i) {
802       queue->SetEventsLabel(nodes_[i].name);
803       const double times =
804           32.0 / absl::ToDoubleMilliseconds(result->dispatches[i].duration);
805       const int n = std::min(64.0, std::max(4.0, times));
806       RETURN_IF_ERROR(nodes_[i].cl_operation.AddToQueueNTimes(queue, n));
807     }
808     RETURN_IF_ERROR(queue->WaitForCompletion());
809     *result = queue->GetProfilingInfo();
810 
811     queue->ResetMeasurements();
812     for (int i = 0; i < nodes_.size(); ++i) {
813       queue->SetEventsLabel(nodes_[i].name);
814       const double times =
815           128.0 / absl::ToDoubleMilliseconds(result->dispatches[i].duration);
816       const int n = std::min(1024.0, std::max(4.0, times));
817       RETURN_IF_ERROR(nodes_[i].cl_operation.AddToQueueNTimes(queue, n));
818     }
819     RETURN_IF_ERROR(queue->WaitForCompletion());
820     *result = queue->GetProfilingInfo();
821     return absl::OkStatus();
822   }
823 
824   return absl::OkStatus();
825 }
826 
Profile(ProfilingCommandQueue * queue,ProfilingInfo * result)827 absl::Status InferenceContext::Profile(ProfilingCommandQueue* queue,
828                                        ProfilingInfo* result) {
829   RETURN_IF_ERROR(ProfileTime(queue, result));
830   for (int i = 0; i < nodes_.size(); ++i) {
831     uint64_t read_size = 0;
832     for (auto& src_id : nodes_[i].inputs) {
833       read_size += GetTensor(src_id)->GetMemorySizeInBytes();
834     }
835     const auto& gpu_op = nodes_[i].cl_operation.GetGpuOperation();
836     read_size += gpu_op.const_args_size_;
837     uint64_t write_size = 0;
838     for (auto& dst_id : nodes_[i].outputs) {
839       write_size += GetTensor(dst_id)->GetMemorySizeInBytes();
840     }
841     result->dispatches[i].flops = gpu_op.flops_;
842     result->dispatches[i].read_mem_size = read_size;
843     result->dispatches[i].write_mem_size = write_size;
844   }
845 
846   return absl::OkStatus();
847 }
848 
GetSizeOfMemoryAllocatedForIntermediateTensors() const849 uint64_t InferenceContext::GetSizeOfMemoryAllocatedForIntermediateTensors()
850     const {
851   uint64_t total_memory = 0;
852   for (const auto& t : strong_shape_tensors_) {
853     total_memory += t.second.GetMemorySizeInBytes();
854   }
855   for (const auto& b : shared_buffers_) {
856     // Sub-buffers do not allocate memory. Count the size of the parent buffer
857     // object instead.
858     if (!b.IsSubBuffer()) {
859       total_memory += b.GetMemorySizeInBytes();
860     }
861   }
862   for (const auto& t : variable_tensors_) {
863     total_memory += t.second.GetMemorySizeInBytes();
864   }
865   if (shared_buffers_parent_) {
866     total_memory += shared_buffers_parent_->GetMemorySizeInBytes();
867   }
868 
869   return total_memory;
870 }
871 
GetConstantTensorsSize() const872 uint64_t InferenceContext::GetConstantTensorsSize() const {
873   uint64_t total_size = 0;
874   for (const auto& node : nodes_) {
875     total_size += node.cl_operation.GetGpuOperation().const_args_size_;
876   }
877   for (const auto& t : const_tensors_) {
878     total_size += t.second.GetMemorySizeInBytes();
879   }
880   return total_size;
881 }
882 
GetTensor(ValueId id)883 Tensor* InferenceContext::GetTensor(ValueId id) {
884   if (external_immutable_tensors_.find(id) !=
885       external_immutable_tensors_.end()) {
886     return external_immutable_tensors_[id];
887   } else if (external_mutable_tensors_.find(id) !=
888              external_mutable_tensors_.end()) {
889     return external_mutable_tensors_[id];
890   } else if (const_tensors_.find(id) != const_tensors_.end()) {
891     return &const_tensors_[id];
892   } else if (variable_ids_and_refs_.find(id) != variable_ids_and_refs_.end()) {
893     return &variable_tensors_[variable_ids_and_refs_[id]];
894   } else if (graph_ids_to_shared_buffer_tensors_.find(id) !=
895              graph_ids_to_shared_buffer_tensors_.end()) {
896     return &shared_buffer_tensors_[graph_ids_to_shared_buffer_tensors_[id]];
897   } else {
898     return &strong_shape_tensors_[graph_ids_to_strong_shape_tensors_[id]];
899   }
900 }
901 
SetInputTensor(ValueId id,const TensorFloat32 & tensor,CLCommandQueue * queue)902 absl::Status InferenceContext::SetInputTensor(ValueId id,
903                                               const TensorFloat32& tensor,
904                                               CLCommandQueue* queue) {
905   Tensor* gpu_tensor = GetTensor(id);
906   TensorDescriptor descriptor_with_data = gpu_tensor->GetDescriptor();
907   descriptor_with_data.UploadData(tensor);
908   return gpu_tensor->UploadDescriptorData(descriptor_with_data, queue);
909 }
910 
GetOutputTensor(ValueId id,CLCommandQueue * queue,TensorFloat32 * result)911 absl::Status InferenceContext::GetOutputTensor(ValueId id,
912                                                CLCommandQueue* queue,
913                                                TensorFloat32* result) {
914   const Tensor* gpu_tensor = GetTensor(id);
915   const auto dst_shape = BHWC(gpu_tensor->Batch(), gpu_tensor->Height(),
916                               gpu_tensor->Width(), gpu_tensor->Channels());
917   result->id = id;
918   result->shape = dst_shape;
919   result->data.resize(dst_shape.DimensionsProduct());
920 
921   TensorDescriptor desc;
922   RETURN_IF_ERROR(gpu_tensor->ToDescriptor(&desc, queue));
923   desc.DownloadData(result);
924   return absl::OkStatus();
925 }
926 
Encode(const CLDevice & device,const ProgramCache & program_cache,flatbuffers::Offset<tflite::gpu::data::GpuModel> gpu_model_fb,flatbuffers::FlatBufferBuilder * builder)927 flatbuffers::Offset<data::InferenceContext> InferenceContext::Encode(
928     const CLDevice& device, const ProgramCache& program_cache,
929     flatbuffers::Offset<tflite::gpu::data::GpuModel> gpu_model_fb,
930     flatbuffers::FlatBufferBuilder* builder) {
931   std::vector<flatbuffers::Offset<tflite::gpu::data::Int3>> work_groups_fb;
932   for (int i = 0; i < nodes_.size(); ++i) {
933     auto work_group_fb =
934         tflite::gpu::Encode(nodes_[i].cl_operation.GetWorkGroupSize(), builder);
935     work_groups_fb.push_back(work_group_fb);
936   }
937   auto work_groups_fb_vec = builder->CreateVector(work_groups_fb);
938   std::vector<uint64_t> node_fingerprints(nodes_.size());
939   for (int i = 0; i < nodes_.size(); ++i) {
940     node_fingerprints[i] = nodes_[i].cl_operation.GetKernelFingerprint();
941   }
942   auto node_fingerprints_fb = builder->CreateVector(node_fingerprints);
943 
944   std::set<uint64_t> fingerprints;
945   for (const auto& node : nodes_) {
946     fingerprints.insert(node.cl_operation.GetKernelFingerprint());
947   }
948   std::vector<flatbuffers::Offset<data::BinaryProgram>> binary_programs_fb;
949   for (auto fingerprint : fingerprints) {
950     std::vector<uint8_t> program_binary;
951     program_cache.GetProgramBinary(fingerprint, &program_binary).IgnoreError();
952     auto binary_fb = builder->CreateVector(program_binary);
953     data::BinaryProgramBuilder program_builder(*builder);
954     program_builder.add_fingerprint(fingerprint);
955     program_builder.add_binary(binary_fb);
956     binary_programs_fb.push_back(program_builder.Finish());
957   }
958   auto binary_programs_fb_vec = builder->CreateVector(binary_programs_fb);
959   auto driver_version = builder->CreateString(device.GetPlatformVersion());
960 
961   data::InferenceContextBuilder inf_builder(*builder);
962   inf_builder.add_gpu_model(gpu_model_fb);
963   inf_builder.add_driver_version(driver_version);
964   inf_builder.add_binary_programs(binary_programs_fb_vec);
965   inf_builder.add_tuned_work_group_sizes_per_node(work_groups_fb_vec);
966   inf_builder.add_fingerprints_per_node(node_fingerprints_fb);
967   return inf_builder.Finish();
968 }
969 
GetInOutRefs(const absl::Span<const uint8_t> serialized_model,std::vector<int64_t> * in_refs,std::vector<int64_t> * out_refs)970 absl::Status GetInOutRefs(const absl::Span<const uint8_t> serialized_model,
971                           std::vector<int64_t>* in_refs,
972                           std::vector<int64_t>* out_refs) {
973   flatbuffers::Verifier verifier(serialized_model.data(),
974                                  serialized_model.size());
975   if (!data::VerifyInferenceContextBuffer(verifier)) {
976     return absl::DataLossError("Deserialization failed.");
977   }
978   auto fb_inference = data::GetInferenceContext(serialized_model.data());
979   if (in_refs) {
980     in_refs->clear();
981     for (auto in_fb : *fb_inference->gpu_model()->input_refs()) {
982       in_refs->push_back(in_fb);
983     }
984   }
985   if (out_refs) {
986     out_refs->clear();
987     for (auto out_fb : *fb_inference->gpu_model()->output_refs()) {
988       out_refs->push_back(out_fb);
989     }
990   }
991   return absl::OkStatus();
992 }
993 
GetTotalBufferSizeForTensors(const GpuModel & gpu_model,const CreateGpuModelInfo & create_info,const GpuInfo & gpu_info,uint64_t * result)994 absl::Status GetTotalBufferSizeForTensors(const GpuModel& gpu_model,
995                                           const CreateGpuModelInfo& create_info,
996                                           const GpuInfo& gpu_info,
997                                           uint64_t* result) {
998   std::vector<TensorUsageRecord<size_t>> buffer_usage_records;
999   ObjectsAssignment<size_t> buffer_assignment;
1000   OffsetsAssignment offset_assignment;
1001   bool use_offset_assignment;
1002   bool is_sub_buffers_supported;
1003   RETURN_IF_ERROR(GetBufferAsignment(
1004       gpu_model, &create_info, gpu_info, &buffer_usage_records, nullptr,
1005       &buffer_assignment, &offset_assignment, &use_offset_assignment,
1006       &is_sub_buffers_supported));
1007   if (use_offset_assignment) {
1008     *result = offset_assignment.total_size;
1009     return absl::OkStatus();
1010   }
1011 
1012   const size_t base_align_bytes =
1013       std::max<size_t>(gpu_info.opencl_info.base_addr_align_in_bits >> 3, 1);
1014   *result = TotalSize(buffer_assignment, base_align_bytes);
1015   return absl::OkStatus();
1016 }
1017 
1018 }  // namespace cl
1019 }  // namespace gpu
1020 }  // namespace tflite
1021