xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/cl/kernels/converter.h"
17 
18 #include <algorithm>
19 #include <array>
20 #include <memory>
21 #include <string>
22 #include <utility>
23 #include <variant>
24 
25 #include "tensorflow/lite/delegates/gpu/cl/cl_arguments.h"
26 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
27 #include "tensorflow/lite/delegates/gpu/cl/cl_errors.h"
28 #include "tensorflow/lite/delegates/gpu/cl/tensor.h"
29 #include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h"
30 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
31 #include "tensorflow/lite/delegates/gpu/common/precision.h"
32 #include "tensorflow/lite/delegates/gpu/common/task/arguments.h"
33 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
34 #include "tensorflow/lite/delegates/gpu/common/task/util.h"
35 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
36 #include "tensorflow/lite/delegates/gpu/common/util.h"
37 
38 namespace tflite {
39 namespace gpu {
40 namespace cl {
41 namespace {
42 
43 class OpenClConverterImpl : public TensorObjectConverter {
44  public:
45   virtual absl::Status Init(const TensorObjectDef& input_def,
46                             const TensorObjectDef& output_def,
47                             Environment* environment) = 0;
48 
SetGpuInfo(const GpuInfo & info)49   void SetGpuInfo(const GpuInfo& info) { gpu_info_ = info; }
50 
51  protected:
DispatchKernel(cl_mem buffer_mem,Tensor * tensor)52   absl::Status DispatchKernel(cl_mem buffer_mem, Tensor* tensor) {
53     kernel_.ResetBindingCounter();
54     RETURN_IF_ERROR(kernel_.SetMemoryAuto(buffer_mem));
55     RETURN_IF_ERROR(cl_args_.SetObjectRef("tensor", tensor));
56     RETURN_IF_ERROR(
57         cl_args_.Bind(kernel_.kernel(), kernel_.GetBindingCounter()));
58     const int3 grid = int3(tensor->Width() * tensor->Batch(), tensor->Height(),
59                            tensor->Slices());
60     std::vector<int3> work_groups;
61     GetPossibleWorkGroupsConv(TuningType::kFast, gpu_info_, kernel_.info_, grid,
62                               &work_groups);
63     const int3 work_group_size = work_groups[0];
64     const int3 work_groups_count = GetWorkGroupsCount(grid, work_group_size);
65     return queue_->Dispatch(kernel_, work_groups_count, work_group_size);
66   }
67 
68   CLArguments cl_args_;
69   BHWC shape_;
70   CLKernel kernel_;
71   TensorDescriptor tensor_descriptor_;
72   GpuInfo gpu_info_;
73   CLCommandQueue* queue_ = nullptr;
74   const CLContext* context_ = nullptr;
75 };
76 
IsSupportedDataType(DataType type)77 bool IsSupportedDataType(DataType type) {
78   return type == DataType::FLOAT16 || type == DataType::FLOAT32 ||
79          type == DataType::BOOL;
80 }
81 
IsBHWCOpenCLBuffer(const ObjectDef & def)82 bool IsBHWCOpenCLBuffer(const ObjectDef& def) {
83   return IsSupportedDataType(def.data_type) &&
84          def.object_type == ObjectType::OPENCL_BUFFER &&
85          def.data_layout == DataLayout::BHWC;
86 }
87 
IsOpenCLTensor(const ObjectDef & def)88 bool IsOpenCLTensor(const ObjectDef& def) {
89   const bool is_buffer_tensor = def.object_type == ObjectType::OPENCL_BUFFER &&
90                                 def.data_layout == DataLayout::DHWC4;
91   const bool is_image2d_tensor =
92       def.object_type == ObjectType::OPENCL_TEXTURE &&
93       def.data_layout == DataLayout::HDWC4;
94   const bool is_image2d_array_tensor =
95       def.object_type == ObjectType::OPENCL_TEXTURE &&
96       def.data_layout == DataLayout::DHWC4;
97   const bool is_single_image_tensor =
98       def.object_type == ObjectType::OPENCL_TEXTURE &&
99       def.data_layout == DataLayout::BHWC;
100   return IsSupportedDataType(def.data_type) &&
101          (is_buffer_tensor || is_image2d_tensor || is_image2d_array_tensor ||
102           is_single_image_tensor);
103 }
104 
GetOpenCLMemory(const TensorObject & obj,cl_mem * memory)105 absl::Status GetOpenCLMemory(const TensorObject& obj, cl_mem* memory) {
106   auto texture = std::get_if<OpenClTexture>(&obj);
107   auto buffer = std::get_if<OpenClBuffer>(&obj);
108   if (texture && texture->memobj) {
109     *memory = texture->memobj;
110   } else if (buffer && buffer->memobj) {
111     *memory = buffer->memobj;
112   } else {
113     return absl::InvalidArgumentError("Missing OpenCL object.");
114   }
115   return absl::OkStatus();
116 }
117 
118 // Implements conversion from OpenCL tensor to another OpenCL tensor.
119 class TensorToTensorConverter : public OpenClConverterImpl {
120  public:
IsSupported(const ObjectDef & input,const ObjectDef & output)121   static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
122     return IsOpenCLTensor(input) && IsOpenCLTensor(output);
123   }
124 
Init(const TensorObjectDef & input_def,const TensorObjectDef & output_def,Environment * environment)125   absl::Status Init(const TensorObjectDef& input_def,
126                     const TensorObjectDef& output_def,
127                     Environment* environment) final {
128     src_tensor_descriptor_ =
129         TensorDescriptor(input_def.object_def.data_type,
130                          ToTensorStorageType(input_def.object_def.object_type,
131                                              input_def.object_def.data_layout),
132                          Layout::BHWC);
133     Arguments args;
134     args.AddObjectRef(
135         "src_tensor", AccessType::READ,
136         std::make_unique<TensorDescriptor>(src_tensor_descriptor_));
137 
138     dst_tensor_descriptor_ =
139         TensorDescriptor(output_def.object_def.data_type,
140                          ToTensorStorageType(output_def.object_def.object_type,
141                                              output_def.object_def.data_layout),
142                          Layout::BHWC);
143     args.AddObjectRef(
144         "dst_tensor", AccessType::WRITE,
145         std::make_unique<TensorDescriptor>(dst_tensor_descriptor_));
146 
147     const bool need_fp16_support =
148         input_def.object_def.data_type == DataType::FLOAT16 ||
149         output_def.object_def.data_type == DataType::FLOAT16;
150     const std::string out_data_type =
151         ToCLDataType(output_def.object_def.data_type);
152     std::string shader_src;
153     if (need_fp16_support) {
154       shader_src += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
155     }
156     shader_src +=
157         R"(__kernel void tensor_to_tensor($0) {
158   int linear_id = get_global_id(0);
159   int x = linear_id / args.dst_tensor.Batch();
160   int b = linear_id % args.dst_tensor.Batch();
161   int y = get_global_id(1);
162   int d = get_global_id(2);
163   if (x >= args.dst_tensor.Width() || y >= args.dst_tensor.Height() || d >= args.dst_tensor.Slices()) return;
164 )";
165     shader_src += "  " + out_data_type + "4 input = args.src_tensor.Read<" +
166                   out_data_type + ">(x, y, d, b);\n";
167     shader_src += "  args.dst_tensor.Write(input, x, y, d, b);\n}";
168     queue_ = environment->queue();
169     context_ = &environment->context();
170     shape_ = BHWC(input_def.dimensions.b, input_def.dimensions.h,
171                   input_def.dimensions.w, input_def.dimensions.c);
172     RETURN_IF_ERROR(
173         args.Compile(environment->device().GetInfo(), {}, &shader_src));
174     RETURN_IF_ERROR(cl_args_.Init(environment->device().GetInfo(), nullptr,
175                                   &args, &shader_src));
176     return environment->program_cache()->GetOrCreateCLKernel(
177         shader_src, "tensor_to_tensor", environment->context(),
178         environment->device(), &kernel_);
179   }
180 
Convert(const TensorObject & input_obj,const TensorObject & output_obj)181   absl::Status Convert(const TensorObject& input_obj,
182                        const TensorObject& output_obj) override {
183     cl_mem in_memory;
184     RETURN_IF_ERROR(GetOpenCLMemory(input_obj, &in_memory));
185     cl_mem out_memory;
186     RETURN_IF_ERROR(GetOpenCLMemory(output_obj, &out_memory));
187 
188     Tensor src_tensor;
189     TensorDescriptor descriptor_with_shape = src_tensor_descriptor_;
190     descriptor_with_shape.SetBHWCShape(shape_);
191     RETURN_IF_ERROR(CreateTensorShared(*context_, in_memory,
192                                        descriptor_with_shape, &src_tensor));
193     Tensor dst_tensor;
194     descriptor_with_shape = dst_tensor_descriptor_;
195     descriptor_with_shape.SetBHWCShape(shape_);
196     RETURN_IF_ERROR(CreateTensorShared(*context_, out_memory,
197                                        descriptor_with_shape, &dst_tensor));
198     RETURN_IF_ERROR(cl_args_.SetObjectRef("src_tensor", &src_tensor));
199     RETURN_IF_ERROR(cl_args_.SetObjectRef("dst_tensor", &dst_tensor));
200     RETURN_IF_ERROR(cl_args_.Bind(kernel_.kernel()));
201     const int3 grid = int3(dst_tensor.Width() * dst_tensor.Batch(),
202                            dst_tensor.Height(), dst_tensor.Slices());
203     const int3 work_group_size = {16, 8, 1};
204     const int3 work_groups_count = GetWorkGroupsCount(grid, work_group_size);
205     return queue_->Dispatch(kernel_, work_groups_count, work_group_size);
206   }
207 
208  private:
209   TensorDescriptor src_tensor_descriptor_;
210   TensorDescriptor dst_tensor_descriptor_;
211 };
212 
213 // Implements conversion from OpenCL-specific tensor layout to BHWC OpenCL
214 // buffer.
215 class TensorToBHWCBufferConverter : public OpenClConverterImpl {
216  public:
IsSupported(const ObjectDef & input,const ObjectDef & output)217   static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
218     return IsOpenCLTensor(input) && IsBHWCOpenCLBuffer(output);
219   }
220 
Init(const TensorObjectDef & input_def,const TensorObjectDef & output_def,Environment * environment)221   absl::Status Init(const TensorObjectDef& input_def,
222                     const TensorObjectDef& output_def,
223                     Environment* environment) final {
224     TensorStorageType src_tensor_type = ToTensorStorageType(
225         input_def.object_def.object_type, input_def.object_def.data_layout);
226     tensor_descriptor_ = TensorDescriptor(input_def.object_def.data_type,
227                                           src_tensor_type, Layout::BHWC);
228     Arguments args;
229     args.AddObjectRef("tensor", AccessType::READ,
230                       std::make_unique<TensorDescriptor>(tensor_descriptor_));
231 
232     const bool need_fp16_support =
233         input_def.object_def.data_type == DataType::FLOAT16 ||
234         output_def.object_def.data_type == DataType::FLOAT16;
235     std::string shader_src;
236     if (need_fp16_support) {
237       shader_src += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
238     }
239     if (output_def.object_def.data_type == DataType::BOOL ||
240         input_def.object_def.data_type == DataType::BOOL) {
241       shader_src +=
242           "#define convert_bool4(value) (convert_uchar4((value) != 0) & "
243           "(uchar4) 1)\n";
244       shader_src += "#define bool4 uchar4\n";
245     }
246     const std::string out_data_type =
247         ToCLDataType(output_def.object_def.data_type);
248     shader_src += "__kernel void tensor_to_bhwc(";
249     shader_src += "__global " + out_data_type + "* dst, $0) {\n";
250     shader_src += R"(  int linear_id = get_global_id(0);
251   int x = linear_id / args.tensor.Batch();
252   int b = linear_id % args.tensor.Batch();
253   int y = get_global_id(1);
254   int d = get_global_id(2);
255   if (x >= args.tensor.Width() || y >= args.tensor.Height() || d >= args.tensor.Slices()) return;
256 )";
257     shader_src += "  " + out_data_type + "4 input = args.tensor.Read<" +
258                   out_data_type + ">(x, y, d, b);\n";
259     shader_src += R"(  int c = d * 4;
260   int index = ((b * args.tensor.Height() + y) * args.tensor.Width() + x) * args.tensor.Channels() + c;
261 
262   dst[index] = input.x;
263   if (c + 1 < args.tensor.Channels()) {
264     dst[index + 1] = input.y;
265   }
266   if (c + 2 < args.tensor.Channels()) {
267     dst[index + 2] = input.z;
268   }
269   if (c + 3 < args.tensor.Channels()) {
270     dst[index + 3] = input.w;
271   }
272 })";
273     queue_ = environment->queue();
274     context_ = &environment->context();
275     shape_ = BHWC(input_def.dimensions.b, input_def.dimensions.h,
276                   input_def.dimensions.w, input_def.dimensions.c);
277     RETURN_IF_ERROR(
278         args.Compile(environment->device().GetInfo(), {}, &shader_src));
279     RETURN_IF_ERROR(cl_args_.Init(environment->device().GetInfo(), nullptr,
280                                   &args, &shader_src));
281     return environment->program_cache()->GetOrCreateCLKernel(
282         shader_src, "tensor_to_bhwc", environment->context(),
283         environment->device(), &kernel_);
284   }
285 
Convert(const TensorObject & input_obj,const TensorObject & output_obj)286   absl::Status Convert(const TensorObject& input_obj,
287                        const TensorObject& output_obj) override {
288     auto output = std::get_if<OpenClBuffer>(&output_obj);
289     if (!output || !output->memobj) {
290       return absl::InvalidArgumentError(
291           "Missing output in tensor_to_bhwc converter");
292     }
293 
294     cl_mem in_memory;
295     RETURN_IF_ERROR(GetOpenCLMemory(input_obj, &in_memory));
296     Tensor tensor;
297     TensorDescriptor descriptor_with_shape = tensor_descriptor_;
298     descriptor_with_shape.SetBHWCShape(shape_);
299     RETURN_IF_ERROR(CreateTensorShared(*context_, in_memory,
300                                        descriptor_with_shape, &tensor));
301     return DispatchKernel(output->memobj, &tensor);
302   }
303 };
304 
305 // Implements conversion from BHWC OpenCL buffer to OpenCL-specific tensor
306 // layout.
307 class BHWCBufferToTensorConverter : public OpenClConverterImpl {
308  public:
IsSupported(const ObjectDef & input,const ObjectDef & output)309   static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
310     return IsBHWCOpenCLBuffer(input) && IsOpenCLTensor(output);
311   }
312 
GetFromBhwcKernel(const TensorObjectDef & input_def,const TensorObjectDef & output_def) const313   std::pair<std::string, std::string> GetFromBhwcKernel(
314       const TensorObjectDef& input_def,
315       const TensorObjectDef& output_def) const {
316     return std::make_pair(
317         "__global " + ToCLDataType(input_def.object_def.data_type) + "* src",
318         R"(int c = d * 4;
319   int index = ((b * args.tensor.Height() + y) * args.tensor.Width() + x) * args.tensor.Channels() + c;
320   result.x = src[index];
321   result.y = c + 1 < args.tensor.Channels() ? src[index + 1] : 1;
322   result.z = c + 2 < args.tensor.Channels() ? src[index + 2] : 2;
323   result.w = c + 3 < args.tensor.Channels() ? src[index + 3] : 3;
324 )");
325   }
326 
Init(const TensorObjectDef & input_def,const TensorObjectDef & output_def,Environment * environment)327   absl::Status Init(const TensorObjectDef& input_def,
328                     const TensorObjectDef& output_def,
329                     Environment* environment) final {
330     auto params_kernel = GetFromBhwcKernel(input_def, output_def);
331 
332     TensorStorageType dst_tensor_type = ToTensorStorageType(
333         output_def.object_def.object_type, output_def.object_def.data_layout);
334     tensor_descriptor_ = TensorDescriptor(output_def.object_def.data_type,
335                                           dst_tensor_type, Layout::BHWC);
336     Arguments args;
337     args.AddObjectRef("tensor", AccessType::WRITE,
338                       std::make_unique<TensorDescriptor>(tensor_descriptor_));
339 
340     const bool need_fp16_support =
341         input_def.object_def.data_type == DataType::FLOAT16 ||
342         output_def.object_def.data_type == DataType::FLOAT16;
343     std::string shader_src;
344     if (need_fp16_support) {
345       shader_src += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
346     }
347     if (output_def.object_def.data_type == DataType::BOOL ||
348         input_def.object_def.data_type == DataType::BOOL) {
349       shader_src +=
350           "#define convert_bool4(value) (convert_uchar4((value) != 0) & "
351           "(uchar4) 1)\n";
352       shader_src += "#define bool4 uchar4\n";
353     }
354     const std::string in_data_type =
355         ToCLDataType(input_def.object_def.data_type);
356     const std::string out_data_type =
357         ToCLDataType(output_def.object_def.data_type);
358     shader_src += "__kernel void bhwc_to_tensor(";
359     shader_src += "__global " + in_data_type + "* src, $0) {\n";
360 
361     shader_src += R"(  int linear_id = get_global_id(0);
362   int x = linear_id / args.tensor.Batch();
363   int b = linear_id % args.tensor.Batch();
364   int y = get_global_id(1);
365   int d = get_global_id(2);
366 
367   if (x >= args.tensor.Width() || y >= args.tensor.Height() || d >= args.tensor.Slices()) return;
368 )";
369     shader_src += "  " + out_data_type + "4 result;\n";
370     shader_src += R"(  int c = d * 4;
371   int index = ((b * args.tensor.Height() + y) * args.tensor.Width() + x) * args.tensor.Channels() + c;
372   result.x = src[index];
373   result.y = c + 1 < args.tensor.Channels() ? src[index + 1] : 1;
374   result.z = c + 2 < args.tensor.Channels() ? src[index + 2] : 2;
375   result.w = c + 3 < args.tensor.Channels() ? src[index + 3] : 3;
376 )";
377     shader_src += "  args.tensor.Write(result, x, y, d, b);\n}";
378     queue_ = environment->queue();
379     context_ = &environment->context();
380     shape_ = BHWC(output_def.dimensions.b, output_def.dimensions.h,
381                   output_def.dimensions.w, output_def.dimensions.c);
382     RETURN_IF_ERROR(
383         args.Compile(environment->device().GetInfo(), {}, &shader_src));
384     RETURN_IF_ERROR(cl_args_.Init(environment->device().GetInfo(), nullptr,
385                                   &args, &shader_src));
386     return environment->program_cache()->GetOrCreateCLKernel(
387         shader_src, "bhwc_to_tensor", environment->context(),
388         environment->device(), &kernel_);
389   }
390 
Convert(const TensorObject & input_obj,const TensorObject & output_obj)391   absl::Status Convert(const TensorObject& input_obj,
392                        const TensorObject& output_obj) override {
393     auto input = std::get_if<OpenClBuffer>(&input_obj);
394     if (!input || !input->memobj) {
395       return absl::InvalidArgumentError(
396           "Missing input in bhwc_to_tensor converter");
397     }
398     cl_mem out_memory;
399     RETURN_IF_ERROR(GetOpenCLMemory(output_obj, &out_memory));
400     Tensor tensor;
401     TensorDescriptor descriptor_with_shape = tensor_descriptor_;
402     descriptor_with_shape.SetBHWCShape(shape_);
403     RETURN_IF_ERROR(CreateTensorShared(*context_, out_memory,
404                                        descriptor_with_shape, &tensor));
405     return DispatchKernel(input->memobj, &tensor);
406   }
407 };
408 
CalculateTextureRegion(const TensorObjectDef & def)409 std::array<size_t, 3> CalculateTextureRegion(const TensorObjectDef& def) {
410   const auto& dims = def.dimensions;
411   std::array<size_t, 3> region = {0, 0, 1};
412   switch (ToTensorStorageType(def.object_def.object_type,
413                               def.object_def.data_layout)) {
414     case TensorStorageType::SINGLE_TEXTURE_2D:
415       region[0] = static_cast<size_t>(dims.w * dims.b);
416       region[1] = static_cast<size_t>(dims.h);
417       break;
418     case TensorStorageType::TEXTURE_2D:
419       region[0] = static_cast<size_t>(dims.w * dims.b);
420       region[1] = static_cast<size_t>(dims.h * dims.d());
421       break;
422     case TensorStorageType::TEXTURE_ARRAY:
423       region[0] = static_cast<size_t>(dims.w * dims.b);
424       region[1] = static_cast<size_t>(dims.h);
425       region[2] = static_cast<size_t>(dims.d());
426       break;
427     default:
428       break;
429   }
430   return region;
431 }
432 
IsOpenClTextureOrBuffer(ObjectType type)433 bool IsOpenClTextureOrBuffer(ObjectType type) {
434   return type == ObjectType::OPENCL_BUFFER ||
435          type == ObjectType::OPENCL_TEXTURE;
436 }
437 
438 // Copies data from one object of the same type and layout to another object.
439 class TrivialCopier : public OpenClConverterImpl {
440  public:
IsSupported(const ObjectDef & input,const ObjectDef & output)441   static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
442     return IsOpenClTextureOrBuffer(input.object_type) &&
443            input.data_type == output.data_type &&
444            input.object_type == output.object_type &&
445            input.data_layout == output.data_layout;
446   }
447 
Init(const TensorObjectDef & input_def,const TensorObjectDef & output_def,Environment * environment)448   absl::Status Init(const TensorObjectDef& input_def,
449                     const TensorObjectDef& output_def,
450                     Environment* environment) final {
451     shape_ = BHWC(input_def.dimensions.b, input_def.dimensions.h,
452                   input_def.dimensions.w, input_def.dimensions.c);
453     data_type_ = input_def.object_def.data_type;
454     queue_ = environment->queue();
455     region_ = CalculateTextureRegion(output_def);
456     return absl::OkStatus();
457   }
458 
Convert(const TensorObject & input_obj,const TensorObject & output_obj)459   absl::Status Convert(const TensorObject& input_obj,
460                        const TensorObject& output_obj) override {
461     auto texture_input = std::get_if<OpenClTexture>(&input_obj);
462     auto texture_output = std::get_if<OpenClTexture>(&output_obj);
463     if (texture_input && texture_output) {
464       return Copy(*texture_input, *texture_output);
465     }
466     auto buffer_input = std::get_if<OpenClBuffer>(&input_obj);
467     auto buffer_output = std::get_if<OpenClBuffer>(&output_obj);
468     if (buffer_input && buffer_output) {
469       return Copy(*buffer_input, *buffer_output);
470     }
471     return absl::InternalError("Unexpected object");
472   }
473 
Copy(const OpenClBuffer & input,const OpenClBuffer & output)474   absl::Status Copy(const OpenClBuffer& input, const OpenClBuffer& output) {
475     if (input.memobj == output.memobj) {
476       return absl::OkStatus();
477     }
478     return GetOpenCLError(
479         clEnqueueCopyBuffer(queue_->queue(), input.memobj, output.memobj, 0, 0,
480                             SizeOf(data_type_) * shape_.w * shape_.h *
481                                 AlignByN(shape_.c, 4) * shape_.b,
482                             0, nullptr, nullptr));
483   }
484 
Copy(const OpenClTexture & input,const OpenClTexture & output)485   absl::Status Copy(const OpenClTexture& input, const OpenClTexture& output) {
486     if (input.memobj == output.memobj) {
487       return absl::OkStatus();
488     }
489     size_t origin[3] = {0, 0, 0};
490     return GetOpenCLError(
491         clEnqueueCopyImage(queue_->queue(), input.memobj, output.memobj, origin,
492                            origin, region_.data(), 0, nullptr, nullptr));
493   }
494 
495  private:
496   DataType data_type_ = DataType::UNKNOWN;
497   std::array<size_t, 3> region_;
498 };
499 
500 // Copies data from/to CPU into a tensor.
501 class CpuCopier : public OpenClConverterImpl {
502  public:
CpuCopier(bool asynchronous=false)503   explicit CpuCopier(bool asynchronous = false) : async_(asynchronous) {}
IsSupported(const ObjectDef & input,const ObjectDef & output)504   static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
505     return input.data_type == output.data_type &&
506            input.data_layout == output.data_layout &&
507            ((input.object_type == ObjectType::CPU_MEMORY &&
508              IsOpenClTextureOrBuffer(output.object_type)) ||
509             (output.object_type == ObjectType::CPU_MEMORY &&
510              IsOpenClTextureOrBuffer(input.object_type)));
511   }
512 
Init(const TensorObjectDef & input_def,const TensorObjectDef & output_def,Environment * environment)513   absl::Status Init(const TensorObjectDef& input_def,
514                     const TensorObjectDef& output_def,
515                     Environment* environment) final {
516     region_ = CalculateTextureRegion(
517         input_def.object_def.object_type == ObjectType::CPU_MEMORY ? output_def
518                                                                    : input_def);
519     input_data_type_ = input_def.object_def.data_type;
520     output_data_type_ = output_def.object_def.data_type;
521     queue_ = environment->queue();
522     return absl::OkStatus();
523   }
524 
Convert(const TensorObject & input_obj,const TensorObject & output_obj)525   absl::Status Convert(const TensorObject& input_obj,
526                        const TensorObject& output_obj) override {
527     auto cpu_input = std::get_if<CpuMemory>(&input_obj);
528     auto cpu_output = std::get_if<CpuMemory>(&output_obj);
529     if (cpu_input) {
530       if (output_data_type_ == DataType::BOOL) {
531         return CopyFromBoolCpu(cpu_input, output_obj);
532       }
533       auto texture_output = std::get_if<OpenClTexture>(&output_obj);
534       if (texture_output) {
535         return queue_->EnqueueWriteImage(
536             texture_output->memobj, int3(region_[0], region_[1], region_[2]),
537             cpu_input->data, async_);
538       }
539       auto buffer_output = std::get_if<OpenClBuffer>(&output_obj);
540       if (buffer_output) {
541         return queue_->EnqueueWriteBuffer(buffer_output->memobj,
542                                           cpu_input->size_bytes,
543                                           cpu_input->data, async_);
544       }
545     } else if (cpu_output) {
546       if (input_data_type_ == DataType::BOOL) {
547         return CopyToBoolCpu(input_obj, cpu_output);
548       }
549       auto texture_input = std::get_if<OpenClTexture>(&input_obj);
550       if (texture_input) {
551         return queue_->EnqueueReadImage(
552             texture_input->memobj, int3(region_[0], region_[1], region_[2]),
553             cpu_output->data, async_);
554       }
555       auto buffer_input = std::get_if<OpenClBuffer>(&input_obj);
556       if (buffer_input) {
557         return queue_->EnqueueReadBuffer(buffer_input->memobj,
558                                          cpu_output->size_bytes,
559                                          cpu_output->data, async_);
560       }
561     }
562     return absl::InternalError("Unexpected object");
563   }
564 
565  private:
CopyToBoolCpu(const TensorObject & tensor_obj,const CpuMemory * cpu_memory)566   absl::Status CopyToBoolCpu(const TensorObject& tensor_obj,
567                              const CpuMemory* cpu_memory) {
568     const size_t num_elements = cpu_memory->size_bytes;
569     std::vector<uint8_t> tmp_data(num_elements);
570     auto texture_input = std::get_if<OpenClTexture>(&tensor_obj);
571     if (texture_input) {
572       RETURN_IF_ERROR(queue_->EnqueueReadImage(
573           texture_input->memobj, int3(region_[0], region_[1], region_[2]),
574           tmp_data.data(), false));
575     } else {
576       auto buffer_input = std::get_if<OpenClBuffer>(&tensor_obj);
577       if (!buffer_input) {
578         return absl::InternalError("Unexpected object");
579       }
580       RETURN_IF_ERROR(queue_->EnqueueReadBuffer(
581           buffer_input->memobj, tmp_data.size(), tmp_data.data(), false));
582     }
583     bool* output_data = reinterpret_cast<bool*>(cpu_memory->data);
584     for (int i = 0; i < num_elements; ++i) {
585       output_data[i] = tmp_data[i];
586     }
587     return absl::OkStatus();
588   }
589 
CopyFromBoolCpu(const CpuMemory * cpu_memory,const TensorObject & tensor_obj)590   absl::Status CopyFromBoolCpu(const CpuMemory* cpu_memory,
591                                const TensorObject& tensor_obj) {
592     const size_t num_elements = cpu_memory->size_bytes;
593     const bool* bool_data = reinterpret_cast<bool*>(cpu_memory->data);
594     std::vector<uint8_t> tmp_data(num_elements);
595     for (int i = 0; i < num_elements; ++i) {
596       tmp_data[i] = bool_data[i];
597     }
598     auto texture_output = std::get_if<OpenClTexture>(&tensor_obj);
599     if (texture_output) {
600       return queue_->EnqueueWriteImage(texture_output->memobj,
601                                        int3(region_[0], region_[1], region_[2]),
602                                        tmp_data.data(), async_);
603     }
604     auto buffer_output = std::get_if<OpenClBuffer>(&tensor_obj);
605     if (buffer_output) {
606       return queue_->EnqueueWriteBuffer(buffer_output->memobj, tmp_data.size(),
607                                         tmp_data.data(), async_);
608     }
609     return absl::InternalError("Unexpected object");
610   }
611 
612   std::array<size_t, 3> region_;
613   bool async_;
614   DataType input_data_type_;
615   DataType output_data_type_;
616 };
617 
618 class OpenClTensorConverterBuilder : public TensorObjectConverterBuilder {
619  public:
OpenClTensorConverterBuilder(Environment * environment)620   explicit OpenClTensorConverterBuilder(Environment* environment)
621       : environment_(environment) {}
622 
IsSupported(const TensorObjectDef & input,const TensorObjectDef & output) const623   bool IsSupported(const TensorObjectDef& input,
624                    const TensorObjectDef& output) const final {
625     const auto& input_def = input.object_def;
626     const auto& output_def = output.object_def;
627     return input.dimensions == output.dimensions &&
628            (TrivialCopier::IsSupported(input_def, output_def) ||
629             TensorToTensorConverter::IsSupported(input_def, output_def) ||
630             CpuCopier::IsSupported(input_def, output_def) ||
631             TensorToBHWCBufferConverter::IsSupported(input_def, output_def) ||
632             BHWCBufferToTensorConverter::IsSupported(input_def, output_def));
633   }
634 
MakeConverter(const TensorObjectDef & input,const TensorObjectDef & output,std::unique_ptr<TensorObjectConverter> * converter)635   absl::Status MakeConverter(
636       const TensorObjectDef& input, const TensorObjectDef& output,
637       std::unique_ptr<TensorObjectConverter>* converter) final {
638     std::unique_ptr<OpenClConverterImpl> impl;
639     const auto& input_def = input.object_def;
640     const auto& output_def = output.object_def;
641     if (TrivialCopier::IsSupported(input_def, output_def)) {
642       impl = std::make_unique<TrivialCopier>();
643     } else if (TensorToTensorConverter::IsSupported(input_def, output_def)) {
644       impl = std::make_unique<TensorToTensorConverter>();
645     } else if (CpuCopier::IsSupported(input_def, output_def)) {
646       impl = std::make_unique<CpuCopier>(/*asynchronous*/ true);
647     } else if (TensorToBHWCBufferConverter::IsSupported(input_def,
648                                                         output_def)) {
649       impl = std::make_unique<TensorToBHWCBufferConverter>();
650     } else if (BHWCBufferToTensorConverter::IsSupported(input_def,
651                                                         output_def)) {
652       impl = std::make_unique<BHWCBufferToTensorConverter>();
653     } else {
654       return absl::UnimplementedError("Unsupported conversion");
655     }
656     RETURN_IF_ERROR(impl->Init(input, output, environment_));
657     impl->SetGpuInfo(environment_->GetDevicePtr()->GetInfo());
658     *converter = std::move(impl);
659     return absl::OkStatus();
660   }
661 
662   Environment* environment_;
663 };
664 
665 }  // namespace
666 
NewConverterBuilder(Environment * environment)667 std::unique_ptr<TensorObjectConverterBuilder> NewConverterBuilder(
668     Environment* environment) {
669   return std::make_unique<OpenClTensorConverterBuilder>(environment);
670 }
671 
672 }  // namespace cl
673 }  // namespace gpu
674 }  // namespace tflite
675