1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/cl/kernels/converter.h"
17
18 #include <algorithm>
19 #include <array>
20 #include <memory>
21 #include <string>
22 #include <utility>
23 #include <variant>
24
25 #include "tensorflow/lite/delegates/gpu/cl/cl_arguments.h"
26 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
27 #include "tensorflow/lite/delegates/gpu/cl/cl_errors.h"
28 #include "tensorflow/lite/delegates/gpu/cl/tensor.h"
29 #include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h"
30 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
31 #include "tensorflow/lite/delegates/gpu/common/precision.h"
32 #include "tensorflow/lite/delegates/gpu/common/task/arguments.h"
33 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
34 #include "tensorflow/lite/delegates/gpu/common/task/util.h"
35 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
36 #include "tensorflow/lite/delegates/gpu/common/util.h"
37
38 namespace tflite {
39 namespace gpu {
40 namespace cl {
41 namespace {
42
43 class OpenClConverterImpl : public TensorObjectConverter {
44 public:
45 virtual absl::Status Init(const TensorObjectDef& input_def,
46 const TensorObjectDef& output_def,
47 Environment* environment) = 0;
48
SetGpuInfo(const GpuInfo & info)49 void SetGpuInfo(const GpuInfo& info) { gpu_info_ = info; }
50
51 protected:
DispatchKernel(cl_mem buffer_mem,Tensor * tensor)52 absl::Status DispatchKernel(cl_mem buffer_mem, Tensor* tensor) {
53 kernel_.ResetBindingCounter();
54 RETURN_IF_ERROR(kernel_.SetMemoryAuto(buffer_mem));
55 RETURN_IF_ERROR(cl_args_.SetObjectRef("tensor", tensor));
56 RETURN_IF_ERROR(
57 cl_args_.Bind(kernel_.kernel(), kernel_.GetBindingCounter()));
58 const int3 grid = int3(tensor->Width() * tensor->Batch(), tensor->Height(),
59 tensor->Slices());
60 std::vector<int3> work_groups;
61 GetPossibleWorkGroupsConv(TuningType::kFast, gpu_info_, kernel_.info_, grid,
62 &work_groups);
63 const int3 work_group_size = work_groups[0];
64 const int3 work_groups_count = GetWorkGroupsCount(grid, work_group_size);
65 return queue_->Dispatch(kernel_, work_groups_count, work_group_size);
66 }
67
68 CLArguments cl_args_;
69 BHWC shape_;
70 CLKernel kernel_;
71 TensorDescriptor tensor_descriptor_;
72 GpuInfo gpu_info_;
73 CLCommandQueue* queue_ = nullptr;
74 const CLContext* context_ = nullptr;
75 };
76
IsSupportedDataType(DataType type)77 bool IsSupportedDataType(DataType type) {
78 return type == DataType::FLOAT16 || type == DataType::FLOAT32 ||
79 type == DataType::BOOL;
80 }
81
IsBHWCOpenCLBuffer(const ObjectDef & def)82 bool IsBHWCOpenCLBuffer(const ObjectDef& def) {
83 return IsSupportedDataType(def.data_type) &&
84 def.object_type == ObjectType::OPENCL_BUFFER &&
85 def.data_layout == DataLayout::BHWC;
86 }
87
IsOpenCLTensor(const ObjectDef & def)88 bool IsOpenCLTensor(const ObjectDef& def) {
89 const bool is_buffer_tensor = def.object_type == ObjectType::OPENCL_BUFFER &&
90 def.data_layout == DataLayout::DHWC4;
91 const bool is_image2d_tensor =
92 def.object_type == ObjectType::OPENCL_TEXTURE &&
93 def.data_layout == DataLayout::HDWC4;
94 const bool is_image2d_array_tensor =
95 def.object_type == ObjectType::OPENCL_TEXTURE &&
96 def.data_layout == DataLayout::DHWC4;
97 const bool is_single_image_tensor =
98 def.object_type == ObjectType::OPENCL_TEXTURE &&
99 def.data_layout == DataLayout::BHWC;
100 return IsSupportedDataType(def.data_type) &&
101 (is_buffer_tensor || is_image2d_tensor || is_image2d_array_tensor ||
102 is_single_image_tensor);
103 }
104
GetOpenCLMemory(const TensorObject & obj,cl_mem * memory)105 absl::Status GetOpenCLMemory(const TensorObject& obj, cl_mem* memory) {
106 auto texture = std::get_if<OpenClTexture>(&obj);
107 auto buffer = std::get_if<OpenClBuffer>(&obj);
108 if (texture && texture->memobj) {
109 *memory = texture->memobj;
110 } else if (buffer && buffer->memobj) {
111 *memory = buffer->memobj;
112 } else {
113 return absl::InvalidArgumentError("Missing OpenCL object.");
114 }
115 return absl::OkStatus();
116 }
117
118 // Implements conversion from OpenCL tensor to another OpenCL tensor.
119 class TensorToTensorConverter : public OpenClConverterImpl {
120 public:
IsSupported(const ObjectDef & input,const ObjectDef & output)121 static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
122 return IsOpenCLTensor(input) && IsOpenCLTensor(output);
123 }
124
Init(const TensorObjectDef & input_def,const TensorObjectDef & output_def,Environment * environment)125 absl::Status Init(const TensorObjectDef& input_def,
126 const TensorObjectDef& output_def,
127 Environment* environment) final {
128 src_tensor_descriptor_ =
129 TensorDescriptor(input_def.object_def.data_type,
130 ToTensorStorageType(input_def.object_def.object_type,
131 input_def.object_def.data_layout),
132 Layout::BHWC);
133 Arguments args;
134 args.AddObjectRef(
135 "src_tensor", AccessType::READ,
136 std::make_unique<TensorDescriptor>(src_tensor_descriptor_));
137
138 dst_tensor_descriptor_ =
139 TensorDescriptor(output_def.object_def.data_type,
140 ToTensorStorageType(output_def.object_def.object_type,
141 output_def.object_def.data_layout),
142 Layout::BHWC);
143 args.AddObjectRef(
144 "dst_tensor", AccessType::WRITE,
145 std::make_unique<TensorDescriptor>(dst_tensor_descriptor_));
146
147 const bool need_fp16_support =
148 input_def.object_def.data_type == DataType::FLOAT16 ||
149 output_def.object_def.data_type == DataType::FLOAT16;
150 const std::string out_data_type =
151 ToCLDataType(output_def.object_def.data_type);
152 std::string shader_src;
153 if (need_fp16_support) {
154 shader_src += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
155 }
156 shader_src +=
157 R"(__kernel void tensor_to_tensor($0) {
158 int linear_id = get_global_id(0);
159 int x = linear_id / args.dst_tensor.Batch();
160 int b = linear_id % args.dst_tensor.Batch();
161 int y = get_global_id(1);
162 int d = get_global_id(2);
163 if (x >= args.dst_tensor.Width() || y >= args.dst_tensor.Height() || d >= args.dst_tensor.Slices()) return;
164 )";
165 shader_src += " " + out_data_type + "4 input = args.src_tensor.Read<" +
166 out_data_type + ">(x, y, d, b);\n";
167 shader_src += " args.dst_tensor.Write(input, x, y, d, b);\n}";
168 queue_ = environment->queue();
169 context_ = &environment->context();
170 shape_ = BHWC(input_def.dimensions.b, input_def.dimensions.h,
171 input_def.dimensions.w, input_def.dimensions.c);
172 RETURN_IF_ERROR(
173 args.Compile(environment->device().GetInfo(), {}, &shader_src));
174 RETURN_IF_ERROR(cl_args_.Init(environment->device().GetInfo(), nullptr,
175 &args, &shader_src));
176 return environment->program_cache()->GetOrCreateCLKernel(
177 shader_src, "tensor_to_tensor", environment->context(),
178 environment->device(), &kernel_);
179 }
180
Convert(const TensorObject & input_obj,const TensorObject & output_obj)181 absl::Status Convert(const TensorObject& input_obj,
182 const TensorObject& output_obj) override {
183 cl_mem in_memory;
184 RETURN_IF_ERROR(GetOpenCLMemory(input_obj, &in_memory));
185 cl_mem out_memory;
186 RETURN_IF_ERROR(GetOpenCLMemory(output_obj, &out_memory));
187
188 Tensor src_tensor;
189 TensorDescriptor descriptor_with_shape = src_tensor_descriptor_;
190 descriptor_with_shape.SetBHWCShape(shape_);
191 RETURN_IF_ERROR(CreateTensorShared(*context_, in_memory,
192 descriptor_with_shape, &src_tensor));
193 Tensor dst_tensor;
194 descriptor_with_shape = dst_tensor_descriptor_;
195 descriptor_with_shape.SetBHWCShape(shape_);
196 RETURN_IF_ERROR(CreateTensorShared(*context_, out_memory,
197 descriptor_with_shape, &dst_tensor));
198 RETURN_IF_ERROR(cl_args_.SetObjectRef("src_tensor", &src_tensor));
199 RETURN_IF_ERROR(cl_args_.SetObjectRef("dst_tensor", &dst_tensor));
200 RETURN_IF_ERROR(cl_args_.Bind(kernel_.kernel()));
201 const int3 grid = int3(dst_tensor.Width() * dst_tensor.Batch(),
202 dst_tensor.Height(), dst_tensor.Slices());
203 const int3 work_group_size = {16, 8, 1};
204 const int3 work_groups_count = GetWorkGroupsCount(grid, work_group_size);
205 return queue_->Dispatch(kernel_, work_groups_count, work_group_size);
206 }
207
208 private:
209 TensorDescriptor src_tensor_descriptor_;
210 TensorDescriptor dst_tensor_descriptor_;
211 };
212
213 // Implements conversion from OpenCL-specific tensor layout to BHWC OpenCL
214 // buffer.
215 class TensorToBHWCBufferConverter : public OpenClConverterImpl {
216 public:
IsSupported(const ObjectDef & input,const ObjectDef & output)217 static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
218 return IsOpenCLTensor(input) && IsBHWCOpenCLBuffer(output);
219 }
220
Init(const TensorObjectDef & input_def,const TensorObjectDef & output_def,Environment * environment)221 absl::Status Init(const TensorObjectDef& input_def,
222 const TensorObjectDef& output_def,
223 Environment* environment) final {
224 TensorStorageType src_tensor_type = ToTensorStorageType(
225 input_def.object_def.object_type, input_def.object_def.data_layout);
226 tensor_descriptor_ = TensorDescriptor(input_def.object_def.data_type,
227 src_tensor_type, Layout::BHWC);
228 Arguments args;
229 args.AddObjectRef("tensor", AccessType::READ,
230 std::make_unique<TensorDescriptor>(tensor_descriptor_));
231
232 const bool need_fp16_support =
233 input_def.object_def.data_type == DataType::FLOAT16 ||
234 output_def.object_def.data_type == DataType::FLOAT16;
235 std::string shader_src;
236 if (need_fp16_support) {
237 shader_src += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
238 }
239 if (output_def.object_def.data_type == DataType::BOOL ||
240 input_def.object_def.data_type == DataType::BOOL) {
241 shader_src +=
242 "#define convert_bool4(value) (convert_uchar4((value) != 0) & "
243 "(uchar4) 1)\n";
244 shader_src += "#define bool4 uchar4\n";
245 }
246 const std::string out_data_type =
247 ToCLDataType(output_def.object_def.data_type);
248 shader_src += "__kernel void tensor_to_bhwc(";
249 shader_src += "__global " + out_data_type + "* dst, $0) {\n";
250 shader_src += R"( int linear_id = get_global_id(0);
251 int x = linear_id / args.tensor.Batch();
252 int b = linear_id % args.tensor.Batch();
253 int y = get_global_id(1);
254 int d = get_global_id(2);
255 if (x >= args.tensor.Width() || y >= args.tensor.Height() || d >= args.tensor.Slices()) return;
256 )";
257 shader_src += " " + out_data_type + "4 input = args.tensor.Read<" +
258 out_data_type + ">(x, y, d, b);\n";
259 shader_src += R"( int c = d * 4;
260 int index = ((b * args.tensor.Height() + y) * args.tensor.Width() + x) * args.tensor.Channels() + c;
261
262 dst[index] = input.x;
263 if (c + 1 < args.tensor.Channels()) {
264 dst[index + 1] = input.y;
265 }
266 if (c + 2 < args.tensor.Channels()) {
267 dst[index + 2] = input.z;
268 }
269 if (c + 3 < args.tensor.Channels()) {
270 dst[index + 3] = input.w;
271 }
272 })";
273 queue_ = environment->queue();
274 context_ = &environment->context();
275 shape_ = BHWC(input_def.dimensions.b, input_def.dimensions.h,
276 input_def.dimensions.w, input_def.dimensions.c);
277 RETURN_IF_ERROR(
278 args.Compile(environment->device().GetInfo(), {}, &shader_src));
279 RETURN_IF_ERROR(cl_args_.Init(environment->device().GetInfo(), nullptr,
280 &args, &shader_src));
281 return environment->program_cache()->GetOrCreateCLKernel(
282 shader_src, "tensor_to_bhwc", environment->context(),
283 environment->device(), &kernel_);
284 }
285
Convert(const TensorObject & input_obj,const TensorObject & output_obj)286 absl::Status Convert(const TensorObject& input_obj,
287 const TensorObject& output_obj) override {
288 auto output = std::get_if<OpenClBuffer>(&output_obj);
289 if (!output || !output->memobj) {
290 return absl::InvalidArgumentError(
291 "Missing output in tensor_to_bhwc converter");
292 }
293
294 cl_mem in_memory;
295 RETURN_IF_ERROR(GetOpenCLMemory(input_obj, &in_memory));
296 Tensor tensor;
297 TensorDescriptor descriptor_with_shape = tensor_descriptor_;
298 descriptor_with_shape.SetBHWCShape(shape_);
299 RETURN_IF_ERROR(CreateTensorShared(*context_, in_memory,
300 descriptor_with_shape, &tensor));
301 return DispatchKernel(output->memobj, &tensor);
302 }
303 };
304
305 // Implements conversion from BHWC OpenCL buffer to OpenCL-specific tensor
306 // layout.
307 class BHWCBufferToTensorConverter : public OpenClConverterImpl {
308 public:
IsSupported(const ObjectDef & input,const ObjectDef & output)309 static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
310 return IsBHWCOpenCLBuffer(input) && IsOpenCLTensor(output);
311 }
312
GetFromBhwcKernel(const TensorObjectDef & input_def,const TensorObjectDef & output_def) const313 std::pair<std::string, std::string> GetFromBhwcKernel(
314 const TensorObjectDef& input_def,
315 const TensorObjectDef& output_def) const {
316 return std::make_pair(
317 "__global " + ToCLDataType(input_def.object_def.data_type) + "* src",
318 R"(int c = d * 4;
319 int index = ((b * args.tensor.Height() + y) * args.tensor.Width() + x) * args.tensor.Channels() + c;
320 result.x = src[index];
321 result.y = c + 1 < args.tensor.Channels() ? src[index + 1] : 1;
322 result.z = c + 2 < args.tensor.Channels() ? src[index + 2] : 2;
323 result.w = c + 3 < args.tensor.Channels() ? src[index + 3] : 3;
324 )");
325 }
326
Init(const TensorObjectDef & input_def,const TensorObjectDef & output_def,Environment * environment)327 absl::Status Init(const TensorObjectDef& input_def,
328 const TensorObjectDef& output_def,
329 Environment* environment) final {
330 auto params_kernel = GetFromBhwcKernel(input_def, output_def);
331
332 TensorStorageType dst_tensor_type = ToTensorStorageType(
333 output_def.object_def.object_type, output_def.object_def.data_layout);
334 tensor_descriptor_ = TensorDescriptor(output_def.object_def.data_type,
335 dst_tensor_type, Layout::BHWC);
336 Arguments args;
337 args.AddObjectRef("tensor", AccessType::WRITE,
338 std::make_unique<TensorDescriptor>(tensor_descriptor_));
339
340 const bool need_fp16_support =
341 input_def.object_def.data_type == DataType::FLOAT16 ||
342 output_def.object_def.data_type == DataType::FLOAT16;
343 std::string shader_src;
344 if (need_fp16_support) {
345 shader_src += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
346 }
347 if (output_def.object_def.data_type == DataType::BOOL ||
348 input_def.object_def.data_type == DataType::BOOL) {
349 shader_src +=
350 "#define convert_bool4(value) (convert_uchar4((value) != 0) & "
351 "(uchar4) 1)\n";
352 shader_src += "#define bool4 uchar4\n";
353 }
354 const std::string in_data_type =
355 ToCLDataType(input_def.object_def.data_type);
356 const std::string out_data_type =
357 ToCLDataType(output_def.object_def.data_type);
358 shader_src += "__kernel void bhwc_to_tensor(";
359 shader_src += "__global " + in_data_type + "* src, $0) {\n";
360
361 shader_src += R"( int linear_id = get_global_id(0);
362 int x = linear_id / args.tensor.Batch();
363 int b = linear_id % args.tensor.Batch();
364 int y = get_global_id(1);
365 int d = get_global_id(2);
366
367 if (x >= args.tensor.Width() || y >= args.tensor.Height() || d >= args.tensor.Slices()) return;
368 )";
369 shader_src += " " + out_data_type + "4 result;\n";
370 shader_src += R"( int c = d * 4;
371 int index = ((b * args.tensor.Height() + y) * args.tensor.Width() + x) * args.tensor.Channels() + c;
372 result.x = src[index];
373 result.y = c + 1 < args.tensor.Channels() ? src[index + 1] : 1;
374 result.z = c + 2 < args.tensor.Channels() ? src[index + 2] : 2;
375 result.w = c + 3 < args.tensor.Channels() ? src[index + 3] : 3;
376 )";
377 shader_src += " args.tensor.Write(result, x, y, d, b);\n}";
378 queue_ = environment->queue();
379 context_ = &environment->context();
380 shape_ = BHWC(output_def.dimensions.b, output_def.dimensions.h,
381 output_def.dimensions.w, output_def.dimensions.c);
382 RETURN_IF_ERROR(
383 args.Compile(environment->device().GetInfo(), {}, &shader_src));
384 RETURN_IF_ERROR(cl_args_.Init(environment->device().GetInfo(), nullptr,
385 &args, &shader_src));
386 return environment->program_cache()->GetOrCreateCLKernel(
387 shader_src, "bhwc_to_tensor", environment->context(),
388 environment->device(), &kernel_);
389 }
390
Convert(const TensorObject & input_obj,const TensorObject & output_obj)391 absl::Status Convert(const TensorObject& input_obj,
392 const TensorObject& output_obj) override {
393 auto input = std::get_if<OpenClBuffer>(&input_obj);
394 if (!input || !input->memobj) {
395 return absl::InvalidArgumentError(
396 "Missing input in bhwc_to_tensor converter");
397 }
398 cl_mem out_memory;
399 RETURN_IF_ERROR(GetOpenCLMemory(output_obj, &out_memory));
400 Tensor tensor;
401 TensorDescriptor descriptor_with_shape = tensor_descriptor_;
402 descriptor_with_shape.SetBHWCShape(shape_);
403 RETURN_IF_ERROR(CreateTensorShared(*context_, out_memory,
404 descriptor_with_shape, &tensor));
405 return DispatchKernel(input->memobj, &tensor);
406 }
407 };
408
CalculateTextureRegion(const TensorObjectDef & def)409 std::array<size_t, 3> CalculateTextureRegion(const TensorObjectDef& def) {
410 const auto& dims = def.dimensions;
411 std::array<size_t, 3> region = {0, 0, 1};
412 switch (ToTensorStorageType(def.object_def.object_type,
413 def.object_def.data_layout)) {
414 case TensorStorageType::SINGLE_TEXTURE_2D:
415 region[0] = static_cast<size_t>(dims.w * dims.b);
416 region[1] = static_cast<size_t>(dims.h);
417 break;
418 case TensorStorageType::TEXTURE_2D:
419 region[0] = static_cast<size_t>(dims.w * dims.b);
420 region[1] = static_cast<size_t>(dims.h * dims.d());
421 break;
422 case TensorStorageType::TEXTURE_ARRAY:
423 region[0] = static_cast<size_t>(dims.w * dims.b);
424 region[1] = static_cast<size_t>(dims.h);
425 region[2] = static_cast<size_t>(dims.d());
426 break;
427 default:
428 break;
429 }
430 return region;
431 }
432
IsOpenClTextureOrBuffer(ObjectType type)433 bool IsOpenClTextureOrBuffer(ObjectType type) {
434 return type == ObjectType::OPENCL_BUFFER ||
435 type == ObjectType::OPENCL_TEXTURE;
436 }
437
438 // Copies data from one object of the same type and layout to another object.
439 class TrivialCopier : public OpenClConverterImpl {
440 public:
IsSupported(const ObjectDef & input,const ObjectDef & output)441 static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
442 return IsOpenClTextureOrBuffer(input.object_type) &&
443 input.data_type == output.data_type &&
444 input.object_type == output.object_type &&
445 input.data_layout == output.data_layout;
446 }
447
Init(const TensorObjectDef & input_def,const TensorObjectDef & output_def,Environment * environment)448 absl::Status Init(const TensorObjectDef& input_def,
449 const TensorObjectDef& output_def,
450 Environment* environment) final {
451 shape_ = BHWC(input_def.dimensions.b, input_def.dimensions.h,
452 input_def.dimensions.w, input_def.dimensions.c);
453 data_type_ = input_def.object_def.data_type;
454 queue_ = environment->queue();
455 region_ = CalculateTextureRegion(output_def);
456 return absl::OkStatus();
457 }
458
Convert(const TensorObject & input_obj,const TensorObject & output_obj)459 absl::Status Convert(const TensorObject& input_obj,
460 const TensorObject& output_obj) override {
461 auto texture_input = std::get_if<OpenClTexture>(&input_obj);
462 auto texture_output = std::get_if<OpenClTexture>(&output_obj);
463 if (texture_input && texture_output) {
464 return Copy(*texture_input, *texture_output);
465 }
466 auto buffer_input = std::get_if<OpenClBuffer>(&input_obj);
467 auto buffer_output = std::get_if<OpenClBuffer>(&output_obj);
468 if (buffer_input && buffer_output) {
469 return Copy(*buffer_input, *buffer_output);
470 }
471 return absl::InternalError("Unexpected object");
472 }
473
Copy(const OpenClBuffer & input,const OpenClBuffer & output)474 absl::Status Copy(const OpenClBuffer& input, const OpenClBuffer& output) {
475 if (input.memobj == output.memobj) {
476 return absl::OkStatus();
477 }
478 return GetOpenCLError(
479 clEnqueueCopyBuffer(queue_->queue(), input.memobj, output.memobj, 0, 0,
480 SizeOf(data_type_) * shape_.w * shape_.h *
481 AlignByN(shape_.c, 4) * shape_.b,
482 0, nullptr, nullptr));
483 }
484
Copy(const OpenClTexture & input,const OpenClTexture & output)485 absl::Status Copy(const OpenClTexture& input, const OpenClTexture& output) {
486 if (input.memobj == output.memobj) {
487 return absl::OkStatus();
488 }
489 size_t origin[3] = {0, 0, 0};
490 return GetOpenCLError(
491 clEnqueueCopyImage(queue_->queue(), input.memobj, output.memobj, origin,
492 origin, region_.data(), 0, nullptr, nullptr));
493 }
494
495 private:
496 DataType data_type_ = DataType::UNKNOWN;
497 std::array<size_t, 3> region_;
498 };
499
500 // Copies data from/to CPU into a tensor.
501 class CpuCopier : public OpenClConverterImpl {
502 public:
CpuCopier(bool asynchronous=false)503 explicit CpuCopier(bool asynchronous = false) : async_(asynchronous) {}
IsSupported(const ObjectDef & input,const ObjectDef & output)504 static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
505 return input.data_type == output.data_type &&
506 input.data_layout == output.data_layout &&
507 ((input.object_type == ObjectType::CPU_MEMORY &&
508 IsOpenClTextureOrBuffer(output.object_type)) ||
509 (output.object_type == ObjectType::CPU_MEMORY &&
510 IsOpenClTextureOrBuffer(input.object_type)));
511 }
512
Init(const TensorObjectDef & input_def,const TensorObjectDef & output_def,Environment * environment)513 absl::Status Init(const TensorObjectDef& input_def,
514 const TensorObjectDef& output_def,
515 Environment* environment) final {
516 region_ = CalculateTextureRegion(
517 input_def.object_def.object_type == ObjectType::CPU_MEMORY ? output_def
518 : input_def);
519 input_data_type_ = input_def.object_def.data_type;
520 output_data_type_ = output_def.object_def.data_type;
521 queue_ = environment->queue();
522 return absl::OkStatus();
523 }
524
Convert(const TensorObject & input_obj,const TensorObject & output_obj)525 absl::Status Convert(const TensorObject& input_obj,
526 const TensorObject& output_obj) override {
527 auto cpu_input = std::get_if<CpuMemory>(&input_obj);
528 auto cpu_output = std::get_if<CpuMemory>(&output_obj);
529 if (cpu_input) {
530 if (output_data_type_ == DataType::BOOL) {
531 return CopyFromBoolCpu(cpu_input, output_obj);
532 }
533 auto texture_output = std::get_if<OpenClTexture>(&output_obj);
534 if (texture_output) {
535 return queue_->EnqueueWriteImage(
536 texture_output->memobj, int3(region_[0], region_[1], region_[2]),
537 cpu_input->data, async_);
538 }
539 auto buffer_output = std::get_if<OpenClBuffer>(&output_obj);
540 if (buffer_output) {
541 return queue_->EnqueueWriteBuffer(buffer_output->memobj,
542 cpu_input->size_bytes,
543 cpu_input->data, async_);
544 }
545 } else if (cpu_output) {
546 if (input_data_type_ == DataType::BOOL) {
547 return CopyToBoolCpu(input_obj, cpu_output);
548 }
549 auto texture_input = std::get_if<OpenClTexture>(&input_obj);
550 if (texture_input) {
551 return queue_->EnqueueReadImage(
552 texture_input->memobj, int3(region_[0], region_[1], region_[2]),
553 cpu_output->data, async_);
554 }
555 auto buffer_input = std::get_if<OpenClBuffer>(&input_obj);
556 if (buffer_input) {
557 return queue_->EnqueueReadBuffer(buffer_input->memobj,
558 cpu_output->size_bytes,
559 cpu_output->data, async_);
560 }
561 }
562 return absl::InternalError("Unexpected object");
563 }
564
565 private:
CopyToBoolCpu(const TensorObject & tensor_obj,const CpuMemory * cpu_memory)566 absl::Status CopyToBoolCpu(const TensorObject& tensor_obj,
567 const CpuMemory* cpu_memory) {
568 const size_t num_elements = cpu_memory->size_bytes;
569 std::vector<uint8_t> tmp_data(num_elements);
570 auto texture_input = std::get_if<OpenClTexture>(&tensor_obj);
571 if (texture_input) {
572 RETURN_IF_ERROR(queue_->EnqueueReadImage(
573 texture_input->memobj, int3(region_[0], region_[1], region_[2]),
574 tmp_data.data(), false));
575 } else {
576 auto buffer_input = std::get_if<OpenClBuffer>(&tensor_obj);
577 if (!buffer_input) {
578 return absl::InternalError("Unexpected object");
579 }
580 RETURN_IF_ERROR(queue_->EnqueueReadBuffer(
581 buffer_input->memobj, tmp_data.size(), tmp_data.data(), false));
582 }
583 bool* output_data = reinterpret_cast<bool*>(cpu_memory->data);
584 for (int i = 0; i < num_elements; ++i) {
585 output_data[i] = tmp_data[i];
586 }
587 return absl::OkStatus();
588 }
589
CopyFromBoolCpu(const CpuMemory * cpu_memory,const TensorObject & tensor_obj)590 absl::Status CopyFromBoolCpu(const CpuMemory* cpu_memory,
591 const TensorObject& tensor_obj) {
592 const size_t num_elements = cpu_memory->size_bytes;
593 const bool* bool_data = reinterpret_cast<bool*>(cpu_memory->data);
594 std::vector<uint8_t> tmp_data(num_elements);
595 for (int i = 0; i < num_elements; ++i) {
596 tmp_data[i] = bool_data[i];
597 }
598 auto texture_output = std::get_if<OpenClTexture>(&tensor_obj);
599 if (texture_output) {
600 return queue_->EnqueueWriteImage(texture_output->memobj,
601 int3(region_[0], region_[1], region_[2]),
602 tmp_data.data(), async_);
603 }
604 auto buffer_output = std::get_if<OpenClBuffer>(&tensor_obj);
605 if (buffer_output) {
606 return queue_->EnqueueWriteBuffer(buffer_output->memobj, tmp_data.size(),
607 tmp_data.data(), async_);
608 }
609 return absl::InternalError("Unexpected object");
610 }
611
612 std::array<size_t, 3> region_;
613 bool async_;
614 DataType input_data_type_;
615 DataType output_data_type_;
616 };
617
618 class OpenClTensorConverterBuilder : public TensorObjectConverterBuilder {
619 public:
OpenClTensorConverterBuilder(Environment * environment)620 explicit OpenClTensorConverterBuilder(Environment* environment)
621 : environment_(environment) {}
622
IsSupported(const TensorObjectDef & input,const TensorObjectDef & output) const623 bool IsSupported(const TensorObjectDef& input,
624 const TensorObjectDef& output) const final {
625 const auto& input_def = input.object_def;
626 const auto& output_def = output.object_def;
627 return input.dimensions == output.dimensions &&
628 (TrivialCopier::IsSupported(input_def, output_def) ||
629 TensorToTensorConverter::IsSupported(input_def, output_def) ||
630 CpuCopier::IsSupported(input_def, output_def) ||
631 TensorToBHWCBufferConverter::IsSupported(input_def, output_def) ||
632 BHWCBufferToTensorConverter::IsSupported(input_def, output_def));
633 }
634
MakeConverter(const TensorObjectDef & input,const TensorObjectDef & output,std::unique_ptr<TensorObjectConverter> * converter)635 absl::Status MakeConverter(
636 const TensorObjectDef& input, const TensorObjectDef& output,
637 std::unique_ptr<TensorObjectConverter>* converter) final {
638 std::unique_ptr<OpenClConverterImpl> impl;
639 const auto& input_def = input.object_def;
640 const auto& output_def = output.object_def;
641 if (TrivialCopier::IsSupported(input_def, output_def)) {
642 impl = std::make_unique<TrivialCopier>();
643 } else if (TensorToTensorConverter::IsSupported(input_def, output_def)) {
644 impl = std::make_unique<TensorToTensorConverter>();
645 } else if (CpuCopier::IsSupported(input_def, output_def)) {
646 impl = std::make_unique<CpuCopier>(/*asynchronous*/ true);
647 } else if (TensorToBHWCBufferConverter::IsSupported(input_def,
648 output_def)) {
649 impl = std::make_unique<TensorToBHWCBufferConverter>();
650 } else if (BHWCBufferToTensorConverter::IsSupported(input_def,
651 output_def)) {
652 impl = std::make_unique<BHWCBufferToTensorConverter>();
653 } else {
654 return absl::UnimplementedError("Unsupported conversion");
655 }
656 RETURN_IF_ERROR(impl->Init(input, output, environment_));
657 impl->SetGpuInfo(environment_->GetDevicePtr()->GetInfo());
658 *converter = std::move(impl);
659 return absl::OkStatus();
660 }
661
662 Environment* environment_;
663 };
664
665 } // namespace
666
NewConverterBuilder(Environment * environment)667 std::unique_ptr<TensorObjectConverterBuilder> NewConverterBuilder(
668 Environment* environment) {
669 return std::make_unique<OpenClTensorConverterBuilder>(environment);
670 }
671
672 } // namespace cl
673 } // namespace gpu
674 } // namespace tflite
675