1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed.h"
17
18 #include <memory>
19 #include <string>
20 #include <utility>
21 #include <vector>
22
23 #include "absl/strings/substitute.h"
24 #include "tensorflow/lite/delegates/gpu/common/shape.h"
25 #include "tensorflow/lite/delegates/gpu/common/status.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h"
27 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
28
29 namespace tflite {
30 namespace gpu {
31 namespace {
UseBufferForWeights(const GpuInfo & gpu_info)32 bool UseBufferForWeights(const GpuInfo& gpu_info) {
33 return gpu_info.IsMali() || gpu_info.IsApple() || gpu_info.IsAMD();
34 }
35 } // namespace
36
ConvolutionTransposed(const OperationDef & definition,const ConvolutionTransposedAttributes & attr,const GpuInfo & gpu_info)37 ConvolutionTransposed::ConvolutionTransposed(
38 const OperationDef& definition, const ConvolutionTransposedAttributes& attr,
39 const GpuInfo& gpu_info)
40 : GPUOperation(definition),
41 stride_(attr.stride.w, attr.stride.h, 1, 1),
42 block_size_(2, 2, 1, 2) {
43 if (UseBufferForWeights(gpu_info)) {
44 if (gpu_info.IsApple()) {
45 weights_layout_ = WeightsLayout::kOSpatialIOGroupO4I4;
46 } else {
47 weights_layout_ = WeightsLayout::kOSpatialIOGroupI4O4;
48 }
49 } else {
50 if (gpu_info.IsApple()) {
51 weights_layout_ = WeightsLayout::k2DX4O4YIsSpatialIAndXIsOOGroupI4;
52 } else {
53 weights_layout_ = WeightsLayout::k2DX4I4YIsSpatialIAndXIsOOGroupO4;
54 }
55 }
56 const bool is_f16 = definition.precision == CalculationsPrecision::F16;
57 if (gpu_info.IsMali()) {
58 if (gpu_info.mali_info.IsMidgard()) {
59 block_size_ = is_f16 ? int4(2, 1, 1, 2) : int4(2, 1, 1, 1);
60 } else {
61 block_size_ = is_f16 ? int4(2, 2, 1, 2) : int4(2, 2, 1, 1);
62 }
63 compiler_options_.push_back(CompilerOptions::kClFastRelaxedMath);
64 }
65 const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
66 if (dst_depth == 1 || dst_depth == 3) {
67 if (!gpu_info.IsMali()) {
68 block_size_.y *= block_size_.w;
69 }
70 block_size_.w = 1;
71 }
72
73 args_.AddInt("stride_x", stride_.x);
74 args_.AddInt("stride_y", stride_.y);
75 args_.AddInt("padding_x", attr.padding.prepended.w);
76 args_.AddInt("padding_y", attr.padding.prepended.h);
77 args_.AddInt("kernel_size_x", attr.weights.shape.w);
78 args_.AddInt("kernel_size_y", attr.weights.shape.h);
79 code_ = GenerateConvolutionTransposedCode(definition_, gpu_info, block_size_);
80 }
81
ConvolutionTransposed(const OperationDef & definition,const ConvolutionTransposed3DAttributes & attr,const GpuInfo & gpu_info)82 ConvolutionTransposed::ConvolutionTransposed(
83 const OperationDef& definition,
84 const ConvolutionTransposed3DAttributes& attr, const GpuInfo& gpu_info)
85 : GPUOperation(definition),
86 stride_(attr.stride.w, attr.stride.h, attr.stride.d, 1),
87 block_size_(2, 2, 1, 2) {
88 if (UseBufferForWeights(gpu_info)) {
89 if (gpu_info.IsApple()) {
90 weights_layout_ = WeightsLayout::kOSpatialIOGroupO4I4;
91 } else {
92 weights_layout_ = WeightsLayout::kOSpatialIOGroupI4O4;
93 }
94 } else {
95 if (gpu_info.IsApple()) {
96 weights_layout_ = WeightsLayout::k2DX4O4YIsSpatialIAndXIsOOGroupI4;
97 } else {
98 weights_layout_ = WeightsLayout::k2DX4I4YIsSpatialIAndXIsOOGroupO4;
99 }
100 }
101 const bool is_f16 = definition.precision == CalculationsPrecision::F16;
102 if (gpu_info.IsMali()) {
103 if (gpu_info.mali_info.IsMidgard()) {
104 block_size_ = is_f16 ? int4(2, 1, 1, 2) : int4(2, 1, 1, 1);
105 } else {
106 block_size_ = is_f16 ? int4(2, 2, 1, 2) : int4(2, 2, 1, 1);
107 }
108 compiler_options_.push_back(CompilerOptions::kClFastRelaxedMath);
109 }
110 const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
111 if (dst_depth == 1 || dst_depth == 3) {
112 if (!gpu_info.IsMali()) {
113 block_size_.y *= block_size_.w;
114 }
115 block_size_.w = 1;
116 }
117
118 args_.AddInt("stride_x", stride_.x);
119 args_.AddInt("stride_y", stride_.y);
120 args_.AddInt("stride_z", stride_.z);
121 args_.AddInt("padding_x", attr.padding.prepended.w);
122 args_.AddInt("padding_y", attr.padding.prepended.h);
123 args_.AddInt("padding_z", attr.padding.prepended.d);
124 args_.AddInt("kernel_size_x", attr.weights.shape.w);
125 args_.AddInt("kernel_size_y", attr.weights.shape.h);
126 args_.AddInt("kernel_size_z", attr.weights.shape.d);
127 args_.AddInt("grid_size_y");
128 code_ = GenerateConvolutionTransposedCode(definition_, gpu_info, block_size_);
129 }
130
GenerateConvolutionTransposedCode(const OperationDef & op_def,const GpuInfo & gpu_info,const int4 & block_size)131 std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
132 const OperationDef& op_def, const GpuInfo& gpu_info,
133 const int4& block_size) {
134 AddSrcTensor("src_tensor", op_def.src_tensors[0]);
135 AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
136
137 if (op_def.src_tensors.size() != 1) {
138 // dynamic weights
139 if (weights_layout_ == WeightsLayout::kOSpatialIOGroupI4O4 ||
140 weights_layout_ == WeightsLayout::kOSpatialIOGroupO4I4) {
141 BufferDescriptor desc;
142 desc.element_type = op_def.src_tensors[1].GetDataType();
143 desc.element_size = 16;
144 desc.memory_type = MemoryType::GLOBAL;
145 AddSrcBuffer("weights", desc);
146 } else {
147 for (int i = 0; i < 4; ++i) {
148 const std::string name = "weights" + std::to_string(i);
149 AddSrcTensor(name, definition_.src_tensors[1 + i]);
150 }
151 }
152 }
153
154 const auto& src_def = op_def.src_tensors[0];
155
156 std::string c;
157
158 const bool weights_are_buffer = UseBufferForWeights(gpu_info);
159 for (int s = 0; s < block_size.w; ++s) {
160 std::string f0, f1, f2, f3;
161 if (weights_are_buffer) {
162 if (gpu_info.SupportsPointersInKernels()) {
163 f0 = "FLT16_0123(weights_cache[" + std::to_string(s) + "])";
164 f1 = "FLT16_4567(weights_cache[" + std::to_string(s) + "])";
165 f2 = "FLT16_89ab(weights_cache[" + std::to_string(s) + "])";
166 f3 = "FLT16_cdef(weights_cache[" + std::to_string(s) + "])";
167 } else {
168 f0 = "FLT16_0123(flt16val)";
169 f1 = "FLT16_4567(flt16val)";
170 f2 = "FLT16_89ab(flt16val)";
171 f3 = "FLT16_cdef(flt16val)";
172 }
173 } else {
174 f0 = "f" + std::to_string(s * 4 + 0);
175 f1 = "f" + std::to_string(s * 4 + 1);
176 f2 = "f" + std::to_string(s * 4 + 2);
177 f3 = "f" + std::to_string(s * 4 + 3);
178 }
179 bool use_fma = gpu_info.IsAMD() && gpu_info.IsApiOpenCl();
180 if (GetWeightsDescription().IsI4O4()) {
181 switch (op_def.precision) {
182 case CalculationsPrecision::F32:
183 case CalculationsPrecision::F16:
184 if (use_fma) {
185 c += "#define CONV" + std::to_string(s) + "(R, S) \\\n";
186 c += "R = fma(" + f0 + ", S.x, R); \\\n";
187 c += "R = fma(" + f1 + ", S.y, R); \\\n";
188 c += "R = fma(" + f2 + ", S.z, R); \\\n";
189 c += "R = fma(" + f3 + ", S.w, R); \n";
190 } else {
191 c += "#define CONV" + std::to_string(s) + "(R, S) \\\n";
192 c += "R += S.x * " + f0 + "; \\\n";
193 c += "R += S.y * " + f1 + "; \\\n";
194 c += "R += S.z * " + f2 + "; \\\n";
195 c += "R += S.w * " + f3 + "; \n";
196 }
197 break;
198 case CalculationsPrecision::F32_F16:
199 c += "#define CONV" + std::to_string(s) + "(R, S) \\\n";
200 c += "R += TO_ACCUM_TYPE(S.x * " + f0 + " + S.y * " + f1 +
201 " + S.z * " + f2 + " + S.w * " + f3 + ");\n";
202 break;
203 }
204 } else {
205 // O4I4
206 c += "#define CONV" + std::to_string(s) + "(R, S) \\\n";
207 c += "R.x += dot(S, " + f0 + "); \\\n";
208 c += "R.y += dot(S, " + f1 + "); \\\n";
209 c += "R.z += dot(S, " + f2 + "); \\\n";
210 c += "R.w += dot(S, " + f3 + "); \n";
211 }
212 }
213
214 auto generate_id = [&](const std::string& x, const std::string& y,
215 const std::string& z) {
216 std::string id;
217 if (src_def.HasAxis(Axis::WIDTH)) {
218 id += "_w" + x;
219 }
220 if (src_def.HasAxis(Axis::HEIGHT)) {
221 id += "_h" + y;
222 }
223 if (src_def.HasAxis(Axis::DEPTH)) {
224 id += "_d" + z;
225 }
226 return id;
227 };
228
229 auto generate_id_full = [&](const std::string& x, const std::string& y,
230 const std::string& z, const std::string& s) {
231 return generate_id(x, y, z) + "_s" + s;
232 };
233
234 auto generate_check = [&](const std::string& x, const std::string& y,
235 const std::string& z) {
236 std::string check;
237 const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
238 const std::vector<std::string> names{"in_x", "in_y", "in_z"};
239 const std::vector<std::string> coords{x, y, z};
240 for (int i = 0; i < axes.size(); ++i) {
241 const auto& axis = axes[i];
242 if (src_def.HasAxis(axis) && !src_def.SupportsZeroClamp(axis, gpu_info) &&
243 block_size[i] != 1) {
244 if (!check.empty()) {
245 check += " && ";
246 }
247 check += names[i] + coords[i];
248 }
249 }
250 return check;
251 };
252
253 switch (op_def.precision) {
254 case CalculationsPrecision::F32:
255 c += "#define FLT16 float16\n";
256 break;
257 case CalculationsPrecision::F32_F16:
258 case CalculationsPrecision::F16:
259 c += "#define FLT16 half16\n";
260 break;
261 }
262
263 c += "MAIN_FUNCTION($0) {\n";
264 if (op_def.IsBatchSupported()) {
265 c += " int linear_id = GLOBAL_ID_0;\n";
266 c += " int dst_x = (linear_id / args.dst_tensor.Batch());\n";
267 c += " int B = linear_id % args.dst_tensor.Batch();\n";
268 c += " args.dst_tensor.SetBatchRef(B);\n";
269 c += " args.src_tensor.SetBatchRef(B);\n";
270 } else {
271 c += " int dst_x = GLOBAL_ID_0;\n";
272 }
273 c += " int rem_x = dst_x % args.stride_x;\n";
274 c += " int ceil_x = dst_x / args.stride_x;\n";
275 c += " dst_x = ceil_x * args.stride_x * " + std::to_string(block_size.x) +
276 " + rem_x;\n";
277 if (src_def.HasAxis(Axis::DEPTH)) {
278 c += " int linear_id_y = GLOBAL_ID_1;\n";
279 c += " int dst_y = linear_id_y % args.grid_size_y;\n";
280 c += " int dst_z = linear_id_y / args.grid_size_y;\n";
281 c += " int rem_z = dst_z % args.stride_z;\n";
282 c += " int ceil_z = dst_z / args.stride_z;\n";
283 c += " dst_z = ceil_z * args.stride_z * " + std::to_string(block_size.z) +
284 " + rem_z;\n";
285 c += " if (dst_z >= args.dst_tensor.Depth()) return;\n";
286 } else {
287 c += " int dst_y = GLOBAL_ID_1;\n";
288 }
289 c += " int rem_y = dst_y % args.stride_y;\n";
290 c += " int ceil_y = dst_y / args.stride_y;\n";
291 c += " dst_y = ceil_y * args.stride_y * " + std::to_string(block_size.y) +
292 " + rem_y;\n";
293 c += " int dst_s = GLOBAL_ID_2 * " + std::to_string(block_size.w) + ";\n";
294 c += " if (dst_x >= args.dst_tensor.Width() || dst_y >= "
295 "args.dst_tensor.Height() || dst_s >= "
296 "args.dst_tensor.Slices()) return;\n";
297 if (weights_are_buffer) {
298 c += " int f_base = dst_s * args.src_tensor.Slices() * args.kernel_size_x "
299 "* args.kernel_size_y";
300 if (src_def.HasAxis(Axis::DEPTH)) {
301 c += " * args.kernel_size_z";
302 }
303 c += ";\n";
304 }
305 for (int s = 0; s < block_size.w; ++s) {
306 const std::string sind = std::to_string(s);
307 for (int z = 0; z < block_size.z; ++z) {
308 const std::string zind = std::to_string(z);
309 for (int y = 0; y < block_size.y; ++y) {
310 const std::string yind = std::to_string(y);
311 for (int x = 0; x < block_size.x; ++x) {
312 const std::string xind = std::to_string(x);
313 c += " ACCUM_FLT4 r" + generate_id_full(xind, yind, zind, sind) +
314 " = INIT_ACCUM_FLT4(0.0f);\n";
315 }
316 }
317 }
318 }
319 c += " int kernel_first_dst_x = dst_x + args.padding_x;\n";
320 c += " int kernel_first_dst_y = dst_y + args.padding_y;\n";
321 c += " int kernel_last_dst_x = kernel_first_dst_x - args.kernel_size_x;\n";
322 c += " int kernel_last_dst_y = kernel_first_dst_y - args.kernel_size_y;\n";
323 c += " int offset_x = abs(args.padding_x);\n";
324 c += " int offset_x_strided = offset_x * args.stride_x;\n";
325 c +=
326 " int src_x = (kernel_first_dst_x + offset_x_strided) / args.stride_x - "
327 "offset_x;\n";
328 c += " int offset_y = abs(args.padding_y);\n";
329 c += " int offset_y_strided = offset_y * args.stride_y;\n";
330 c +=
331 " int src_y = (kernel_first_dst_y + offset_y_strided) / args.stride_y - "
332 "offset_y;\n";
333 if (src_def.HasAxis(Axis::DEPTH)) {
334 c += " int kernel_first_dst_z = dst_z + args.padding_z;\n";
335 c += " int kernel_last_dst_z = kernel_first_dst_z - args.kernel_size_z;\n";
336 c += " int offset_z = abs(args.padding_z);\n";
337 c += " int offset_z_strided = offset_z * args.stride_z;\n";
338 c += " int src_z = (kernel_first_dst_z + offset_z_strided) / "
339 "args.stride_z - offset_z;\n";
340 c += " int src_as_dst_z = src_z * args.stride_z;\n";
341 c +=
342 " for (;src_as_dst_z > kernel_last_dst_z; src_z -= 1, src_as_dst_z -= "
343 "args.stride_z) {\n";
344 for (int z = 0; z < block_size.z; ++z) {
345 const std::string zindex = std::to_string(z);
346 c += " int sz" + zindex + " = src_z + " + zindex + ";\n";
347 if (!src_def.SupportsZeroClamp(Axis::DEPTH, gpu_info)) {
348 c += " bool in_z" + zindex + " = sz" + zindex + " >= 0 && sz" +
349 zindex + " < args.src_tensor.Depth();\n";
350 if (!src_def.CanReadOutOfBorder(Axis::DEPTH)) {
351 c += " sz" + zindex + " = clamp(sz" + zindex +
352 ", 0, args.src_tensor.Depth() - 1);\n";
353 }
354 }
355 }
356 if (block_size.z == 1 &&
357 !src_def.SupportsZeroClamp(Axis::DEPTH, gpu_info)) {
358 c += " if (!in_z0) continue;\n";
359 }
360 c += " int kernel_z = kernel_first_dst_z - src_as_dst_z;\n";
361 c += " int src_as_dst_y = src_y * args.stride_y;\n";
362 c += " int src_y_copy = src_y;\n";
363 c += " for (;src_as_dst_y > kernel_last_dst_y; src_y_copy -= 1, "
364 "src_as_dst_y -= args.stride_y) {\n";
365 } else {
366 c += " int src_as_dst_y = src_y * args.stride_y;\n";
367 c += " for (;src_as_dst_y > kernel_last_dst_y; src_y -= 1, src_as_dst_y "
368 "-= args.stride_y) {\n";
369 }
370 for (int y = 0; y < block_size.y; ++y) {
371 const std::string yindex = std::to_string(y);
372 const std::string src_y =
373 src_def.HasAxis(Axis::DEPTH) ? "src_y_copy" : "src_y";
374 c += " int sy" + yindex + " = " + src_y + " + " + yindex + ";\n";
375 if (!src_def.SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
376 c += " bool in_y" + yindex + " = sy" + yindex + " >= 0 && sy" +
377 yindex + " < args.src_tensor.Height();\n";
378 if (!src_def.CanReadOutOfBorder(Axis::HEIGHT)) {
379 c += " sy" + yindex + " = clamp(sy" + yindex +
380 ", 0, args.src_tensor.Height() - 1);\n";
381 }
382 }
383 }
384 if (block_size.y == 1 && !src_def.SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
385 c += " if (!in_y0) continue;\n";
386 }
387 c += " int kernel_y = kernel_first_dst_y - src_as_dst_y;\n";
388 c += " int src_as_dst_x = src_x * args.stride_x;\n";
389 c += " int src_x_copy = src_x;\n";
390 c += " for (;src_as_dst_x > kernel_last_dst_x; src_x_copy -= 1, "
391 "src_as_dst_x "
392 "-= args.stride_x) {\n";
393 for (int x = 0; x < block_size.x; ++x) {
394 const std::string xindex = std::to_string(x);
395 c += " int sx" + xindex + " = src_x_copy + " + xindex + ";\n";
396 if (!src_def.SupportsZeroClamp(Axis::WIDTH, gpu_info)) {
397 c += " bool in_x" + xindex + " = sx" + xindex + " >= 0 && sx" +
398 xindex + " < args.src_tensor.Width();\n";
399 if (!src_def.CanReadOutOfBorder(Axis::WIDTH)) {
400 c += " sx" + xindex + " = clamp(sx" + xindex +
401 ", 0, args.src_tensor.Width() - 1);\n";
402 }
403 }
404 }
405 if (block_size.x == 1 && !src_def.SupportsZeroClamp(Axis::WIDTH, gpu_info)) {
406 c += " if (!in_x0) continue;\n";
407 }
408 for (int z = 0; z < block_size.z; ++z) {
409 const std::string zind = std::to_string(z);
410 for (int y = 0; y < block_size.y; ++y) {
411 const std::string yind = std::to_string(y);
412 for (int x = 0; x < block_size.x; ++x) {
413 const std::string xind = std::to_string(x);
414 const std::string id = generate_id(xind, yind, zind);
415 const std::string check = generate_check(xind, yind, zind);
416 std::string coords = "sx" + xind + ", sy" + yind;
417 if (src_def.HasAxis(Axis::DEPTH)) {
418 coords += ", sz" + zind;
419 }
420 if (src_def.IsLinear()) {
421 c += " int addr" + id + " = args.src_tensor.GetAddress(" +
422 coords + ", 0);\n";
423 if (src_def.ReturnsZeroForNegOneRead(gpu_info)) {
424 c += " addr" + id + " = select(-1, addr" + id + ", (" + check +
425 "));\n";
426 c += " int ds" + id +
427 " = select(0, args.src_tensor.SliceStride(), (" + check +
428 "));\n";
429 }
430 }
431 }
432 }
433 }
434 if (src_def.IsLinear() && !src_def.ReturnsZeroForNegOneRead(gpu_info)) {
435 c += " int ds = args.src_tensor.SliceStride();\n";
436 }
437 c += " int kernel_x = kernel_first_dst_x - src_as_dst_x;\n";
438 if (src_def.HasAxis(Axis::DEPTH)) {
439 c += " int kernel_index = (kernel_z * args.kernel_size_y + kernel_y) "
440 "* args.kernel_size_x + kernel_x;\n";
441 } else {
442 c += " int kernel_index = kernel_y * args.kernel_size_x + kernel_x;\n";
443 }
444 if (weights_are_buffer) {
445 c += " int f_offset = f_base + kernel_index * "
446 "args.src_tensor.Slices() * " +
447 std::to_string(block_size.w) + ";\n";
448 } else {
449 c += " int x_c = kernel_index * args.src_tensor.Slices();\n";
450 }
451 c += " for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
452 const bool conditional_read = gpu_info.IsMali();
453 for (int z = 0; z < block_size.z; ++z) {
454 const std::string zind = std::to_string(z);
455 for (int y = 0; y < block_size.y; ++y) {
456 const std::string yind = std::to_string(y);
457 for (int x = 0; x < block_size.x; ++x) {
458 const std::string xind = std::to_string(x);
459 const std::string id = generate_id(xind, yind, zind);
460 std::string address;
461 if (src_def.IsLinear()) {
462 address = "addr" + id;
463 } else {
464 address = "sx" + xind + ", sy" + yind;
465 if (src_def.HasAxis(Axis::DEPTH)) {
466 address += ", sz" + zind;
467 }
468 address += ", s";
469 }
470 if (src_def.ReturnsZeroForNegOneRead(gpu_info)) {
471 c += " FLT4 src" + id + " = args.src_tensor.Read(" + address +
472 "); " + address + " += ds" + id + ";\n";
473 } else {
474 const std::string check = generate_check(xind, yind, zind);
475 if (!check.empty()) {
476 if (conditional_read) {
477 c += " FLT4 src" + id + " = " + check +
478 " ? args.src_tensor.Read(" + address +
479 ") : INIT_FLT4(0.0f);\n";
480 } else {
481 c += " FLT4 src" + id + " = args.src_tensor.Read(" +
482 address + ") * INIT_FLT(" + check + ");\n";
483 }
484 } else {
485 c += " FLT4 src" + id + " = args.src_tensor.Read(" +
486 address + ");\n";
487 }
488 if (src_def.IsLinear()) {
489 c += " addr" + id + " += ds;\n";
490 }
491 }
492 }
493 }
494 }
495 if (weights_are_buffer) {
496 if (gpu_info.SupportsPointersInKernels()) {
497 c += " __global FLT16* weights_cache = "
498 "args.weights.GetPtr(f_offset);\n";
499 }
500 } else {
501 for (int s = 0; s < block_size.w; ++s) {
502 c += absl::Substitute(
503 R"( FLT4 f$1 = args.weights0.Read(dst_s + $0, x_c);
504 FLT4 f$2 = args.weights1.Read(dst_s + $0, x_c);
505 FLT4 f$3 = args.weights2.Read(dst_s + $0, x_c);
506 FLT4 f$4 = args.weights3.Read(dst_s + $0, x_c);
507 )",
508 s, s * 4 + 0, s * 4 + 1, s * 4 + 2, s * 4 + 3);
509 }
510 c += " x_c++;\n";
511 }
512 if (weights_are_buffer && !gpu_info.SupportsPointersInKernels()) {
513 c += " FLT16 flt16val;\n";
514 }
515 for (int s = 0; s < block_size.w; ++s) {
516 if (weights_are_buffer && !gpu_info.SupportsPointersInKernels()) {
517 c += " flt16val = args.weights.Read(f_offset + " +
518 std::to_string(s) + ");\n";
519 }
520 const std::string sind = std::to_string(s);
521 for (int z = 0; z < block_size.z; ++z) {
522 const std::string zind = std::to_string(z);
523 for (int y = 0; y < block_size.y; ++y) {
524 const std::string yind = std::to_string(y);
525 for (int x = 0; x < block_size.x; ++x) {
526 const std::string xind = std::to_string(x);
527 const std::string id = generate_id(xind, yind, zind);
528 const std::string full_id = generate_id_full(xind, yind, zind, sind);
529 c += " CONV" + sind + "(r" + full_id + ", src" + id + ");\n";
530 }
531 }
532 }
533 }
534 if (weights_are_buffer) {
535 c += " f_offset += " + std::to_string(block_size.w) + ";\n";
536 }
537 c += " }\n";
538 c += " }\n";
539 c += " }\n";
540 if (src_def.HasAxis(Axis::DEPTH)) {
541 c += " }\n";
542 }
543 for (int s = 0; s < block_size.w; ++s) {
544 const std::string sind = std::to_string(s);
545 c += " if (dst_s < args.dst_tensor.Slices()) {\n";
546 c += " FLT4 bias_val = args.biases.Read(dst_s);\n";
547 for (int z = 0; z < block_size.z; ++z) {
548 const std::string zind = std::to_string(z);
549 for (int y = 0; y < block_size.y; ++y) {
550 const std::string yind = std::to_string(y);
551 for (int x = 0; x < block_size.x; ++x) {
552 const std::string xind = std::to_string(x);
553 const std::string id = generate_id_full(xind, yind, zind, sind);
554 std::string checks =
555 "xc < args.dst_tensor.Width() && yc < args.dst_tensor.Height()";
556 std::string coords = "xc, yc";
557 c += " {\n";
558 c += " int xc = dst_x + args.stride_x * " + xind + ";\n";
559 c += " int yc = dst_y + args.stride_y * " + yind + ";\n";
560 if (src_def.HasAxis(Axis::DEPTH)) {
561 c += " int zc = dst_z + args.stride_z * " + zind + ";\n";
562 checks += " && zc < args.dst_tensor.Depth()";
563 coords += ", zc";
564 }
565 c += " if (" + checks + ") {\n";
566 c += " FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
567 c += " args.dst_tensor.Write(res, " + coords + ", dst_s);\n";
568 c += " }\n";
569 c += " }\n";
570 }
571 }
572 }
573 c += " }\n";
574 c += " dst_s++;\n";
575 }
576 c += "}\n";
577 return c;
578 }
579
BindArguments(ArgumentsBinder * args)580 absl::Status ConvolutionTransposed::BindArguments(ArgumentsBinder* args) {
581 if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
582 const int aligned_h =
583 AlignByN(dst_[0]->Height(), stride_.y * block_size_.y);
584 RETURN_IF_ERROR(
585 args->SetInt("grid_size_y", DivideRoundUp(aligned_h, block_size_.y)));
586 }
587 return absl::OkStatus();
588 }
589
GetGridSize() const590 int3 ConvolutionTransposed::GetGridSize() const {
591 const int aligned_w = AlignByN(dst_[0]->Width(), stride_.x * block_size_.x);
592 const int aligned_h = AlignByN(dst_[0]->Height(), stride_.y * block_size_.y);
593 const int aligned_d = AlignByN(dst_[0]->Depth(), stride_.z * block_size_.z);
594 const int grid_x = DivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
595 const int grid_y = DivideRoundUp(aligned_h, block_size_.y) *
596 DivideRoundUp(aligned_d, block_size_.z);
597 const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.w);
598 return int3(grid_x, grid_y, grid_z);
599 }
600
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups) const601 void ConvolutionTransposed::GetPossibleKernelWorkGroups(
602 TuningType tuning_type, const GpuInfo& gpu_info,
603 const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
604 GetPossibleWorkGroupsConv(tuning_type, gpu_info, kernel_info, grid_size_,
605 work_groups);
606 }
607
CreateConvolutionTransposed(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)608 ConvolutionTransposed CreateConvolutionTransposed(
609 const GpuInfo& gpu_info, const OperationDef& definition,
610 const ConvolutionTransposedAttributes& attr) {
611 ConvolutionTransposed result(definition, attr, gpu_info);
612 result.UploadWeights(attr.weights, UseBufferForWeights(gpu_info));
613
614 TensorDescriptor bias_tensor_desc = CreateConstantLinearTensorDescriptor(
615 gpu_info, definition.src_tensors[0].GetDataType(), attr.bias);
616 result.args_.AddObject("biases", std::make_unique<TensorDescriptor>(
617 std::move(bias_tensor_desc)));
618 return result;
619 }
620
CreateConvolutionTransposed3D(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposed3DAttributes & attr)621 ConvolutionTransposed CreateConvolutionTransposed3D(
622 const GpuInfo& gpu_info, const OperationDef& definition,
623 const ConvolutionTransposed3DAttributes& attr) {
624 ConvolutionTransposed result(definition, attr, gpu_info);
625 result.UploadWeights(attr.weights, UseBufferForWeights(gpu_info));
626
627 TensorDescriptor bias_tensor_desc = CreateConstantLinearTensorDescriptor(
628 gpu_info, definition.src_tensors[0].GetDataType(), attr.bias);
629 result.args_.AddObject("biases", std::make_unique<TensorDescriptor>(
630 std::move(bias_tensor_desc)));
631 return result;
632 }
633
CreateConvolutionTransposedDynamicWeights(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)634 ConvolutionTransposed CreateConvolutionTransposedDynamicWeights(
635 const GpuInfo& gpu_info, const OperationDef& definition,
636 const ConvolutionTransposedAttributes& attr) {
637 OperationDef new_def = definition;
638 new_def.src_tensors = {
639 definition.src_tensors[0]}; // leaving only src_tensor def, weights defs
640 // will be added later
641 const DataType weights_type = definition.GetDataType();
642 if (UseBufferForWeights(gpu_info)) {
643 // add 1 src_tensor(buffer) for weights
644 new_def.src_tensors.push_back(
645 {weights_type, TensorStorageType::BUFFER, Layout::HWC});
646 } else {
647 // add 4 src_tensors(4X textures 2d) for weights
648 new_def.src_tensors.push_back(
649 {weights_type, TensorStorageType::TEXTURE_2D, Layout::HW});
650 new_def.src_tensors.push_back(
651 {weights_type, TensorStorageType::TEXTURE_2D, Layout::HW});
652 new_def.src_tensors.push_back(
653 {weights_type, TensorStorageType::TEXTURE_2D, Layout::HW});
654 new_def.src_tensors.push_back(
655 {weights_type, TensorStorageType::TEXTURE_2D, Layout::HW});
656 }
657 ConvolutionTransposed result(new_def, attr, gpu_info);
658
659 TensorDescriptor bias_tensor_desc = CreateConstantLinearTensorDescriptor(
660 gpu_info, definition.src_tensors[0].GetDataType(), attr.bias);
661 result.args_.AddObject("biases", std::make_unique<TensorDescriptor>(
662 std::move(bias_tensor_desc)));
663 return result;
664 }
665
666 } // namespace gpu
667 } // namespace tflite
668