xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed.h"
17 
18 #include <memory>
19 #include <string>
20 #include <utility>
21 #include <vector>
22 
23 #include "absl/strings/substitute.h"
24 #include "tensorflow/lite/delegates/gpu/common/shape.h"
25 #include "tensorflow/lite/delegates/gpu/common/status.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h"
27 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
28 
29 namespace tflite {
30 namespace gpu {
31 namespace {
UseBufferForWeights(const GpuInfo & gpu_info)32 bool UseBufferForWeights(const GpuInfo& gpu_info) {
33   return gpu_info.IsMali() || gpu_info.IsApple() || gpu_info.IsAMD();
34 }
35 }  // namespace
36 
ConvolutionTransposed(const OperationDef & definition,const ConvolutionTransposedAttributes & attr,const GpuInfo & gpu_info)37 ConvolutionTransposed::ConvolutionTransposed(
38     const OperationDef& definition, const ConvolutionTransposedAttributes& attr,
39     const GpuInfo& gpu_info)
40     : GPUOperation(definition),
41       stride_(attr.stride.w, attr.stride.h, 1, 1),
42       block_size_(2, 2, 1, 2) {
43   if (UseBufferForWeights(gpu_info)) {
44     if (gpu_info.IsApple()) {
45       weights_layout_ = WeightsLayout::kOSpatialIOGroupO4I4;
46     } else {
47       weights_layout_ = WeightsLayout::kOSpatialIOGroupI4O4;
48     }
49   } else {
50     if (gpu_info.IsApple()) {
51       weights_layout_ = WeightsLayout::k2DX4O4YIsSpatialIAndXIsOOGroupI4;
52     } else {
53       weights_layout_ = WeightsLayout::k2DX4I4YIsSpatialIAndXIsOOGroupO4;
54     }
55   }
56   const bool is_f16 = definition.precision == CalculationsPrecision::F16;
57   if (gpu_info.IsMali()) {
58     if (gpu_info.mali_info.IsMidgard()) {
59       block_size_ = is_f16 ? int4(2, 1, 1, 2) : int4(2, 1, 1, 1);
60     } else {
61       block_size_ = is_f16 ? int4(2, 2, 1, 2) : int4(2, 2, 1, 1);
62     }
63     compiler_options_.push_back(CompilerOptions::kClFastRelaxedMath);
64   }
65   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
66   if (dst_depth == 1 || dst_depth == 3) {
67     if (!gpu_info.IsMali()) {
68       block_size_.y *= block_size_.w;
69     }
70     block_size_.w = 1;
71   }
72 
73   args_.AddInt("stride_x", stride_.x);
74   args_.AddInt("stride_y", stride_.y);
75   args_.AddInt("padding_x", attr.padding.prepended.w);
76   args_.AddInt("padding_y", attr.padding.prepended.h);
77   args_.AddInt("kernel_size_x", attr.weights.shape.w);
78   args_.AddInt("kernel_size_y", attr.weights.shape.h);
79   code_ = GenerateConvolutionTransposedCode(definition_, gpu_info, block_size_);
80 }
81 
ConvolutionTransposed(const OperationDef & definition,const ConvolutionTransposed3DAttributes & attr,const GpuInfo & gpu_info)82 ConvolutionTransposed::ConvolutionTransposed(
83     const OperationDef& definition,
84     const ConvolutionTransposed3DAttributes& attr, const GpuInfo& gpu_info)
85     : GPUOperation(definition),
86       stride_(attr.stride.w, attr.stride.h, attr.stride.d, 1),
87       block_size_(2, 2, 1, 2) {
88   if (UseBufferForWeights(gpu_info)) {
89     if (gpu_info.IsApple()) {
90       weights_layout_ = WeightsLayout::kOSpatialIOGroupO4I4;
91     } else {
92       weights_layout_ = WeightsLayout::kOSpatialIOGroupI4O4;
93     }
94   } else {
95     if (gpu_info.IsApple()) {
96       weights_layout_ = WeightsLayout::k2DX4O4YIsSpatialIAndXIsOOGroupI4;
97     } else {
98       weights_layout_ = WeightsLayout::k2DX4I4YIsSpatialIAndXIsOOGroupO4;
99     }
100   }
101   const bool is_f16 = definition.precision == CalculationsPrecision::F16;
102   if (gpu_info.IsMali()) {
103     if (gpu_info.mali_info.IsMidgard()) {
104       block_size_ = is_f16 ? int4(2, 1, 1, 2) : int4(2, 1, 1, 1);
105     } else {
106       block_size_ = is_f16 ? int4(2, 2, 1, 2) : int4(2, 2, 1, 1);
107     }
108     compiler_options_.push_back(CompilerOptions::kClFastRelaxedMath);
109   }
110   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
111   if (dst_depth == 1 || dst_depth == 3) {
112     if (!gpu_info.IsMali()) {
113       block_size_.y *= block_size_.w;
114     }
115     block_size_.w = 1;
116   }
117 
118   args_.AddInt("stride_x", stride_.x);
119   args_.AddInt("stride_y", stride_.y);
120   args_.AddInt("stride_z", stride_.z);
121   args_.AddInt("padding_x", attr.padding.prepended.w);
122   args_.AddInt("padding_y", attr.padding.prepended.h);
123   args_.AddInt("padding_z", attr.padding.prepended.d);
124   args_.AddInt("kernel_size_x", attr.weights.shape.w);
125   args_.AddInt("kernel_size_y", attr.weights.shape.h);
126   args_.AddInt("kernel_size_z", attr.weights.shape.d);
127   args_.AddInt("grid_size_y");
128   code_ = GenerateConvolutionTransposedCode(definition_, gpu_info, block_size_);
129 }
130 
GenerateConvolutionTransposedCode(const OperationDef & op_def,const GpuInfo & gpu_info,const int4 & block_size)131 std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
132     const OperationDef& op_def, const GpuInfo& gpu_info,
133     const int4& block_size) {
134   AddSrcTensor("src_tensor", op_def.src_tensors[0]);
135   AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
136 
137   if (op_def.src_tensors.size() != 1) {
138     // dynamic weights
139     if (weights_layout_ == WeightsLayout::kOSpatialIOGroupI4O4 ||
140         weights_layout_ == WeightsLayout::kOSpatialIOGroupO4I4) {
141       BufferDescriptor desc;
142       desc.element_type = op_def.src_tensors[1].GetDataType();
143       desc.element_size = 16;
144       desc.memory_type = MemoryType::GLOBAL;
145       AddSrcBuffer("weights", desc);
146     } else {
147       for (int i = 0; i < 4; ++i) {
148         const std::string name = "weights" + std::to_string(i);
149         AddSrcTensor(name, definition_.src_tensors[1 + i]);
150       }
151     }
152   }
153 
154   const auto& src_def = op_def.src_tensors[0];
155 
156   std::string c;
157 
158   const bool weights_are_buffer = UseBufferForWeights(gpu_info);
159   for (int s = 0; s < block_size.w; ++s) {
160     std::string f0, f1, f2, f3;
161     if (weights_are_buffer) {
162       if (gpu_info.SupportsPointersInKernels()) {
163         f0 = "FLT16_0123(weights_cache[" + std::to_string(s) + "])";
164         f1 = "FLT16_4567(weights_cache[" + std::to_string(s) + "])";
165         f2 = "FLT16_89ab(weights_cache[" + std::to_string(s) + "])";
166         f3 = "FLT16_cdef(weights_cache[" + std::to_string(s) + "])";
167       } else {
168         f0 = "FLT16_0123(flt16val)";
169         f1 = "FLT16_4567(flt16val)";
170         f2 = "FLT16_89ab(flt16val)";
171         f3 = "FLT16_cdef(flt16val)";
172       }
173     } else {
174       f0 = "f" + std::to_string(s * 4 + 0);
175       f1 = "f" + std::to_string(s * 4 + 1);
176       f2 = "f" + std::to_string(s * 4 + 2);
177       f3 = "f" + std::to_string(s * 4 + 3);
178     }
179     bool use_fma = gpu_info.IsAMD() && gpu_info.IsApiOpenCl();
180     if (GetWeightsDescription().IsI4O4()) {
181       switch (op_def.precision) {
182         case CalculationsPrecision::F32:
183         case CalculationsPrecision::F16:
184           if (use_fma) {
185             c += "#define CONV" + std::to_string(s) + "(R, S)    \\\n";
186             c += "R = fma(" + f0 + ", S.x, R); \\\n";
187             c += "R = fma(" + f1 + ", S.y, R); \\\n";
188             c += "R = fma(" + f2 + ", S.z, R); \\\n";
189             c += "R = fma(" + f3 + ", S.w, R);   \n";
190           } else {
191             c += "#define CONV" + std::to_string(s) + "(R, S)    \\\n";
192             c += "R += S.x * " + f0 + "; \\\n";
193             c += "R += S.y * " + f1 + "; \\\n";
194             c += "R += S.z * " + f2 + "; \\\n";
195             c += "R += S.w * " + f3 + ";   \n";
196           }
197           break;
198         case CalculationsPrecision::F32_F16:
199           c += "#define CONV" + std::to_string(s) + "(R, S) \\\n";
200           c += "R += TO_ACCUM_TYPE(S.x * " + f0 + " + S.y * " + f1 +
201                " + S.z * " + f2 + " + S.w * " + f3 + ");\n";
202           break;
203       }
204     } else {
205       // O4I4
206       c += "#define CONV" + std::to_string(s) + "(R, S)    \\\n";
207       c += "R.x += dot(S, " + f0 + "); \\\n";
208       c += "R.y += dot(S, " + f1 + "); \\\n";
209       c += "R.z += dot(S, " + f2 + "); \\\n";
210       c += "R.w += dot(S, " + f3 + ");   \n";
211     }
212   }
213 
214   auto generate_id = [&](const std::string& x, const std::string& y,
215                          const std::string& z) {
216     std::string id;
217     if (src_def.HasAxis(Axis::WIDTH)) {
218       id += "_w" + x;
219     }
220     if (src_def.HasAxis(Axis::HEIGHT)) {
221       id += "_h" + y;
222     }
223     if (src_def.HasAxis(Axis::DEPTH)) {
224       id += "_d" + z;
225     }
226     return id;
227   };
228 
229   auto generate_id_full = [&](const std::string& x, const std::string& y,
230                               const std::string& z, const std::string& s) {
231     return generate_id(x, y, z) + "_s" + s;
232   };
233 
234   auto generate_check = [&](const std::string& x, const std::string& y,
235                             const std::string& z) {
236     std::string check;
237     const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
238     const std::vector<std::string> names{"in_x", "in_y", "in_z"};
239     const std::vector<std::string> coords{x, y, z};
240     for (int i = 0; i < axes.size(); ++i) {
241       const auto& axis = axes[i];
242       if (src_def.HasAxis(axis) && !src_def.SupportsZeroClamp(axis, gpu_info) &&
243           block_size[i] != 1) {
244         if (!check.empty()) {
245           check += " && ";
246         }
247         check += names[i] + coords[i];
248       }
249     }
250     return check;
251   };
252 
253   switch (op_def.precision) {
254     case CalculationsPrecision::F32:
255       c += "#define FLT16 float16\n";
256       break;
257     case CalculationsPrecision::F32_F16:
258     case CalculationsPrecision::F16:
259       c += "#define FLT16 half16\n";
260       break;
261   }
262 
263   c += "MAIN_FUNCTION($0) {\n";
264   if (op_def.IsBatchSupported()) {
265     c += "  int linear_id = GLOBAL_ID_0;\n";
266     c += "  int dst_x = (linear_id / args.dst_tensor.Batch());\n";
267     c += "  int B = linear_id % args.dst_tensor.Batch();\n";
268     c += "  args.dst_tensor.SetBatchRef(B);\n";
269     c += "  args.src_tensor.SetBatchRef(B);\n";
270   } else {
271     c += "  int dst_x = GLOBAL_ID_0;\n";
272   }
273   c += "  int rem_x = dst_x % args.stride_x;\n";
274   c += "  int ceil_x = dst_x / args.stride_x;\n";
275   c += "  dst_x = ceil_x * args.stride_x * " + std::to_string(block_size.x) +
276        " + rem_x;\n";
277   if (src_def.HasAxis(Axis::DEPTH)) {
278     c += "  int linear_id_y = GLOBAL_ID_1;\n";
279     c += "  int dst_y = linear_id_y % args.grid_size_y;\n";
280     c += "  int dst_z = linear_id_y / args.grid_size_y;\n";
281     c += "  int rem_z = dst_z % args.stride_z;\n";
282     c += "  int ceil_z = dst_z / args.stride_z;\n";
283     c += "  dst_z = ceil_z * args.stride_z * " + std::to_string(block_size.z) +
284          " + rem_z;\n";
285     c += "  if (dst_z >= args.dst_tensor.Depth()) return;\n";
286   } else {
287     c += "  int dst_y = GLOBAL_ID_1;\n";
288   }
289   c += "  int rem_y = dst_y % args.stride_y;\n";
290   c += "  int ceil_y = dst_y / args.stride_y;\n";
291   c += "  dst_y = ceil_y * args.stride_y * " + std::to_string(block_size.y) +
292        " + rem_y;\n";
293   c += "  int dst_s = GLOBAL_ID_2 * " + std::to_string(block_size.w) + ";\n";
294   c += "  if (dst_x >= args.dst_tensor.Width() || dst_y >= "
295        "args.dst_tensor.Height() || dst_s >= "
296        "args.dst_tensor.Slices()) return;\n";
297   if (weights_are_buffer) {
298     c += "  int f_base = dst_s * args.src_tensor.Slices() * args.kernel_size_x "
299          "* args.kernel_size_y";
300     if (src_def.HasAxis(Axis::DEPTH)) {
301       c += " * args.kernel_size_z";
302     }
303     c += ";\n";
304   }
305   for (int s = 0; s < block_size.w; ++s) {
306     const std::string sind = std::to_string(s);
307     for (int z = 0; z < block_size.z; ++z) {
308       const std::string zind = std::to_string(z);
309       for (int y = 0; y < block_size.y; ++y) {
310         const std::string yind = std::to_string(y);
311         for (int x = 0; x < block_size.x; ++x) {
312           const std::string xind = std::to_string(x);
313           c += "  ACCUM_FLT4 r" + generate_id_full(xind, yind, zind, sind) +
314                " = INIT_ACCUM_FLT4(0.0f);\n";
315         }
316       }
317     }
318   }
319   c += "  int kernel_first_dst_x = dst_x + args.padding_x;\n";
320   c += "  int kernel_first_dst_y = dst_y + args.padding_y;\n";
321   c += "  int kernel_last_dst_x = kernel_first_dst_x - args.kernel_size_x;\n";
322   c += "  int kernel_last_dst_y = kernel_first_dst_y - args.kernel_size_y;\n";
323   c += "  int offset_x = abs(args.padding_x);\n";
324   c += "  int offset_x_strided = offset_x * args.stride_x;\n";
325   c +=
326       "  int src_x = (kernel_first_dst_x + offset_x_strided) / args.stride_x - "
327       "offset_x;\n";
328   c += "  int offset_y = abs(args.padding_y);\n";
329   c += "  int offset_y_strided = offset_y * args.stride_y;\n";
330   c +=
331       "  int src_y = (kernel_first_dst_y + offset_y_strided) / args.stride_y - "
332       "offset_y;\n";
333   if (src_def.HasAxis(Axis::DEPTH)) {
334     c += "  int kernel_first_dst_z = dst_z + args.padding_z;\n";
335     c += "  int kernel_last_dst_z = kernel_first_dst_z - args.kernel_size_z;\n";
336     c += "  int offset_z = abs(args.padding_z);\n";
337     c += "  int offset_z_strided = offset_z * args.stride_z;\n";
338     c += "  int src_z = (kernel_first_dst_z + offset_z_strided) / "
339          "args.stride_z - offset_z;\n";
340     c += "  int src_as_dst_z = src_z * args.stride_z;\n";
341     c +=
342         "  for (;src_as_dst_z > kernel_last_dst_z; src_z -= 1, src_as_dst_z -= "
343         "args.stride_z) {\n";
344     for (int z = 0; z < block_size.z; ++z) {
345       const std::string zindex = std::to_string(z);
346       c += "    int sz" + zindex + " = src_z + " + zindex + ";\n";
347       if (!src_def.SupportsZeroClamp(Axis::DEPTH, gpu_info)) {
348         c += "    bool in_z" + zindex + " = sz" + zindex + " >= 0 && sz" +
349              zindex + " < args.src_tensor.Depth();\n";
350         if (!src_def.CanReadOutOfBorder(Axis::DEPTH)) {
351           c += "    sz" + zindex + " = clamp(sz" + zindex +
352                ", 0, args.src_tensor.Depth() - 1);\n";
353         }
354       }
355     }
356     if (block_size.z == 1 &&
357         !src_def.SupportsZeroClamp(Axis::DEPTH, gpu_info)) {
358       c += "    if (!in_z0) continue;\n";
359     }
360     c += "    int kernel_z = kernel_first_dst_z - src_as_dst_z;\n";
361     c += "    int src_as_dst_y = src_y * args.stride_y;\n";
362     c += "    int src_y_copy = src_y;\n";
363     c += "    for (;src_as_dst_y > kernel_last_dst_y; src_y_copy -= 1, "
364          "src_as_dst_y -= args.stride_y) {\n";
365   } else {
366     c += "  int src_as_dst_y = src_y * args.stride_y;\n";
367     c += "  for (;src_as_dst_y > kernel_last_dst_y; src_y -= 1, src_as_dst_y "
368          "-= args.stride_y) {\n";
369   }
370   for (int y = 0; y < block_size.y; ++y) {
371     const std::string yindex = std::to_string(y);
372     const std::string src_y =
373         src_def.HasAxis(Axis::DEPTH) ? "src_y_copy" : "src_y";
374     c += "    int sy" + yindex + " = " + src_y + " + " + yindex + ";\n";
375     if (!src_def.SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
376       c += "    bool in_y" + yindex + " = sy" + yindex + " >= 0 && sy" +
377            yindex + " < args.src_tensor.Height();\n";
378       if (!src_def.CanReadOutOfBorder(Axis::HEIGHT)) {
379         c += "    sy" + yindex + " = clamp(sy" + yindex +
380              ", 0, args.src_tensor.Height() - 1);\n";
381       }
382     }
383   }
384   if (block_size.y == 1 && !src_def.SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
385     c += "      if (!in_y0) continue;\n";
386   }
387   c += "    int kernel_y = kernel_first_dst_y - src_as_dst_y;\n";
388   c += "    int src_as_dst_x = src_x * args.stride_x;\n";
389   c += "    int src_x_copy = src_x;\n";
390   c += "    for (;src_as_dst_x > kernel_last_dst_x; src_x_copy -= 1, "
391        "src_as_dst_x "
392        "-= args.stride_x) {\n";
393   for (int x = 0; x < block_size.x; ++x) {
394     const std::string xindex = std::to_string(x);
395     c += "      int sx" + xindex + " = src_x_copy + " + xindex + ";\n";
396     if (!src_def.SupportsZeroClamp(Axis::WIDTH, gpu_info)) {
397       c += "      bool in_x" + xindex + " = sx" + xindex + " >= 0 && sx" +
398            xindex + " < args.src_tensor.Width();\n";
399       if (!src_def.CanReadOutOfBorder(Axis::WIDTH)) {
400         c += "      sx" + xindex + " = clamp(sx" + xindex +
401              ", 0, args.src_tensor.Width() - 1);\n";
402       }
403     }
404   }
405   if (block_size.x == 1 && !src_def.SupportsZeroClamp(Axis::WIDTH, gpu_info)) {
406     c += "      if (!in_x0) continue;\n";
407   }
408   for (int z = 0; z < block_size.z; ++z) {
409     const std::string zind = std::to_string(z);
410     for (int y = 0; y < block_size.y; ++y) {
411       const std::string yind = std::to_string(y);
412       for (int x = 0; x < block_size.x; ++x) {
413         const std::string xind = std::to_string(x);
414         const std::string id = generate_id(xind, yind, zind);
415         const std::string check = generate_check(xind, yind, zind);
416         std::string coords = "sx" + xind + ", sy" + yind;
417         if (src_def.HasAxis(Axis::DEPTH)) {
418           coords += ", sz" + zind;
419         }
420         if (src_def.IsLinear()) {
421           c += "      int addr" + id + " = args.src_tensor.GetAddress(" +
422                coords + ", 0);\n";
423           if (src_def.ReturnsZeroForNegOneRead(gpu_info)) {
424             c += "      addr" + id + " = select(-1, addr" + id + ", (" + check +
425                  "));\n";
426             c += "      int ds" + id +
427                  " = select(0, args.src_tensor.SliceStride(), (" + check +
428                  "));\n";
429           }
430         }
431       }
432     }
433   }
434   if (src_def.IsLinear() && !src_def.ReturnsZeroForNegOneRead(gpu_info)) {
435     c += "      int ds = args.src_tensor.SliceStride();\n";
436   }
437   c += "      int kernel_x = kernel_first_dst_x - src_as_dst_x;\n";
438   if (src_def.HasAxis(Axis::DEPTH)) {
439     c += "      int kernel_index = (kernel_z * args.kernel_size_y + kernel_y) "
440          "*  args.kernel_size_x + kernel_x;\n";
441   } else {
442     c += "      int kernel_index = kernel_y * args.kernel_size_x + kernel_x;\n";
443   }
444   if (weights_are_buffer) {
445     c += "      int f_offset = f_base + kernel_index * "
446          "args.src_tensor.Slices() * " +
447          std::to_string(block_size.w) + ";\n";
448   } else {
449     c += "      int x_c = kernel_index * args.src_tensor.Slices();\n";
450   }
451   c += "      for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
452   const bool conditional_read = gpu_info.IsMali();
453   for (int z = 0; z < block_size.z; ++z) {
454     const std::string zind = std::to_string(z);
455     for (int y = 0; y < block_size.y; ++y) {
456       const std::string yind = std::to_string(y);
457       for (int x = 0; x < block_size.x; ++x) {
458         const std::string xind = std::to_string(x);
459         const std::string id = generate_id(xind, yind, zind);
460         std::string address;
461         if (src_def.IsLinear()) {
462           address = "addr" + id;
463         } else {
464           address = "sx" + xind + ", sy" + yind;
465           if (src_def.HasAxis(Axis::DEPTH)) {
466             address += ", sz" + zind;
467           }
468           address += ", s";
469         }
470         if (src_def.ReturnsZeroForNegOneRead(gpu_info)) {
471           c += "        FLT4 src" + id + " = args.src_tensor.Read(" + address +
472                "); " + address + " += ds" + id + ";\n";
473         } else {
474           const std::string check = generate_check(xind, yind, zind);
475           if (!check.empty()) {
476             if (conditional_read) {
477               c += "        FLT4 src" + id + " = " + check +
478                    " ? args.src_tensor.Read(" + address +
479                    ") : INIT_FLT4(0.0f);\n";
480             } else {
481               c += "        FLT4 src" + id + " = args.src_tensor.Read(" +
482                    address + ") * INIT_FLT(" + check + ");\n";
483             }
484           } else {
485             c += "        FLT4 src" + id + " = args.src_tensor.Read(" +
486                  address + ");\n";
487           }
488           if (src_def.IsLinear()) {
489             c += "        addr" + id + " += ds;\n";
490           }
491         }
492       }
493     }
494   }
495   if (weights_are_buffer) {
496     if (gpu_info.SupportsPointersInKernels()) {
497       c += "        __global FLT16* weights_cache = "
498            "args.weights.GetPtr(f_offset);\n";
499     }
500   } else {
501     for (int s = 0; s < block_size.w; ++s) {
502       c += absl::Substitute(
503           R"(        FLT4 f$1 = args.weights0.Read(dst_s + $0, x_c);
504         FLT4 f$2 = args.weights1.Read(dst_s + $0, x_c);
505         FLT4 f$3 = args.weights2.Read(dst_s + $0, x_c);
506         FLT4 f$4 = args.weights3.Read(dst_s + $0, x_c);
507 )",
508           s, s * 4 + 0, s * 4 + 1, s * 4 + 2, s * 4 + 3);
509     }
510     c += "        x_c++;\n";
511   }
512   if (weights_are_buffer && !gpu_info.SupportsPointersInKernels()) {
513     c += "      FLT16 flt16val;\n";
514   }
515   for (int s = 0; s < block_size.w; ++s) {
516     if (weights_are_buffer && !gpu_info.SupportsPointersInKernels()) {
517       c += "        flt16val = args.weights.Read(f_offset + " +
518            std::to_string(s) + ");\n";
519     }
520     const std::string sind = std::to_string(s);
521     for (int z = 0; z < block_size.z; ++z) {
522       const std::string zind = std::to_string(z);
523       for (int y = 0; y < block_size.y; ++y) {
524         const std::string yind = std::to_string(y);
525         for (int x = 0; x < block_size.x; ++x) {
526           const std::string xind = std::to_string(x);
527           const std::string id = generate_id(xind, yind, zind);
528           const std::string full_id = generate_id_full(xind, yind, zind, sind);
529           c += "        CONV" + sind + "(r" + full_id + ", src" + id + ");\n";
530         }
531       }
532     }
533   }
534   if (weights_are_buffer) {
535     c += "        f_offset += " + std::to_string(block_size.w) + ";\n";
536   }
537   c += "      }\n";
538   c += "    }\n";
539   c += "  }\n";
540   if (src_def.HasAxis(Axis::DEPTH)) {
541     c += "  }\n";
542   }
543   for (int s = 0; s < block_size.w; ++s) {
544     const std::string sind = std::to_string(s);
545     c += "  if (dst_s < args.dst_tensor.Slices()) {\n";
546     c += "    FLT4 bias_val = args.biases.Read(dst_s);\n";
547     for (int z = 0; z < block_size.z; ++z) {
548       const std::string zind = std::to_string(z);
549       for (int y = 0; y < block_size.y; ++y) {
550         const std::string yind = std::to_string(y);
551         for (int x = 0; x < block_size.x; ++x) {
552           const std::string xind = std::to_string(x);
553           const std::string id = generate_id_full(xind, yind, zind, sind);
554           std::string checks =
555               "xc < args.dst_tensor.Width() && yc < args.dst_tensor.Height()";
556           std::string coords = "xc, yc";
557           c += "    {\n";
558           c += "      int xc = dst_x + args.stride_x * " + xind + ";\n";
559           c += "      int yc = dst_y + args.stride_y * " + yind + ";\n";
560           if (src_def.HasAxis(Axis::DEPTH)) {
561             c += "      int zc = dst_z + args.stride_z * " + zind + ";\n";
562             checks += " && zc < args.dst_tensor.Depth()";
563             coords += ", zc";
564           }
565           c += "      if (" + checks + ") {\n";
566           c += "        FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
567           c += "        args.dst_tensor.Write(res, " + coords + ", dst_s);\n";
568           c += "      }\n";
569           c += "    }\n";
570         }
571       }
572     }
573     c += "  }\n";
574     c += "  dst_s++;\n";
575   }
576   c += "}\n";
577   return c;
578 }
579 
BindArguments(ArgumentsBinder * args)580 absl::Status ConvolutionTransposed::BindArguments(ArgumentsBinder* args) {
581   if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
582     const int aligned_h =
583         AlignByN(dst_[0]->Height(), stride_.y * block_size_.y);
584     RETURN_IF_ERROR(
585         args->SetInt("grid_size_y", DivideRoundUp(aligned_h, block_size_.y)));
586   }
587   return absl::OkStatus();
588 }
589 
GetGridSize() const590 int3 ConvolutionTransposed::GetGridSize() const {
591   const int aligned_w = AlignByN(dst_[0]->Width(), stride_.x * block_size_.x);
592   const int aligned_h = AlignByN(dst_[0]->Height(), stride_.y * block_size_.y);
593   const int aligned_d = AlignByN(dst_[0]->Depth(), stride_.z * block_size_.z);
594   const int grid_x = DivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
595   const int grid_y = DivideRoundUp(aligned_h, block_size_.y) *
596                      DivideRoundUp(aligned_d, block_size_.z);
597   const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.w);
598   return int3(grid_x, grid_y, grid_z);
599 }
600 
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups) const601 void ConvolutionTransposed::GetPossibleKernelWorkGroups(
602     TuningType tuning_type, const GpuInfo& gpu_info,
603     const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
604   GetPossibleWorkGroupsConv(tuning_type, gpu_info, kernel_info, grid_size_,
605                             work_groups);
606 }
607 
CreateConvolutionTransposed(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)608 ConvolutionTransposed CreateConvolutionTransposed(
609     const GpuInfo& gpu_info, const OperationDef& definition,
610     const ConvolutionTransposedAttributes& attr) {
611   ConvolutionTransposed result(definition, attr, gpu_info);
612   result.UploadWeights(attr.weights, UseBufferForWeights(gpu_info));
613 
614   TensorDescriptor bias_tensor_desc = CreateConstantLinearTensorDescriptor(
615       gpu_info, definition.src_tensors[0].GetDataType(), attr.bias);
616   result.args_.AddObject("biases", std::make_unique<TensorDescriptor>(
617                                        std::move(bias_tensor_desc)));
618   return result;
619 }
620 
CreateConvolutionTransposed3D(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposed3DAttributes & attr)621 ConvolutionTransposed CreateConvolutionTransposed3D(
622     const GpuInfo& gpu_info, const OperationDef& definition,
623     const ConvolutionTransposed3DAttributes& attr) {
624   ConvolutionTransposed result(definition, attr, gpu_info);
625   result.UploadWeights(attr.weights, UseBufferForWeights(gpu_info));
626 
627   TensorDescriptor bias_tensor_desc = CreateConstantLinearTensorDescriptor(
628       gpu_info, definition.src_tensors[0].GetDataType(), attr.bias);
629   result.args_.AddObject("biases", std::make_unique<TensorDescriptor>(
630                                        std::move(bias_tensor_desc)));
631   return result;
632 }
633 
CreateConvolutionTransposedDynamicWeights(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)634 ConvolutionTransposed CreateConvolutionTransposedDynamicWeights(
635     const GpuInfo& gpu_info, const OperationDef& definition,
636     const ConvolutionTransposedAttributes& attr) {
637   OperationDef new_def = definition;
638   new_def.src_tensors = {
639       definition.src_tensors[0]};  // leaving only src_tensor def, weights defs
640                                    // will be added later
641   const DataType weights_type = definition.GetDataType();
642   if (UseBufferForWeights(gpu_info)) {
643     // add 1 src_tensor(buffer) for weights
644     new_def.src_tensors.push_back(
645         {weights_type, TensorStorageType::BUFFER, Layout::HWC});
646   } else {
647     // add 4 src_tensors(4X textures 2d) for weights
648     new_def.src_tensors.push_back(
649         {weights_type, TensorStorageType::TEXTURE_2D, Layout::HW});
650     new_def.src_tensors.push_back(
651         {weights_type, TensorStorageType::TEXTURE_2D, Layout::HW});
652     new_def.src_tensors.push_back(
653         {weights_type, TensorStorageType::TEXTURE_2D, Layout::HW});
654     new_def.src_tensors.push_back(
655         {weights_type, TensorStorageType::TEXTURE_2D, Layout::HW});
656   }
657   ConvolutionTransposed result(new_def, attr, gpu_info);
658 
659   TensorDescriptor bias_tensor_desc = CreateConstantLinearTensorDescriptor(
660       gpu_info, definition.src_tensors[0].GetDataType(), attr.bias);
661   result.args_.AddObject("biases", std::make_unique<TensorDescriptor>(
662                                        std::move(bias_tensor_desc)));
663   return result;
664 }
665 
666 }  // namespace gpu
667 }  // namespace tflite
668