xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_4x4.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_4x4.h"
17 
18 #include <memory>
19 #include <string>
20 #include <utility>
21 #include <vector>
22 
23 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
24 
25 namespace tflite {
26 namespace gpu {
27 
28 namespace {
GetBestWeightsUploadType(const GpuInfo & gpu_info)29 ConvolutionTransposed4x4::WeightsUploadType GetBestWeightsUploadType(
30     const GpuInfo& gpu_info) {
31   ConvolutionTransposed4x4::WeightsUploadType weights_upload_type =
32       ConvolutionTransposed4x4::WeightsUploadType::GLOBAL_MEM;
33   if (gpu_info.IsApple()) {
34     if (gpu_info.apple_info.IsBionic()) {
35       weights_upload_type =
36           ConvolutionTransposed4x4::WeightsUploadType::GLOBAL_MEM;
37     } else {
38       weights_upload_type =
39           ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_BY_THREADS;
40     }
41   } else if (gpu_info.IsPowerVR()) {
42     weights_upload_type =
43         ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_ASYNC;
44   } else if (gpu_info.IsNvidia() || gpu_info.IsIntel()) {
45     weights_upload_type =
46         ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_BY_THREADS;
47   } else if (gpu_info.IsAMD()) {
48     weights_upload_type =
49         ConvolutionTransposed4x4::WeightsUploadType::CONSTANT_MEM;
50   } else {
51     weights_upload_type =
52         ConvolutionTransposed4x4::WeightsUploadType::GLOBAL_MEM;
53   }
54   return weights_upload_type;
55 }
56 }  // namespace
57 
ConvolutionTransposed4x4(const OperationDef & definition,const GpuInfo & gpu_info)58 ConvolutionTransposed4x4::ConvolutionTransposed4x4(
59     const OperationDef& definition, const GpuInfo& gpu_info)
60     : GPUOperation(definition) {
61   work_group_size_ = int3(8, 4, 1);
62   if (gpu_info.IsApple()) {
63     work_group_launch_order_ = int3(2, 0, 1);
64   }
65 
66   if (gpu_info.IsApple()) {
67     weights_layout_ = WeightsLayout::kOICustomSpatialO4I4;
68   } else {
69     weights_layout_ = WeightsLayout::kOICustomSpatialI4O4;
70   }
71 
72   code_ = GenerateConvolutionTransposedCode(gpu_info, definition_,
73                                             GetBestWeightsUploadType(gpu_info));
74   if (definition_.precision == CalculationsPrecision::F16 &&
75       gpu_info.IsPowerVR()) {
76     compiler_options_.push_back(CompilerOptions::kClFastRelaxedMath);
77   }
78 }
79 
GenerateConvolutionTransposedCode(const GpuInfo & gpu_info,const OperationDef & op_def,WeightsUploadType weights_upload_type)80 std::string ConvolutionTransposed4x4::GenerateConvolutionTransposedCode(
81     const GpuInfo& gpu_info, const OperationDef& op_def,
82     WeightsUploadType weights_upload_type) {
83   auto src_desc = op_def.src_tensors[0];
84   AddSrcTensor("src_tensor", src_desc);
85   AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
86 
87   if (op_def.src_tensors.size() == 2) {
88     // dynamic weights
89     BufferDescriptor desc;
90     desc.element_type = op_def.src_tensors[1].GetDataType();
91     desc.element_size = 4;
92     desc.memory_type =
93         weights_upload_type ==
94                 ConvolutionTransposed4x4::WeightsUploadType::CONSTANT_MEM
95             ? MemoryType::CONSTANT
96             : MemoryType::GLOBAL;
97     AddSrcBuffer("weights", desc);
98   }
99 
100   args_.AddInt("filter_offset");
101 
102   const bool need_local_mem =
103       weights_upload_type ==
104           ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_BY_THREADS ||
105       weights_upload_type ==
106           ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_ASYNC;
107 
108   const int wg_total_size =
109       work_group_size_.x * work_group_size_.y * work_group_size_.z;
110   const std::string barrier =
111       wg_total_size == 32 && gpu_info.IsWaveSizeEqualTo32()
112           ? "SIMD_LOCAL_MEM_BARRIER"
113           : "LOCAL_MEM_BARRIER";
114 
115   std::string c;
116   if (GetWeightsDescription().IsI4O4()) {
117     switch (op_def.precision) {
118       case CalculationsPrecision::F32:
119       case CalculationsPrecision::F16:
120         c += "#define CONV(R, SRC, F) \\\n";
121         c += "  R += SRC.x * weights_cache[F]; \\\n";
122         c += "  R += SRC.y * weights_cache[F + 1]; \\\n";
123         c += "  R += SRC.z * weights_cache[F + 2]; \\\n";
124         c += "  R += SRC.w * weights_cache[F + 3];   \n";
125         break;
126       case CalculationsPrecision::F32_F16:
127         c += "#define CONV(R, SRC, F) \\\n";
128         c += "  R += TO_ACCUM_TYPE(SRC.x * weights_cache[F] + SRC.y * "
129              "weights_cache[F + 1] + SRC.z * weights_cache[F + 2] + SRC.w * "
130              "weights_cache[F + 3]);\n";
131         break;
132     }
133   } else {
134     // O4I4
135     c += "#define CONV(R, SRC, F) \\\n";
136     c += "  R.x += dot(SRC, weights_cache[F]); \\\n";
137     c += "  R.y += dot(SRC, weights_cache[F + 1]); \\\n";
138     c += "  R.z += dot(SRC, weights_cache[F + 2]); \\\n";
139     c += "  R.w += dot(SRC, weights_cache[F + 3]);   \n";
140   }
141 
142   const std::string weights_space =
143       weights_upload_type ==
144               ConvolutionTransposed4x4::WeightsUploadType::CONSTANT_MEM
145           ? "__constant"
146           : "__global";
147 
148   if (gpu_info.IsApiOpenCl()) {
149     c += "__attribute__((reqd_work_group_size(8, 4, 1)))\n";
150   }
151   c += "MAIN_FUNCTION($0) {\n";
152   std::string grid_coords[3];
153   int3 launch_remap;
154   launch_remap[work_group_launch_order_.x] = 0;
155   launch_remap[work_group_launch_order_.y] = 1;
156   launch_remap[work_group_launch_order_.z] = 2;
157   if (work_group_launch_order_[0] == 0) {
158     grid_coords[0] = "GLOBAL_ID_0";
159   } else {
160     grid_coords[0] = "(GROUP_ID_" + std::to_string(launch_remap[0]) +
161                      " * GROUP_SIZE_0 + LOCAL_ID_0);\n";
162   }
163   if (work_group_launch_order_[1] == 1) {
164     grid_coords[1] = "GLOBAL_ID_1";
165   } else {
166     grid_coords[1] = "(GROUP_ID_" + std::to_string(launch_remap[1]) +
167                      " * GROUP_SIZE_1 + LOCAL_ID_1);\n";
168   }
169   if (work_group_launch_order_[2] == 2) {
170     grid_coords[2] = "GLOBAL_ID_2";
171   } else {
172     grid_coords[2] = "(GROUP_ID_" + std::to_string(launch_remap[2]) +
173                      " * GROUP_SIZE_2 + LOCAL_ID_2);\n";
174   }
175   if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
176     c += "  int linear_id = " + grid_coords[0] + ";\n";
177     c += "  int X = linear_id / args.dst_tensor.Batch();\n";
178     c += "  int B = linear_id % args.dst_tensor.Batch();\n";
179     c += "  args.src_tensor.SetBatchRef(B);\n";
180     c += "  args.dst_tensor.SetBatchRef(B);\n";
181   } else {
182     c += "  int X = " + grid_coords[0] + ";\n";
183   }
184   c += "  int Y = " + grid_coords[1] + ";\n";
185   c += "  int Z = " + grid_coords[2] + ";\n";
186   if (!need_local_mem) {
187     c += "  if (X * 2 > args.dst_tensor.Width() || Y * 2 > "
188          "args.dst_tensor.Height() || Z >= args.dst_tensor.Slices()) "
189          "return;\n";
190   }
191   c += "  ACCUM_FLT4 r0 = INIT_ACCUM_FLT4(0.0f);\n";
192   c += "  ACCUM_FLT4 r1 = INIT_ACCUM_FLT4(0.0f);\n";
193   c += "  ACCUM_FLT4 r2 = INIT_ACCUM_FLT4(0.0f);\n";
194   c += "  ACCUM_FLT4 r3 = INIT_ACCUM_FLT4(0.0f);\n";
195   c += "  int f_offset = Z * args.filter_offset;\n";
196   if (need_local_mem) {
197     c += "  __local FLT4 weights_cache[64];\n";
198   }
199   if (weights_upload_type ==
200       ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
201     c += "  int local_id = LOCAL_ID_1 * 8 + LOCAL_ID_0;\n";
202   }
203   if (!src_desc.SupportsZeroClamp(Axis::WIDTH, gpu_info)) {
204     c += "  bool in_x0 = X - 1 >= 0 && X - 1 < args.src_tensor.Width();\n";
205     c += "  bool in_x1 = X >= 0 && X < args.src_tensor.Width();\n";
206   }
207   if (!src_desc.SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
208     c += "  bool in_y0 = Y - 1 >= 0 && Y - 1 < args.src_tensor.Height();\n";
209     c += "  bool in_y1 = Y >= 0 && Y < args.src_tensor.Height();\n";
210   }
211   auto generate_check = [&](int x, int y) {
212     std::string check;
213     const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT};
214     const std::vector<std::string> names{"in_x" + std::to_string(x),
215                                          "in_y" + std::to_string(y)};
216     for (int i = 0; i < axes.size(); ++i) {
217       const auto& axis = axes[i];
218       if (src_desc.HasAxis(axis) &&
219           !src_desc.SupportsZeroClamp(axis, gpu_info)) {
220         if (!check.empty()) {
221           check += " && ";
222         }
223         check += names[i];
224       }
225     }
226     return check;
227   };
228   if (src_desc.IsLinear()) {
229     if (src_desc.ReturnsZeroForNegOneRead(gpu_info)) {
230       c += "  int addr_0 = args.src_tensor.GetAddress(X - 1, Y - 1, 0);\n";
231       c += "  int addr_1 = args.src_tensor.GetAddress(X, Y - 1, 0);\n";
232       c += "  int addr_2 = args.src_tensor.GetAddress(X - 1, Y, 0);\n";
233       c += "  int addr_3 = args.src_tensor.GetAddress(X, Y, 0);\n";
234       c += "  addr_0 = select(-1, addr_0, (in_x0 && in_y0));\n";
235       c += "  addr_1 = select(-1, addr_1, (in_x1 && in_y0));\n";
236       c += "  addr_2 = select(-1, addr_2, (in_x0 && in_y1));\n";
237       c += "  addr_3 = select(-1, addr_3, (in_x1 && in_y1));\n";
238       c += "  int dz_0 = select(0, args.src_tensor.SliceStride(), (in_x0 && "
239            "in_y0));\n";
240       c += "  int dz_1 = select(0, args.src_tensor.SliceStride(), (in_x1 && "
241            "in_y0));\n";
242       c += "  int dz_2 = select(0, args.src_tensor.SliceStride(), (in_x0 && "
243            "in_y1));\n";
244       c += "  int dz_3 = select(0, args.src_tensor.SliceStride(), (in_x1 && "
245            "in_y1));\n";
246     } else {
247       c += "  int xc0 = clamp(X - 1, 0, args.src_tensor.Width() - 1);\n";
248       c += "  int xc1 = clamp(X, 0, args.src_tensor.Width() - 1);\n";
249       c += "  int yc0 = clamp(Y - 1, 0, args.src_tensor.Height() - 1);\n";
250       c += "  int yc1 = clamp(Y, 0, args.src_tensor.Height() - 1);\n";
251       c += "  int addr_0 = args.src_tensor.GetAddress(xc0, yc0, 0);\n";
252       c += "  int addr_1 = args.src_tensor.GetAddress(xc1, yc0, 0);\n";
253       c += "  int addr_2 = args.src_tensor.GetAddress(xc0, yc1, 0);\n";
254       c += "  int addr_3 = args.src_tensor.GetAddress(xc1, yc1, 0);\n";
255       c += "  int dz = args.src_tensor.SliceStride();\n";
256     }
257   }
258   auto read_src = [&](int x, int y) {
259     if (src_desc.IsLinear()) {
260       const std::string id = std::to_string(y * 2 + x);
261       const std::string addr = "addr_" + std::to_string(y * 2 + x);
262       if (src_desc.ReturnsZeroForNegOneRead(gpu_info)) {
263         return "args.src_tensor.Read(" + addr + "); " + addr + " += dz_" + id +
264                ";";
265       } else {
266         return "args.src_tensor.Read(" + addr + ") * INIT_FLT(in_x" +
267                std::to_string(x) + " && in_y" + std::to_string(y) + "); " +
268                addr + " += dz;";
269       }
270     } else {
271       std::string check = generate_check(x, y);
272       if (!check.empty()) {
273         check = " * INIT_FLT(" + check + ")";
274       }
275       return "args.src_tensor.Read(X + " + std::to_string(x - 1) + ", Y + " +
276              std::to_string(y - 1) + ", s)" + check + ";";
277     }
278   };
279   c += "  for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
280   if (need_local_mem) {
281     c += "    " + barrier + ";\n";
282   }
283   if (weights_upload_type ==
284       ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_ASYNC) {
285     c += "    async_work_group_copy(weights_cache, "
286          "args.weights.GetPtr(f_offset), 64, "
287          "0);\n";
288   } else if (weights_upload_type ==
289              ConvolutionTransposed4x4::WeightsUploadType::
290                  LOCAL_MEM_BY_THREADS) {
291     c += "    weights_cache[local_id] = args.weights.Read(f_offset + "
292          "local_id);\n";
293     c += "    weights_cache[local_id + 32] = args.weights.Read(f_offset + "
294          "local_id + "
295          "32);\n";
296   } else {  // GLOBAL_MEM
297     c += "    " + weights_space +
298          " FLT4* weights_cache = args.weights.GetPtr(f_offset);\n";
299   }
300   c += "    FLT4 src0 = " + read_src(0, 0) + ";\n";
301   c += "    FLT4 src1 = " + read_src(1, 0) + ";\n";
302   c += "    FLT4 src2 = " + read_src(0, 1) + ";\n";
303   c += "    FLT4 src3 = " + read_src(1, 1) + ";\n";
304   c += "    f_offset += 64;\n";
305   if (need_local_mem) {
306     c += "    " + barrier + ";\n";
307   }
308   c += "    CONV(r0, src0, 0);\n";
309   c += "    CONV(r1, src0, 4);\n";
310   c += "    CONV(r2, src0, 8);\n";
311   c += "    CONV(r3, src0, 12);\n";
312   c += "    CONV(r0, src1, 16);\n";
313   c += "    CONV(r1, src1, 20);\n";
314   c += "    CONV(r2, src1, 24);\n";
315   c += "    CONV(r3, src1, 28);\n";
316   c += "    CONV(r0, src2, 32);\n";
317   c += "    CONV(r1, src2, 36);\n";
318   c += "    CONV(r2, src2, 40);\n";
319   c += "    CONV(r3, src2, 44);\n";
320   c += "    CONV(r0, src3, 48);\n";
321   c += "    CONV(r1, src3, 52);\n";
322   c += "    CONV(r2, src3, 56);\n";
323   c += "    CONV(r3, src3, 60);\n";
324   c += "  }\n";
325   c += "\n";
326   if (need_local_mem) {
327     c += "  if (X * 2 > args.dst_tensor.Width() || Y * 2 > "
328          "args.dst_tensor.Height() || Z >= args.dst_tensor.Slices()) "
329          "return;\n";
330   }
331   c += "  X = X * 2 - 1;\n";
332   c += "  Y = Y * 2 - 1;\n";
333   c += "\n";
334   c += "  FLT4 bias_val = args.biases.Read(Z);\n";
335   c += "  if (X >= 0 && Y >= 0) {\n";
336   c += "    FLT4 result = TO_FLT4(r0) + bias_val;\n";
337   c += "    args.dst_tensor.Write(result, X, Y, Z);\n";
338   c += "  }\n";
339   c += "  if (X + 1 < args.dst_tensor.Width() && Y >= 0) {\n";
340   c += "    FLT4 result = TO_FLT4(r1) + bias_val;\n";
341   c += "    args.dst_tensor.Write(result, X + 1, Y, Z);\n";
342   c += "  }\n";
343   c += "  if (X >= 0 && Y + 1 < args.dst_tensor.Height()) {\n";
344   c += "    FLT4 result = TO_FLT4(r2) + bias_val;\n";
345   c += "    args.dst_tensor.Write(result, X, Y + 1, Z);\n";
346   c += "  }\n";
347   c += "  if (X + 1 < args.dst_tensor.Width() && Y + 1 < "
348        "args.dst_tensor.Height()) {\n";
349   c += "    FLT4 result = TO_FLT4(r3) + bias_val;\n";
350   c += "    args.dst_tensor.Write(result, X + 1, Y + 1, Z);\n";
351   c += "  }\n";
352   c += "}\n";
353   return c;
354 }
355 
BindArguments(ArgumentsBinder * args)356 absl::Status ConvolutionTransposed4x4::BindArguments(ArgumentsBinder* args) {
357   return args->SetInt("filter_offset", 4 * 16 * src_[0]->Slices());
358 }
359 
GetGridSize() const360 int3 ConvolutionTransposed4x4::GetGridSize() const {
361   const int grid_x = DivideRoundUp(dst_[0]->Width() + 2, 2) * dst_[0]->Batch();
362   const int grid_y = DivideRoundUp(dst_[0]->Height() + 2, 2);
363   const int grid_z = dst_[0]->Slices();
364   return int3(grid_x, grid_y, grid_z);
365 }
366 
GetSpatialWeightsRemap() const367 std::vector<int> ConvolutionTransposed4x4::GetSpatialWeightsRemap() const {
368   return std::vector<int>{10, 11, 14, 15, 8, 9, 12, 13, 2, 3, 6, 7, 0, 1, 4, 5};
369 }
370 
UploadWeights(const tflite::gpu::Tensor<OHWI,DataType::FLOAT32> & weights,WeightsUploadType weights_upload_type)371 void ConvolutionTransposed4x4::UploadWeights(
372     const tflite::gpu::Tensor<OHWI, DataType::FLOAT32>& weights,
373     WeightsUploadType weights_upload_type) {
374   const auto weights_desc = GetWeightsDescription();
375   const int flt_count =
376       GetTotalElementsCountForLayout(weights_desc, weights.shape);
377 
378   BufferDescriptor desc;
379   desc.element_type = weights_desc.type;
380   desc.element_size = 4;
381   desc.memory_type =
382       weights_upload_type ==
383               ConvolutionTransposed4x4::WeightsUploadType::CONSTANT_MEM
384           ? MemoryType::CONSTANT
385           : MemoryType::GLOBAL;
386   desc.size = flt_count * SizeOf(desc.element_type);
387   desc.data.resize(desc.size);
388 
389   RearrangeWeights(weights, weights_desc, absl::MakeSpan(desc.data));
390   args_.AddObject("weights",
391                   std::make_unique<BufferDescriptor>(std::move(desc)));
392 }
393 
IsConvolutionTransposed4x4Supported(const OperationDef & definition,const ConvolutionTransposedAttributes & attr)394 bool IsConvolutionTransposed4x4Supported(
395     const OperationDef& definition,
396     const ConvolutionTransposedAttributes& attr) {
397   return attr.weights.shape.w == 4 && attr.weights.shape.h == 4 &&
398          attr.stride.w == 2 && attr.stride.h == 2 &&
399          attr.padding.prepended.w == 1 && attr.padding.prepended.h == 1;
400 }
401 
CreateConvolutionTransposed4x4(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)402 ConvolutionTransposed4x4 CreateConvolutionTransposed4x4(
403     const GpuInfo& gpu_info, const OperationDef& definition,
404     const ConvolutionTransposedAttributes& attr) {
405   ConvolutionTransposed4x4 result(definition, gpu_info);
406   result.UploadWeights(attr.weights, GetBestWeightsUploadType(gpu_info));
407 
408   TensorDescriptor bias_tensor_desc = CreateConstantLinearTensorDescriptor(
409       gpu_info, definition.src_tensors[0].GetDataType(), attr.bias);
410   result.args_.AddObject("biases", std::make_unique<TensorDescriptor>(
411                                        std::move(bias_tensor_desc)));
412   return result;
413 }
414 
CreateConvolutionTransposed4x4DynamicWeights(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)415 ConvolutionTransposed4x4 CreateConvolutionTransposed4x4DynamicWeights(
416     const GpuInfo& gpu_info, const OperationDef& definition,
417     const ConvolutionTransposedAttributes& attr) {
418   OperationDef new_def = definition;
419   new_def.src_tensors = {
420       definition.src_tensors[0]};  // leaving only src_tensor def, weights defs
421                                    // will be added later
422   const DataType weights_type = definition.GetDataType();
423   // add 1 src_tensor(buffer) for weights
424   new_def.src_tensors.push_back(
425       {weights_type, TensorStorageType::BUFFER, Layout::HWC});
426 
427   ConvolutionTransposed4x4 result(new_def, gpu_info);
428 
429   TensorDescriptor bias_tensor_desc = CreateConstantLinearTensorDescriptor(
430       gpu_info, definition.src_tensors[0].GetDataType(), attr.bias);
431   result.args_.AddObject("biases", std::make_unique<TensorDescriptor>(
432                                        std::move(bias_tensor_desc)));
433   return result;
434 }
435 
436 }  // namespace gpu
437 }  // namespace tflite
438