xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/delegates/gpu/common/tasks/conv_constants.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/conv_constants.h"
17 
18 #include <algorithm>
19 #include <memory>
20 #include <string>
21 #include <utility>
22 
23 #include "absl/strings/match.h"
24 #include "absl/strings/str_cat.h"
25 
26 namespace tflite {
27 namespace gpu {
28 
29 namespace {
30 // Adreno can provide up to ~3-4KB of constant memory, but in some cases even
31 // 3KB can have very bad performance.
GetAdrenoOptimalMaxConstantSize(const AdrenoInfo & adreno_info)32 int GetAdrenoOptimalMaxConstantSize(const AdrenoInfo& adreno_info) {
33   if (adreno_info.IsAdreno3xx() || adreno_info.IsAdreno4xx() ||
34       adreno_info.IsAdreno5xx()) {
35     return 256 * 10;  // 2.5KB
36   } else {
37     return 256 * 14;  // 3.5KB
38   }
39 }
40 
GetOptimalMaxConstantSize(const GpuInfo & gpu_info)41 int GetOptimalMaxConstantSize(const GpuInfo& gpu_info) {
42   if (gpu_info.IsAdreno()) {
43     return GetAdrenoOptimalMaxConstantSize(gpu_info.adreno_info);
44   } else if (gpu_info.IsAMD()) {
45     return 4096;
46   } else {
47     return 1024;  // 1KB
48   }
49 }
50 
AppendConditionally(const std::string & value,const std::string & delimeter,std::string * result)51 void AppendConditionally(const std::string& value, const std::string& delimeter,
52                          std::string* result) {
53   if (!result->empty()) {
54     *result += delimeter;
55   }
56   *result += value;
57 }
58 
59 // src_size and dst_size must be <= 4;
GenerateConv(int src_size,int dst_size,bool use_dot_conv,int const_mem_offset,CalculationsPrecision precision,const std::string & dst,const std::string & src)60 std::string GenerateConv(int src_size, int dst_size, bool use_dot_conv,
61                          int const_mem_offset, CalculationsPrecision precision,
62                          const std::string& dst, const std::string& src) {
63   std::string result;
64   const std::string postfixes[] = {".x", ".y", ".z", ".w"};
65   if (use_dot_conv) {
66     const std::string src_postfixes[] = {".x", ".xy", ".xyz", ""};
67     const std::string src_postfix = src_postfixes[src_size - 1];
68     for (int i = 0; i < dst_size; ++i) {
69       result += "      " + dst + postfixes[i] + " += dot(" + src +
70                 ", args.weights.Read(" + std::to_string(const_mem_offset + i) +
71                 ")" + src_postfix + ");\n";
72     }
73   } else {
74     const std::string dst_postfixes[] = {".x", ".xy", ".xyz", ""};
75     const std::string dst_postfix = dst_postfixes[dst_size - 1];
76     if (precision == CalculationsPrecision::F32_F16) {
77       for (int i = 0; i < src_size; ++i) {
78         if (i != 0) {
79           result += " + ";
80         }
81         std::string src_name = src;
82         if (src_size != 1) {
83           src_name += postfixes[i];
84         }
85         result += src_name + " * args.weights.Read(" +
86                   std::to_string(const_mem_offset + i) + ")" + dst_postfix;
87       }
88       std::string size = dst_size == 1 ? "" : std::to_string(dst_size);
89       result = "      " + dst + dst_postfix + " += TO_ACCUM_FLT" + size + "(" +
90                result + ");\n";
91     } else {
92       for (int i = 0; i < src_size; ++i) {
93         std::string src_name = src;
94         if (src_size != 1) {
95           src_name += postfixes[i];
96         }
97         result += "      " + dst + dst_postfix + " += " + src_name +
98                   " * args.weights.Read(" +
99                   std::to_string(const_mem_offset + i) + ")" + dst_postfix +
100                   ";\n";
101       }
102     }
103   }
104   return result;
105 }
106 
GenerateConvolutionConstantCode(const GpuInfo & gpu_info,const OperationDef & op_def,const OHWI & weights_shape,bool x_oob_reads,bool y_oob_reads,bool use_dot_conv,GPUOperation * op)107 std::string GenerateConvolutionConstantCode(const GpuInfo& gpu_info,
108                                             const OperationDef& op_def,
109                                             const OHWI& weights_shape,
110                                             bool x_oob_reads, bool y_oob_reads,
111                                             bool use_dot_conv,
112                                             GPUOperation* op) {
113   auto src_desc = op_def.src_tensors[0];
114   op->AddSrcTensor("src_tensor", src_desc);
115   op->AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
116 
117   const int out_z = DivideRoundUp(weights_shape.o, 4);
118   const std::string kOutZ = std::to_string(out_z);
119   const int src_depth = DivideRoundUp(weights_shape.i, 4);
120 
121   const std::string postfixes[] = {".x", ".xy", ".xyz", ""};
122 
123   std::string c;
124   c += "MAIN_FUNCTION($0) {\n";
125   if (src_desc.HasAxis(Axis::BATCH)) {
126     c += "  int linear_id = GLOBAL_ID_0;\n";
127     c += "  int X = linear_id / args.dst_tensor.Batch();\n";
128     c += "  int B = linear_id % args.dst_tensor.Batch();\n";
129     c += "  args.src_tensor.SetBatchRef(B);\n";
130     c += "  args.dst_tensor.SetBatchRef(B);\n";
131   } else {
132     c += "  int X = GLOBAL_ID_0;\n";
133   }
134   c += "  int Y = GLOBAL_ID_1;\n";
135   c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height()) "
136        "return;\n";
137   c += "  int start_x = X * args.stride_x + args.padding_x;\n";
138   c += "  int start_y = Y * args.stride_y + args.padding_y;\n";
139   for (int i = 0; i < out_z; ++i) {
140     c += "  ACCUM_FLT4 r" + std::to_string(i) + " = INIT_ACCUM_FLT4(0.0f);\n";
141   }
142   std::string check;
143   if (y_oob_reads && !src_desc.SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
144     AppendConditionally("inside_y", " && ", &check);
145   }
146   if (x_oob_reads && !src_desc.SupportsZeroClamp(Axis::WIDTH, gpu_info)) {
147     AppendConditionally("inside_x", " && ", &check);
148   }
149   int filters_counter = 0;
150   for (int s = 0; s < src_depth; ++s) {
151     const int src_ch_count = std::min(4, weights_shape.i - s * 4);
152     const std::string s_count =
153         src_ch_count == 1 ? "" : std::to_string(src_ch_count);
154     const std::string s_type = absl::StrCat("FLT", s_count);
155     const std::string s_postfix = postfixes[src_ch_count - 1];
156     for (int ky = 0; ky < weights_shape.h; ++ky) {
157       std::string s_y = absl::StrCat("(start_y + ", ky, " * args.dilation_y)");
158       c += "  {\n";
159       c += "    int y_c = start_y + " + std::to_string(ky) +
160            " * args.dilation_y;\n";
161       if (y_oob_reads && !src_desc.SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
162         c +=
163             "    bool inside_y = y_c >= 0 && y_c < args.src_tensor.Height();\n";
164         c += "    y_c = clamp(y_c, 0, args.src_tensor.Height() - 1);\n";
165       }
166       for (int kx = 0; kx < weights_shape.w; ++kx) {
167         c += "    {\n";
168         c += "      int x_c = start_x + " + std::to_string(kx) +
169              " * args.dilation_x;\n";
170         if (x_oob_reads && !src_desc.SupportsZeroClamp(Axis::WIDTH, gpu_info)) {
171           c += "      bool inside_x = x_c >= 0 && x_c < "
172                "args.src_tensor.Width();\n";
173           c += "      x_c = clamp(x_c, 0, args.src_tensor.Width() - 1);\n";
174         }
175         c += "      " + s_type + " src = args.src_tensor.Read(x_c, y_c, " +
176              std::to_string(s) + ")" + s_postfix + ";\n";
177         if (!check.empty()) {
178           c += "      src *= INIT_FLT(" + check + ");\n";
179         }
180         for (int d = 0; d < out_z; ++d) {
181           const int dst_ch_count = std::min(4, weights_shape.o - d * 4);
182           c += GenerateConv(src_ch_count, dst_ch_count, use_dot_conv,
183                             filters_counter, op_def.precision,
184                             "r" + std::to_string(d), "src");
185           filters_counter += use_dot_conv ? dst_ch_count : src_ch_count;
186         }
187         c += "    }\n";
188       }
189       c += "  }\n";
190     }
191   }
192   for (int i = 0; i < out_z; ++i) {
193     std::string s_i = std::to_string(i);
194     c += "  {\n";
195     c += "    FLT4 res = TO_FLT4(r" + s_i + ") + args.biases.Read(" + s_i +
196          ");\n";
197     c += "    args.dst_tensor.Write(res, X, Y, " + s_i + ");\n";
198     c += "  }\n";
199   }
200   c += "}\n";
201   return c;
202 }
203 
IsDotConvBetter(int src_channels,int dst_channels)204 bool IsDotConvBetter(int src_channels, int dst_channels) {
205   if (dst_channels % 4 == 0) {
206     return false;
207   }
208 
209   // dst_channels % 4 != 0
210   if (src_channels % 4 == 0) {
211     return true;
212   }
213 
214   // dst_channels % 4 != 0 && src_channels % 4 != 0
215   const int src_depth = DivideRoundUp(src_channels, 4);
216   const int dst_depth = DivideRoundUp(dst_channels, 4);
217   return dst_channels * src_depth < src_channels * dst_depth;
218 }
219 
220 }  // namespace
221 
IsConvConstantsSupported(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr)222 bool IsConvConstantsSupported(const GpuInfo& gpu_info,
223                               const OperationDef& definition,
224                               const Convolution2DAttributes& attr) {
225   if (gpu_info.IsApiOpenCl() && gpu_info.IsAdreno()) {
226     const std::string kBadDriver =
227         "OpenCL 2.0 QUALCOMM build: commit #7ff4f54 changeid #I4460aa6217 "
228         "Date: 12/30/18";
229     if (absl::StrContains(gpu_info.opencl_info.platform_version, kBadDriver)) {
230       return false;
231     }
232   }
233 
234   if (attr.groups != 1) {
235     return false;
236   }
237 
238   const bool use_dot_conv =
239       IsDotConvBetter(attr.weights.shape.i, attr.weights.shape.o);
240   const auto& w_shape = attr.weights.shape;
241   const int src_depth = DivideRoundUp(w_shape.i, 4);
242   const int dst_depth = DivideRoundUp(w_shape.o, 4);
243   const int aligned_ch_count =
244       use_dot_conv ? w_shape.o * src_depth * 4 : w_shape.i * dst_depth * 4;
245   const int filters_count = aligned_ch_count * w_shape.h * w_shape.w;
246   const int float_size = definition.precision == CalculationsPrecision::F32
247                              ? sizeof(float)
248                              : sizeof(half);
249   const int filters_buffer_size = filters_count * float_size;
250   const int kConstantMaxSize = GetOptimalMaxConstantSize(gpu_info);
251   const int flt4_registers = DivideRoundUp(w_shape.o, 4);
252   return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
253 }
254 
CreateConvConstants(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr)255 GPUOperation CreateConvConstants(const GpuInfo& gpu_info,
256                                  const OperationDef& definition,
257                                  const Convolution2DAttributes& attr) {
258   const bool use_dot_conv =
259       IsDotConvBetter(attr.weights.shape.i, attr.weights.shape.o);
260   GPUOperation op(definition);
261   UploadWeightsForConvConstants(attr.weights, gpu_info, definition.precision,
262                                 use_dot_conv, &op);
263   op.args_.AddInt("stride_x", attr.strides.w);
264   op.args_.AddInt("stride_y", attr.strides.h);
265   op.args_.AddInt("padding_x", -attr.padding.prepended.w);
266   op.args_.AddInt("padding_y", -attr.padding.prepended.h);
267   op.args_.AddInt("dilation_x", attr.dilations.w);
268   op.args_.AddInt("dilation_y", attr.dilations.h);
269   op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1;
270 
271   bool x_oob_reads =
272       attr.padding.appended.w != 0 || attr.padding.prepended.w != 0;
273   bool y_oob_reads =
274       attr.padding.appended.h != 0 || attr.padding.prepended.h != 0;
275   op.code_ = GenerateConvolutionConstantCode(gpu_info, definition,
276                                              attr.weights.shape, x_oob_reads,
277                                              y_oob_reads, use_dot_conv, &op);
278   if (definition.precision == CalculationsPrecision::F16 &&
279       gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx()) {
280     op.compiler_options_.push_back(CompilerOptions::kAdrenoFullSimd);
281   }
282   if (definition.precision != CalculationsPrecision::F32 &&
283       gpu_info.IsPowerVR()) {
284     // BUG, some PowerVRs (GE8320) produce incorrect result without it
285     op.compiler_options_.push_back(CompilerOptions::kClDisableOptimizations);
286   }
287 
288   TensorDescriptor bias_tensor_desc = CreateConstantLinearTensorDescriptor(
289       gpu_info, definition.src_tensors[0].GetDataType(), attr.bias);
290   op.args_.AddObject("biases", std::make_unique<TensorDescriptor>(
291                                    std::move(bias_tensor_desc)));
292   return op;
293 }
294 
295 }  // namespace gpu
296 }  // namespace tflite
297