1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/conv_constants.h"
17
18 #include <algorithm>
19 #include <memory>
20 #include <string>
21 #include <utility>
22
23 #include "absl/strings/match.h"
24 #include "absl/strings/str_cat.h"
25
26 namespace tflite {
27 namespace gpu {
28
29 namespace {
30 // Adreno can provide up to ~3-4KB of constant memory, but in some cases even
31 // 3KB can have very bad performance.
GetAdrenoOptimalMaxConstantSize(const AdrenoInfo & adreno_info)32 int GetAdrenoOptimalMaxConstantSize(const AdrenoInfo& adreno_info) {
33 if (adreno_info.IsAdreno3xx() || adreno_info.IsAdreno4xx() ||
34 adreno_info.IsAdreno5xx()) {
35 return 256 * 10; // 2.5KB
36 } else {
37 return 256 * 14; // 3.5KB
38 }
39 }
40
GetOptimalMaxConstantSize(const GpuInfo & gpu_info)41 int GetOptimalMaxConstantSize(const GpuInfo& gpu_info) {
42 if (gpu_info.IsAdreno()) {
43 return GetAdrenoOptimalMaxConstantSize(gpu_info.adreno_info);
44 } else if (gpu_info.IsAMD()) {
45 return 4096;
46 } else {
47 return 1024; // 1KB
48 }
49 }
50
AppendConditionally(const std::string & value,const std::string & delimeter,std::string * result)51 void AppendConditionally(const std::string& value, const std::string& delimeter,
52 std::string* result) {
53 if (!result->empty()) {
54 *result += delimeter;
55 }
56 *result += value;
57 }
58
59 // src_size and dst_size must be <= 4;
GenerateConv(int src_size,int dst_size,bool use_dot_conv,int const_mem_offset,CalculationsPrecision precision,const std::string & dst,const std::string & src)60 std::string GenerateConv(int src_size, int dst_size, bool use_dot_conv,
61 int const_mem_offset, CalculationsPrecision precision,
62 const std::string& dst, const std::string& src) {
63 std::string result;
64 const std::string postfixes[] = {".x", ".y", ".z", ".w"};
65 if (use_dot_conv) {
66 const std::string src_postfixes[] = {".x", ".xy", ".xyz", ""};
67 const std::string src_postfix = src_postfixes[src_size - 1];
68 for (int i = 0; i < dst_size; ++i) {
69 result += " " + dst + postfixes[i] + " += dot(" + src +
70 ", args.weights.Read(" + std::to_string(const_mem_offset + i) +
71 ")" + src_postfix + ");\n";
72 }
73 } else {
74 const std::string dst_postfixes[] = {".x", ".xy", ".xyz", ""};
75 const std::string dst_postfix = dst_postfixes[dst_size - 1];
76 if (precision == CalculationsPrecision::F32_F16) {
77 for (int i = 0; i < src_size; ++i) {
78 if (i != 0) {
79 result += " + ";
80 }
81 std::string src_name = src;
82 if (src_size != 1) {
83 src_name += postfixes[i];
84 }
85 result += src_name + " * args.weights.Read(" +
86 std::to_string(const_mem_offset + i) + ")" + dst_postfix;
87 }
88 std::string size = dst_size == 1 ? "" : std::to_string(dst_size);
89 result = " " + dst + dst_postfix + " += TO_ACCUM_FLT" + size + "(" +
90 result + ");\n";
91 } else {
92 for (int i = 0; i < src_size; ++i) {
93 std::string src_name = src;
94 if (src_size != 1) {
95 src_name += postfixes[i];
96 }
97 result += " " + dst + dst_postfix + " += " + src_name +
98 " * args.weights.Read(" +
99 std::to_string(const_mem_offset + i) + ")" + dst_postfix +
100 ";\n";
101 }
102 }
103 }
104 return result;
105 }
106
GenerateConvolutionConstantCode(const GpuInfo & gpu_info,const OperationDef & op_def,const OHWI & weights_shape,bool x_oob_reads,bool y_oob_reads,bool use_dot_conv,GPUOperation * op)107 std::string GenerateConvolutionConstantCode(const GpuInfo& gpu_info,
108 const OperationDef& op_def,
109 const OHWI& weights_shape,
110 bool x_oob_reads, bool y_oob_reads,
111 bool use_dot_conv,
112 GPUOperation* op) {
113 auto src_desc = op_def.src_tensors[0];
114 op->AddSrcTensor("src_tensor", src_desc);
115 op->AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
116
117 const int out_z = DivideRoundUp(weights_shape.o, 4);
118 const std::string kOutZ = std::to_string(out_z);
119 const int src_depth = DivideRoundUp(weights_shape.i, 4);
120
121 const std::string postfixes[] = {".x", ".xy", ".xyz", ""};
122
123 std::string c;
124 c += "MAIN_FUNCTION($0) {\n";
125 if (src_desc.HasAxis(Axis::BATCH)) {
126 c += " int linear_id = GLOBAL_ID_0;\n";
127 c += " int X = linear_id / args.dst_tensor.Batch();\n";
128 c += " int B = linear_id % args.dst_tensor.Batch();\n";
129 c += " args.src_tensor.SetBatchRef(B);\n";
130 c += " args.dst_tensor.SetBatchRef(B);\n";
131 } else {
132 c += " int X = GLOBAL_ID_0;\n";
133 }
134 c += " int Y = GLOBAL_ID_1;\n";
135 c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height()) "
136 "return;\n";
137 c += " int start_x = X * args.stride_x + args.padding_x;\n";
138 c += " int start_y = Y * args.stride_y + args.padding_y;\n";
139 for (int i = 0; i < out_z; ++i) {
140 c += " ACCUM_FLT4 r" + std::to_string(i) + " = INIT_ACCUM_FLT4(0.0f);\n";
141 }
142 std::string check;
143 if (y_oob_reads && !src_desc.SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
144 AppendConditionally("inside_y", " && ", &check);
145 }
146 if (x_oob_reads && !src_desc.SupportsZeroClamp(Axis::WIDTH, gpu_info)) {
147 AppendConditionally("inside_x", " && ", &check);
148 }
149 int filters_counter = 0;
150 for (int s = 0; s < src_depth; ++s) {
151 const int src_ch_count = std::min(4, weights_shape.i - s * 4);
152 const std::string s_count =
153 src_ch_count == 1 ? "" : std::to_string(src_ch_count);
154 const std::string s_type = absl::StrCat("FLT", s_count);
155 const std::string s_postfix = postfixes[src_ch_count - 1];
156 for (int ky = 0; ky < weights_shape.h; ++ky) {
157 std::string s_y = absl::StrCat("(start_y + ", ky, " * args.dilation_y)");
158 c += " {\n";
159 c += " int y_c = start_y + " + std::to_string(ky) +
160 " * args.dilation_y;\n";
161 if (y_oob_reads && !src_desc.SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
162 c +=
163 " bool inside_y = y_c >= 0 && y_c < args.src_tensor.Height();\n";
164 c += " y_c = clamp(y_c, 0, args.src_tensor.Height() - 1);\n";
165 }
166 for (int kx = 0; kx < weights_shape.w; ++kx) {
167 c += " {\n";
168 c += " int x_c = start_x + " + std::to_string(kx) +
169 " * args.dilation_x;\n";
170 if (x_oob_reads && !src_desc.SupportsZeroClamp(Axis::WIDTH, gpu_info)) {
171 c += " bool inside_x = x_c >= 0 && x_c < "
172 "args.src_tensor.Width();\n";
173 c += " x_c = clamp(x_c, 0, args.src_tensor.Width() - 1);\n";
174 }
175 c += " " + s_type + " src = args.src_tensor.Read(x_c, y_c, " +
176 std::to_string(s) + ")" + s_postfix + ";\n";
177 if (!check.empty()) {
178 c += " src *= INIT_FLT(" + check + ");\n";
179 }
180 for (int d = 0; d < out_z; ++d) {
181 const int dst_ch_count = std::min(4, weights_shape.o - d * 4);
182 c += GenerateConv(src_ch_count, dst_ch_count, use_dot_conv,
183 filters_counter, op_def.precision,
184 "r" + std::to_string(d), "src");
185 filters_counter += use_dot_conv ? dst_ch_count : src_ch_count;
186 }
187 c += " }\n";
188 }
189 c += " }\n";
190 }
191 }
192 for (int i = 0; i < out_z; ++i) {
193 std::string s_i = std::to_string(i);
194 c += " {\n";
195 c += " FLT4 res = TO_FLT4(r" + s_i + ") + args.biases.Read(" + s_i +
196 ");\n";
197 c += " args.dst_tensor.Write(res, X, Y, " + s_i + ");\n";
198 c += " }\n";
199 }
200 c += "}\n";
201 return c;
202 }
203
IsDotConvBetter(int src_channels,int dst_channels)204 bool IsDotConvBetter(int src_channels, int dst_channels) {
205 if (dst_channels % 4 == 0) {
206 return false;
207 }
208
209 // dst_channels % 4 != 0
210 if (src_channels % 4 == 0) {
211 return true;
212 }
213
214 // dst_channels % 4 != 0 && src_channels % 4 != 0
215 const int src_depth = DivideRoundUp(src_channels, 4);
216 const int dst_depth = DivideRoundUp(dst_channels, 4);
217 return dst_channels * src_depth < src_channels * dst_depth;
218 }
219
220 } // namespace
221
IsConvConstantsSupported(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr)222 bool IsConvConstantsSupported(const GpuInfo& gpu_info,
223 const OperationDef& definition,
224 const Convolution2DAttributes& attr) {
225 if (gpu_info.IsApiOpenCl() && gpu_info.IsAdreno()) {
226 const std::string kBadDriver =
227 "OpenCL 2.0 QUALCOMM build: commit #7ff4f54 changeid #I4460aa6217 "
228 "Date: 12/30/18";
229 if (absl::StrContains(gpu_info.opencl_info.platform_version, kBadDriver)) {
230 return false;
231 }
232 }
233
234 if (attr.groups != 1) {
235 return false;
236 }
237
238 const bool use_dot_conv =
239 IsDotConvBetter(attr.weights.shape.i, attr.weights.shape.o);
240 const auto& w_shape = attr.weights.shape;
241 const int src_depth = DivideRoundUp(w_shape.i, 4);
242 const int dst_depth = DivideRoundUp(w_shape.o, 4);
243 const int aligned_ch_count =
244 use_dot_conv ? w_shape.o * src_depth * 4 : w_shape.i * dst_depth * 4;
245 const int filters_count = aligned_ch_count * w_shape.h * w_shape.w;
246 const int float_size = definition.precision == CalculationsPrecision::F32
247 ? sizeof(float)
248 : sizeof(half);
249 const int filters_buffer_size = filters_count * float_size;
250 const int kConstantMaxSize = GetOptimalMaxConstantSize(gpu_info);
251 const int flt4_registers = DivideRoundUp(w_shape.o, 4);
252 return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
253 }
254
CreateConvConstants(const GpuInfo & gpu_info,const OperationDef & definition,const Convolution2DAttributes & attr)255 GPUOperation CreateConvConstants(const GpuInfo& gpu_info,
256 const OperationDef& definition,
257 const Convolution2DAttributes& attr) {
258 const bool use_dot_conv =
259 IsDotConvBetter(attr.weights.shape.i, attr.weights.shape.o);
260 GPUOperation op(definition);
261 UploadWeightsForConvConstants(attr.weights, gpu_info, definition.precision,
262 use_dot_conv, &op);
263 op.args_.AddInt("stride_x", attr.strides.w);
264 op.args_.AddInt("stride_y", attr.strides.h);
265 op.args_.AddInt("padding_x", -attr.padding.prepended.w);
266 op.args_.AddInt("padding_y", -attr.padding.prepended.h);
267 op.args_.AddInt("dilation_x", attr.dilations.w);
268 op.args_.AddInt("dilation_y", attr.dilations.h);
269 op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1;
270
271 bool x_oob_reads =
272 attr.padding.appended.w != 0 || attr.padding.prepended.w != 0;
273 bool y_oob_reads =
274 attr.padding.appended.h != 0 || attr.padding.prepended.h != 0;
275 op.code_ = GenerateConvolutionConstantCode(gpu_info, definition,
276 attr.weights.shape, x_oob_reads,
277 y_oob_reads, use_dot_conv, &op);
278 if (definition.precision == CalculationsPrecision::F16 &&
279 gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx()) {
280 op.compiler_options_.push_back(CompilerOptions::kAdrenoFullSimd);
281 }
282 if (definition.precision != CalculationsPrecision::F32 &&
283 gpu_info.IsPowerVR()) {
284 // BUG, some PowerVRs (GE8320) produce incorrect result without it
285 op.compiler_options_.push_back(CompilerOptions::kClDisableOptimizations);
286 }
287
288 TensorDescriptor bias_tensor_desc = CreateConstantLinearTensorDescriptor(
289 gpu_info, definition.src_tensors[0].GetDataType(), attr.bias);
290 op.args_.AddObject("biases", std::make_unique<TensorDescriptor>(
291 std::move(bias_tensor_desc)));
292 return op;
293 }
294
295 } // namespace gpu
296 } // namespace tflite
297