1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_3x3.h"
17
18 #include <memory>
19 #include <string>
20 #include <utility>
21 #include <vector>
22
23 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
24
25 namespace tflite {
26 namespace gpu {
27
ConvolutionTransposed3x3(const OperationDef & definition,const GpuInfo & gpu_info,int2 padding)28 ConvolutionTransposed3x3::ConvolutionTransposed3x3(
29 const OperationDef& definition, const GpuInfo& gpu_info, int2 padding)
30 : GPUOperation(definition), padding_(padding) {
31 work_group_size_ = int3(8, 4, 1);
32 work_group_launch_order_ = int3(2, 0, 1);
33 if (gpu_info.IsApple()) {
34 if (gpu_info.apple_info.IsBionic()) {
35 weights_upload_type_ = WeightsUploadType::GLOBAL_MEM;
36 } else {
37 weights_upload_type_ = WeightsUploadType::LOCAL_MEM_BY_THREADS;
38 }
39 } else if (gpu_info.IsPowerVR()) {
40 weights_upload_type_ = WeightsUploadType::LOCAL_MEM_ASYNC;
41 } else if (gpu_info.IsNvidia() || gpu_info.IsIntel()) {
42 weights_upload_type_ = WeightsUploadType::LOCAL_MEM_BY_THREADS;
43 } else if (gpu_info.IsAMD()) {
44 weights_upload_type_ = WeightsUploadType::CONSTANT_MEM;
45 } else {
46 weights_upload_type_ = WeightsUploadType::GLOBAL_MEM;
47 }
48 if (gpu_info.IsApple()) {
49 weights_layout_ = WeightsLayout::kOICustomSpatialO4I4;
50 } else {
51 weights_layout_ = WeightsLayout::kOICustomSpatialI4O4;
52 }
53 code_ = GenerateConvolutionTransposedCode(gpu_info, definition_,
54 weights_upload_type_, padding_,
55 work_group_launch_order_);
56 if (definition_.precision == CalculationsPrecision::F16 &&
57 gpu_info.IsPowerVR()) {
58 compiler_options_.push_back(CompilerOptions::kClFastRelaxedMath);
59 }
60 }
61
GenerateConvolutionTransposedCode(const GpuInfo & gpu_info,const OperationDef & op_def,ConvolutionTransposed3x3::WeightsUploadType weights_upload_type,int2 padding,int3 work_group_launch_order)62 std::string ConvolutionTransposed3x3::GenerateConvolutionTransposedCode(
63 const GpuInfo& gpu_info, const OperationDef& op_def,
64 ConvolutionTransposed3x3::WeightsUploadType weights_upload_type,
65 int2 padding, int3 work_group_launch_order) {
66 auto src_desc = op_def.src_tensors[0];
67 AddSrcTensor("src_tensor", src_desc);
68 AddDstTensor("dst_tensor", op_def.src_tensors[0]);
69
70 if (op_def.src_tensors.size() == 2) {
71 // dynamic weights
72 BufferDescriptor desc;
73 desc.element_type = op_def.src_tensors[1].GetDataType();
74 desc.element_size = 4;
75 desc.memory_type =
76 weights_upload_type ==
77 ConvolutionTransposed3x3::WeightsUploadType::CONSTANT_MEM
78 ? MemoryType::CONSTANT
79 : MemoryType::GLOBAL;
80 AddSrcBuffer("weights", desc);
81 }
82
83 args_.AddInt("filter_offset");
84 args_.AddInt("padding_x");
85 args_.AddInt("padding_y");
86
87 const bool need_local_mem =
88 weights_upload_type ==
89 ConvolutionTransposed3x3::WeightsUploadType::LOCAL_MEM_BY_THREADS ||
90 weights_upload_type ==
91 ConvolutionTransposed3x3::WeightsUploadType::LOCAL_MEM_ASYNC;
92
93 std::string c;
94 if (GetWeightsDescription().IsI4O4()) {
95 switch (op_def.precision) {
96 case CalculationsPrecision::F32:
97 case CalculationsPrecision::F16:
98 c += "#define CONV(R, SRC, F) \\\n";
99 c += " R += SRC.x * weights_cache[F]; \\\n";
100 c += " R += SRC.y * weights_cache[F + 1]; \\\n";
101 c += " R += SRC.z * weights_cache[F + 2]; \\\n";
102 c += " R += SRC.w * weights_cache[F + 3]; \n";
103 break;
104 case CalculationsPrecision::F32_F16:
105 c += "#define CONV(R, SRC, F) \\\n";
106 c += " R += TO_ACCUM_TYPE(SRC.x * weights_cache[F] + SRC.y * "
107 "weights_cache[F + 1] + SRC.z * weights_cache[F + 2] + SRC.w * "
108 "weights_cache[F + 3]);\n";
109 break;
110 }
111 } else {
112 // O4I4
113 c += "#define CONV(R, SRC, F) \\\n";
114 c += " R.x += dot(SRC, weights_cache[F]); \\\n";
115 c += " R.y += dot(SRC, weights_cache[F + 1]); \\\n";
116 c += " R.z += dot(SRC, weights_cache[F + 2]); \\\n";
117 c += " R.w += dot(SRC, weights_cache[F + 3]); \n";
118 }
119
120 const int wg_total_size =
121 work_group_size_.x * work_group_size_.y * work_group_size_.z;
122 const std::string barrier =
123 wg_total_size == 32 && gpu_info.IsWaveSizeEqualTo32()
124 ? "SIMD_LOCAL_MEM_BARRIER"
125 : "LOCAL_MEM_BARRIER";
126 const std::string weights_space =
127 weights_upload_type ==
128 ConvolutionTransposed3x3::WeightsUploadType::CONSTANT_MEM
129 ? "__constant"
130 : "__global";
131
132 if (gpu_info.IsApiOpenCl()) {
133 c += "__attribute__((reqd_work_group_size(8, 4, 1)))\n";
134 }
135 c += "MAIN_FUNCTION($0) {\n";
136 int3 launch_remap;
137 launch_remap[work_group_launch_order.x] = 0;
138 launch_remap[work_group_launch_order.y] = 1;
139 launch_remap[work_group_launch_order.z] = 2;
140 auto GetGlobalID = [&](int id) {
141 std::string result;
142 const std::string sid = std::to_string(id);
143 if (work_group_launch_order[id] == id) {
144 return "GLOBAL_ID_" + sid;
145 } else {
146 return "GROUP_ID_" + std::to_string(launch_remap[id]) + " * GROUP_SIZE_" +
147 sid + " + LOCAL_ID_" + sid;
148 }
149 };
150 if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
151 c += " int linear_id = " + GetGlobalID(0) + ";\n";
152 c += " int X = linear_id / args.dst_tensor.Batch();\n";
153 c += " int B = linear_id % args.dst_tensor.Batch();\n";
154 c += " args.src_tensor.SetBatchRef(B);\n";
155 c += " args.dst_tensor.SetBatchRef(B);\n";
156 } else {
157 c += " int X = " + GetGlobalID(0) + ";\n";
158 }
159 c += " int DST_X = X * 2;\n";
160 c += " int SRC_X = X + args.padding_x;\n";
161 c += " int Y = " + GetGlobalID(1) + ";\n";
162 c += " int DST_Y = Y * 2;\n";
163 c += " int SRC_Y = Y + args.padding_y;\n";
164 c += " int Z = " + GetGlobalID(2) + ";\n";
165 if (!need_local_mem) {
166 c += " if (DST_X >= args.dst_tensor.Width() || DST_Y >= "
167 "args.dst_tensor.Height() || Z >= args.dst_tensor.Slices()) return;\n";
168 }
169 c += " ACCUM_FLT4 r0 = INIT_ACCUM_FLT4(0.0f);\n";
170 c += " ACCUM_FLT4 r1 = INIT_ACCUM_FLT4(0.0f);\n";
171 c += " ACCUM_FLT4 r2 = INIT_ACCUM_FLT4(0.0f);\n";
172 c += " ACCUM_FLT4 r3 = INIT_ACCUM_FLT4(0.0f);\n";
173 c += " int f_offset = Z * args.filter_offset;\n";
174 if (need_local_mem) {
175 c += " __local FLT4 weights_cache[36];\n";
176 }
177 if (weights_upload_type ==
178 ConvolutionTransposed3x3::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
179 c += " int local_id = LOCAL_ID_1 * 8 + LOCAL_ID_0;\n";
180 }
181 if (!src_desc.SupportsZeroClamp(Axis::WIDTH, gpu_info)) {
182 c += " bool in_x0 = SRC_X >= 0 && SRC_X < args.src_tensor.Width();\n";
183 c += " bool in_x1 = SRC_X + 1 >= 0 && SRC_X + 1 < "
184 "args.src_tensor.Width();\n";
185 }
186 if (!src_desc.SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
187 c += " bool in_y0 = SRC_Y >= 0 && SRC_Y < args.src_tensor.Height();\n";
188 c += " bool in_y1 = SRC_Y + 1 >= 0 && SRC_Y + 1 < "
189 "args.src_tensor.Height();\n";
190 }
191 auto generate_check = [&](int x, int y) {
192 std::string check;
193 const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT};
194 const std::vector<std::string> names{"in_x" + std::to_string(x),
195 "in_y" + std::to_string(y)};
196 for (int i = 0; i < axes.size(); ++i) {
197 const auto& axis = axes[i];
198 if (src_desc.HasAxis(axis) &&
199 !src_desc.SupportsZeroClamp(axis, gpu_info)) {
200 if (!check.empty()) {
201 check += " && ";
202 }
203 check += names[i];
204 }
205 }
206 return check;
207 };
208 if (src_desc.IsLinear()) {
209 if (src_desc.ReturnsZeroForNegOneRead(gpu_info)) {
210 c += " int addr_0 = args.src_tensor.GetAddress(SRC_X, SRC_Y, 0);\n";
211 c += " int addr_1 = args.src_tensor.GetAddress(SRC_X + 1, SRC_Y, 0);\n";
212 c += " int addr_2 = args.src_tensor.GetAddress(SRC_X, SRC_Y + 1, 0);\n";
213 c += " int addr_3 = args.src_tensor.GetAddress(SRC_X+1, SRC_Y+1, 0);\n";
214 c += " addr_0 = select(-1, addr_0, (in_x0 && in_y0));\n";
215 c += " addr_1 = select(-1, addr_1, (in_x1 && in_y0));\n";
216 c += " addr_2 = select(-1, addr_2, (in_x0 && in_y1));\n";
217 c += " addr_3 = select(-1, addr_3, (in_x1 && in_y1));\n";
218 c += " int dz_0 = select(0, args.src_tensor.SliceStride(), (in_x0 && "
219 "in_y0));\n";
220 c += " int dz_1 = select(0, args.src_tensor.SliceStride(), (in_x1 && "
221 "in_y0));\n";
222 c += " int dz_2 = select(0, args.src_tensor.SliceStride(), (in_x0 && "
223 "in_y1));\n";
224 c += " int dz_3 = select(0, args.src_tensor.SliceStride(), (in_x1 && "
225 "in_y1));\n";
226 } else {
227 c += " int xc0 = clamp(SRC_X, 0, args.src_tensor.Width() - 1);\n";
228 c += " int xc1 = clamp(SRC_X + 1, 0, args.src_tensor.Width() - 1);\n";
229 c += " int yc0 = clamp(SRC_Y, 0, args.src_tensor.Height() - 1);\n";
230 c += " int yc1 = clamp(SRC_Y + 1, 0, args.src_tensor.Height() - 1);\n";
231 c += " int addr_0 = args.src_tensor.GetAddress(xc0, yc0, 0);\n";
232 c += " int addr_1 = args.src_tensor.GetAddress(xc1, yc0, 0);\n";
233 c += " int addr_2 = args.src_tensor.GetAddress(xc0, yc1, 0);\n";
234 c += " int addr_3 = args.src_tensor.GetAddress(xc1, yc1, 0);\n";
235 c += " int dz = args.src_tensor.SliceStride();\n";
236 }
237 }
238 auto read_src = [&](int x, int y) {
239 if (src_desc.IsLinear()) {
240 const std::string id = std::to_string(y * 2 + x);
241 const std::string addr = "addr_" + std::to_string(y * 2 + x);
242 if (src_desc.ReturnsZeroForNegOneRead(gpu_info)) {
243 return "args.src_tensor.Read(" + addr + "); " + addr + " += dz_" + id +
244 ";\n";
245 } else {
246 return "args.src_tensor.Read(" + addr + ") * INIT_FLT(in_x" +
247 std::to_string(x) + " && in_y" + std::to_string(y) + "); " +
248 addr + " += dz;\n";
249 }
250 } else {
251 std::string check = generate_check(x, y);
252 if (!check.empty()) {
253 check = " * INIT_FLT(" + check + ")";
254 }
255 return "args.src_tensor.Read(SRC_X + " + std::to_string(x) +
256 ", SRC_Y + " + std::to_string(y) + ", s)" + check + ";\n";
257 }
258 };
259 const int padding_x_rem = abs(padding.x) % 2;
260 const int padding_y_rem = abs(padding.y) % 2;
261 std::vector<std::pair<int, int>> permutation;
262 if (padding_x_rem == 1 && padding_y_rem == 1) {
263 permutation = {{0, 0}, {1, 0}, {1, 1}, {2, 0}, {2, 2},
264 {3, 0}, {3, 1}, {3, 2}, {3, 3}};
265 } else if (padding_x_rem == 0 && padding_y_rem == 1) {
266 permutation = {{0, 0}, {0, 1}, {1, 1}, {2, 0}, {2, 1},
267 {2, 2}, {2, 3}, {3, 1}, {3, 3}};
268 } else if (padding_x_rem == 1 && padding_y_rem == 0) {
269 permutation = {{0, 0}, {0, 2}, {1, 0}, {1, 1}, {1, 2},
270 {1, 3}, {2, 2}, {3, 2}, {3, 3}};
271 } else { // padding_x_rem == 0 && padding_y_rem == 0
272 permutation = {{0, 0}, {0, 1}, {0, 2}, {0, 3}, {1, 1},
273 {1, 3}, {2, 2}, {2, 3}, {3, 3}};
274 }
275 c += " for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
276 if (need_local_mem) {
277 c += " " + barrier + ";\n";
278 }
279 if (weights_upload_type ==
280 ConvolutionTransposed3x3::WeightsUploadType::LOCAL_MEM_ASYNC) {
281 c += " async_work_group_copy(weights_cache, "
282 "args.weights.GetPtr(f_offset), 36, "
283 "0);\n";
284 } else if (weights_upload_type ==
285 ConvolutionTransposed3x3::WeightsUploadType::
286 LOCAL_MEM_BY_THREADS) {
287 c += " weights_cache[local_id] = args.weights.Read(f_offset + "
288 "local_id);\n";
289 c += " if (local_id < 4) {\n";
290 c += " weights_cache[local_id + 32] = args.weights.Read(f_offset + "
291 "local_id + "
292 "32);\n";
293 c += " };\n";
294 } else { // GLOBAL_MEM/CONSTANT_MEM
295 c += " " + weights_space +
296 " FLT4* weights_cache = args.weights.GetPtr(f_offset);\n";
297 }
298 c += " FLT4 src0 = " + read_src(0, 0);
299 c += " FLT4 src1 = " + read_src(1, 0);
300 c += " FLT4 src2 = " + read_src(0, 1);
301 c += " FLT4 src3 = " + read_src(1, 1);
302 c += " f_offset += 36;\n";
303 if (need_local_mem) {
304 c += " " + barrier + ";\n";
305 }
306 for (int i = 0; i < 9; ++i) {
307 const std::string r_name = "r" + std::to_string(permutation[i].first);
308 const std::string s_name = "src" + std::to_string(permutation[i].second);
309 const std::string w_name = std::to_string(i * 4);
310 c += " CONV(" + r_name + ", " + s_name + ", " + w_name + ");\n";
311 }
312 c += " }\n";
313 if (need_local_mem) {
314 c += " if (DST_X >= args.dst_tensor.Width() || DST_Y >= "
315 "args.dst_tensor.Height() || Z >= args.dst_tensor.Slices()) return;\n";
316 }
317 c += " FLT4 bias_val = args.biases.Read(Z);\n";
318 for (int y = 0; y < 2; ++y) {
319 for (int x = 0; x < 2; ++x) {
320 const std::string s_x = std::to_string(x);
321 const std::string s_y = std::to_string(y);
322 const std::string id = std::to_string(y * 2 + x);
323 const std::string x_c = "DST_X + " + s_x;
324 const std::string y_c = "DST_Y + " + s_y;
325 c += " if (" + x_c + " < args.dst_tensor.Width() && " + y_c +
326 " < args.dst_tensor.Height()) {\n";
327 c += " FLT4 res0 = TO_FLT4(r" + id + ") + bias_val;\n";
328 c += " args.dst_tensor.Write(res0, " + x_c + ", " + y_c + ", Z);\n";
329 c += " }\n";
330 }
331 }
332 c += "}\n";
333 return c;
334 }
335
BindArguments(ArgumentsBinder * args)336 absl::Status ConvolutionTransposed3x3::BindArguments(ArgumentsBinder* args) {
337 RETURN_IF_ERROR(args->SetInt("filter_offset", 4 * 9 * src_[0]->Slices()));
338 const int padding_x =
339 padding_.x >= 1 ? (padding_.x - 1) / 2 : (padding_.x - 2) / 2;
340 const int padding_y =
341 padding_.y >= 1 ? (padding_.y - 1) / 2 : (padding_.y - 2) / 2;
342 RETURN_IF_ERROR(args->SetInt("padding_x", padding_x));
343 return args->SetInt("padding_y", padding_y);
344 }
345
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups) const346 void ConvolutionTransposed3x3::GetPossibleKernelWorkGroups(
347 TuningType tuning_type, const GpuInfo& gpu_info,
348 const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
349 if (weights_upload_type_ == WeightsUploadType::LOCAL_MEM_ASYNC ||
350 weights_upload_type_ == WeightsUploadType::LOCAL_MEM_BY_THREADS) {
351 work_groups->push_back(work_group_size_);
352 return;
353 }
354 GetPossibleWorkGroupsConv(tuning_type, gpu_info, kernel_info, grid_size_,
355 work_groups);
356 }
357
GetGridSize() const358 int3 ConvolutionTransposed3x3::GetGridSize() const {
359 const int grid_x = DivideRoundUp(dst_[0]->Width(), 2) * dst_[0]->Batch();
360 const int grid_y = DivideRoundUp(dst_[0]->Height(), 2);
361 const int grid_z = dst_[0]->Slices();
362 return int3(grid_x, grid_y, grid_z);
363 }
364
GetSpatialWeightsRemap() const365 std::vector<int> ConvolutionTransposed3x3::GetSpatialWeightsRemap() const {
366 const int padding_x_rem = abs(padding_.x) % 2;
367 const int padding_y_rem = abs(padding_.y) % 2;
368
369 std::vector<int> remap;
370 if (padding_x_rem == 1 && padding_y_rem == 1) {
371 return std::vector<int>{4, 5, 3, 7, 1, 8, 6, 2, 0};
372 } else if (padding_x_rem == 0 && padding_y_rem == 1) {
373 return std::vector<int>{5, 3, 4, 8, 6, 2, 0, 7, 1};
374 } else if (padding_x_rem == 1 && padding_y_rem == 0) {
375 return std::vector<int>{7, 1, 8, 6, 2, 0, 4, 5, 3};
376 } else { // padding_x_rem == 0 && padding_y_rem == 0
377 return std::vector<int>{8, 6, 2, 0, 7, 1, 5, 3, 4};
378 }
379 }
380
UploadWeights(const tflite::gpu::Tensor<OHWI,DataType::FLOAT32> & weights)381 void ConvolutionTransposed3x3::UploadWeights(
382 const tflite::gpu::Tensor<OHWI, DataType::FLOAT32>& weights) {
383 const auto weights_desc = GetWeightsDescription();
384 const int flt_count =
385 GetTotalElementsCountForLayout(weights_desc, weights.shape);
386
387 BufferDescriptor desc;
388 desc.element_type = weights_desc.type;
389 desc.element_size = 4;
390 desc.memory_type =
391 weights_upload_type_ ==
392 ConvolutionTransposed3x3::WeightsUploadType::CONSTANT_MEM
393 ? MemoryType::CONSTANT
394 : MemoryType::GLOBAL;
395 desc.size = flt_count * SizeOf(desc.element_type);
396 desc.data.resize(desc.size);
397
398 RearrangeWeights(weights, weights_desc, absl::MakeSpan(desc.data));
399
400 args_.AddObject("weights",
401 std::make_unique<BufferDescriptor>(std::move(desc)));
402 }
403
IsConvolutionTransposed3x3Supported(const OperationDef & definition,const ConvolutionTransposedAttributes & attr)404 bool IsConvolutionTransposed3x3Supported(
405 const OperationDef& definition,
406 const ConvolutionTransposedAttributes& attr) {
407 return attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
408 attr.stride.w == 2 && attr.stride.h == 2;
409 }
410
CreateConvolutionTransposed3x3(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)411 ConvolutionTransposed3x3 CreateConvolutionTransposed3x3(
412 const GpuInfo& gpu_info, const OperationDef& definition,
413 const ConvolutionTransposedAttributes& attr) {
414 const int2 padding = int2(attr.padding.prepended.w, attr.padding.prepended.h);
415 ConvolutionTransposed3x3 result(definition, gpu_info, padding);
416 result.UploadWeights(attr.weights);
417
418 TensorDescriptor bias_tensor_desc = CreateConstantLinearTensorDescriptor(
419 gpu_info, definition.src_tensors[0].GetDataType(), attr.bias);
420 result.args_.AddObject("biases", std::make_unique<TensorDescriptor>(
421 std::move(bias_tensor_desc)));
422 return result;
423 }
424
CreateConvolutionTransposed3x3DynamicWeights(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)425 ConvolutionTransposed3x3 CreateConvolutionTransposed3x3DynamicWeights(
426 const GpuInfo& gpu_info, const OperationDef& definition,
427 const ConvolutionTransposedAttributes& attr) {
428 OperationDef new_def = definition;
429 new_def.src_tensors = {
430 definition.src_tensors[0]}; // leaving only src_tensor def, weights defs
431 // will be added later
432 const DataType weights_type = definition.GetDataType();
433 // add 1 src_tensor(buffer) for weights
434 new_def.src_tensors.push_back(
435 {weights_type, TensorStorageType::BUFFER, Layout::HWC});
436
437 const int2 padding = int2(attr.padding.prepended.w, attr.padding.prepended.h);
438 ConvolutionTransposed3x3 result(new_def, gpu_info, padding);
439
440 TensorDescriptor bias_tensor_desc = CreateConstantLinearTensorDescriptor(
441 gpu_info, definition.src_tensors[0].GetDataType(), attr.bias);
442 result.args_.AddObject("biases", std::make_unique<TensorDescriptor>(
443 std::move(bias_tensor_desc)));
444 return result;
445 }
446
447 } // namespace gpu
448 } // namespace tflite
449