1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_4x4.h"
17
18 #include <memory>
19 #include <string>
20 #include <utility>
21 #include <vector>
22
23 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
24
25 namespace tflite {
26 namespace gpu {
27
28 namespace {
GetBestWeightsUploadType(const GpuInfo & gpu_info)29 ConvolutionTransposed4x4::WeightsUploadType GetBestWeightsUploadType(
30 const GpuInfo& gpu_info) {
31 ConvolutionTransposed4x4::WeightsUploadType weights_upload_type =
32 ConvolutionTransposed4x4::WeightsUploadType::GLOBAL_MEM;
33 if (gpu_info.IsApple()) {
34 if (gpu_info.apple_info.IsBionic()) {
35 weights_upload_type =
36 ConvolutionTransposed4x4::WeightsUploadType::GLOBAL_MEM;
37 } else {
38 weights_upload_type =
39 ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_BY_THREADS;
40 }
41 } else if (gpu_info.IsPowerVR()) {
42 weights_upload_type =
43 ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_ASYNC;
44 } else if (gpu_info.IsNvidia() || gpu_info.IsIntel()) {
45 weights_upload_type =
46 ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_BY_THREADS;
47 } else if (gpu_info.IsAMD()) {
48 weights_upload_type =
49 ConvolutionTransposed4x4::WeightsUploadType::CONSTANT_MEM;
50 } else {
51 weights_upload_type =
52 ConvolutionTransposed4x4::WeightsUploadType::GLOBAL_MEM;
53 }
54 return weights_upload_type;
55 }
56 } // namespace
57
ConvolutionTransposed4x4(const OperationDef & definition,const GpuInfo & gpu_info)58 ConvolutionTransposed4x4::ConvolutionTransposed4x4(
59 const OperationDef& definition, const GpuInfo& gpu_info)
60 : GPUOperation(definition) {
61 work_group_size_ = int3(8, 4, 1);
62 if (gpu_info.IsApple()) {
63 work_group_launch_order_ = int3(2, 0, 1);
64 }
65
66 if (gpu_info.IsApple()) {
67 weights_layout_ = WeightsLayout::kOICustomSpatialO4I4;
68 } else {
69 weights_layout_ = WeightsLayout::kOICustomSpatialI4O4;
70 }
71
72 code_ = GenerateConvolutionTransposedCode(gpu_info, definition_,
73 GetBestWeightsUploadType(gpu_info));
74 if (definition_.precision == CalculationsPrecision::F16 &&
75 gpu_info.IsPowerVR()) {
76 compiler_options_.push_back(CompilerOptions::kClFastRelaxedMath);
77 }
78 }
79
GenerateConvolutionTransposedCode(const GpuInfo & gpu_info,const OperationDef & op_def,WeightsUploadType weights_upload_type)80 std::string ConvolutionTransposed4x4::GenerateConvolutionTransposedCode(
81 const GpuInfo& gpu_info, const OperationDef& op_def,
82 WeightsUploadType weights_upload_type) {
83 auto src_desc = op_def.src_tensors[0];
84 AddSrcTensor("src_tensor", src_desc);
85 AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
86
87 if (op_def.src_tensors.size() == 2) {
88 // dynamic weights
89 BufferDescriptor desc;
90 desc.element_type = op_def.src_tensors[1].GetDataType();
91 desc.element_size = 4;
92 desc.memory_type =
93 weights_upload_type ==
94 ConvolutionTransposed4x4::WeightsUploadType::CONSTANT_MEM
95 ? MemoryType::CONSTANT
96 : MemoryType::GLOBAL;
97 AddSrcBuffer("weights", desc);
98 }
99
100 args_.AddInt("filter_offset");
101
102 const bool need_local_mem =
103 weights_upload_type ==
104 ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_BY_THREADS ||
105 weights_upload_type ==
106 ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_ASYNC;
107
108 const int wg_total_size =
109 work_group_size_.x * work_group_size_.y * work_group_size_.z;
110 const std::string barrier =
111 wg_total_size == 32 && gpu_info.IsWaveSizeEqualTo32()
112 ? "SIMD_LOCAL_MEM_BARRIER"
113 : "LOCAL_MEM_BARRIER";
114
115 std::string c;
116 if (GetWeightsDescription().IsI4O4()) {
117 switch (op_def.precision) {
118 case CalculationsPrecision::F32:
119 case CalculationsPrecision::F16:
120 c += "#define CONV(R, SRC, F) \\\n";
121 c += " R += SRC.x * weights_cache[F]; \\\n";
122 c += " R += SRC.y * weights_cache[F + 1]; \\\n";
123 c += " R += SRC.z * weights_cache[F + 2]; \\\n";
124 c += " R += SRC.w * weights_cache[F + 3]; \n";
125 break;
126 case CalculationsPrecision::F32_F16:
127 c += "#define CONV(R, SRC, F) \\\n";
128 c += " R += TO_ACCUM_TYPE(SRC.x * weights_cache[F] + SRC.y * "
129 "weights_cache[F + 1] + SRC.z * weights_cache[F + 2] + SRC.w * "
130 "weights_cache[F + 3]);\n";
131 break;
132 }
133 } else {
134 // O4I4
135 c += "#define CONV(R, SRC, F) \\\n";
136 c += " R.x += dot(SRC, weights_cache[F]); \\\n";
137 c += " R.y += dot(SRC, weights_cache[F + 1]); \\\n";
138 c += " R.z += dot(SRC, weights_cache[F + 2]); \\\n";
139 c += " R.w += dot(SRC, weights_cache[F + 3]); \n";
140 }
141
142 const std::string weights_space =
143 weights_upload_type ==
144 ConvolutionTransposed4x4::WeightsUploadType::CONSTANT_MEM
145 ? "__constant"
146 : "__global";
147
148 if (gpu_info.IsApiOpenCl()) {
149 c += "__attribute__((reqd_work_group_size(8, 4, 1)))\n";
150 }
151 c += "MAIN_FUNCTION($0) {\n";
152 std::string grid_coords[3];
153 int3 launch_remap;
154 launch_remap[work_group_launch_order_.x] = 0;
155 launch_remap[work_group_launch_order_.y] = 1;
156 launch_remap[work_group_launch_order_.z] = 2;
157 if (work_group_launch_order_[0] == 0) {
158 grid_coords[0] = "GLOBAL_ID_0";
159 } else {
160 grid_coords[0] = "(GROUP_ID_" + std::to_string(launch_remap[0]) +
161 " * GROUP_SIZE_0 + LOCAL_ID_0);\n";
162 }
163 if (work_group_launch_order_[1] == 1) {
164 grid_coords[1] = "GLOBAL_ID_1";
165 } else {
166 grid_coords[1] = "(GROUP_ID_" + std::to_string(launch_remap[1]) +
167 " * GROUP_SIZE_1 + LOCAL_ID_1);\n";
168 }
169 if (work_group_launch_order_[2] == 2) {
170 grid_coords[2] = "GLOBAL_ID_2";
171 } else {
172 grid_coords[2] = "(GROUP_ID_" + std::to_string(launch_remap[2]) +
173 " * GROUP_SIZE_2 + LOCAL_ID_2);\n";
174 }
175 if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
176 c += " int linear_id = " + grid_coords[0] + ";\n";
177 c += " int X = linear_id / args.dst_tensor.Batch();\n";
178 c += " int B = linear_id % args.dst_tensor.Batch();\n";
179 c += " args.src_tensor.SetBatchRef(B);\n";
180 c += " args.dst_tensor.SetBatchRef(B);\n";
181 } else {
182 c += " int X = " + grid_coords[0] + ";\n";
183 }
184 c += " int Y = " + grid_coords[1] + ";\n";
185 c += " int Z = " + grid_coords[2] + ";\n";
186 if (!need_local_mem) {
187 c += " if (X * 2 > args.dst_tensor.Width() || Y * 2 > "
188 "args.dst_tensor.Height() || Z >= args.dst_tensor.Slices()) "
189 "return;\n";
190 }
191 c += " ACCUM_FLT4 r0 = INIT_ACCUM_FLT4(0.0f);\n";
192 c += " ACCUM_FLT4 r1 = INIT_ACCUM_FLT4(0.0f);\n";
193 c += " ACCUM_FLT4 r2 = INIT_ACCUM_FLT4(0.0f);\n";
194 c += " ACCUM_FLT4 r3 = INIT_ACCUM_FLT4(0.0f);\n";
195 c += " int f_offset = Z * args.filter_offset;\n";
196 if (need_local_mem) {
197 c += " __local FLT4 weights_cache[64];\n";
198 }
199 if (weights_upload_type ==
200 ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
201 c += " int local_id = LOCAL_ID_1 * 8 + LOCAL_ID_0;\n";
202 }
203 if (!src_desc.SupportsZeroClamp(Axis::WIDTH, gpu_info)) {
204 c += " bool in_x0 = X - 1 >= 0 && X - 1 < args.src_tensor.Width();\n";
205 c += " bool in_x1 = X >= 0 && X < args.src_tensor.Width();\n";
206 }
207 if (!src_desc.SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
208 c += " bool in_y0 = Y - 1 >= 0 && Y - 1 < args.src_tensor.Height();\n";
209 c += " bool in_y1 = Y >= 0 && Y < args.src_tensor.Height();\n";
210 }
211 auto generate_check = [&](int x, int y) {
212 std::string check;
213 const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT};
214 const std::vector<std::string> names{"in_x" + std::to_string(x),
215 "in_y" + std::to_string(y)};
216 for (int i = 0; i < axes.size(); ++i) {
217 const auto& axis = axes[i];
218 if (src_desc.HasAxis(axis) &&
219 !src_desc.SupportsZeroClamp(axis, gpu_info)) {
220 if (!check.empty()) {
221 check += " && ";
222 }
223 check += names[i];
224 }
225 }
226 return check;
227 };
228 if (src_desc.IsLinear()) {
229 if (src_desc.ReturnsZeroForNegOneRead(gpu_info)) {
230 c += " int addr_0 = args.src_tensor.GetAddress(X - 1, Y - 1, 0);\n";
231 c += " int addr_1 = args.src_tensor.GetAddress(X, Y - 1, 0);\n";
232 c += " int addr_2 = args.src_tensor.GetAddress(X - 1, Y, 0);\n";
233 c += " int addr_3 = args.src_tensor.GetAddress(X, Y, 0);\n";
234 c += " addr_0 = select(-1, addr_0, (in_x0 && in_y0));\n";
235 c += " addr_1 = select(-1, addr_1, (in_x1 && in_y0));\n";
236 c += " addr_2 = select(-1, addr_2, (in_x0 && in_y1));\n";
237 c += " addr_3 = select(-1, addr_3, (in_x1 && in_y1));\n";
238 c += " int dz_0 = select(0, args.src_tensor.SliceStride(), (in_x0 && "
239 "in_y0));\n";
240 c += " int dz_1 = select(0, args.src_tensor.SliceStride(), (in_x1 && "
241 "in_y0));\n";
242 c += " int dz_2 = select(0, args.src_tensor.SliceStride(), (in_x0 && "
243 "in_y1));\n";
244 c += " int dz_3 = select(0, args.src_tensor.SliceStride(), (in_x1 && "
245 "in_y1));\n";
246 } else {
247 c += " int xc0 = clamp(X - 1, 0, args.src_tensor.Width() - 1);\n";
248 c += " int xc1 = clamp(X, 0, args.src_tensor.Width() - 1);\n";
249 c += " int yc0 = clamp(Y - 1, 0, args.src_tensor.Height() - 1);\n";
250 c += " int yc1 = clamp(Y, 0, args.src_tensor.Height() - 1);\n";
251 c += " int addr_0 = args.src_tensor.GetAddress(xc0, yc0, 0);\n";
252 c += " int addr_1 = args.src_tensor.GetAddress(xc1, yc0, 0);\n";
253 c += " int addr_2 = args.src_tensor.GetAddress(xc0, yc1, 0);\n";
254 c += " int addr_3 = args.src_tensor.GetAddress(xc1, yc1, 0);\n";
255 c += " int dz = args.src_tensor.SliceStride();\n";
256 }
257 }
258 auto read_src = [&](int x, int y) {
259 if (src_desc.IsLinear()) {
260 const std::string id = std::to_string(y * 2 + x);
261 const std::string addr = "addr_" + std::to_string(y * 2 + x);
262 if (src_desc.ReturnsZeroForNegOneRead(gpu_info)) {
263 return "args.src_tensor.Read(" + addr + "); " + addr + " += dz_" + id +
264 ";";
265 } else {
266 return "args.src_tensor.Read(" + addr + ") * INIT_FLT(in_x" +
267 std::to_string(x) + " && in_y" + std::to_string(y) + "); " +
268 addr + " += dz;";
269 }
270 } else {
271 std::string check = generate_check(x, y);
272 if (!check.empty()) {
273 check = " * INIT_FLT(" + check + ")";
274 }
275 return "args.src_tensor.Read(X + " + std::to_string(x - 1) + ", Y + " +
276 std::to_string(y - 1) + ", s)" + check + ";";
277 }
278 };
279 c += " for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
280 if (need_local_mem) {
281 c += " " + barrier + ";\n";
282 }
283 if (weights_upload_type ==
284 ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_ASYNC) {
285 c += " async_work_group_copy(weights_cache, "
286 "args.weights.GetPtr(f_offset), 64, "
287 "0);\n";
288 } else if (weights_upload_type ==
289 ConvolutionTransposed4x4::WeightsUploadType::
290 LOCAL_MEM_BY_THREADS) {
291 c += " weights_cache[local_id] = args.weights.Read(f_offset + "
292 "local_id);\n";
293 c += " weights_cache[local_id + 32] = args.weights.Read(f_offset + "
294 "local_id + "
295 "32);\n";
296 } else { // GLOBAL_MEM
297 c += " " + weights_space +
298 " FLT4* weights_cache = args.weights.GetPtr(f_offset);\n";
299 }
300 c += " FLT4 src0 = " + read_src(0, 0) + ";\n";
301 c += " FLT4 src1 = " + read_src(1, 0) + ";\n";
302 c += " FLT4 src2 = " + read_src(0, 1) + ";\n";
303 c += " FLT4 src3 = " + read_src(1, 1) + ";\n";
304 c += " f_offset += 64;\n";
305 if (need_local_mem) {
306 c += " " + barrier + ";\n";
307 }
308 c += " CONV(r0, src0, 0);\n";
309 c += " CONV(r1, src0, 4);\n";
310 c += " CONV(r2, src0, 8);\n";
311 c += " CONV(r3, src0, 12);\n";
312 c += " CONV(r0, src1, 16);\n";
313 c += " CONV(r1, src1, 20);\n";
314 c += " CONV(r2, src1, 24);\n";
315 c += " CONV(r3, src1, 28);\n";
316 c += " CONV(r0, src2, 32);\n";
317 c += " CONV(r1, src2, 36);\n";
318 c += " CONV(r2, src2, 40);\n";
319 c += " CONV(r3, src2, 44);\n";
320 c += " CONV(r0, src3, 48);\n";
321 c += " CONV(r1, src3, 52);\n";
322 c += " CONV(r2, src3, 56);\n";
323 c += " CONV(r3, src3, 60);\n";
324 c += " }\n";
325 c += "\n";
326 if (need_local_mem) {
327 c += " if (X * 2 > args.dst_tensor.Width() || Y * 2 > "
328 "args.dst_tensor.Height() || Z >= args.dst_tensor.Slices()) "
329 "return;\n";
330 }
331 c += " X = X * 2 - 1;\n";
332 c += " Y = Y * 2 - 1;\n";
333 c += "\n";
334 c += " FLT4 bias_val = args.biases.Read(Z);\n";
335 c += " if (X >= 0 && Y >= 0) {\n";
336 c += " FLT4 result = TO_FLT4(r0) + bias_val;\n";
337 c += " args.dst_tensor.Write(result, X, Y, Z);\n";
338 c += " }\n";
339 c += " if (X + 1 < args.dst_tensor.Width() && Y >= 0) {\n";
340 c += " FLT4 result = TO_FLT4(r1) + bias_val;\n";
341 c += " args.dst_tensor.Write(result, X + 1, Y, Z);\n";
342 c += " }\n";
343 c += " if (X >= 0 && Y + 1 < args.dst_tensor.Height()) {\n";
344 c += " FLT4 result = TO_FLT4(r2) + bias_val;\n";
345 c += " args.dst_tensor.Write(result, X, Y + 1, Z);\n";
346 c += " }\n";
347 c += " if (X + 1 < args.dst_tensor.Width() && Y + 1 < "
348 "args.dst_tensor.Height()) {\n";
349 c += " FLT4 result = TO_FLT4(r3) + bias_val;\n";
350 c += " args.dst_tensor.Write(result, X + 1, Y + 1, Z);\n";
351 c += " }\n";
352 c += "}\n";
353 return c;
354 }
355
BindArguments(ArgumentsBinder * args)356 absl::Status ConvolutionTransposed4x4::BindArguments(ArgumentsBinder* args) {
357 return args->SetInt("filter_offset", 4 * 16 * src_[0]->Slices());
358 }
359
GetGridSize() const360 int3 ConvolutionTransposed4x4::GetGridSize() const {
361 const int grid_x = DivideRoundUp(dst_[0]->Width() + 2, 2) * dst_[0]->Batch();
362 const int grid_y = DivideRoundUp(dst_[0]->Height() + 2, 2);
363 const int grid_z = dst_[0]->Slices();
364 return int3(grid_x, grid_y, grid_z);
365 }
366
GetSpatialWeightsRemap() const367 std::vector<int> ConvolutionTransposed4x4::GetSpatialWeightsRemap() const {
368 return std::vector<int>{10, 11, 14, 15, 8, 9, 12, 13, 2, 3, 6, 7, 0, 1, 4, 5};
369 }
370
UploadWeights(const tflite::gpu::Tensor<OHWI,DataType::FLOAT32> & weights,WeightsUploadType weights_upload_type)371 void ConvolutionTransposed4x4::UploadWeights(
372 const tflite::gpu::Tensor<OHWI, DataType::FLOAT32>& weights,
373 WeightsUploadType weights_upload_type) {
374 const auto weights_desc = GetWeightsDescription();
375 const int flt_count =
376 GetTotalElementsCountForLayout(weights_desc, weights.shape);
377
378 BufferDescriptor desc;
379 desc.element_type = weights_desc.type;
380 desc.element_size = 4;
381 desc.memory_type =
382 weights_upload_type ==
383 ConvolutionTransposed4x4::WeightsUploadType::CONSTANT_MEM
384 ? MemoryType::CONSTANT
385 : MemoryType::GLOBAL;
386 desc.size = flt_count * SizeOf(desc.element_type);
387 desc.data.resize(desc.size);
388
389 RearrangeWeights(weights, weights_desc, absl::MakeSpan(desc.data));
390 args_.AddObject("weights",
391 std::make_unique<BufferDescriptor>(std::move(desc)));
392 }
393
IsConvolutionTransposed4x4Supported(const OperationDef & definition,const ConvolutionTransposedAttributes & attr)394 bool IsConvolutionTransposed4x4Supported(
395 const OperationDef& definition,
396 const ConvolutionTransposedAttributes& attr) {
397 return attr.weights.shape.w == 4 && attr.weights.shape.h == 4 &&
398 attr.stride.w == 2 && attr.stride.h == 2 &&
399 attr.padding.prepended.w == 1 && attr.padding.prepended.h == 1;
400 }
401
CreateConvolutionTransposed4x4(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)402 ConvolutionTransposed4x4 CreateConvolutionTransposed4x4(
403 const GpuInfo& gpu_info, const OperationDef& definition,
404 const ConvolutionTransposedAttributes& attr) {
405 ConvolutionTransposed4x4 result(definition, gpu_info);
406 result.UploadWeights(attr.weights, GetBestWeightsUploadType(gpu_info));
407
408 TensorDescriptor bias_tensor_desc = CreateConstantLinearTensorDescriptor(
409 gpu_info, definition.src_tensors[0].GetDataType(), attr.bias);
410 result.args_.AddObject("biases", std::make_unique<TensorDescriptor>(
411 std::move(bias_tensor_desc)));
412 return result;
413 }
414
CreateConvolutionTransposed4x4DynamicWeights(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)415 ConvolutionTransposed4x4 CreateConvolutionTransposed4x4DynamicWeights(
416 const GpuInfo& gpu_info, const OperationDef& definition,
417 const ConvolutionTransposedAttributes& attr) {
418 OperationDef new_def = definition;
419 new_def.src_tensors = {
420 definition.src_tensors[0]}; // leaving only src_tensor def, weights defs
421 // will be added later
422 const DataType weights_type = definition.GetDataType();
423 // add 1 src_tensor(buffer) for weights
424 new_def.src_tensors.push_back(
425 {weights_type, TensorStorageType::BUFFER, Layout::HWC});
426
427 ConvolutionTransposed4x4 result(new_def, gpu_info);
428
429 TensorDescriptor bias_tensor_desc = CreateConstantLinearTensorDescriptor(
430 gpu_info, definition.src_tensors[0].GetDataType(), attr.bias);
431 result.args_.AddObject("biases", std::make_unique<TensorDescriptor>(
432 std::move(bias_tensor_desc)));
433 return result;
434 }
435
436 } // namespace gpu
437 } // namespace tflite
438