1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/special/depthwise_conv_plus_1x1_conv.h"
17
18 #include <algorithm>
19 #include <cstring>
20 #include <map>
21 #include <memory>
22 #include <set>
23 #include <string>
24 #include <utility>
25 #include <vector>
26
27 #include "absl/strings/str_replace.h"
28 #include "tensorflow/lite/delegates/gpu/common/flops_util.h"
29 #include "tensorflow/lite/delegates/gpu/common/task/util.h"
30 #include "tensorflow/lite/delegates/gpu/common/tasks/prelu.h"
31 #include "tensorflow/lite/delegates/gpu/common/tasks/relu.h"
32 #include "tensorflow/lite/delegates/gpu/common/util.h"
33
34 namespace tflite {
35 namespace gpu {
36 namespace {
MultiplyAccumulate(const GpuInfo & gpu_info,const std::string & accum,const std::string & a,const std::string & b)37 std::string MultiplyAccumulate(const GpuInfo& gpu_info,
38 const std::string& accum, const std::string& a,
39 const std::string& b) {
40 const bool use_fma = gpu_info.IsAMD() && gpu_info.IsApiOpenCl();
41 if (use_fma) {
42 return accum + " = fma(" + a + ", " + b + ", " + accum + ")";
43 } else {
44 return accum + " += " + a + " * " + b;
45 }
46 }
47 } // namespace
48
49 class ThinPointwiseFuser {
50 public:
51 void Init(CalculationsPrecision precision, const TensorDescriptor& src_desc,
52 int output_batch, int output_width, int output_height);
53 bool Finalize(const GpuInfo& gpu_info, const GraphFloat32& graph,
54 const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
55 GPUOperationsSubgraph* gpu_subgraph);
56
57 bool ReserveNode(const GpuInfo& gpu_info, Node* node);
58
GetFusedNodes() const59 const std::set<NodeId>& GetFusedNodes() const { return fused_nodes_; }
60
61 private:
62 bool IsNodeSupported(const GpuInfo& gpu_info, Node* node) const;
63 bool IsElementwiseNode(Node* node) const;
64 bool IsConvNode(Node* node) const;
65 bool IsDwConvNode(Node* node) const;
66 void AddNode(const GpuInfo& gpu_info, Node* node);
67 void AddElementwiseNode(ElementwiseDescriptor&& op_desc);
68 void AddConvNode(const GpuInfo& gpu_info,
69 const Convolution2DAttributes& attr);
70 void AddReluNode(const ReLUAttributes& attr);
71 void AddPreluNode(const PReLUAttributes& attr);
72 void AddDepthwiseConvNode(const GpuInfo& gpu_info,
73 const DepthwiseConvolution2DAttributes& attr);
74 void AddConvData(const Convolution2DAttributes& conv_attr);
75 void AddDepthwiseConvData(const DepthwiseConvolution2DAttributes& dw_attr);
76 void CreateConstantsGpuBuffer(const GpuInfo& gpu_info);
77 std::vector<Node*> nodes_;
78 OperationDef op_def_;
79 Arguments args_;
80 std::string code_;
81 std::vector<std::string> outputs_;
82 std::vector<float> gpu_data_;
83 int weights_counter_ = 0;
84 int buffer_size_ = 0;
85 std::string op_name_;
86 int link_counter_ = 0;
87 uint64_t flops_ = 0;
88 bool last_op_ = false;
89 int convs_count_ = 0;
90 std::set<NodeId> fused_nodes_;
91 BHWC output_shape_;
92 };
93
AddDepthwiseConvData(const DepthwiseConvolution2DAttributes & dw_attr)94 void ThinPointwiseFuser::AddDepthwiseConvData(
95 const DepthwiseConvolution2DAttributes& dw_attr) {
96 int dw_dst_ch_aligned = AlignByN(dw_attr.weights.shape.i, 4);
97 int dw_weights_count = dw_dst_ch_aligned + dw_dst_ch_aligned *
98 dw_attr.weights.shape.h *
99 dw_attr.weights.shape.w;
100 gpu_data_.reserve(gpu_data_.size() + dw_weights_count);
101 // dw bias loading
102 for (int i = 0; i < dw_dst_ch_aligned; ++i) {
103 if (i < dw_attr.bias.shape.v) {
104 gpu_data_.push_back(dw_attr.bias.data[i]);
105 } else {
106 gpu_data_.push_back(0.0f);
107 }
108 }
109 // dw weights loading
110 for (int d = 0; d < dw_dst_ch_aligned / 4; ++d) {
111 for (int y = 0; y < dw_attr.weights.shape.h; ++y) {
112 for (int x = 0; x < dw_attr.weights.shape.w; ++x) {
113 for (int i = 0; i < 4; ++i) {
114 const int d_ch = d * 4 + i;
115 if (d_ch < dw_attr.weights.shape.i) {
116 const int f_index =
117 dw_attr.weights.shape.LinearIndex({0, y, x, d_ch});
118 gpu_data_.push_back(dw_attr.weights.data[f_index]);
119 } else {
120 gpu_data_.push_back(0.0f);
121 }
122 }
123 }
124 }
125 }
126 }
127
AddConvData(const Convolution2DAttributes & conv_attr)128 void ThinPointwiseFuser::AddConvData(const Convolution2DAttributes& conv_attr) {
129 int conv_src_ch_aligned = AlignByN(conv_attr.weights.shape.i, 4);
130 int conv_dst_ch_aligned = AlignByN(conv_attr.weights.shape.o, 4);
131 int conv_weights_count =
132 conv_dst_ch_aligned + conv_src_ch_aligned * conv_dst_ch_aligned;
133 gpu_data_.reserve(gpu_data_.size() + conv_weights_count);
134 // conv bias loading
135 for (int i = 0; i < conv_dst_ch_aligned; ++i) {
136 if (i < conv_attr.bias.shape.v) {
137 gpu_data_.push_back(conv_attr.bias.data[i]);
138 } else {
139 gpu_data_.push_back(0.0f);
140 }
141 }
142 // conv weights loading
143 for (int d = 0; d < conv_dst_ch_aligned / 4; ++d) {
144 for (int s = 0; s < conv_src_ch_aligned / 4; ++s) {
145 for (int j = 0; j < 4; ++j) {
146 for (int i = 0; i < 4; ++i) {
147 const int s_ch = s * 4 + j;
148 const int d_ch = d * 4 + i;
149 if (s_ch < conv_attr.weights.shape.i &&
150 d_ch < conv_attr.weights.shape.o) {
151 const int f_index =
152 conv_attr.weights.shape.LinearIndex({d_ch, 0, 0, s_ch});
153 gpu_data_.push_back(conv_attr.weights.data[f_index]);
154 } else {
155 gpu_data_.push_back(0.0f);
156 }
157 }
158 }
159 }
160 }
161 }
162
CreateConstantsGpuBuffer(const GpuInfo & gpu_info)163 void ThinPointwiseFuser::CreateConstantsGpuBuffer(const GpuInfo& gpu_info) {
164 const bool fp32_weights = op_def_.precision == CalculationsPrecision::F32;
165 const int float_size = fp32_weights ? 4 : 2;
166 BufferDescriptor desc;
167 desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
168 desc.element_size = 4;
169 desc.memory_type = gpu_info.IsMali() || gpu_info.IsAMD()
170 ? MemoryType::GLOBAL
171 : MemoryType::CONSTANT;
172 desc.size = float_size * gpu_data_.size();
173 desc.data.resize(desc.size);
174
175 if (fp32_weights) {
176 memcpy(desc.data.data(), gpu_data_.data(), desc.size);
177 } else {
178 half* gpu_data_half = reinterpret_cast<half*>(desc.data.data());
179 for (int i = 0; i < gpu_data_.size(); ++i) {
180 gpu_data_half[i] = gpu_data_[i];
181 }
182 }
183 args_.AddObject("constants",
184 std::make_unique<BufferDescriptor>(std::move(desc)));
185 }
186
Init(CalculationsPrecision precision,const TensorDescriptor & src_desc,int output_batch,int output_width,int output_height)187 void ThinPointwiseFuser::Init(CalculationsPrecision precision,
188 const TensorDescriptor& src_desc,
189 int output_batch, int output_width,
190 int output_height) {
191 op_def_.precision = precision;
192 op_def_.src_tensors.push_back(src_desc);
193 weights_counter_ = 0;
194 output_shape_.b = output_batch;
195 output_shape_.w = output_width;
196 output_shape_.h = output_height;
197
198 code_ += "MAIN_FUNCTION($0) {\n";
199 if (src_desc.HasAxis(Axis::BATCH)) {
200 code_ += " int linear_id = GLOBAL_ID_0;\n";
201 code_ += " int X = linear_id / args.dst_tensor.Batch();\n";
202 code_ += " int B = linear_id % args.dst_tensor.Batch();\n";
203 code_ += " args.dst_tensor.SetBatchRef(B);\n";
204 code_ += " args.src_tensor.SetBatchRef(B);\n";
205 } else {
206 code_ += " int X = GLOBAL_ID_0;\n";
207 }
208 code_ += " int Y = GLOBAL_ID_1;\n";
209 code_ +=
210 " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height()) { "
211 "\n";
212 code_ += " return; \n";
213 code_ += " } \n";
214 }
215
IsNodeSupported(const GpuInfo & gpu_info,Node * node) const216 bool ThinPointwiseFuser::IsNodeSupported(const GpuInfo& gpu_info,
217 Node* node) const {
218 if (!node) {
219 return false;
220 }
221 auto op_type = OperationTypeFromString(node->operation.type);
222 if (op_type == OperationType::RELU || op_type == OperationType::PRELU) {
223 return !nodes_.empty();
224 } else if (op_type == OperationType::DEPTHWISE_CONVOLUTION) {
225 if (!nodes_.empty()) {
226 return false;
227 }
228 DepthwiseConvolution2DAttributes* dw_attr =
229 absl::any_cast<DepthwiseConvolution2DAttributes>(
230 &node->operation.attributes);
231 const auto dw_shape = dw_attr->weights.shape;
232 bool good_dw = dw_shape.o == 1;
233 if (!good_dw) {
234 return false;
235 }
236 if (gpu_info.IsApple()) {
237 return dw_shape.i <= 16 &&
238 dw_shape.i * dw_shape.h * dw_shape.w <= 3 * 3 * 16;
239 } else if (gpu_info.IsMali()) {
240 if (op_def_.precision == CalculationsPrecision::F16 &&
241 op_def_.src_tensors[0].SupportsZeroClamp(Axis::WIDTH, gpu_info) &&
242 op_def_.src_tensors[0].SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
243 return dw_shape.i <= 16 &&
244 dw_shape.i * dw_shape.h * dw_shape.w <= 3 * 3 * 16;
245 } else {
246 return false;
247 }
248 } else {
249 if (op_def_.precision == CalculationsPrecision::F16) {
250 return dw_shape.i <= 32 &&
251 dw_shape.i * dw_shape.h * dw_shape.w <= 3 * 3 * 32;
252 } else {
253 return dw_shape.i <= 16 &&
254 dw_shape.i * dw_shape.h * dw_shape.w <= 3 * 3 * 16;
255 }
256 }
257 } else if (op_type == OperationType::CONVOLUTION_2D) {
258 if (nodes_.empty()) {
259 return false;
260 }
261 Convolution2DAttributes* conv_attr =
262 absl::any_cast<Convolution2DAttributes>(&node->operation.attributes);
263 const auto conv_shape = conv_attr->weights.shape;
264 bool good_conv =
265 conv_shape.w == 1 && conv_shape.h == 1 && conv_attr->dilations.w == 1 &&
266 conv_attr->dilations.h == 1 && conv_attr->strides.w == 1 &&
267 conv_attr->strides.h == 1 && conv_attr->padding.prepended.w == 0 &&
268 conv_attr->padding.prepended.h == 0 &&
269 conv_attr->padding.appended.w == 0 &&
270 conv_attr->padding.appended.h == 0;
271 if (!good_conv) {
272 return false;
273 }
274 if (gpu_info.IsAdreno() && gpu_info.IsApiOpenCl()) {
275 int conv_src_ch_aligned = AlignByN(conv_attr->weights.shape.i, 4);
276 int conv_dst_ch_aligned = AlignByN(conv_attr->weights.shape.o, 4);
277 int conv_weights_count =
278 conv_dst_ch_aligned + conv_src_ch_aligned * conv_dst_ch_aligned;
279
280 DataType data_type = op_def_.precision == CalculationsPrecision::F32
281 ? DataType::FLOAT32
282 : DataType::FLOAT16;
283 int weights_size = conv_weights_count * SizeOf(data_type);
284 if (convs_count_ >= 3 || buffer_size_ + weights_size > 1024 * 3) {
285 return false;
286 }
287 } else {
288 if (convs_count_ >= 1) {
289 return false;
290 }
291 }
292 if (gpu_info.IsApple()) {
293 if (op_def_.precision == CalculationsPrecision::F16) {
294 return conv_shape.o <= 16 && conv_shape.i * conv_shape.o <= 16 * 16;
295 } else {
296 return conv_shape.o <= 8 && conv_shape.i * conv_shape.o <= 8 * 16;
297 }
298 } else if (gpu_info.IsMali()) {
299 if (op_def_.precision == CalculationsPrecision::F16) {
300 return conv_shape.o <= 16 && conv_shape.i * conv_shape.o <= 16 * 16;
301 } else {
302 return false;
303 }
304 } else {
305 if (op_def_.precision == CalculationsPrecision::F16) {
306 return conv_shape.o <= 32 && conv_shape.i * conv_shape.o <= 32 * 32;
307 } else {
308 return conv_shape.o <= 32 && conv_shape.i * conv_shape.o <= 16 * 32;
309 }
310 }
311 } else {
312 return false;
313 }
314 }
315
ReserveNode(const GpuInfo & gpu_info,Node * node)316 bool ThinPointwiseFuser::ReserveNode(const GpuInfo& gpu_info, Node* node) {
317 if (!IsNodeSupported(gpu_info, node)) {
318 return false;
319 }
320 nodes_.push_back(node);
321 if (IsConvNode(node)) {
322 convs_count_++;
323 Convolution2DAttributes* conv_attr =
324 absl::any_cast<Convolution2DAttributes>(&node->operation.attributes);
325
326 int conv_src_ch_aligned = AlignByN(conv_attr->weights.shape.i, 4);
327 int conv_dst_ch_aligned = AlignByN(conv_attr->weights.shape.o, 4);
328 int conv_weights_count =
329 conv_dst_ch_aligned + conv_src_ch_aligned * conv_dst_ch_aligned;
330
331 DataType data_type = op_def_.precision == CalculationsPrecision::F32
332 ? DataType::FLOAT32
333 : DataType::FLOAT16;
334 buffer_size_ += conv_weights_count * SizeOf(data_type);
335 }
336 if (IsDwConvNode(node)) {
337 DepthwiseConvolution2DAttributes* dw_attr =
338 absl::any_cast<DepthwiseConvolution2DAttributes>(
339 &node->operation.attributes);
340
341 int dw_dst_ch_aligned = AlignByN(dw_attr->weights.shape.i, 4);
342 int dw_weights_count = dw_dst_ch_aligned + dw_dst_ch_aligned *
343 dw_attr->weights.shape.h *
344 dw_attr->weights.shape.w;
345 DataType data_type = op_def_.precision == CalculationsPrecision::F32
346 ? DataType::FLOAT32
347 : DataType::FLOAT16;
348 buffer_size_ += dw_weights_count * SizeOf(data_type);
349 }
350 return true;
351 }
352
AddNode(const GpuInfo & gpu_info,Node * node)353 void ThinPointwiseFuser::AddNode(const GpuInfo& gpu_info, Node* node) {
354 auto op_type = OperationTypeFromString(node->operation.type);
355 if (op_type == OperationType::RELU) {
356 ReLUAttributes* attr =
357 absl::any_cast<ReLUAttributes>(&node->operation.attributes);
358 AddReluNode(*attr);
359 } else if (op_type == OperationType::PRELU) {
360 PReLUAttributes* attr =
361 absl::any_cast<PReLUAttributes>(&node->operation.attributes);
362 AddPreluNode(*attr);
363 } else if (op_type == OperationType::DEPTHWISE_CONVOLUTION) {
364 DepthwiseConvolution2DAttributes* attr =
365 absl::any_cast<DepthwiseConvolution2DAttributes>(
366 &node->operation.attributes);
367 AddDepthwiseConvNode(gpu_info, *attr);
368 } else if (op_type == OperationType::CONVOLUTION_2D) {
369 Convolution2DAttributes* attr =
370 absl::any_cast<Convolution2DAttributes>(&node->operation.attributes);
371 AddConvNode(gpu_info, *attr);
372 }
373 }
374
IsElementwiseNode(Node * node) const375 bool ThinPointwiseFuser::IsElementwiseNode(Node* node) const {
376 auto op_type = OperationTypeFromString(node->operation.type);
377 return op_type == OperationType::RELU || op_type == OperationType::PRELU;
378 }
379
IsConvNode(Node * node) const380 bool ThinPointwiseFuser::IsConvNode(Node* node) const {
381 auto op_type = OperationTypeFromString(node->operation.type);
382 return op_type == OperationType::CONVOLUTION_2D;
383 }
384
IsDwConvNode(Node * node) const385 bool ThinPointwiseFuser::IsDwConvNode(Node* node) const {
386 auto op_type = OperationTypeFromString(node->operation.type);
387 return op_type == OperationType::DEPTHWISE_CONVOLUTION;
388 }
389
AddDepthwiseConvNode(const GpuInfo & gpu_info,const DepthwiseConvolution2DAttributes & attr)390 void ThinPointwiseFuser::AddDepthwiseConvNode(
391 const GpuInfo& gpu_info, const DepthwiseConvolution2DAttributes& attr) {
392 AddDepthwiseConvData(attr);
393 op_name_ += "dw_conv";
394 output_shape_.c = attr.weights.shape.i;
395 flops_ += GetDepthwiseConvolutionFlops(output_shape_, attr.weights.shape);
396 args_.AddInt("stride_x", attr.strides.w);
397 args_.AddInt("padding_x", -attr.padding.prepended.w);
398 args_.AddInt("dilation_x", attr.dilations.w);
399 args_.AddInt("stride_y", attr.strides.h);
400 args_.AddInt("padding_y", -attr.padding.prepended.h);
401 args_.AddInt("dilation_y", attr.dilations.h);
402
403 const auto& src_desc = op_def_.src_tensors[0];
404 int intermediate_depth = DivideRoundUp(attr.weights.shape.i, 4);
405 for (int d = 0; d < intermediate_depth; ++d) {
406 code_ += " FLT4 dw_res_" + std::to_string(d) + " = args.constants.Read(" +
407 std::to_string(weights_counter_++) + ");\n";
408 }
409 code_ += " int x_offseted = X * args.stride_x + args.padding_x;\n";
410 code_ += " int y_offseted = Y * args.stride_y + args.padding_y;\n";
411 code_ += " int x_c, y_c;\n";
412
413 auto generate_check = [&]() {
414 std::string check;
415 const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
416 const std::vector<std::string> names{"x_in", "y_in", "z_in"};
417 for (int i = 0; i < axes.size(); ++i) {
418 const auto& axis = axes[i];
419 if (src_desc.HasAxis(axis) &&
420 !src_desc.SupportsZeroClamp(axis, gpu_info)) {
421 if (!check.empty()) {
422 check += " && ";
423 }
424 check += names[i];
425 }
426 }
427 return check;
428 };
429 const std::string check = generate_check();
430 if (!src_desc.SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
431 code_ += " bool y_in;\n";
432 }
433 if (!src_desc.SupportsZeroClamp(Axis::WIDTH, gpu_info)) {
434 code_ += " bool x_in;\n";
435 }
436
437 const std::string postfixes[] = {".x", ".xy", ".xyz", ""};
438 code_ += " FLT4 src;\n";
439 for (int d = 0; d < intermediate_depth; ++d) {
440 outputs_.push_back("dw_res_" + std::to_string(d));
441 const int src_ch_count = std::min(4, attr.weights.shape.i - d * 4);
442 const std::string s_postfix = postfixes[src_ch_count - 1];
443 for (int ky = 0; ky < attr.weights.shape.h; ++ky) {
444 code_ += " y_c = y_offseted + " + std::to_string(ky) +
445 " * args.dilation_y;\n";
446 if (!src_desc.SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
447 code_ += " y_in = y_c >= 0 && y_c < args.src_tensor.Height();\n";
448 code_ += " y_c = clamp(y_c, 0, args.src_tensor.Height() - 1);\n";
449 }
450 for (int kx = 0; kx < attr.weights.shape.w; ++kx) {
451 code_ += " x_c = x_offseted + " + std::to_string(kx) +
452 " * args.dilation_x;\n";
453 if (!src_desc.SupportsZeroClamp(Axis::WIDTH, gpu_info)) {
454 code_ += " x_in = x_c >= 0 && x_c < args.src_tensor.Width();\n";
455 code_ += " x_c = clamp(x_c, 0, args.src_tensor.Width() - 1);\n";
456 }
457 std::string multiplier =
458 check.empty() ? "" : " * INIT_FLT(" + check + ")";
459 code_ += " src" + s_postfix + " = args.src_tensor.Read(x_c, y_c, " +
460 std::to_string(d) + ")" + s_postfix + multiplier + ";\n";
461 code_ += " " +
462 MultiplyAccumulate(
463 gpu_info, "dw_res_" + std::to_string(d) + s_postfix,
464 "src" + s_postfix,
465 "args.constants.Read(" +
466 std::to_string(weights_counter_++) + ")" + s_postfix) +
467 ";\n";
468 }
469 }
470 }
471 }
472
AddElementwiseNode(ElementwiseDescriptor && op_desc)473 void ThinPointwiseFuser::AddElementwiseNode(ElementwiseDescriptor&& op_desc) {
474 std::string unique_postfix = absl::StrCat("_link_internal", link_counter_);
475 link_counter_++;
476 op_desc.args.RenameArgs(unique_postfix, &op_desc.code);
477 auto status = args_.Merge(std::move(op_desc.args), unique_postfix);
478 for (int i = 0; i < outputs_.size(); ++i) {
479 const std::string elementwise_new_code =
480 absl::StrReplaceAll(op_desc.code, {{"in_value", outputs_[i]},
481 {"out_value", outputs_[i]},
482 {"X_COORD", "X"},
483 {"Y_COORD", "Y"},
484 {"S_COORD", std::to_string(i)},
485 {"B_COORD", "B"}});
486 code_ += " { " + elementwise_new_code + " }\n";
487 }
488 }
489
AddReluNode(const ReLUAttributes & attr)490 void ThinPointwiseFuser::AddReluNode(const ReLUAttributes& attr) {
491 ElementwiseDescriptor op_desc = CreateReLU(attr, op_def_.precision);
492 AddElementwiseNode(std::move(op_desc));
493 }
494
AddPreluNode(const PReLUAttributes & attr)495 void ThinPointwiseFuser::AddPreluNode(const PReLUAttributes& attr) {
496 ElementwiseDescriptor op_desc = CreatePReLU(attr, op_def_.dst_tensors[0]);
497 AddElementwiseNode(std::move(op_desc));
498 }
499
AddConvNode(const GpuInfo & gpu_info,const Convolution2DAttributes & attr)500 void ThinPointwiseFuser::AddConvNode(const GpuInfo& gpu_info,
501 const Convolution2DAttributes& attr) {
502 AddConvData(attr);
503 op_name_ += "->conv1x1";
504 output_shape_.c = attr.weights.shape.o;
505 flops_ += GetConvolutionFlops(output_shape_, attr.weights.shape);
506 const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
507 const int dst_slices = DivideRoundUp(attr.weights.shape.o, 4);
508 std::vector<std::string> inputs = outputs_;
509 outputs_.resize(dst_slices);
510 std::string link = "_link_" + std::to_string(link_counter_);
511 link_counter_++;
512 for (int d = 0; d < dst_slices; ++d) {
513 std::string dst = "conv_res_" + std::to_string(d) + link;
514 outputs_[d] = dst;
515 code_ += " FLT4 " + outputs_[d] + " = args.constants.Read(" +
516 std::to_string(weights_counter_++) + ");\n";
517 }
518 for (int d = 0; d < dst_slices; ++d) {
519 std::string dst = outputs_[d];
520 for (int s = 0; s < src_slices; ++s) {
521 std::string src = inputs[s];
522 const std::string c0 =
523 "args.constants.Read(" + std::to_string(weights_counter_++) + ")";
524 const std::string c1 =
525 "args.constants.Read(" + std::to_string(weights_counter_++) + ")";
526 const std::string c2 =
527 "args.constants.Read(" + std::to_string(weights_counter_++) + ")";
528 const std::string c3 =
529 "args.constants.Read(" + std::to_string(weights_counter_++) + ")";
530 code_ += " " + MultiplyAccumulate(gpu_info, dst, c0, src + ".x") + ";\n";
531 code_ += " " + MultiplyAccumulate(gpu_info, dst, c1, src + ".y") + ";\n";
532 code_ += " " + MultiplyAccumulate(gpu_info, dst, c2, src + ".z") + ";\n";
533 code_ += " " + MultiplyAccumulate(gpu_info, dst, c3, src + ".w") + ";\n";
534 }
535 if (last_op_) {
536 code_ += " if(" + std::to_string(d) + " < args.dst_tensor.Slices()) {\n";
537 code_ += " args.dst_tensor.Write(" + dst + ", X, Y, " +
538 std::to_string(d) + ");\n";
539 code_ += " }\n";
540 }
541 }
542 }
543
Finalize(const GpuInfo & gpu_info,const GraphFloat32 & graph,const std::map<ValueId,TensorDescriptor> & tensor_descriptors,GPUOperationsSubgraph * gpu_subgraph)544 bool ThinPointwiseFuser::Finalize(
545 const GpuInfo& gpu_info, const GraphFloat32& graph,
546 const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
547 GPUOperationsSubgraph* gpu_subgraph) {
548 while (!nodes_.empty() && IsElementwiseNode(nodes_.back())) {
549 nodes_.pop_back();
550 }
551 if (nodes_.empty() || convs_count_ == 0) {
552 return false;
553 }
554 auto first_node_inputs = graph.FindInputs(nodes_.front()->id);
555 auto last_node_outputs = graph.FindOutputs(nodes_.back()->id);
556 const TensorDescriptor& dst_desc =
557 tensor_descriptors.find(last_node_outputs[0]->id)->second;
558 op_def_.dst_tensors.push_back(dst_desc);
559 for (int i = 0; i < nodes_.size(); ++i) {
560 if (i == nodes_.size() - 1) {
561 last_op_ = true;
562 }
563 AddNode(gpu_info, nodes_[i]);
564 fused_nodes_.insert(nodes_[i]->id);
565 }
566 code_ += "}\n";
567
568 if (gpu_info.IsMali()) {
569 const BHWC dst_shape = output_shape_;
570 const int dst_slices = DivideRoundUp(dst_shape.c, 4);
571 int task_size = dst_shape.b * dst_shape.h * dst_shape.w * dst_slices;
572 int block_size =
573 GetRecommendedBlockSizeForConv(gpu_info, op_def_.precision, task_size);
574 if (block_size < 4 && dst_slices >= 2) {
575 return false;
576 }
577 if (block_size < 2 && dst_slices >= 4) {
578 return false;
579 }
580 }
581
582 CreateConstantsGpuBuffer(gpu_info);
583 std::unique_ptr<GPUOperation>* gpu_op =
584 InitSingleOpSubgraph(first_node_inputs, last_node_outputs, gpu_subgraph);
585 GPUOperation operation(op_def_);
586 operation.args_ = std::move(args_);
587 operation.AddSrcTensor("src_tensor", op_def_.src_tensors[0]);
588 operation.AddDstTensor("dst_tensor", op_def_.dst_tensors[0]);
589 operation.code_ = code_;
590 operation.flops_ = flops_;
591 operation.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1;
592 if (gpu_info.IsMali()) {
593 operation.compiler_options_.push_back(CompilerOptions::kClFastRelaxedMath);
594 }
595 *gpu_op = std::make_unique<GPUOperation>(std::move(operation));
596 gpu_subgraph->operations[0].name = op_name_;
597 return true;
598 }
599
GetNextLinearNode(const GraphFloat32 & graph,NodeId current_node)600 Node* GetNextLinearNode(const GraphFloat32& graph, NodeId current_node) {
601 auto inputs = graph.FindInputs(current_node);
602 if (inputs.size() != 1) {
603 return nullptr;
604 }
605 auto outputs = graph.FindOutputs(current_node);
606 if (outputs.size() != 1) {
607 return nullptr;
608 }
609 auto consumers = graph.FindConsumers(outputs[0]->id);
610 if (consumers.size() != 1) {
611 return nullptr;
612 }
613 return consumers[0];
614 }
615
TryDepthwiseConvPlus1x1Conv(const GpuInfo & gpu_info,CalculationsPrecision precision,const GraphFloat32 & graph,NodeId first_node_id,const std::map<ValueId,TensorDescriptor> & tensor_descriptors,std::set<NodeId> * consumed_nodes,GPUOperationsSubgraph * gpu_subgraph)616 absl::Status TryDepthwiseConvPlus1x1Conv(
617 const GpuInfo& gpu_info, CalculationsPrecision precision,
618 const GraphFloat32& graph, NodeId first_node_id,
619 const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
620 std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph) {
621 if (!(gpu_info.IsAdreno() || gpu_info.IsNvidia() || gpu_info.IsMali() ||
622 gpu_info.IsApple() || gpu_info.IsAMD())) {
623 return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
624 }
625 if (gpu_info.IsMali() && gpu_info.mali_info.IsMidgard()) {
626 return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
627 }
628 auto* node = graph.GetNode(first_node_id);
629 if (node == nullptr ||
630 consumed_nodes->find(node->id) != consumed_nodes->end()) {
631 return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
632 }
633 auto dw_inputs = graph.FindInputs(node->id);
634 auto dw_outputs = graph.FindOutputs(node->id);
635
636 const TensorDescriptor& src_desc =
637 tensor_descriptors.find(dw_inputs[0]->id)->second;
638 ThinPointwiseFuser fuser;
639 auto dw_shape = dw_outputs[0]->tensor.shape;
640 fuser.Init(precision, src_desc, dw_shape.b, dw_shape.w, dw_shape.h);
641 while (fuser.ReserveNode(gpu_info, node)) {
642 node = GetNextLinearNode(graph, node->id);
643 if (node == nullptr ||
644 consumed_nodes->find(node->id) != consumed_nodes->end()) {
645 break;
646 }
647 }
648
649 if (!fuser.Finalize(gpu_info, graph, tensor_descriptors, gpu_subgraph)) {
650 return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
651 }
652 consumed_nodes->insert(fuser.GetFusedNodes().begin(),
653 fuser.GetFusedNodes().end());
654 return absl::OkStatus();
655 }
656
657 } // namespace gpu
658 } // namespace tflite
659