1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/special/depthwise_conv_plus_1x1_conv.h"
17 
18 #include <algorithm>
19 #include <cstring>
20 #include <map>
21 #include <memory>
22 #include <set>
23 #include <string>
24 #include <utility>
25 #include <vector>
26 
27 #include "absl/strings/str_replace.h"
28 #include "tensorflow/lite/delegates/gpu/common/flops_util.h"
29 #include "tensorflow/lite/delegates/gpu/common/task/util.h"
30 #include "tensorflow/lite/delegates/gpu/common/tasks/prelu.h"
31 #include "tensorflow/lite/delegates/gpu/common/tasks/relu.h"
32 #include "tensorflow/lite/delegates/gpu/common/util.h"
33 
34 namespace tflite {
35 namespace gpu {
36 namespace {
MultiplyAccumulate(const GpuInfo & gpu_info,const std::string & accum,const std::string & a,const std::string & b)37 std::string MultiplyAccumulate(const GpuInfo& gpu_info,
38                                const std::string& accum, const std::string& a,
39                                const std::string& b) {
40   const bool use_fma = gpu_info.IsAMD() && gpu_info.IsApiOpenCl();
41   if (use_fma) {
42     return accum + " = fma(" + a + ", " + b + ", " + accum + ")";
43   } else {
44     return accum + " += " + a + " * " + b;
45   }
46 }
47 }  // namespace
48 
49 class ThinPointwiseFuser {
50  public:
51   void Init(CalculationsPrecision precision, const TensorDescriptor& src_desc,
52             int output_batch, int output_width, int output_height);
53   bool Finalize(const GpuInfo& gpu_info, const GraphFloat32& graph,
54                 const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
55                 GPUOperationsSubgraph* gpu_subgraph);
56 
57   bool ReserveNode(const GpuInfo& gpu_info, Node* node);
58 
GetFusedNodes() const59   const std::set<NodeId>& GetFusedNodes() const { return fused_nodes_; }
60 
61  private:
62   bool IsNodeSupported(const GpuInfo& gpu_info, Node* node) const;
63   bool IsElementwiseNode(Node* node) const;
64   bool IsConvNode(Node* node) const;
65   bool IsDwConvNode(Node* node) const;
66   void AddNode(const GpuInfo& gpu_info, Node* node);
67   void AddElementwiseNode(ElementwiseDescriptor&& op_desc);
68   void AddConvNode(const GpuInfo& gpu_info,
69                    const Convolution2DAttributes& attr);
70   void AddReluNode(const ReLUAttributes& attr);
71   void AddPreluNode(const PReLUAttributes& attr);
72   void AddDepthwiseConvNode(const GpuInfo& gpu_info,
73                             const DepthwiseConvolution2DAttributes& attr);
74   void AddConvData(const Convolution2DAttributes& conv_attr);
75   void AddDepthwiseConvData(const DepthwiseConvolution2DAttributes& dw_attr);
76   void CreateConstantsGpuBuffer(const GpuInfo& gpu_info);
77   std::vector<Node*> nodes_;
78   OperationDef op_def_;
79   Arguments args_;
80   std::string code_;
81   std::vector<std::string> outputs_;
82   std::vector<float> gpu_data_;
83   int weights_counter_ = 0;
84   int buffer_size_ = 0;
85   std::string op_name_;
86   int link_counter_ = 0;
87   uint64_t flops_ = 0;
88   bool last_op_ = false;
89   int convs_count_ = 0;
90   std::set<NodeId> fused_nodes_;
91   BHWC output_shape_;
92 };
93 
AddDepthwiseConvData(const DepthwiseConvolution2DAttributes & dw_attr)94 void ThinPointwiseFuser::AddDepthwiseConvData(
95     const DepthwiseConvolution2DAttributes& dw_attr) {
96   int dw_dst_ch_aligned = AlignByN(dw_attr.weights.shape.i, 4);
97   int dw_weights_count = dw_dst_ch_aligned + dw_dst_ch_aligned *
98                                                  dw_attr.weights.shape.h *
99                                                  dw_attr.weights.shape.w;
100   gpu_data_.reserve(gpu_data_.size() + dw_weights_count);
101   // dw bias loading
102   for (int i = 0; i < dw_dst_ch_aligned; ++i) {
103     if (i < dw_attr.bias.shape.v) {
104       gpu_data_.push_back(dw_attr.bias.data[i]);
105     } else {
106       gpu_data_.push_back(0.0f);
107     }
108   }
109   // dw weights loading
110   for (int d = 0; d < dw_dst_ch_aligned / 4; ++d) {
111     for (int y = 0; y < dw_attr.weights.shape.h; ++y) {
112       for (int x = 0; x < dw_attr.weights.shape.w; ++x) {
113         for (int i = 0; i < 4; ++i) {
114           const int d_ch = d * 4 + i;
115           if (d_ch < dw_attr.weights.shape.i) {
116             const int f_index =
117                 dw_attr.weights.shape.LinearIndex({0, y, x, d_ch});
118             gpu_data_.push_back(dw_attr.weights.data[f_index]);
119           } else {
120             gpu_data_.push_back(0.0f);
121           }
122         }
123       }
124     }
125   }
126 }
127 
AddConvData(const Convolution2DAttributes & conv_attr)128 void ThinPointwiseFuser::AddConvData(const Convolution2DAttributes& conv_attr) {
129   int conv_src_ch_aligned = AlignByN(conv_attr.weights.shape.i, 4);
130   int conv_dst_ch_aligned = AlignByN(conv_attr.weights.shape.o, 4);
131   int conv_weights_count =
132       conv_dst_ch_aligned + conv_src_ch_aligned * conv_dst_ch_aligned;
133   gpu_data_.reserve(gpu_data_.size() + conv_weights_count);
134   // conv bias loading
135   for (int i = 0; i < conv_dst_ch_aligned; ++i) {
136     if (i < conv_attr.bias.shape.v) {
137       gpu_data_.push_back(conv_attr.bias.data[i]);
138     } else {
139       gpu_data_.push_back(0.0f);
140     }
141   }
142   // conv weights loading
143   for (int d = 0; d < conv_dst_ch_aligned / 4; ++d) {
144     for (int s = 0; s < conv_src_ch_aligned / 4; ++s) {
145       for (int j = 0; j < 4; ++j) {
146         for (int i = 0; i < 4; ++i) {
147           const int s_ch = s * 4 + j;
148           const int d_ch = d * 4 + i;
149           if (s_ch < conv_attr.weights.shape.i &&
150               d_ch < conv_attr.weights.shape.o) {
151             const int f_index =
152                 conv_attr.weights.shape.LinearIndex({d_ch, 0, 0, s_ch});
153             gpu_data_.push_back(conv_attr.weights.data[f_index]);
154           } else {
155             gpu_data_.push_back(0.0f);
156           }
157         }
158       }
159     }
160   }
161 }
162 
CreateConstantsGpuBuffer(const GpuInfo & gpu_info)163 void ThinPointwiseFuser::CreateConstantsGpuBuffer(const GpuInfo& gpu_info) {
164   const bool fp32_weights = op_def_.precision == CalculationsPrecision::F32;
165   const int float_size = fp32_weights ? 4 : 2;
166   BufferDescriptor desc;
167   desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
168   desc.element_size = 4;
169   desc.memory_type = gpu_info.IsMali() || gpu_info.IsAMD()
170                          ? MemoryType::GLOBAL
171                          : MemoryType::CONSTANT;
172   desc.size = float_size * gpu_data_.size();
173   desc.data.resize(desc.size);
174 
175   if (fp32_weights) {
176     memcpy(desc.data.data(), gpu_data_.data(), desc.size);
177   } else {
178     half* gpu_data_half = reinterpret_cast<half*>(desc.data.data());
179     for (int i = 0; i < gpu_data_.size(); ++i) {
180       gpu_data_half[i] = gpu_data_[i];
181     }
182   }
183   args_.AddObject("constants",
184                   std::make_unique<BufferDescriptor>(std::move(desc)));
185 }
186 
Init(CalculationsPrecision precision,const TensorDescriptor & src_desc,int output_batch,int output_width,int output_height)187 void ThinPointwiseFuser::Init(CalculationsPrecision precision,
188                               const TensorDescriptor& src_desc,
189                               int output_batch, int output_width,
190                               int output_height) {
191   op_def_.precision = precision;
192   op_def_.src_tensors.push_back(src_desc);
193   weights_counter_ = 0;
194   output_shape_.b = output_batch;
195   output_shape_.w = output_width;
196   output_shape_.h = output_height;
197 
198   code_ += "MAIN_FUNCTION($0) {\n";
199   if (src_desc.HasAxis(Axis::BATCH)) {
200     code_ += "  int linear_id = GLOBAL_ID_0;\n";
201     code_ += "  int X = linear_id / args.dst_tensor.Batch();\n";
202     code_ += "  int B = linear_id % args.dst_tensor.Batch();\n";
203     code_ += "  args.dst_tensor.SetBatchRef(B);\n";
204     code_ += "  args.src_tensor.SetBatchRef(B);\n";
205   } else {
206     code_ += "  int X = GLOBAL_ID_0;\n";
207   }
208   code_ += "  int Y = GLOBAL_ID_1;\n";
209   code_ +=
210       "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height()) { "
211       "\n";
212   code_ += "    return; \n";
213   code_ += "  } \n";
214 }
215 
IsNodeSupported(const GpuInfo & gpu_info,Node * node) const216 bool ThinPointwiseFuser::IsNodeSupported(const GpuInfo& gpu_info,
217                                          Node* node) const {
218   if (!node) {
219     return false;
220   }
221   auto op_type = OperationTypeFromString(node->operation.type);
222   if (op_type == OperationType::RELU || op_type == OperationType::PRELU) {
223     return !nodes_.empty();
224   } else if (op_type == OperationType::DEPTHWISE_CONVOLUTION) {
225     if (!nodes_.empty()) {
226       return false;
227     }
228     DepthwiseConvolution2DAttributes* dw_attr =
229         absl::any_cast<DepthwiseConvolution2DAttributes>(
230             &node->operation.attributes);
231     const auto dw_shape = dw_attr->weights.shape;
232     bool good_dw = dw_shape.o == 1;
233     if (!good_dw) {
234       return false;
235     }
236     if (gpu_info.IsApple()) {
237       return dw_shape.i <= 16 &&
238              dw_shape.i * dw_shape.h * dw_shape.w <= 3 * 3 * 16;
239     } else if (gpu_info.IsMali()) {
240       if (op_def_.precision == CalculationsPrecision::F16 &&
241           op_def_.src_tensors[0].SupportsZeroClamp(Axis::WIDTH, gpu_info) &&
242           op_def_.src_tensors[0].SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
243         return dw_shape.i <= 16 &&
244                dw_shape.i * dw_shape.h * dw_shape.w <= 3 * 3 * 16;
245       } else {
246         return false;
247       }
248     } else {
249       if (op_def_.precision == CalculationsPrecision::F16) {
250         return dw_shape.i <= 32 &&
251                dw_shape.i * dw_shape.h * dw_shape.w <= 3 * 3 * 32;
252       } else {
253         return dw_shape.i <= 16 &&
254                dw_shape.i * dw_shape.h * dw_shape.w <= 3 * 3 * 16;
255       }
256     }
257   } else if (op_type == OperationType::CONVOLUTION_2D) {
258     if (nodes_.empty()) {
259       return false;
260     }
261     Convolution2DAttributes* conv_attr =
262         absl::any_cast<Convolution2DAttributes>(&node->operation.attributes);
263     const auto conv_shape = conv_attr->weights.shape;
264     bool good_conv =
265         conv_shape.w == 1 && conv_shape.h == 1 && conv_attr->dilations.w == 1 &&
266         conv_attr->dilations.h == 1 && conv_attr->strides.w == 1 &&
267         conv_attr->strides.h == 1 && conv_attr->padding.prepended.w == 0 &&
268         conv_attr->padding.prepended.h == 0 &&
269         conv_attr->padding.appended.w == 0 &&
270         conv_attr->padding.appended.h == 0;
271     if (!good_conv) {
272       return false;
273     }
274     if (gpu_info.IsAdreno() && gpu_info.IsApiOpenCl()) {
275       int conv_src_ch_aligned = AlignByN(conv_attr->weights.shape.i, 4);
276       int conv_dst_ch_aligned = AlignByN(conv_attr->weights.shape.o, 4);
277       int conv_weights_count =
278           conv_dst_ch_aligned + conv_src_ch_aligned * conv_dst_ch_aligned;
279 
280       DataType data_type = op_def_.precision == CalculationsPrecision::F32
281                                ? DataType::FLOAT32
282                                : DataType::FLOAT16;
283       int weights_size = conv_weights_count * SizeOf(data_type);
284       if (convs_count_ >= 3 || buffer_size_ + weights_size > 1024 * 3) {
285         return false;
286       }
287     } else {
288       if (convs_count_ >= 1) {
289         return false;
290       }
291     }
292     if (gpu_info.IsApple()) {
293       if (op_def_.precision == CalculationsPrecision::F16) {
294         return conv_shape.o <= 16 && conv_shape.i * conv_shape.o <= 16 * 16;
295       } else {
296         return conv_shape.o <= 8 && conv_shape.i * conv_shape.o <= 8 * 16;
297       }
298     } else if (gpu_info.IsMali()) {
299       if (op_def_.precision == CalculationsPrecision::F16) {
300         return conv_shape.o <= 16 && conv_shape.i * conv_shape.o <= 16 * 16;
301       } else {
302         return false;
303       }
304     } else {
305       if (op_def_.precision == CalculationsPrecision::F16) {
306         return conv_shape.o <= 32 && conv_shape.i * conv_shape.o <= 32 * 32;
307       } else {
308         return conv_shape.o <= 32 && conv_shape.i * conv_shape.o <= 16 * 32;
309       }
310     }
311   } else {
312     return false;
313   }
314 }
315 
ReserveNode(const GpuInfo & gpu_info,Node * node)316 bool ThinPointwiseFuser::ReserveNode(const GpuInfo& gpu_info, Node* node) {
317   if (!IsNodeSupported(gpu_info, node)) {
318     return false;
319   }
320   nodes_.push_back(node);
321   if (IsConvNode(node)) {
322     convs_count_++;
323     Convolution2DAttributes* conv_attr =
324         absl::any_cast<Convolution2DAttributes>(&node->operation.attributes);
325 
326     int conv_src_ch_aligned = AlignByN(conv_attr->weights.shape.i, 4);
327     int conv_dst_ch_aligned = AlignByN(conv_attr->weights.shape.o, 4);
328     int conv_weights_count =
329         conv_dst_ch_aligned + conv_src_ch_aligned * conv_dst_ch_aligned;
330 
331     DataType data_type = op_def_.precision == CalculationsPrecision::F32
332                              ? DataType::FLOAT32
333                              : DataType::FLOAT16;
334     buffer_size_ += conv_weights_count * SizeOf(data_type);
335   }
336   if (IsDwConvNode(node)) {
337     DepthwiseConvolution2DAttributes* dw_attr =
338         absl::any_cast<DepthwiseConvolution2DAttributes>(
339             &node->operation.attributes);
340 
341     int dw_dst_ch_aligned = AlignByN(dw_attr->weights.shape.i, 4);
342     int dw_weights_count = dw_dst_ch_aligned + dw_dst_ch_aligned *
343                                                    dw_attr->weights.shape.h *
344                                                    dw_attr->weights.shape.w;
345     DataType data_type = op_def_.precision == CalculationsPrecision::F32
346                              ? DataType::FLOAT32
347                              : DataType::FLOAT16;
348     buffer_size_ += dw_weights_count * SizeOf(data_type);
349   }
350   return true;
351 }
352 
AddNode(const GpuInfo & gpu_info,Node * node)353 void ThinPointwiseFuser::AddNode(const GpuInfo& gpu_info, Node* node) {
354   auto op_type = OperationTypeFromString(node->operation.type);
355   if (op_type == OperationType::RELU) {
356     ReLUAttributes* attr =
357         absl::any_cast<ReLUAttributes>(&node->operation.attributes);
358     AddReluNode(*attr);
359   } else if (op_type == OperationType::PRELU) {
360     PReLUAttributes* attr =
361         absl::any_cast<PReLUAttributes>(&node->operation.attributes);
362     AddPreluNode(*attr);
363   } else if (op_type == OperationType::DEPTHWISE_CONVOLUTION) {
364     DepthwiseConvolution2DAttributes* attr =
365         absl::any_cast<DepthwiseConvolution2DAttributes>(
366             &node->operation.attributes);
367     AddDepthwiseConvNode(gpu_info, *attr);
368   } else if (op_type == OperationType::CONVOLUTION_2D) {
369     Convolution2DAttributes* attr =
370         absl::any_cast<Convolution2DAttributes>(&node->operation.attributes);
371     AddConvNode(gpu_info, *attr);
372   }
373 }
374 
IsElementwiseNode(Node * node) const375 bool ThinPointwiseFuser::IsElementwiseNode(Node* node) const {
376   auto op_type = OperationTypeFromString(node->operation.type);
377   return op_type == OperationType::RELU || op_type == OperationType::PRELU;
378 }
379 
IsConvNode(Node * node) const380 bool ThinPointwiseFuser::IsConvNode(Node* node) const {
381   auto op_type = OperationTypeFromString(node->operation.type);
382   return op_type == OperationType::CONVOLUTION_2D;
383 }
384 
IsDwConvNode(Node * node) const385 bool ThinPointwiseFuser::IsDwConvNode(Node* node) const {
386   auto op_type = OperationTypeFromString(node->operation.type);
387   return op_type == OperationType::DEPTHWISE_CONVOLUTION;
388 }
389 
AddDepthwiseConvNode(const GpuInfo & gpu_info,const DepthwiseConvolution2DAttributes & attr)390 void ThinPointwiseFuser::AddDepthwiseConvNode(
391     const GpuInfo& gpu_info, const DepthwiseConvolution2DAttributes& attr) {
392   AddDepthwiseConvData(attr);
393   op_name_ += "dw_conv";
394   output_shape_.c = attr.weights.shape.i;
395   flops_ += GetDepthwiseConvolutionFlops(output_shape_, attr.weights.shape);
396   args_.AddInt("stride_x", attr.strides.w);
397   args_.AddInt("padding_x", -attr.padding.prepended.w);
398   args_.AddInt("dilation_x", attr.dilations.w);
399   args_.AddInt("stride_y", attr.strides.h);
400   args_.AddInt("padding_y", -attr.padding.prepended.h);
401   args_.AddInt("dilation_y", attr.dilations.h);
402 
403   const auto& src_desc = op_def_.src_tensors[0];
404   int intermediate_depth = DivideRoundUp(attr.weights.shape.i, 4);
405   for (int d = 0; d < intermediate_depth; ++d) {
406     code_ += "  FLT4 dw_res_" + std::to_string(d) + " = args.constants.Read(" +
407              std::to_string(weights_counter_++) + ");\n";
408   }
409   code_ += "  int x_offseted = X * args.stride_x + args.padding_x;\n";
410   code_ += "  int y_offseted = Y * args.stride_y + args.padding_y;\n";
411   code_ += "  int x_c, y_c;\n";
412 
413   auto generate_check = [&]() {
414     std::string check;
415     const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
416     const std::vector<std::string> names{"x_in", "y_in", "z_in"};
417     for (int i = 0; i < axes.size(); ++i) {
418       const auto& axis = axes[i];
419       if (src_desc.HasAxis(axis) &&
420           !src_desc.SupportsZeroClamp(axis, gpu_info)) {
421         if (!check.empty()) {
422           check += " && ";
423         }
424         check += names[i];
425       }
426     }
427     return check;
428   };
429   const std::string check = generate_check();
430   if (!src_desc.SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
431     code_ += "  bool y_in;\n";
432   }
433   if (!src_desc.SupportsZeroClamp(Axis::WIDTH, gpu_info)) {
434     code_ += "  bool x_in;\n";
435   }
436 
437   const std::string postfixes[] = {".x", ".xy", ".xyz", ""};
438   code_ += "  FLT4 src;\n";
439   for (int d = 0; d < intermediate_depth; ++d) {
440     outputs_.push_back("dw_res_" + std::to_string(d));
441     const int src_ch_count = std::min(4, attr.weights.shape.i - d * 4);
442     const std::string s_postfix = postfixes[src_ch_count - 1];
443     for (int ky = 0; ky < attr.weights.shape.h; ++ky) {
444       code_ += "  y_c = y_offseted + " + std::to_string(ky) +
445                " * args.dilation_y;\n";
446       if (!src_desc.SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
447         code_ += "  y_in = y_c >= 0 && y_c < args.src_tensor.Height();\n";
448         code_ += "  y_c = clamp(y_c, 0, args.src_tensor.Height() - 1);\n";
449       }
450       for (int kx = 0; kx < attr.weights.shape.w; ++kx) {
451         code_ += "  x_c = x_offseted + " + std::to_string(kx) +
452                  " * args.dilation_x;\n";
453         if (!src_desc.SupportsZeroClamp(Axis::WIDTH, gpu_info)) {
454           code_ += "  x_in = x_c >= 0 && x_c < args.src_tensor.Width();\n";
455           code_ += "  x_c = clamp(x_c, 0, args.src_tensor.Width() - 1);\n";
456         }
457         std::string multiplier =
458             check.empty() ? "" : " * INIT_FLT(" + check + ")";
459         code_ += "  src" + s_postfix + " = args.src_tensor.Read(x_c, y_c, " +
460                  std::to_string(d) + ")" + s_postfix + multiplier + ";\n";
461         code_ += "  " +
462                  MultiplyAccumulate(
463                      gpu_info, "dw_res_" + std::to_string(d) + s_postfix,
464                      "src" + s_postfix,
465                      "args.constants.Read(" +
466                          std::to_string(weights_counter_++) + ")" + s_postfix) +
467                  ";\n";
468       }
469     }
470   }
471 }
472 
AddElementwiseNode(ElementwiseDescriptor && op_desc)473 void ThinPointwiseFuser::AddElementwiseNode(ElementwiseDescriptor&& op_desc) {
474   std::string unique_postfix = absl::StrCat("_link_internal", link_counter_);
475   link_counter_++;
476   op_desc.args.RenameArgs(unique_postfix, &op_desc.code);
477   auto status = args_.Merge(std::move(op_desc.args), unique_postfix);
478   for (int i = 0; i < outputs_.size(); ++i) {
479     const std::string elementwise_new_code =
480         absl::StrReplaceAll(op_desc.code, {{"in_value", outputs_[i]},
481                                            {"out_value", outputs_[i]},
482                                            {"X_COORD", "X"},
483                                            {"Y_COORD", "Y"},
484                                            {"S_COORD", std::to_string(i)},
485                                            {"B_COORD", "B"}});
486     code_ += "  {  " + elementwise_new_code + "  }\n";
487   }
488 }
489 
AddReluNode(const ReLUAttributes & attr)490 void ThinPointwiseFuser::AddReluNode(const ReLUAttributes& attr) {
491   ElementwiseDescriptor op_desc = CreateReLU(attr, op_def_.precision);
492   AddElementwiseNode(std::move(op_desc));
493 }
494 
AddPreluNode(const PReLUAttributes & attr)495 void ThinPointwiseFuser::AddPreluNode(const PReLUAttributes& attr) {
496   ElementwiseDescriptor op_desc = CreatePReLU(attr, op_def_.dst_tensors[0]);
497   AddElementwiseNode(std::move(op_desc));
498 }
499 
AddConvNode(const GpuInfo & gpu_info,const Convolution2DAttributes & attr)500 void ThinPointwiseFuser::AddConvNode(const GpuInfo& gpu_info,
501                                      const Convolution2DAttributes& attr) {
502   AddConvData(attr);
503   op_name_ += "->conv1x1";
504   output_shape_.c = attr.weights.shape.o;
505   flops_ += GetConvolutionFlops(output_shape_, attr.weights.shape);
506   const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
507   const int dst_slices = DivideRoundUp(attr.weights.shape.o, 4);
508   std::vector<std::string> inputs = outputs_;
509   outputs_.resize(dst_slices);
510   std::string link = "_link_" + std::to_string(link_counter_);
511   link_counter_++;
512   for (int d = 0; d < dst_slices; ++d) {
513     std::string dst = "conv_res_" + std::to_string(d) + link;
514     outputs_[d] = dst;
515     code_ += "  FLT4 " + outputs_[d] + " = args.constants.Read(" +
516              std::to_string(weights_counter_++) + ");\n";
517   }
518   for (int d = 0; d < dst_slices; ++d) {
519     std::string dst = outputs_[d];
520     for (int s = 0; s < src_slices; ++s) {
521       std::string src = inputs[s];
522       const std::string c0 =
523           "args.constants.Read(" + std::to_string(weights_counter_++) + ")";
524       const std::string c1 =
525           "args.constants.Read(" + std::to_string(weights_counter_++) + ")";
526       const std::string c2 =
527           "args.constants.Read(" + std::to_string(weights_counter_++) + ")";
528       const std::string c3 =
529           "args.constants.Read(" + std::to_string(weights_counter_++) + ")";
530       code_ += "  " + MultiplyAccumulate(gpu_info, dst, c0, src + ".x") + ";\n";
531       code_ += "  " + MultiplyAccumulate(gpu_info, dst, c1, src + ".y") + ";\n";
532       code_ += "  " + MultiplyAccumulate(gpu_info, dst, c2, src + ".z") + ";\n";
533       code_ += "  " + MultiplyAccumulate(gpu_info, dst, c3, src + ".w") + ";\n";
534     }
535     if (last_op_) {
536       code_ += "  if(" + std::to_string(d) + " < args.dst_tensor.Slices()) {\n";
537       code_ += "    args.dst_tensor.Write(" + dst + ", X, Y, " +
538                std::to_string(d) + ");\n";
539       code_ += "  }\n";
540     }
541   }
542 }
543 
Finalize(const GpuInfo & gpu_info,const GraphFloat32 & graph,const std::map<ValueId,TensorDescriptor> & tensor_descriptors,GPUOperationsSubgraph * gpu_subgraph)544 bool ThinPointwiseFuser::Finalize(
545     const GpuInfo& gpu_info, const GraphFloat32& graph,
546     const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
547     GPUOperationsSubgraph* gpu_subgraph) {
548   while (!nodes_.empty() && IsElementwiseNode(nodes_.back())) {
549     nodes_.pop_back();
550   }
551   if (nodes_.empty() || convs_count_ == 0) {
552     return false;
553   }
554   auto first_node_inputs = graph.FindInputs(nodes_.front()->id);
555   auto last_node_outputs = graph.FindOutputs(nodes_.back()->id);
556   const TensorDescriptor& dst_desc =
557       tensor_descriptors.find(last_node_outputs[0]->id)->second;
558   op_def_.dst_tensors.push_back(dst_desc);
559   for (int i = 0; i < nodes_.size(); ++i) {
560     if (i == nodes_.size() - 1) {
561       last_op_ = true;
562     }
563     AddNode(gpu_info, nodes_[i]);
564     fused_nodes_.insert(nodes_[i]->id);
565   }
566   code_ += "}\n";
567 
568   if (gpu_info.IsMali()) {
569     const BHWC dst_shape = output_shape_;
570     const int dst_slices = DivideRoundUp(dst_shape.c, 4);
571     int task_size = dst_shape.b * dst_shape.h * dst_shape.w * dst_slices;
572     int block_size =
573         GetRecommendedBlockSizeForConv(gpu_info, op_def_.precision, task_size);
574     if (block_size < 4 && dst_slices >= 2) {
575       return false;
576     }
577     if (block_size < 2 && dst_slices >= 4) {
578       return false;
579     }
580   }
581 
582   CreateConstantsGpuBuffer(gpu_info);
583   std::unique_ptr<GPUOperation>* gpu_op =
584       InitSingleOpSubgraph(first_node_inputs, last_node_outputs, gpu_subgraph);
585   GPUOperation operation(op_def_);
586   operation.args_ = std::move(args_);
587   operation.AddSrcTensor("src_tensor", op_def_.src_tensors[0]);
588   operation.AddDstTensor("dst_tensor", op_def_.dst_tensors[0]);
589   operation.code_ = code_;
590   operation.flops_ = flops_;
591   operation.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1;
592   if (gpu_info.IsMali()) {
593     operation.compiler_options_.push_back(CompilerOptions::kClFastRelaxedMath);
594   }
595   *gpu_op = std::make_unique<GPUOperation>(std::move(operation));
596   gpu_subgraph->operations[0].name = op_name_;
597   return true;
598 }
599 
GetNextLinearNode(const GraphFloat32 & graph,NodeId current_node)600 Node* GetNextLinearNode(const GraphFloat32& graph, NodeId current_node) {
601   auto inputs = graph.FindInputs(current_node);
602   if (inputs.size() != 1) {
603     return nullptr;
604   }
605   auto outputs = graph.FindOutputs(current_node);
606   if (outputs.size() != 1) {
607     return nullptr;
608   }
609   auto consumers = graph.FindConsumers(outputs[0]->id);
610   if (consumers.size() != 1) {
611     return nullptr;
612   }
613   return consumers[0];
614 }
615 
TryDepthwiseConvPlus1x1Conv(const GpuInfo & gpu_info,CalculationsPrecision precision,const GraphFloat32 & graph,NodeId first_node_id,const std::map<ValueId,TensorDescriptor> & tensor_descriptors,std::set<NodeId> * consumed_nodes,GPUOperationsSubgraph * gpu_subgraph)616 absl::Status TryDepthwiseConvPlus1x1Conv(
617     const GpuInfo& gpu_info, CalculationsPrecision precision,
618     const GraphFloat32& graph, NodeId first_node_id,
619     const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
620     std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph) {
621   if (!(gpu_info.IsAdreno() || gpu_info.IsNvidia() || gpu_info.IsMali() ||
622         gpu_info.IsApple() || gpu_info.IsAMD())) {
623     return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
624   }
625   if (gpu_info.IsMali() && gpu_info.mali_info.IsMidgard()) {
626     return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
627   }
628   auto* node = graph.GetNode(first_node_id);
629   if (node == nullptr ||
630       consumed_nodes->find(node->id) != consumed_nodes->end()) {
631     return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
632   }
633   auto dw_inputs = graph.FindInputs(node->id);
634   auto dw_outputs = graph.FindOutputs(node->id);
635 
636   const TensorDescriptor& src_desc =
637       tensor_descriptors.find(dw_inputs[0]->id)->second;
638   ThinPointwiseFuser fuser;
639   auto dw_shape = dw_outputs[0]->tensor.shape;
640   fuser.Init(precision, src_desc, dw_shape.b, dw_shape.w, dw_shape.h);
641   while (fuser.ReserveNode(gpu_info, node)) {
642     node = GetNextLinearNode(graph, node->id);
643     if (node == nullptr ||
644         consumed_nodes->find(node->id) != consumed_nodes->end()) {
645       break;
646     }
647   }
648 
649   if (!fuser.Finalize(gpu_info, graph, tensor_descriptors, gpu_subgraph)) {
650     return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
651   }
652   consumed_nodes->insert(fuser.GetFusedNodes().begin(),
653                          fuser.GetFusedNodes().end());
654   return absl::OkStatus();
655 }
656 
657 }  // namespace gpu
658 }  // namespace tflite
659