xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/delegates/gpu/common/selectors/operation_selector.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/selectors/operation_selector.h"
17 
18 #include <algorithm>
19 #include <memory>
20 #include <string>
21 #include <utility>
22 #include <vector>
23 
24 #include "absl/strings/str_cat.h"
25 #include "absl/types/any.h"
26 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
27 #include "tensorflow/lite/delegates/gpu/common/flops_util.h"
28 #include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
29 #include "tensorflow/lite/delegates/gpu/common/operations.h"
30 #include "tensorflow/lite/delegates/gpu/common/selectors/convolution_selector.h"
31 #include "tensorflow/lite/delegates/gpu/common/selectors/convolution_transposed_selector.h"
32 #include "tensorflow/lite/delegates/gpu/common/selectors/default_selector.h"
33 #include "tensorflow/lite/delegates/gpu/common/selectors/dw_convolution_selector.h"
34 #include "tensorflow/lite/delegates/gpu/common/selectors/fully_connected_selector.h"
35 #include "tensorflow/lite/delegates/gpu/common/selectors/simple_selectors.h"
36 #include "tensorflow/lite/delegates/gpu/common/shape.h"
37 #include "tensorflow/lite/delegates/gpu/common/status.h"
38 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
39 #include "tensorflow/lite/delegates/gpu/common/task/weights_conversion.h"
40 #include "tensorflow/lite/delegates/gpu/common/tasks/elementwise.h"
41 #include "tensorflow/lite/delegates/gpu/common/tasks/mean_stddev_normalization.h"
42 #include "tensorflow/lite/delegates/gpu/common/tasks/transpose.h"
43 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
44 #include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
45 
46 namespace tflite {
47 namespace gpu {
48 namespace {
IsRecommendedForWinograd4x4To6x6(const Convolution2DAttributes & attr,const GpuInfo & gpu_info,const BHWC & dst_shape)49 bool IsRecommendedForWinograd4x4To6x6(const Convolution2DAttributes& attr,
50                                       const GpuInfo& gpu_info,
51                                       const BHWC& dst_shape) {
52   const int tiles_x = DivideRoundUp(dst_shape.w, 4);
53   const int tiles_y = DivideRoundUp(dst_shape.h, 4);
54   const int total_tiles = tiles_x * tiles_y;
55   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
56   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
57   int min_src_depth = 16;
58   int min_dst_depth = 16;
59   if (gpu_info.IsAdreno()) {
60     min_src_depth = 32;
61     min_dst_depth = 32;
62   } else if (gpu_info.IsAMD()) {
63     min_dst_depth = 8;
64   }
65   int min_tiles = 32;
66   if (gpu_info.IsAdreno()) {
67     if (gpu_info.adreno_info.IsAdreno6xx()) {
68       min_tiles = 128;
69     } else {
70       min_tiles = 64;
71     }
72   }
73   const bool recommended_channels =
74       src_depth >= min_src_depth && dst_depth >= min_dst_depth;
75   const bool recommended_hw = total_tiles >= min_tiles;
76   return recommended_channels && recommended_hw;
77 }
78 
WinogradFromNode(const GpuInfo & gpu_info,const std::vector<Value * > & inputs,const std::vector<Value * > & outputs,const OperationDef & op_def,ModelHints hints,const BHWC & input_shape,const BHWC & output_shape,const Convolution2DAttributes & attr,GPUOperationsSubgraph * gpu_subgraph)79 absl::Status WinogradFromNode(const GpuInfo& gpu_info,
80                               const std::vector<Value*>& inputs,
81                               const std::vector<Value*>& outputs,
82                               const OperationDef& op_def, ModelHints hints,
83                               const BHWC& input_shape, const BHWC& output_shape,
84                               const Convolution2DAttributes& attr,
85                               GPUOperationsSubgraph* gpu_subgraph) {
86   if (!IsSuitableForWinograd4x4To6x6(attr)) {
87     return absl::UnimplementedError("No implementation for this case.");
88   }
89   if (!IsRecommendedForWinograd4x4To6x6(attr, gpu_info, output_shape)) {
90     return absl::UnimplementedError("Not recommended for this case.");
91   }
92 
93   const int tiles_x = DivideRoundUp(output_shape.w, 4);
94   const int tiles_y = DivideRoundUp(output_shape.h, 4);
95   const BHWC src_transformed_shape{input_shape.b, 36, tiles_x * tiles_y,
96                                    input_shape.c};
97   const BHWC dst_transformed_shape{input_shape.b, 36, tiles_x * tiles_y,
98                                    output_shape.c};
99   TensorDescriptor src_transformed_desc = op_def.src_tensors[0];
100   RETURN_IF_ERROR(src_transformed_desc.UpdateToSupportedStorageType(
101       gpu_info, src_transformed_shape));
102   TensorDescriptor dst_transformed_desc = op_def.src_tensors[0];
103   RETURN_IF_ERROR(dst_transformed_desc.UpdateToSupportedStorageType(
104       gpu_info, dst_transformed_shape));
105   const int src_transformed_id =
106       gpu_subgraph->AddTensor(src_transformed_shape, src_transformed_desc);
107   const int dst_transformed_id =
108       gpu_subgraph->AddTensor(dst_transformed_shape, dst_transformed_desc);
109   gpu_subgraph->operations.clear();
110   gpu_subgraph->operations.resize(3);
111 
112   OperationDef winograd_up_def;
113   winograd_up_def.precision = op_def.precision;
114   winograd_up_def.src_tensors.push_back(op_def.src_tensors[0]);
115   winograd_up_def.dst_tensors.push_back(src_transformed_desc);
116   auto& winograd_up = gpu_subgraph->operations[0];
117   winograd_up.operation =
118       SelectWinograd4x4To36(gpu_info, attr.padding, winograd_up_def);
119   winograd_up.input_ids = {static_cast<int>(inputs[0]->id)};
120   winograd_up.output_ids = {src_transformed_id};
121   winograd_up.name = "winograd_4x4_to_36";
122 
123   OperationDef conv_def;
124   conv_def.precision = op_def.precision;
125   conv_def.src_tensors.push_back(src_transformed_desc);
126   conv_def.dst_tensors.push_back(dst_transformed_desc);
127   auto& conv = gpu_subgraph->operations[1];
128   conv.input_ids = {src_transformed_id};
129   conv.output_ids = {dst_transformed_id};
130   conv.operation = SelectConvolutionForWinograd(attr, input_shape, gpu_info,
131                                                 conv_def, hints);
132   conv.name = "convolution_winograd_4x4_6x6";
133   conv.operation->flops_ =
134       GetConvolutionWinograd4x4To6x6Flops(output_shape, attr.weights.shape);
135 
136   OperationDef winograd_down_def;
137   winograd_down_def.precision = op_def.precision;
138   winograd_down_def.src_tensors.push_back(dst_transformed_desc);
139   winograd_down_def.dst_tensors.push_back(op_def.dst_tensors[0]);
140   auto& winograd_down = gpu_subgraph->operations[2];
141   winograd_down.input_ids = {dst_transformed_id};
142   winograd_down.output_ids = {static_cast<int>(outputs[0]->id)};
143   auto bias_copy = attr.bias;
144   if (bias_copy.shape.v < attr.weights.shape.o) {
145     bias_copy.shape = Linear(attr.weights.shape.o);
146     bias_copy.data.resize(attr.weights.shape.o);
147   }
148   winograd_down.operation =
149       SelectWinograd36To4x4(gpu_info, winograd_down_def, bias_copy);
150   winograd_down.name = "winograd_36_to_4x4";
151   return absl::OkStatus();
152 }
153 
154 // Supported operation types:
155 // 1) BATCHED_MATMUL
156 // 2) CONVOLUTION_2D
157 // 3) CONVOLUTION_TRANSPOSED
AddDynamicConv(ModelHints hints,const GpuInfo & gpu_info,const OperationDef & op_def,OperationType op_type,const BHWC & src_shape,const OHWI & weights_shape,const BHWC & dst_shape,int src_id,int weights_id,int dst_id,GPUOperationsSubgraph * gpu_subgraph,void * attr=nullptr)158 absl::Status AddDynamicConv(ModelHints hints, const GpuInfo& gpu_info,
159                             const OperationDef& op_def, OperationType op_type,
160                             const BHWC& src_shape, const OHWI& weights_shape,
161                             const BHWC& dst_shape, int src_id, int weights_id,
162                             int dst_id, GPUOperationsSubgraph* gpu_subgraph,
163                             void* attr = nullptr) {
164   gpu_subgraph->operations.reserve(gpu_subgraph->operations.size() + 2);
165   gpu_subgraph->operations.push_back({});
166   auto& converter_op = gpu_subgraph->operations.back();
167   gpu_subgraph->operations.push_back({});
168   auto& conv_op = gpu_subgraph->operations.back();
169   OperationDef conv_temp_def = op_def;
170   conv_temp_def.src_tensors[1] = {op_def.src_tensors[1].GetDataType(),
171                                   TensorStorageType::BUFFER, Layout::HWC};
172   WeightsDescription weights_desc;
173   const BHWC weights_shape_bhwc(weights_shape.o, weights_shape.h,
174                                 weights_shape.w, weights_shape.i);
175   conv_op.output_ids = {dst_id};
176   if (op_type == OperationType::CONVOLUTION_2D) {
177     Convolution2DAttributes* conv_attr =
178         reinterpret_cast<Convolution2DAttributes*>(attr);
179     conv_op.operation = SelectConvolutionWithDynamicWeights(
180         *conv_attr, weights_shape_bhwc, dst_shape, gpu_info, conv_temp_def,
181         hints, &weights_desc);
182     conv_op.name = "convolution_dynamic";
183     conv_op.operation->flops_ = GetConvolutionFlops(dst_shape, weights_shape);
184   } else if (op_type == OperationType::CONVOLUTION_TRANSPOSED) {
185     ConvolutionTransposedAttributes* conv_attr =
186         reinterpret_cast<ConvolutionTransposedAttributes*>(attr);
187     conv_op.operation = SelectConvolutionTransposedWithDynamicWeights(
188         *conv_attr, gpu_info, conv_temp_def, &weights_desc);
189     conv_op.name = "conv_transposed_dynamic";
190     conv_op.operation->flops_ =
191         GetConvolutionTransposedFlops(src_shape, weights_shape);
192   } else if (op_type == OperationType::BATCHED_MATMUL) {
193     conv_op.operation =
194         SelectConvolutionBatchedMatMul(weights_shape, dst_shape, gpu_info,
195                                        conv_temp_def, hints, &weights_desc);
196     conv_op.name = "mat_mul_as_convolution";
197     conv_op.operation->flops_ =
198         dst_shape.b * dst_shape.h * dst_shape.w * dst_shape.c * weights_shape.i;
199   } else {
200     return absl::InternalError("No support of this operation type.");
201   }
202   conv_op.input_ids = {src_id};
203   if (weights_desc.layout == WeightsLayout::k2DX4I4YIsSpatialIAndXIsOOGroupO4 ||
204       weights_desc.layout == WeightsLayout::k2DX4O4YIsSpatialIAndXIsOOGroupI4) {
205     // weights are 4x textures 2d
206     uint2 tex_size = Get2dResourceSize(weights_desc, weights_shape);
207     for (int i = 0; i < 4; ++i) {
208       int tensor_id = gpu_subgraph->AddTensor(
209           BHWC(1, tex_size.y, tex_size.x, 4),
210           TensorDescriptor(weights_desc.type, TensorStorageType::TEXTURE_2D,
211                            Layout::HWC));
212       conv_op.input_ids.push_back(tensor_id);
213       converter_op.output_ids.push_back(tensor_id);
214     }
215   } else {
216     // weights are single buffer
217     int tensor_id = gpu_subgraph->AddTensor(
218         BHWC(1, 1, 1,
219              GetTotalElementsCountForLayout(weights_desc, weights_shape)),
220         TensorDescriptor(weights_desc.type, TensorStorageType::BUFFER,
221                          Layout::HWC));
222     conv_op.input_ids.push_back(tensor_id);
223     converter_op.output_ids.push_back(tensor_id);
224   }
225   OperationDef conv_def = conv_op.operation->GetDefinition();
226   OperationDef converter_def;
227   converter_def.precision = op_def.precision;
228   converter_def.src_tensors.push_back(op_def.src_tensors[1]);
229   for (int i = 1; i < conv_def.src_tensors.size(); ++i) {
230     converter_def.dst_tensors.push_back(conv_def.src_tensors[i]);
231   }
232 
233   converter_op.input_ids = {weights_id};
234   Layout input_layout = Layout::OHWI;
235   if (op_type == OperationType::BATCHED_MATMUL) {
236     input_layout = Layout::HWIO;
237   }
238   converter_op.operation = SelectConverterToConvWeights(
239       weights_desc, converter_def, hints, input_layout);
240   converter_op.name = "bhwc_tensor_to_conv_weights";
241   return absl::OkStatus();
242 }
243 
AddConvSharedWeights(const Convolution2DAttributes & attr,const WeightsDescription & weights_desc,std::vector<SharedWeightsConvDesc> * shared_conv_weights,GPUOperationsSubgraph * gpu_subgraph)244 void AddConvSharedWeights(
245     const Convolution2DAttributes& attr, const WeightsDescription& weights_desc,
246     std::vector<SharedWeightsConvDesc>* shared_conv_weights,
247     GPUOperationsSubgraph* gpu_subgraph) {
248   SharedWeightsConvDesc shared_weights_desc;
249   shared_weights_desc.weights_id = attr.weights.id;
250   shared_weights_desc.desc = weights_desc;
251   int index = -1;
252   for (int i = 0; i < shared_conv_weights->size(); ++i) {
253     if ((*shared_conv_weights)[i] == shared_weights_desc) {
254       index = i;
255       break;
256     }
257   }
258   if (index != -1) {
259     const auto& new_ids = (*shared_conv_weights)[index].global_const_ids;
260     for (int i = 0; i < new_ids.size(); ++i) {
261       gpu_subgraph->operations[0].input_ids.push_back(new_ids[i]);
262     }
263   } else {
264     shared_conv_weights->push_back(shared_weights_desc);
265     if (weights_desc.layout ==
266             WeightsLayout::k2DX4I4YIsSpatialIAndXIsOOGroupO4 ||
267         weights_desc.layout ==
268             WeightsLayout::k2DX4O4YIsSpatialIAndXIsOOGroupI4) {
269       // weights are 4x textures 2d
270       uint2 tex_size = Get2dResourceSize(weights_desc, attr.weights.shape);
271       const int flt_count =
272           GetTotalElementsCountForLayout(weights_desc, attr.weights.shape);
273 
274       std::vector<uint8_t> weights_data(flt_count * SizeOf(weights_desc.type));
275       RearrangeWeights(attr.weights, weights_desc,
276                        absl::MakeSpan(weights_data));
277       int sub_size = SizeOf(weights_desc.type) * 4 * tex_size.x * tex_size.y;
278       for (int i = 0; i < 4; ++i) {
279         TensorDescriptor weights_tensor = TensorDescriptor(
280             weights_desc.type, TensorStorageType::TEXTURE_2D, Layout::HWC);
281         weights_tensor.SetBHWCShape(BHWC(1, tex_size.y, tex_size.x, 4));
282         weights_tensor.SetData(std::vector<uint8_t>(
283             weights_data.data() + sub_size * i,
284             weights_data.data() + sub_size * i + sub_size));
285         int tensor_id = gpu_subgraph->AddTensor(std::move(weights_tensor));
286         gpu_subgraph->operations[0].input_ids.push_back(tensor_id);
287         shared_conv_weights->back().global_const_ids.push_back(tensor_id);
288       }
289     } else {
290       // weights are single buffer
291       TensorDescriptor weights_tensor = TensorDescriptor(
292           weights_desc.type, TensorStorageType::BUFFER, Layout::HWC);
293       const int flt_count =
294           GetTotalElementsCountForLayout(weights_desc, attr.weights.shape);
295       weights_tensor.SetBHWCShape(BHWC(1, 1, 1, flt_count));
296       std::vector<uint8_t> weights_data =
297           std::vector<uint8_t>(flt_count * SizeOf(weights_desc.type));
298       RearrangeWeights(attr.weights, weights_desc,
299                        absl::MakeSpan(weights_data));
300       weights_tensor.SetData(std::move(weights_data));
301       int tensor_id = gpu_subgraph->AddTensor(std::move(weights_tensor));
302       gpu_subgraph->operations[0].input_ids.push_back(tensor_id);
303       shared_conv_weights->back().global_const_ids.push_back(tensor_id);
304     }
305   }
306 }
307 
308 }  // namespace
309 
GPUOperationFromNodePart0(const GpuInfo & gpu_info,const OperationDef & op_def,ModelHints hints,const std::vector<Value * > & inputs,const std::vector<Value * > & outputs,const Node & node,std::vector<SharedWeightsConvDesc> * shared_conv_weights,GPUOperationsSubgraph * gpu_subgraph)310 absl::Status GPUOperationFromNodePart0(
311     const GpuInfo& gpu_info, const OperationDef& op_def, ModelHints hints,
312     const std::vector<Value*>& inputs, const std::vector<Value*>& outputs,
313     const Node& node, std::vector<SharedWeightsConvDesc>* shared_conv_weights,
314     GPUOperationsSubgraph* gpu_subgraph) {
315   std::unique_ptr<GPUOperation>* gpu_op =
316       InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
317   auto op_type = OperationTypeFromString(node.operation.type);
318   switch (op_type) {
319     case OperationType::BATCHED_MATMUL: {
320       // Matmul replaced with this sequence:
321       //   1) Transpose second tensor(weights). (D0xD1xHxW)->(WxD0xD1xH)
322       //   2) Run convolution with runtime weights
323       //   if batch != 1, input reshaped to hwc and output reshaped from hwc
324       auto first_shape = inputs[0]->tensor.shape;
325       auto second_shape = inputs[1]->tensor.shape;
326       auto dst_shape = outputs[0]->tensor.shape;
327       gpu_subgraph->operations.clear();
328       int src_id = static_cast<int>(inputs[0]->id);
329       int dst_id = static_cast<int>(outputs[0]->id);
330       const OHWI weights_shape(second_shape.c, second_shape.b, second_shape.h,
331                                second_shape.w);
332       const BHWC weights_shape_bhwc(weights_shape.o, weights_shape.h,
333                                     weights_shape.w, weights_shape.i);
334       if (dst_shape.b != 1) {
335         const BHWC hwc_input_shape(1, first_shape.b * first_shape.h,
336                                    first_shape.w, first_shape.c);
337         const BHWC hwc_output_shape(1, dst_shape.b * dst_shape.h, dst_shape.w,
338                                     dst_shape.c);
339         TensorDescriptor hwc_input_desc = {
340             op_def.src_tensors[0].GetDataType(),
341             op_def.src_tensors[0].GetStorageType(), Layout::BHWC};
342         TensorDescriptor hwc_output_desc = {
343             op_def.dst_tensors[0].GetDataType(),
344             op_def.dst_tensors[0].GetStorageType(), Layout::BHWC};
345         src_id = gpu_subgraph->AddTensor(hwc_input_shape, hwc_input_desc);
346         dst_id = gpu_subgraph->AddTensor(hwc_output_shape, hwc_output_desc);
347 
348         OperationDef reshape_input_def;
349         reshape_input_def.precision = op_def.precision;
350         reshape_input_def.src_tensors.push_back(op_def.src_tensors[0]);
351         reshape_input_def.dst_tensors.push_back(hwc_input_desc);
352         gpu_subgraph->operations.push_back({});
353         auto& reshape_input_op = gpu_subgraph->operations.back();
354         SelectReshape(first_shape.c, first_shape.c, reshape_input_def,
355                       &reshape_input_op.operation);
356         reshape_input_op.input_ids = {static_cast<int>(inputs[0]->id)};
357         reshape_input_op.output_ids = {src_id};
358         reshape_input_op.name = "mat_mul_reshape_input";
359       }
360       OperationDef conv_def = op_def;
361       RETURN_IF_ERROR(AddDynamicConv(
362           hints, gpu_info, conv_def, op_type, first_shape, weights_shape,
363           dst_shape, src_id, inputs[1]->id, dst_id, gpu_subgraph));
364       if (dst_shape.b != 1) {
365         TensorDescriptor hwc_output_desc = {
366             op_def.dst_tensors[0].GetDataType(),
367             op_def.dst_tensors[0].GetStorageType(), Layout::BHWC};
368 
369         OperationDef reshape_output_def;
370         reshape_output_def.precision = op_def.precision;
371         reshape_output_def.src_tensors.push_back(hwc_output_desc);
372         reshape_output_def.dst_tensors.push_back(op_def.dst_tensors[0]);
373         gpu_subgraph->operations.push_back({});
374         auto& reshape_output_op = gpu_subgraph->operations.back();
375         SelectReshape(dst_shape.c, dst_shape.c, reshape_output_def,
376                       &reshape_output_op.operation);
377         reshape_output_op.input_ids = {dst_id};
378         reshape_output_op.output_ids = {static_cast<int>(outputs[0]->id)};
379         reshape_output_op.name = "mat_mul_reshape_output";
380       }
381       return absl::OkStatus();
382     }
383     case OperationType::CAST:
384       SelectCast(op_def, gpu_info, gpu_op);
385       return absl::OkStatus();
386     case OperationType::CONCAT: {
387       auto attr = absl::any_cast<ConcatAttributes>(node.operation.attributes);
388       int max_inputs = gpu_info.GetMaxImageArguments() - 8;
389       if (gpu_info.IsMali()) {
390         // Mali can fail clEnqueueNDRangeKernel with "Out of resources" when it
391         // receives too big kernel.
392         max_inputs = std::min(8, max_inputs);
393       }
394       if (inputs.size() >= max_inputs) {
395         int groups = DivideRoundUp(inputs.size(), max_inputs);
396         gpu_subgraph->operations.clear();
397         gpu_subgraph->operations.resize(groups);
398         BHWC concatenated_shape = inputs[0]->tensor.shape;
399         concatenated_shape.set(attr.axis, 0);
400         for (int g = 0; g < groups; ++g) {
401           std::vector<int> channels;
402           auto& concat_op = gpu_subgraph->operations[g];
403           OperationDef new_def;
404           new_def.precision = op_def.precision;
405           if (g != 0) {
406             // concatenated tensor from previos concats
407             new_def.src_tensors.push_back(op_def.dst_tensors[0]);
408             concat_op.input_ids = {-g};
409             channels.push_back(concatenated_shape.c);
410           }
411           for (int i = 0; i < max_inputs; ++i) {
412             int src_index = g * max_inputs + i;
413             if (src_index >= op_def.src_tensors.size()) {
414               break;
415             }
416             new_def.src_tensors.push_back(op_def.src_tensors[src_index]);
417             concat_op.input_ids.push_back(inputs[src_index]->id);
418             channels.push_back(inputs[src_index]->tensor.shape.c);
419             int current_size = concatenated_shape.get(attr.axis);
420             concatenated_shape.set(
421                 attr.axis,
422                 current_size + inputs[src_index]->tensor.shape.get(attr.axis));
423           }
424           new_def.dst_tensors.push_back(op_def.dst_tensors[0]);
425           if (g == groups - 1) {
426             // last concat
427             concat_op.output_ids = {static_cast<int>(outputs[0]->id)};
428           } else {
429             // intermediate concat, create new tensor for it
430             int tensor_id = gpu_subgraph->AddTensor(concatenated_shape,
431                                                     op_def.dst_tensors[0]);
432             concat_op.output_ids = {tensor_id};
433           }
434           RETURN_IF_ERROR(SelectConcat(attr, channels, new_def, gpu_info,
435                                        &concat_op.operation));
436         }
437         return absl::OkStatus();
438       } else {
439         std::vector<int> channels(inputs.size());
440         for (int i = 0; i < inputs.size(); ++i) {
441           channels[i] = inputs[i]->tensor.shape.c;
442         }
443         return SelectConcat(attr, channels, op_def, gpu_info, gpu_op);
444       }
445     }
446     case OperationType::CONVOLUTION_2D: {
447       auto attr =
448           absl::any_cast<Convolution2DAttributes>(node.operation.attributes);
449       auto input_shape = inputs[0]->tensor.shape;
450       auto output_shape = outputs[0]->tensor.shape;
451       if (inputs.size() == 1) {
452         if (!hints.Check(ModelHints::kNoWinogradOptimizations) &&
453             WinogradFromNode(gpu_info, inputs, outputs, op_def, hints,
454                              input_shape, output_shape, attr, gpu_subgraph)
455                 .ok()) {
456           return absl::OkStatus();
457         } else {
458           gpu_op = InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
459           if (attr.groups != 1) {
460             gpu_subgraph->operations[0].name = "convolution_2d_grouped";
461           }
462           if (!shared_conv_weights || attr.weights.id == -1) {
463             *gpu_op =
464                 SelectConvolution(attr, output_shape, gpu_info, op_def, hints);
465           } else {
466             // Using convolutions with shared weights
467             WeightsDescription weights_desc;
468             const BHWC weights_shape_bhwc(
469                 attr.weights.shape.o, attr.weights.shape.h,
470                 attr.weights.shape.w, attr.weights.shape.i);
471             OperationDef conv_temp_def = op_def;
472             conv_temp_def.src_tensors.push_back(
473                 {op_def.src_tensors[0].GetDataType(), TensorStorageType::BUFFER,
474                  Layout::HWC});
475             *gpu_op = SelectConvolutionWithDynamicWeights(
476                 attr, weights_shape_bhwc, output_shape, gpu_info, conv_temp_def,
477                 hints, &weights_desc);
478             AddConvSharedWeights(attr, weights_desc, shared_conv_weights,
479                                  gpu_subgraph);
480           }
481           (*gpu_op)->flops_ =
482               GetConvolutionFlops(output_shape, attr.weights.shape);
483           return absl::OkStatus();
484         }
485       } else {
486         // CONVOLUTION_2D with runtime weights
487         const OHWI weights_shape =
488             OHWI(inputs[1]->tensor.shape.b, inputs[1]->tensor.shape.h,
489                  inputs[1]->tensor.shape.w, inputs[1]->tensor.shape.c);
490         if (weights_shape.i != inputs[0]->tensor.shape.c) {
491           return absl::UnimplementedError(
492               "No support of grouped convolution with runtime weights");
493         }
494         if (attr.bias.data.empty()) {
495           attr.bias.shape = Linear(weights_shape.o);
496           attr.bias.data.resize(weights_shape.o, 0.0f);
497         }
498         gpu_subgraph->operations.clear();
499         return AddDynamicConv(hints, gpu_info, op_def, op_type, input_shape,
500                               weights_shape, output_shape, inputs[0]->id,
501                               inputs[1]->id, outputs[0]->id, gpu_subgraph,
502                               &attr);
503       }
504     }
505     case OperationType::CONVOLUTION_TRANSPOSED: {
506       auto attr = absl::any_cast<ConvolutionTransposedAttributes>(
507           node.operation.attributes);
508       if (inputs.size() == 1) {
509         *gpu_op = SelectConvolutionTransposed(attr, gpu_info, op_def);
510         (*gpu_op)->flops_ = GetConvolutionTransposedFlops(
511             inputs[0]->tensor.shape, attr.weights.shape);
512         return absl::OkStatus();
513       } else {
514         // CONVOLUTION_TRANSPOSED with runtime weights
515         const OHWI weights_shape =
516             OHWI(inputs[1]->tensor.shape.b, inputs[1]->tensor.shape.h,
517                  inputs[1]->tensor.shape.w, inputs[1]->tensor.shape.c);
518         if (attr.bias.data.empty()) {
519           attr.bias.shape = Linear(weights_shape.o);
520           attr.bias.data.resize(weights_shape.o, 0.0f);
521         }
522         gpu_subgraph->operations.clear();
523         return AddDynamicConv(
524             hints, gpu_info, op_def, op_type, inputs[0]->tensor.shape,
525             weights_shape, outputs[0]->tensor.shape, inputs[0]->id,
526             inputs[1]->id, outputs[0]->id, gpu_subgraph, &attr);
527       }
528     }
529     case OperationType::DEPTHWISE_CONVOLUTION: {
530       auto attr = absl::any_cast<DepthwiseConvolution2DAttributes>(
531           node.operation.attributes);
532       if (inputs.size() == 1) {
533         *gpu_op = SelectDWConvolution(attr, gpu_info, op_def);
534         (*gpu_op)->flops_ = GetDepthwiseConvolutionFlops(
535             outputs[0]->tensor.shape, attr.weights.shape);
536       } else {
537         if (inputs[1]->tensor.shape.b != 1) {
538           return absl::UnimplementedError(
539               "No support of depthwise runtime weights with channel multiplier "
540               "!= 1");
541         }
542         *gpu_op = SelectDWConvolutionDynamicWeights(attr, gpu_info, op_def);
543         (*gpu_op)->flops_ = GetDepthwiseConvolutionFlops(
544             outputs[0]->tensor.shape,
545             OHWI(inputs[1]->tensor.shape.b, inputs[1]->tensor.shape.h,
546                  inputs[1]->tensor.shape.w, inputs[1]->tensor.shape.c));
547       }
548       return absl::OkStatus();
549     }
550     case OperationType::CUMSUM: {
551       auto attr = absl::any_cast<CumsumAttributes>(node.operation.attributes);
552       SelectCumsum(op_def, attr, gpu_op);
553       return absl::OkStatus();
554     }
555     case OperationType::DEPTH_TO_SPACE: {
556       auto attr =
557           absl::any_cast<SpaceToDepthAttributes>(node.operation.attributes);
558       SelectDepthToSpace(attr, op_def, gpu_op);
559       return absl::OkStatus();
560     }
561     case OperationType::FULLY_CONNECTED: {
562       auto attr =
563           absl::any_cast<FullyConnectedAttributes>(node.operation.attributes);
564       *gpu_op = SelectFullyConnected(attr, gpu_info, op_def,
565                                      inputs[0]->tensor.shape.b);
566       (*gpu_op)->flops_ =
567           GetFullyConnectedFlops(outputs[0]->tensor.shape, attr.weights.shape);
568       return absl::OkStatus();
569     }
570     case OperationType::FULLY_CONNECTED_INT8: {
571       auto attr = absl::any_cast<FullyConnectedInt8Attributes>(
572           node.operation.attributes);
573       *gpu_op = SelectFullyConnected(attr, gpu_info, op_def);
574       return absl::OkStatus();
575     }
576     case OperationType::GATHER: {
577       auto attr = absl::any_cast<GatherAttributes>(node.operation.attributes);
578       RETURN_IF_ERROR(SelectGather(attr, op_def, gpu_op));
579       return absl::OkStatus();
580     }
581     case OperationType::LSTM: {
582       *gpu_op = SelectLSTM(op_def, gpu_info);
583       return absl::OkStatus();
584     }
585     case OperationType::MAX_UNPOOLING_2D: {
586       auto attr =
587           absl::any_cast<MaxUnpooling2DAttributes>(node.operation.attributes);
588       *gpu_op = SelectMaxUnpooling(attr, gpu_info, op_def);
589       return absl::OkStatus();
590     }
591     case OperationType::MEAN: {
592       auto attr = absl::any_cast<MeanAttributes>(node.operation.attributes);
593       *gpu_op = SelectReduce(attr.dims, inputs[0]->tensor.shape, op_type,
594                              op_def, gpu_info);
595       return absl::OkStatus();
596     }
597     case OperationType::MEAN_STDDEV_NORMALIZATION: {
598       MeanStdDevNormalization operation = CreateMeanStdDevNormalization(
599           op_def, gpu_info, inputs[0]->tensor.shape);
600       *gpu_op = std::make_unique<MeanStdDevNormalization>(std::move(operation));
601       return absl::OkStatus();
602     }
603     case OperationType::ONE_HOT: {
604       auto attr = absl::any_cast<OneHotAttributes>(node.operation.attributes);
605       SelectOneHot(op_def, attr, gpu_op);
606       return absl::OkStatus();
607     }
608     case OperationType::PAD: {
609       auto attr = absl::any_cast<PadAttributes>(node.operation.attributes);
610       SelectPadding(attr, op_def, gpu_op);
611       return absl::OkStatus();
612     }
613     case OperationType::POOLING_2D: {
614       auto attr =
615           absl::any_cast<Pooling2DAttributes>(node.operation.attributes);
616       *gpu_op = SelectPooling(attr, gpu_info, op_def);
617       return absl::OkStatus();
618     }
619     case OperationType::PRELU: {
620       auto attr = absl::any_cast<PReLUAttributes>(node.operation.attributes);
621       *gpu_op = SelectPReLU(attr, gpu_info, op_def);
622       return absl::OkStatus();
623     }
624     case OperationType::QUANTIZE_AND_DEQUANTIZE: {
625       auto attr = absl::any_cast<QuantizeAndDequantizeAttributes>(
626           node.operation.attributes);
627       *gpu_op = SelectQuantizeAndDequantize(attr, op_def);
628       return absl::OkStatus();
629     }
630     case OperationType::RELU: {
631       auto attr = absl::any_cast<ReLUAttributes>(node.operation.attributes);
632       *gpu_op = SelectReLU(attr, op_def);
633       return absl::OkStatus();
634     }
635     case OperationType::RESAMPLER: {
636       *gpu_op = SelectResampler(op_def, gpu_info);
637       return absl::OkStatus();
638     }
639     case OperationType::RESHAPE: {
640       const int src_channels = inputs[0]->tensor.shape.c;
641       auto attr = absl::any_cast<ReshapeAttributes>(node.operation.attributes);
642       SelectReshape(src_channels, attr.new_shape.c, op_def, gpu_op);
643       return absl::OkStatus();
644     }
645     case OperationType::RESIZE: {
646       auto attr = absl::any_cast<Resize2DAttributes>(node.operation.attributes);
647       return SelectResize(attr, op_def, gpu_op);
648     }
649     case OperationType::SLICE: {
650       auto attr = absl::any_cast<SliceAttributes>(node.operation.attributes);
651       SelectStridedSlice(attr, op_def, gpu_op);
652       return absl::OkStatus();
653     }
654     case OperationType::SOFTMAX: {
655       SelectSoftmax(inputs[0]->tensor.shape, op_def, gpu_op);
656       return absl::OkStatus();
657     }
658     case OperationType::SPACE_TO_DEPTH: {
659       auto attr =
660           absl::any_cast<SpaceToDepthAttributes>(node.operation.attributes);
661       SelectSpaceToDepth(attr, op_def, gpu_op);
662       return absl::OkStatus();
663     }
664     case OperationType::SPLIT: {
665       std::vector<int> channels;
666       channels.reserve(outputs.size());
667       for (const auto& output : outputs) {
668         channels.push_back(output->tensor.shape.c);
669       }
670       auto attr = absl::any_cast<SplitAttributes>(node.operation.attributes);
671       if (gpu_info.IsMali()) {
672         // Mali can fail clEnqueueNDRangeKernel with "Out of resources" when it
673         // receives too big kernel.
674         // Replace single complex split to N with N simple kernels.
675         gpu_subgraph->operations.clear();
676         gpu_subgraph->operations.resize(outputs.size());
677         int split_offset = 0;
678         for (int i = 0; i < outputs.size(); ++i) {
679           auto& op = gpu_subgraph->operations[i];
680           op.input_ids = {static_cast<int>(inputs[0]->id)};
681           op.output_ids = {static_cast<int>(outputs[i]->id)};
682           OperationDef new_def;
683           new_def.precision = op_def.precision;
684           new_def.src_tensors.push_back(op_def.src_tensors[0]);
685           new_def.dst_tensors.push_back(op_def.dst_tensors[i]);
686           SliceAttributes new_attr;
687           new_attr.starts = BHWC(0, 0, 0, 0);
688           new_attr.ends = inputs[0]->tensor.shape;
689           new_attr.strides = BHWC(1, 1, 1, 1);
690           new_attr.starts.set(attr.axis, split_offset);
691           new_attr.ends.set(
692               attr.axis,
693               split_offset + outputs[i]->tensor.shape.get(attr.axis));
694           split_offset += outputs[i]->tensor.shape.get(attr.axis);
695           SelectStridedSlice(new_attr, new_def, &op.operation);
696         }
697         return absl::OkStatus();
698       }
699       SelectSplit(attr, gpu_info, channels, op_def, gpu_op);
700       return absl::OkStatus();
701     }
702     case OperationType::TILE: {
703       *gpu_op = SelectTile(op_def, inputs[0]->tensor.shape);
704       return absl::OkStatus();
705     }
706     case OperationType::TRANSPOSE: {
707       auto attr =
708           absl::any_cast<TransposeAttributes>(node.operation.attributes);
709       SelectTranspose(attr, op_def, gpu_op);
710       return absl::OkStatus();
711     }
712     case OperationType::ABS:
713     case OperationType::COPY:
714     case OperationType::COS:
715     case OperationType::ELU:
716     case OperationType::EXP:
717     case OperationType::HARD_SWISH:
718     case OperationType::LOG:
719     case OperationType::NEG:
720     case OperationType::RSQRT:
721     case OperationType::SIGMOID:
722     case OperationType::SIN:
723     case OperationType::SQRT:
724     case OperationType::SQUARE:
725     case OperationType::TANH: {
726       GPUOperation operation;
727       if (inputs[0]->tensor.shape != outputs[0]->tensor.shape) {
728         operation = CreateElementwiseOneInputWithBroadcast(
729             gpu_info, op_def, op_type, inputs[0]->tensor.shape,
730             outputs[0]->tensor.shape);
731       } else {
732         operation = CreateElementwiseOneInput(gpu_info, op_def, op_type);
733       }
734       *gpu_op = std::make_unique<GPUOperation>(std::move(operation));
735       return absl::OkStatus();
736     }
737     case OperationType::ADD:
738     case OperationType::DIV:
739     case OperationType::EQUAL:
740     case OperationType::GREATER:
741     case OperationType::GREATER_EQUAL:
742     case OperationType::LESS:
743     case OperationType::LESS_EQUAL:
744     case OperationType::MAXIMUM:
745     case OperationType::MINIMUM:
746     case OperationType::MUL:
747     case OperationType::NOT_EQUAL:
748     case OperationType::POW:
749     case OperationType::SQUARED_DIFF:
750     case OperationType::SUB: {
751       if (op_type == OperationType::ADD && inputs.size() >= 2) {
752         const bool two_input_add_with_zero_padded_channels =
753             inputs[0]->tensor.shape.c % 4 == 0 &&
754             inputs[1]->tensor.shape.c % 4 == 0 &&
755             outputs[0]->tensor.shape.c % 4 == 0 &&
756             (inputs[0]->tensor.shape.c != outputs[0]->tensor.shape.c ||
757              inputs[1]->tensor.shape.c != outputs[0]->tensor.shape.c);
758         if (inputs.size() >= 3 || two_input_add_with_zero_padded_channels) {
759           auto output = outputs[0];
760           std::vector<int> channels(inputs.size());
761           for (int i = 0; i < inputs.size(); ++i) {
762             channels[i] = inputs[i]->tensor.shape.c;
763           }
764           SelectAdd(op_def, channels, output->tensor.shape.c, gpu_op);
765           return absl::OkStatus();
766         }
767       }
768 
769       if (inputs.size() == 2) {
770         GPUOperation operation;
771         if (inputs[0]->tensor.shape != outputs[0]->tensor.shape) {
772           operation = CreateElementwiseTwoInputWithBroadcast(
773               op_def, op_type, inputs[0]->tensor.shape, inputs[1]->tensor.shape,
774               outputs[0]->tensor.shape);
775         } else {
776           operation = CreateElementwiseTwoInput(op_def, op_type,
777                                                 inputs[1]->tensor.shape);
778         }
779         *gpu_op = std::make_unique<GPUOperation>(std::move(operation));
780         return absl::OkStatus();
781       } else if (inputs.size() == 1 && node.operation.attributes.has_value()) {
782         auto attr =
783             absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
784         GPUOperation operation;
785         if (inputs[0]->tensor.shape != outputs[0]->tensor.shape) {
786           operation = CreateElementwiseWithBroadcast(
787               gpu_info, op_def, op_type, attr, inputs[0]->tensor.shape,
788               outputs[0]->tensor.shape);
789         } else {
790           operation = CreateElementwise(gpu_info, op_def, op_type, attr);
791         }
792         *gpu_op = std::make_unique<GPUOperation>(std::move(operation));
793         return absl::OkStatus();
794       }
795       return absl::UnimplementedError(absl::StrCat(
796           "No support of ", node.operation.type, " with this parameters"));
797     }
798     case OperationType::REDUCE_MAXIMUM:
799     case OperationType::REDUCE_MINIMUM:
800     case OperationType::REDUCE_PRODUCT:
801     case OperationType::REDUCE_SUM: {
802       auto attr = absl::any_cast<ReduceAttributes>(node.operation.attributes);
803       *gpu_op = SelectReduce(attr.dims, inputs[0]->tensor.shape, op_type,
804                              op_def, gpu_info);
805       return absl::OkStatus();
806     }
807     case OperationType::SELECT_V2: {
808       auto attr = absl::any_cast<SelectV2Attributes>(node.operation.attributes);
809       SelectSelectV2(op_def, attr, gpu_op);
810       return absl::OkStatus();
811     }
812     default:
813       return SelectDefault(gpu_info, op_def, hints, inputs, outputs, node,
814                            gpu_subgraph);
815   }
816 }
817 
GPUOperationFromNode(const GpuInfo & gpu_info,const OperationDef & op_def,ModelHints hints,const std::vector<Value * > & inputs,const std::vector<Value * > & outputs,const Node & node,std::vector<SharedWeightsConvDesc> * shared_conv_weights,GPUOperationsSubgraph * gpu_subgraph)818 absl::Status GPUOperationFromNode(
819     const GpuInfo& gpu_info, const OperationDef& op_def, ModelHints hints,
820     const std::vector<Value*>& inputs, const std::vector<Value*>& outputs,
821     const Node& node, std::vector<SharedWeightsConvDesc>* shared_conv_weights,
822     GPUOperationsSubgraph* gpu_subgraph) {
823   RETURN_IF_ERROR(GPUOperationFromNodePart0(gpu_info, op_def, hints, inputs,
824                                             outputs, node, shared_conv_weights,
825                                             gpu_subgraph));
826   for (auto& gpu_op : gpu_subgraph->operations) {
827     if (gpu_op.name.empty()) {
828       gpu_op.name = node.operation.type + " " + std::to_string(node.id);
829     } else {
830       gpu_op.name += " " + std::to_string(node.id);
831     }
832   }
833   return absl::OkStatus();
834 }
835 
836 }  // namespace gpu
837 }  // namespace tflite
838