1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/selectors/operation_selector.h"
17
18 #include <algorithm>
19 #include <memory>
20 #include <string>
21 #include <utility>
22 #include <vector>
23
24 #include "absl/strings/str_cat.h"
25 #include "absl/types/any.h"
26 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
27 #include "tensorflow/lite/delegates/gpu/common/flops_util.h"
28 #include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
29 #include "tensorflow/lite/delegates/gpu/common/operations.h"
30 #include "tensorflow/lite/delegates/gpu/common/selectors/convolution_selector.h"
31 #include "tensorflow/lite/delegates/gpu/common/selectors/convolution_transposed_selector.h"
32 #include "tensorflow/lite/delegates/gpu/common/selectors/default_selector.h"
33 #include "tensorflow/lite/delegates/gpu/common/selectors/dw_convolution_selector.h"
34 #include "tensorflow/lite/delegates/gpu/common/selectors/fully_connected_selector.h"
35 #include "tensorflow/lite/delegates/gpu/common/selectors/simple_selectors.h"
36 #include "tensorflow/lite/delegates/gpu/common/shape.h"
37 #include "tensorflow/lite/delegates/gpu/common/status.h"
38 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
39 #include "tensorflow/lite/delegates/gpu/common/task/weights_conversion.h"
40 #include "tensorflow/lite/delegates/gpu/common/tasks/elementwise.h"
41 #include "tensorflow/lite/delegates/gpu/common/tasks/mean_stddev_normalization.h"
42 #include "tensorflow/lite/delegates/gpu/common/tasks/transpose.h"
43 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
44 #include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
45
46 namespace tflite {
47 namespace gpu {
48 namespace {
IsRecommendedForWinograd4x4To6x6(const Convolution2DAttributes & attr,const GpuInfo & gpu_info,const BHWC & dst_shape)49 bool IsRecommendedForWinograd4x4To6x6(const Convolution2DAttributes& attr,
50 const GpuInfo& gpu_info,
51 const BHWC& dst_shape) {
52 const int tiles_x = DivideRoundUp(dst_shape.w, 4);
53 const int tiles_y = DivideRoundUp(dst_shape.h, 4);
54 const int total_tiles = tiles_x * tiles_y;
55 const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
56 const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
57 int min_src_depth = 16;
58 int min_dst_depth = 16;
59 if (gpu_info.IsAdreno()) {
60 min_src_depth = 32;
61 min_dst_depth = 32;
62 } else if (gpu_info.IsAMD()) {
63 min_dst_depth = 8;
64 }
65 int min_tiles = 32;
66 if (gpu_info.IsAdreno()) {
67 if (gpu_info.adreno_info.IsAdreno6xx()) {
68 min_tiles = 128;
69 } else {
70 min_tiles = 64;
71 }
72 }
73 const bool recommended_channels =
74 src_depth >= min_src_depth && dst_depth >= min_dst_depth;
75 const bool recommended_hw = total_tiles >= min_tiles;
76 return recommended_channels && recommended_hw;
77 }
78
WinogradFromNode(const GpuInfo & gpu_info,const std::vector<Value * > & inputs,const std::vector<Value * > & outputs,const OperationDef & op_def,ModelHints hints,const BHWC & input_shape,const BHWC & output_shape,const Convolution2DAttributes & attr,GPUOperationsSubgraph * gpu_subgraph)79 absl::Status WinogradFromNode(const GpuInfo& gpu_info,
80 const std::vector<Value*>& inputs,
81 const std::vector<Value*>& outputs,
82 const OperationDef& op_def, ModelHints hints,
83 const BHWC& input_shape, const BHWC& output_shape,
84 const Convolution2DAttributes& attr,
85 GPUOperationsSubgraph* gpu_subgraph) {
86 if (!IsSuitableForWinograd4x4To6x6(attr)) {
87 return absl::UnimplementedError("No implementation for this case.");
88 }
89 if (!IsRecommendedForWinograd4x4To6x6(attr, gpu_info, output_shape)) {
90 return absl::UnimplementedError("Not recommended for this case.");
91 }
92
93 const int tiles_x = DivideRoundUp(output_shape.w, 4);
94 const int tiles_y = DivideRoundUp(output_shape.h, 4);
95 const BHWC src_transformed_shape{input_shape.b, 36, tiles_x * tiles_y,
96 input_shape.c};
97 const BHWC dst_transformed_shape{input_shape.b, 36, tiles_x * tiles_y,
98 output_shape.c};
99 TensorDescriptor src_transformed_desc = op_def.src_tensors[0];
100 RETURN_IF_ERROR(src_transformed_desc.UpdateToSupportedStorageType(
101 gpu_info, src_transformed_shape));
102 TensorDescriptor dst_transformed_desc = op_def.src_tensors[0];
103 RETURN_IF_ERROR(dst_transformed_desc.UpdateToSupportedStorageType(
104 gpu_info, dst_transformed_shape));
105 const int src_transformed_id =
106 gpu_subgraph->AddTensor(src_transformed_shape, src_transformed_desc);
107 const int dst_transformed_id =
108 gpu_subgraph->AddTensor(dst_transformed_shape, dst_transformed_desc);
109 gpu_subgraph->operations.clear();
110 gpu_subgraph->operations.resize(3);
111
112 OperationDef winograd_up_def;
113 winograd_up_def.precision = op_def.precision;
114 winograd_up_def.src_tensors.push_back(op_def.src_tensors[0]);
115 winograd_up_def.dst_tensors.push_back(src_transformed_desc);
116 auto& winograd_up = gpu_subgraph->operations[0];
117 winograd_up.operation =
118 SelectWinograd4x4To36(gpu_info, attr.padding, winograd_up_def);
119 winograd_up.input_ids = {static_cast<int>(inputs[0]->id)};
120 winograd_up.output_ids = {src_transformed_id};
121 winograd_up.name = "winograd_4x4_to_36";
122
123 OperationDef conv_def;
124 conv_def.precision = op_def.precision;
125 conv_def.src_tensors.push_back(src_transformed_desc);
126 conv_def.dst_tensors.push_back(dst_transformed_desc);
127 auto& conv = gpu_subgraph->operations[1];
128 conv.input_ids = {src_transformed_id};
129 conv.output_ids = {dst_transformed_id};
130 conv.operation = SelectConvolutionForWinograd(attr, input_shape, gpu_info,
131 conv_def, hints);
132 conv.name = "convolution_winograd_4x4_6x6";
133 conv.operation->flops_ =
134 GetConvolutionWinograd4x4To6x6Flops(output_shape, attr.weights.shape);
135
136 OperationDef winograd_down_def;
137 winograd_down_def.precision = op_def.precision;
138 winograd_down_def.src_tensors.push_back(dst_transformed_desc);
139 winograd_down_def.dst_tensors.push_back(op_def.dst_tensors[0]);
140 auto& winograd_down = gpu_subgraph->operations[2];
141 winograd_down.input_ids = {dst_transformed_id};
142 winograd_down.output_ids = {static_cast<int>(outputs[0]->id)};
143 auto bias_copy = attr.bias;
144 if (bias_copy.shape.v < attr.weights.shape.o) {
145 bias_copy.shape = Linear(attr.weights.shape.o);
146 bias_copy.data.resize(attr.weights.shape.o);
147 }
148 winograd_down.operation =
149 SelectWinograd36To4x4(gpu_info, winograd_down_def, bias_copy);
150 winograd_down.name = "winograd_36_to_4x4";
151 return absl::OkStatus();
152 }
153
154 // Supported operation types:
155 // 1) BATCHED_MATMUL
156 // 2) CONVOLUTION_2D
157 // 3) CONVOLUTION_TRANSPOSED
AddDynamicConv(ModelHints hints,const GpuInfo & gpu_info,const OperationDef & op_def,OperationType op_type,const BHWC & src_shape,const OHWI & weights_shape,const BHWC & dst_shape,int src_id,int weights_id,int dst_id,GPUOperationsSubgraph * gpu_subgraph,void * attr=nullptr)158 absl::Status AddDynamicConv(ModelHints hints, const GpuInfo& gpu_info,
159 const OperationDef& op_def, OperationType op_type,
160 const BHWC& src_shape, const OHWI& weights_shape,
161 const BHWC& dst_shape, int src_id, int weights_id,
162 int dst_id, GPUOperationsSubgraph* gpu_subgraph,
163 void* attr = nullptr) {
164 gpu_subgraph->operations.reserve(gpu_subgraph->operations.size() + 2);
165 gpu_subgraph->operations.push_back({});
166 auto& converter_op = gpu_subgraph->operations.back();
167 gpu_subgraph->operations.push_back({});
168 auto& conv_op = gpu_subgraph->operations.back();
169 OperationDef conv_temp_def = op_def;
170 conv_temp_def.src_tensors[1] = {op_def.src_tensors[1].GetDataType(),
171 TensorStorageType::BUFFER, Layout::HWC};
172 WeightsDescription weights_desc;
173 const BHWC weights_shape_bhwc(weights_shape.o, weights_shape.h,
174 weights_shape.w, weights_shape.i);
175 conv_op.output_ids = {dst_id};
176 if (op_type == OperationType::CONVOLUTION_2D) {
177 Convolution2DAttributes* conv_attr =
178 reinterpret_cast<Convolution2DAttributes*>(attr);
179 conv_op.operation = SelectConvolutionWithDynamicWeights(
180 *conv_attr, weights_shape_bhwc, dst_shape, gpu_info, conv_temp_def,
181 hints, &weights_desc);
182 conv_op.name = "convolution_dynamic";
183 conv_op.operation->flops_ = GetConvolutionFlops(dst_shape, weights_shape);
184 } else if (op_type == OperationType::CONVOLUTION_TRANSPOSED) {
185 ConvolutionTransposedAttributes* conv_attr =
186 reinterpret_cast<ConvolutionTransposedAttributes*>(attr);
187 conv_op.operation = SelectConvolutionTransposedWithDynamicWeights(
188 *conv_attr, gpu_info, conv_temp_def, &weights_desc);
189 conv_op.name = "conv_transposed_dynamic";
190 conv_op.operation->flops_ =
191 GetConvolutionTransposedFlops(src_shape, weights_shape);
192 } else if (op_type == OperationType::BATCHED_MATMUL) {
193 conv_op.operation =
194 SelectConvolutionBatchedMatMul(weights_shape, dst_shape, gpu_info,
195 conv_temp_def, hints, &weights_desc);
196 conv_op.name = "mat_mul_as_convolution";
197 conv_op.operation->flops_ =
198 dst_shape.b * dst_shape.h * dst_shape.w * dst_shape.c * weights_shape.i;
199 } else {
200 return absl::InternalError("No support of this operation type.");
201 }
202 conv_op.input_ids = {src_id};
203 if (weights_desc.layout == WeightsLayout::k2DX4I4YIsSpatialIAndXIsOOGroupO4 ||
204 weights_desc.layout == WeightsLayout::k2DX4O4YIsSpatialIAndXIsOOGroupI4) {
205 // weights are 4x textures 2d
206 uint2 tex_size = Get2dResourceSize(weights_desc, weights_shape);
207 for (int i = 0; i < 4; ++i) {
208 int tensor_id = gpu_subgraph->AddTensor(
209 BHWC(1, tex_size.y, tex_size.x, 4),
210 TensorDescriptor(weights_desc.type, TensorStorageType::TEXTURE_2D,
211 Layout::HWC));
212 conv_op.input_ids.push_back(tensor_id);
213 converter_op.output_ids.push_back(tensor_id);
214 }
215 } else {
216 // weights are single buffer
217 int tensor_id = gpu_subgraph->AddTensor(
218 BHWC(1, 1, 1,
219 GetTotalElementsCountForLayout(weights_desc, weights_shape)),
220 TensorDescriptor(weights_desc.type, TensorStorageType::BUFFER,
221 Layout::HWC));
222 conv_op.input_ids.push_back(tensor_id);
223 converter_op.output_ids.push_back(tensor_id);
224 }
225 OperationDef conv_def = conv_op.operation->GetDefinition();
226 OperationDef converter_def;
227 converter_def.precision = op_def.precision;
228 converter_def.src_tensors.push_back(op_def.src_tensors[1]);
229 for (int i = 1; i < conv_def.src_tensors.size(); ++i) {
230 converter_def.dst_tensors.push_back(conv_def.src_tensors[i]);
231 }
232
233 converter_op.input_ids = {weights_id};
234 Layout input_layout = Layout::OHWI;
235 if (op_type == OperationType::BATCHED_MATMUL) {
236 input_layout = Layout::HWIO;
237 }
238 converter_op.operation = SelectConverterToConvWeights(
239 weights_desc, converter_def, hints, input_layout);
240 converter_op.name = "bhwc_tensor_to_conv_weights";
241 return absl::OkStatus();
242 }
243
AddConvSharedWeights(const Convolution2DAttributes & attr,const WeightsDescription & weights_desc,std::vector<SharedWeightsConvDesc> * shared_conv_weights,GPUOperationsSubgraph * gpu_subgraph)244 void AddConvSharedWeights(
245 const Convolution2DAttributes& attr, const WeightsDescription& weights_desc,
246 std::vector<SharedWeightsConvDesc>* shared_conv_weights,
247 GPUOperationsSubgraph* gpu_subgraph) {
248 SharedWeightsConvDesc shared_weights_desc;
249 shared_weights_desc.weights_id = attr.weights.id;
250 shared_weights_desc.desc = weights_desc;
251 int index = -1;
252 for (int i = 0; i < shared_conv_weights->size(); ++i) {
253 if ((*shared_conv_weights)[i] == shared_weights_desc) {
254 index = i;
255 break;
256 }
257 }
258 if (index != -1) {
259 const auto& new_ids = (*shared_conv_weights)[index].global_const_ids;
260 for (int i = 0; i < new_ids.size(); ++i) {
261 gpu_subgraph->operations[0].input_ids.push_back(new_ids[i]);
262 }
263 } else {
264 shared_conv_weights->push_back(shared_weights_desc);
265 if (weights_desc.layout ==
266 WeightsLayout::k2DX4I4YIsSpatialIAndXIsOOGroupO4 ||
267 weights_desc.layout ==
268 WeightsLayout::k2DX4O4YIsSpatialIAndXIsOOGroupI4) {
269 // weights are 4x textures 2d
270 uint2 tex_size = Get2dResourceSize(weights_desc, attr.weights.shape);
271 const int flt_count =
272 GetTotalElementsCountForLayout(weights_desc, attr.weights.shape);
273
274 std::vector<uint8_t> weights_data(flt_count * SizeOf(weights_desc.type));
275 RearrangeWeights(attr.weights, weights_desc,
276 absl::MakeSpan(weights_data));
277 int sub_size = SizeOf(weights_desc.type) * 4 * tex_size.x * tex_size.y;
278 for (int i = 0; i < 4; ++i) {
279 TensorDescriptor weights_tensor = TensorDescriptor(
280 weights_desc.type, TensorStorageType::TEXTURE_2D, Layout::HWC);
281 weights_tensor.SetBHWCShape(BHWC(1, tex_size.y, tex_size.x, 4));
282 weights_tensor.SetData(std::vector<uint8_t>(
283 weights_data.data() + sub_size * i,
284 weights_data.data() + sub_size * i + sub_size));
285 int tensor_id = gpu_subgraph->AddTensor(std::move(weights_tensor));
286 gpu_subgraph->operations[0].input_ids.push_back(tensor_id);
287 shared_conv_weights->back().global_const_ids.push_back(tensor_id);
288 }
289 } else {
290 // weights are single buffer
291 TensorDescriptor weights_tensor = TensorDescriptor(
292 weights_desc.type, TensorStorageType::BUFFER, Layout::HWC);
293 const int flt_count =
294 GetTotalElementsCountForLayout(weights_desc, attr.weights.shape);
295 weights_tensor.SetBHWCShape(BHWC(1, 1, 1, flt_count));
296 std::vector<uint8_t> weights_data =
297 std::vector<uint8_t>(flt_count * SizeOf(weights_desc.type));
298 RearrangeWeights(attr.weights, weights_desc,
299 absl::MakeSpan(weights_data));
300 weights_tensor.SetData(std::move(weights_data));
301 int tensor_id = gpu_subgraph->AddTensor(std::move(weights_tensor));
302 gpu_subgraph->operations[0].input_ids.push_back(tensor_id);
303 shared_conv_weights->back().global_const_ids.push_back(tensor_id);
304 }
305 }
306 }
307
308 } // namespace
309
GPUOperationFromNodePart0(const GpuInfo & gpu_info,const OperationDef & op_def,ModelHints hints,const std::vector<Value * > & inputs,const std::vector<Value * > & outputs,const Node & node,std::vector<SharedWeightsConvDesc> * shared_conv_weights,GPUOperationsSubgraph * gpu_subgraph)310 absl::Status GPUOperationFromNodePart0(
311 const GpuInfo& gpu_info, const OperationDef& op_def, ModelHints hints,
312 const std::vector<Value*>& inputs, const std::vector<Value*>& outputs,
313 const Node& node, std::vector<SharedWeightsConvDesc>* shared_conv_weights,
314 GPUOperationsSubgraph* gpu_subgraph) {
315 std::unique_ptr<GPUOperation>* gpu_op =
316 InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
317 auto op_type = OperationTypeFromString(node.operation.type);
318 switch (op_type) {
319 case OperationType::BATCHED_MATMUL: {
320 // Matmul replaced with this sequence:
321 // 1) Transpose second tensor(weights). (D0xD1xHxW)->(WxD0xD1xH)
322 // 2) Run convolution with runtime weights
323 // if batch != 1, input reshaped to hwc and output reshaped from hwc
324 auto first_shape = inputs[0]->tensor.shape;
325 auto second_shape = inputs[1]->tensor.shape;
326 auto dst_shape = outputs[0]->tensor.shape;
327 gpu_subgraph->operations.clear();
328 int src_id = static_cast<int>(inputs[0]->id);
329 int dst_id = static_cast<int>(outputs[0]->id);
330 const OHWI weights_shape(second_shape.c, second_shape.b, second_shape.h,
331 second_shape.w);
332 const BHWC weights_shape_bhwc(weights_shape.o, weights_shape.h,
333 weights_shape.w, weights_shape.i);
334 if (dst_shape.b != 1) {
335 const BHWC hwc_input_shape(1, first_shape.b * first_shape.h,
336 first_shape.w, first_shape.c);
337 const BHWC hwc_output_shape(1, dst_shape.b * dst_shape.h, dst_shape.w,
338 dst_shape.c);
339 TensorDescriptor hwc_input_desc = {
340 op_def.src_tensors[0].GetDataType(),
341 op_def.src_tensors[0].GetStorageType(), Layout::BHWC};
342 TensorDescriptor hwc_output_desc = {
343 op_def.dst_tensors[0].GetDataType(),
344 op_def.dst_tensors[0].GetStorageType(), Layout::BHWC};
345 src_id = gpu_subgraph->AddTensor(hwc_input_shape, hwc_input_desc);
346 dst_id = gpu_subgraph->AddTensor(hwc_output_shape, hwc_output_desc);
347
348 OperationDef reshape_input_def;
349 reshape_input_def.precision = op_def.precision;
350 reshape_input_def.src_tensors.push_back(op_def.src_tensors[0]);
351 reshape_input_def.dst_tensors.push_back(hwc_input_desc);
352 gpu_subgraph->operations.push_back({});
353 auto& reshape_input_op = gpu_subgraph->operations.back();
354 SelectReshape(first_shape.c, first_shape.c, reshape_input_def,
355 &reshape_input_op.operation);
356 reshape_input_op.input_ids = {static_cast<int>(inputs[0]->id)};
357 reshape_input_op.output_ids = {src_id};
358 reshape_input_op.name = "mat_mul_reshape_input";
359 }
360 OperationDef conv_def = op_def;
361 RETURN_IF_ERROR(AddDynamicConv(
362 hints, gpu_info, conv_def, op_type, first_shape, weights_shape,
363 dst_shape, src_id, inputs[1]->id, dst_id, gpu_subgraph));
364 if (dst_shape.b != 1) {
365 TensorDescriptor hwc_output_desc = {
366 op_def.dst_tensors[0].GetDataType(),
367 op_def.dst_tensors[0].GetStorageType(), Layout::BHWC};
368
369 OperationDef reshape_output_def;
370 reshape_output_def.precision = op_def.precision;
371 reshape_output_def.src_tensors.push_back(hwc_output_desc);
372 reshape_output_def.dst_tensors.push_back(op_def.dst_tensors[0]);
373 gpu_subgraph->operations.push_back({});
374 auto& reshape_output_op = gpu_subgraph->operations.back();
375 SelectReshape(dst_shape.c, dst_shape.c, reshape_output_def,
376 &reshape_output_op.operation);
377 reshape_output_op.input_ids = {dst_id};
378 reshape_output_op.output_ids = {static_cast<int>(outputs[0]->id)};
379 reshape_output_op.name = "mat_mul_reshape_output";
380 }
381 return absl::OkStatus();
382 }
383 case OperationType::CAST:
384 SelectCast(op_def, gpu_info, gpu_op);
385 return absl::OkStatus();
386 case OperationType::CONCAT: {
387 auto attr = absl::any_cast<ConcatAttributes>(node.operation.attributes);
388 int max_inputs = gpu_info.GetMaxImageArguments() - 8;
389 if (gpu_info.IsMali()) {
390 // Mali can fail clEnqueueNDRangeKernel with "Out of resources" when it
391 // receives too big kernel.
392 max_inputs = std::min(8, max_inputs);
393 }
394 if (inputs.size() >= max_inputs) {
395 int groups = DivideRoundUp(inputs.size(), max_inputs);
396 gpu_subgraph->operations.clear();
397 gpu_subgraph->operations.resize(groups);
398 BHWC concatenated_shape = inputs[0]->tensor.shape;
399 concatenated_shape.set(attr.axis, 0);
400 for (int g = 0; g < groups; ++g) {
401 std::vector<int> channels;
402 auto& concat_op = gpu_subgraph->operations[g];
403 OperationDef new_def;
404 new_def.precision = op_def.precision;
405 if (g != 0) {
406 // concatenated tensor from previos concats
407 new_def.src_tensors.push_back(op_def.dst_tensors[0]);
408 concat_op.input_ids = {-g};
409 channels.push_back(concatenated_shape.c);
410 }
411 for (int i = 0; i < max_inputs; ++i) {
412 int src_index = g * max_inputs + i;
413 if (src_index >= op_def.src_tensors.size()) {
414 break;
415 }
416 new_def.src_tensors.push_back(op_def.src_tensors[src_index]);
417 concat_op.input_ids.push_back(inputs[src_index]->id);
418 channels.push_back(inputs[src_index]->tensor.shape.c);
419 int current_size = concatenated_shape.get(attr.axis);
420 concatenated_shape.set(
421 attr.axis,
422 current_size + inputs[src_index]->tensor.shape.get(attr.axis));
423 }
424 new_def.dst_tensors.push_back(op_def.dst_tensors[0]);
425 if (g == groups - 1) {
426 // last concat
427 concat_op.output_ids = {static_cast<int>(outputs[0]->id)};
428 } else {
429 // intermediate concat, create new tensor for it
430 int tensor_id = gpu_subgraph->AddTensor(concatenated_shape,
431 op_def.dst_tensors[0]);
432 concat_op.output_ids = {tensor_id};
433 }
434 RETURN_IF_ERROR(SelectConcat(attr, channels, new_def, gpu_info,
435 &concat_op.operation));
436 }
437 return absl::OkStatus();
438 } else {
439 std::vector<int> channels(inputs.size());
440 for (int i = 0; i < inputs.size(); ++i) {
441 channels[i] = inputs[i]->tensor.shape.c;
442 }
443 return SelectConcat(attr, channels, op_def, gpu_info, gpu_op);
444 }
445 }
446 case OperationType::CONVOLUTION_2D: {
447 auto attr =
448 absl::any_cast<Convolution2DAttributes>(node.operation.attributes);
449 auto input_shape = inputs[0]->tensor.shape;
450 auto output_shape = outputs[0]->tensor.shape;
451 if (inputs.size() == 1) {
452 if (!hints.Check(ModelHints::kNoWinogradOptimizations) &&
453 WinogradFromNode(gpu_info, inputs, outputs, op_def, hints,
454 input_shape, output_shape, attr, gpu_subgraph)
455 .ok()) {
456 return absl::OkStatus();
457 } else {
458 gpu_op = InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
459 if (attr.groups != 1) {
460 gpu_subgraph->operations[0].name = "convolution_2d_grouped";
461 }
462 if (!shared_conv_weights || attr.weights.id == -1) {
463 *gpu_op =
464 SelectConvolution(attr, output_shape, gpu_info, op_def, hints);
465 } else {
466 // Using convolutions with shared weights
467 WeightsDescription weights_desc;
468 const BHWC weights_shape_bhwc(
469 attr.weights.shape.o, attr.weights.shape.h,
470 attr.weights.shape.w, attr.weights.shape.i);
471 OperationDef conv_temp_def = op_def;
472 conv_temp_def.src_tensors.push_back(
473 {op_def.src_tensors[0].GetDataType(), TensorStorageType::BUFFER,
474 Layout::HWC});
475 *gpu_op = SelectConvolutionWithDynamicWeights(
476 attr, weights_shape_bhwc, output_shape, gpu_info, conv_temp_def,
477 hints, &weights_desc);
478 AddConvSharedWeights(attr, weights_desc, shared_conv_weights,
479 gpu_subgraph);
480 }
481 (*gpu_op)->flops_ =
482 GetConvolutionFlops(output_shape, attr.weights.shape);
483 return absl::OkStatus();
484 }
485 } else {
486 // CONVOLUTION_2D with runtime weights
487 const OHWI weights_shape =
488 OHWI(inputs[1]->tensor.shape.b, inputs[1]->tensor.shape.h,
489 inputs[1]->tensor.shape.w, inputs[1]->tensor.shape.c);
490 if (weights_shape.i != inputs[0]->tensor.shape.c) {
491 return absl::UnimplementedError(
492 "No support of grouped convolution with runtime weights");
493 }
494 if (attr.bias.data.empty()) {
495 attr.bias.shape = Linear(weights_shape.o);
496 attr.bias.data.resize(weights_shape.o, 0.0f);
497 }
498 gpu_subgraph->operations.clear();
499 return AddDynamicConv(hints, gpu_info, op_def, op_type, input_shape,
500 weights_shape, output_shape, inputs[0]->id,
501 inputs[1]->id, outputs[0]->id, gpu_subgraph,
502 &attr);
503 }
504 }
505 case OperationType::CONVOLUTION_TRANSPOSED: {
506 auto attr = absl::any_cast<ConvolutionTransposedAttributes>(
507 node.operation.attributes);
508 if (inputs.size() == 1) {
509 *gpu_op = SelectConvolutionTransposed(attr, gpu_info, op_def);
510 (*gpu_op)->flops_ = GetConvolutionTransposedFlops(
511 inputs[0]->tensor.shape, attr.weights.shape);
512 return absl::OkStatus();
513 } else {
514 // CONVOLUTION_TRANSPOSED with runtime weights
515 const OHWI weights_shape =
516 OHWI(inputs[1]->tensor.shape.b, inputs[1]->tensor.shape.h,
517 inputs[1]->tensor.shape.w, inputs[1]->tensor.shape.c);
518 if (attr.bias.data.empty()) {
519 attr.bias.shape = Linear(weights_shape.o);
520 attr.bias.data.resize(weights_shape.o, 0.0f);
521 }
522 gpu_subgraph->operations.clear();
523 return AddDynamicConv(
524 hints, gpu_info, op_def, op_type, inputs[0]->tensor.shape,
525 weights_shape, outputs[0]->tensor.shape, inputs[0]->id,
526 inputs[1]->id, outputs[0]->id, gpu_subgraph, &attr);
527 }
528 }
529 case OperationType::DEPTHWISE_CONVOLUTION: {
530 auto attr = absl::any_cast<DepthwiseConvolution2DAttributes>(
531 node.operation.attributes);
532 if (inputs.size() == 1) {
533 *gpu_op = SelectDWConvolution(attr, gpu_info, op_def);
534 (*gpu_op)->flops_ = GetDepthwiseConvolutionFlops(
535 outputs[0]->tensor.shape, attr.weights.shape);
536 } else {
537 if (inputs[1]->tensor.shape.b != 1) {
538 return absl::UnimplementedError(
539 "No support of depthwise runtime weights with channel multiplier "
540 "!= 1");
541 }
542 *gpu_op = SelectDWConvolutionDynamicWeights(attr, gpu_info, op_def);
543 (*gpu_op)->flops_ = GetDepthwiseConvolutionFlops(
544 outputs[0]->tensor.shape,
545 OHWI(inputs[1]->tensor.shape.b, inputs[1]->tensor.shape.h,
546 inputs[1]->tensor.shape.w, inputs[1]->tensor.shape.c));
547 }
548 return absl::OkStatus();
549 }
550 case OperationType::CUMSUM: {
551 auto attr = absl::any_cast<CumsumAttributes>(node.operation.attributes);
552 SelectCumsum(op_def, attr, gpu_op);
553 return absl::OkStatus();
554 }
555 case OperationType::DEPTH_TO_SPACE: {
556 auto attr =
557 absl::any_cast<SpaceToDepthAttributes>(node.operation.attributes);
558 SelectDepthToSpace(attr, op_def, gpu_op);
559 return absl::OkStatus();
560 }
561 case OperationType::FULLY_CONNECTED: {
562 auto attr =
563 absl::any_cast<FullyConnectedAttributes>(node.operation.attributes);
564 *gpu_op = SelectFullyConnected(attr, gpu_info, op_def,
565 inputs[0]->tensor.shape.b);
566 (*gpu_op)->flops_ =
567 GetFullyConnectedFlops(outputs[0]->tensor.shape, attr.weights.shape);
568 return absl::OkStatus();
569 }
570 case OperationType::FULLY_CONNECTED_INT8: {
571 auto attr = absl::any_cast<FullyConnectedInt8Attributes>(
572 node.operation.attributes);
573 *gpu_op = SelectFullyConnected(attr, gpu_info, op_def);
574 return absl::OkStatus();
575 }
576 case OperationType::GATHER: {
577 auto attr = absl::any_cast<GatherAttributes>(node.operation.attributes);
578 RETURN_IF_ERROR(SelectGather(attr, op_def, gpu_op));
579 return absl::OkStatus();
580 }
581 case OperationType::LSTM: {
582 *gpu_op = SelectLSTM(op_def, gpu_info);
583 return absl::OkStatus();
584 }
585 case OperationType::MAX_UNPOOLING_2D: {
586 auto attr =
587 absl::any_cast<MaxUnpooling2DAttributes>(node.operation.attributes);
588 *gpu_op = SelectMaxUnpooling(attr, gpu_info, op_def);
589 return absl::OkStatus();
590 }
591 case OperationType::MEAN: {
592 auto attr = absl::any_cast<MeanAttributes>(node.operation.attributes);
593 *gpu_op = SelectReduce(attr.dims, inputs[0]->tensor.shape, op_type,
594 op_def, gpu_info);
595 return absl::OkStatus();
596 }
597 case OperationType::MEAN_STDDEV_NORMALIZATION: {
598 MeanStdDevNormalization operation = CreateMeanStdDevNormalization(
599 op_def, gpu_info, inputs[0]->tensor.shape);
600 *gpu_op = std::make_unique<MeanStdDevNormalization>(std::move(operation));
601 return absl::OkStatus();
602 }
603 case OperationType::ONE_HOT: {
604 auto attr = absl::any_cast<OneHotAttributes>(node.operation.attributes);
605 SelectOneHot(op_def, attr, gpu_op);
606 return absl::OkStatus();
607 }
608 case OperationType::PAD: {
609 auto attr = absl::any_cast<PadAttributes>(node.operation.attributes);
610 SelectPadding(attr, op_def, gpu_op);
611 return absl::OkStatus();
612 }
613 case OperationType::POOLING_2D: {
614 auto attr =
615 absl::any_cast<Pooling2DAttributes>(node.operation.attributes);
616 *gpu_op = SelectPooling(attr, gpu_info, op_def);
617 return absl::OkStatus();
618 }
619 case OperationType::PRELU: {
620 auto attr = absl::any_cast<PReLUAttributes>(node.operation.attributes);
621 *gpu_op = SelectPReLU(attr, gpu_info, op_def);
622 return absl::OkStatus();
623 }
624 case OperationType::QUANTIZE_AND_DEQUANTIZE: {
625 auto attr = absl::any_cast<QuantizeAndDequantizeAttributes>(
626 node.operation.attributes);
627 *gpu_op = SelectQuantizeAndDequantize(attr, op_def);
628 return absl::OkStatus();
629 }
630 case OperationType::RELU: {
631 auto attr = absl::any_cast<ReLUAttributes>(node.operation.attributes);
632 *gpu_op = SelectReLU(attr, op_def);
633 return absl::OkStatus();
634 }
635 case OperationType::RESAMPLER: {
636 *gpu_op = SelectResampler(op_def, gpu_info);
637 return absl::OkStatus();
638 }
639 case OperationType::RESHAPE: {
640 const int src_channels = inputs[0]->tensor.shape.c;
641 auto attr = absl::any_cast<ReshapeAttributes>(node.operation.attributes);
642 SelectReshape(src_channels, attr.new_shape.c, op_def, gpu_op);
643 return absl::OkStatus();
644 }
645 case OperationType::RESIZE: {
646 auto attr = absl::any_cast<Resize2DAttributes>(node.operation.attributes);
647 return SelectResize(attr, op_def, gpu_op);
648 }
649 case OperationType::SLICE: {
650 auto attr = absl::any_cast<SliceAttributes>(node.operation.attributes);
651 SelectStridedSlice(attr, op_def, gpu_op);
652 return absl::OkStatus();
653 }
654 case OperationType::SOFTMAX: {
655 SelectSoftmax(inputs[0]->tensor.shape, op_def, gpu_op);
656 return absl::OkStatus();
657 }
658 case OperationType::SPACE_TO_DEPTH: {
659 auto attr =
660 absl::any_cast<SpaceToDepthAttributes>(node.operation.attributes);
661 SelectSpaceToDepth(attr, op_def, gpu_op);
662 return absl::OkStatus();
663 }
664 case OperationType::SPLIT: {
665 std::vector<int> channels;
666 channels.reserve(outputs.size());
667 for (const auto& output : outputs) {
668 channels.push_back(output->tensor.shape.c);
669 }
670 auto attr = absl::any_cast<SplitAttributes>(node.operation.attributes);
671 if (gpu_info.IsMali()) {
672 // Mali can fail clEnqueueNDRangeKernel with "Out of resources" when it
673 // receives too big kernel.
674 // Replace single complex split to N with N simple kernels.
675 gpu_subgraph->operations.clear();
676 gpu_subgraph->operations.resize(outputs.size());
677 int split_offset = 0;
678 for (int i = 0; i < outputs.size(); ++i) {
679 auto& op = gpu_subgraph->operations[i];
680 op.input_ids = {static_cast<int>(inputs[0]->id)};
681 op.output_ids = {static_cast<int>(outputs[i]->id)};
682 OperationDef new_def;
683 new_def.precision = op_def.precision;
684 new_def.src_tensors.push_back(op_def.src_tensors[0]);
685 new_def.dst_tensors.push_back(op_def.dst_tensors[i]);
686 SliceAttributes new_attr;
687 new_attr.starts = BHWC(0, 0, 0, 0);
688 new_attr.ends = inputs[0]->tensor.shape;
689 new_attr.strides = BHWC(1, 1, 1, 1);
690 new_attr.starts.set(attr.axis, split_offset);
691 new_attr.ends.set(
692 attr.axis,
693 split_offset + outputs[i]->tensor.shape.get(attr.axis));
694 split_offset += outputs[i]->tensor.shape.get(attr.axis);
695 SelectStridedSlice(new_attr, new_def, &op.operation);
696 }
697 return absl::OkStatus();
698 }
699 SelectSplit(attr, gpu_info, channels, op_def, gpu_op);
700 return absl::OkStatus();
701 }
702 case OperationType::TILE: {
703 *gpu_op = SelectTile(op_def, inputs[0]->tensor.shape);
704 return absl::OkStatus();
705 }
706 case OperationType::TRANSPOSE: {
707 auto attr =
708 absl::any_cast<TransposeAttributes>(node.operation.attributes);
709 SelectTranspose(attr, op_def, gpu_op);
710 return absl::OkStatus();
711 }
712 case OperationType::ABS:
713 case OperationType::COPY:
714 case OperationType::COS:
715 case OperationType::ELU:
716 case OperationType::EXP:
717 case OperationType::HARD_SWISH:
718 case OperationType::LOG:
719 case OperationType::NEG:
720 case OperationType::RSQRT:
721 case OperationType::SIGMOID:
722 case OperationType::SIN:
723 case OperationType::SQRT:
724 case OperationType::SQUARE:
725 case OperationType::TANH: {
726 GPUOperation operation;
727 if (inputs[0]->tensor.shape != outputs[0]->tensor.shape) {
728 operation = CreateElementwiseOneInputWithBroadcast(
729 gpu_info, op_def, op_type, inputs[0]->tensor.shape,
730 outputs[0]->tensor.shape);
731 } else {
732 operation = CreateElementwiseOneInput(gpu_info, op_def, op_type);
733 }
734 *gpu_op = std::make_unique<GPUOperation>(std::move(operation));
735 return absl::OkStatus();
736 }
737 case OperationType::ADD:
738 case OperationType::DIV:
739 case OperationType::EQUAL:
740 case OperationType::GREATER:
741 case OperationType::GREATER_EQUAL:
742 case OperationType::LESS:
743 case OperationType::LESS_EQUAL:
744 case OperationType::MAXIMUM:
745 case OperationType::MINIMUM:
746 case OperationType::MUL:
747 case OperationType::NOT_EQUAL:
748 case OperationType::POW:
749 case OperationType::SQUARED_DIFF:
750 case OperationType::SUB: {
751 if (op_type == OperationType::ADD && inputs.size() >= 2) {
752 const bool two_input_add_with_zero_padded_channels =
753 inputs[0]->tensor.shape.c % 4 == 0 &&
754 inputs[1]->tensor.shape.c % 4 == 0 &&
755 outputs[0]->tensor.shape.c % 4 == 0 &&
756 (inputs[0]->tensor.shape.c != outputs[0]->tensor.shape.c ||
757 inputs[1]->tensor.shape.c != outputs[0]->tensor.shape.c);
758 if (inputs.size() >= 3 || two_input_add_with_zero_padded_channels) {
759 auto output = outputs[0];
760 std::vector<int> channels(inputs.size());
761 for (int i = 0; i < inputs.size(); ++i) {
762 channels[i] = inputs[i]->tensor.shape.c;
763 }
764 SelectAdd(op_def, channels, output->tensor.shape.c, gpu_op);
765 return absl::OkStatus();
766 }
767 }
768
769 if (inputs.size() == 2) {
770 GPUOperation operation;
771 if (inputs[0]->tensor.shape != outputs[0]->tensor.shape) {
772 operation = CreateElementwiseTwoInputWithBroadcast(
773 op_def, op_type, inputs[0]->tensor.shape, inputs[1]->tensor.shape,
774 outputs[0]->tensor.shape);
775 } else {
776 operation = CreateElementwiseTwoInput(op_def, op_type,
777 inputs[1]->tensor.shape);
778 }
779 *gpu_op = std::make_unique<GPUOperation>(std::move(operation));
780 return absl::OkStatus();
781 } else if (inputs.size() == 1 && node.operation.attributes.has_value()) {
782 auto attr =
783 absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
784 GPUOperation operation;
785 if (inputs[0]->tensor.shape != outputs[0]->tensor.shape) {
786 operation = CreateElementwiseWithBroadcast(
787 gpu_info, op_def, op_type, attr, inputs[0]->tensor.shape,
788 outputs[0]->tensor.shape);
789 } else {
790 operation = CreateElementwise(gpu_info, op_def, op_type, attr);
791 }
792 *gpu_op = std::make_unique<GPUOperation>(std::move(operation));
793 return absl::OkStatus();
794 }
795 return absl::UnimplementedError(absl::StrCat(
796 "No support of ", node.operation.type, " with this parameters"));
797 }
798 case OperationType::REDUCE_MAXIMUM:
799 case OperationType::REDUCE_MINIMUM:
800 case OperationType::REDUCE_PRODUCT:
801 case OperationType::REDUCE_SUM: {
802 auto attr = absl::any_cast<ReduceAttributes>(node.operation.attributes);
803 *gpu_op = SelectReduce(attr.dims, inputs[0]->tensor.shape, op_type,
804 op_def, gpu_info);
805 return absl::OkStatus();
806 }
807 case OperationType::SELECT_V2: {
808 auto attr = absl::any_cast<SelectV2Attributes>(node.operation.attributes);
809 SelectSelectV2(op_def, attr, gpu_op);
810 return absl::OkStatus();
811 }
812 default:
813 return SelectDefault(gpu_info, op_def, hints, inputs, outputs, node,
814 gpu_subgraph);
815 }
816 }
817
GPUOperationFromNode(const GpuInfo & gpu_info,const OperationDef & op_def,ModelHints hints,const std::vector<Value * > & inputs,const std::vector<Value * > & outputs,const Node & node,std::vector<SharedWeightsConvDesc> * shared_conv_weights,GPUOperationsSubgraph * gpu_subgraph)818 absl::Status GPUOperationFromNode(
819 const GpuInfo& gpu_info, const OperationDef& op_def, ModelHints hints,
820 const std::vector<Value*>& inputs, const std::vector<Value*>& outputs,
821 const Node& node, std::vector<SharedWeightsConvDesc>* shared_conv_weights,
822 GPUOperationsSubgraph* gpu_subgraph) {
823 RETURN_IF_ERROR(GPUOperationFromNodePart0(gpu_info, op_def, hints, inputs,
824 outputs, node, shared_conv_weights,
825 gpu_subgraph));
826 for (auto& gpu_op : gpu_subgraph->operations) {
827 if (gpu_op.name.empty()) {
828 gpu_op.name = node.operation.type + " " + std::to_string(node.id);
829 } else {
830 gpu_op.name += " " + std::to_string(node.id);
831 }
832 }
833 return absl::OkStatus();
834 }
835
836 } // namespace gpu
837 } // namespace tflite
838