xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/delegates/gpu/common/tasks/conv_generic.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_GENERIC_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_GENERIC_H_
18 
19 #include <cstring>
20 #include <memory>
21 #include <string>
22 #include <utility>
23 #include <vector>
24 
25 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
26 #include "tensorflow/lite/delegates/gpu/common/operations.h"
27 #include "tensorflow/lite/delegates/gpu/common/shape.h"
28 #include "tensorflow/lite/delegates/gpu/common/status.h"
29 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
30 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
31 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
32 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
33 #include "tensorflow/lite/delegates/gpu/common/task/weights_conversion.h"
34 #include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h"
35 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
36 #include "tensorflow/lite/delegates/gpu/common/types.h"
37 #include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
38 
39 namespace tflite {
40 namespace gpu {
41 
42 class ConvGeneric : public GPUOperation {
43  public:
44   enum class WeightsUploadType {
45     LOCAL_MEM_ASYNC_SUBGROUP,  // we use it for PowerVR with workgroup size = 32
46     LOCAL_MEM_BY_THREADS,
47     GLOBAL_MEM,
48     CONSTANT_MEM,
49     PRIVATE_MEM_SIMD_BROADCAST,
50     TEXTURES_MEM_X4,  // 4 textures for weights
51   };
52   struct ConvParams {
53     DataType weights_data_type;  // used for weights and biases
54     int4 block_size;             // WHDS
55     bool fixed_work_group_size;
56     int3 work_group_size;
57     int3 work_group_launch_order;
58     bool linear_spatial;  // spatial dimensions are Width/Height/Depth
59     bool linear_all;  // linear_spatial & linear_all can not be used together,
60                       // linear_all can not be used with WeightsUploadTypes
61                       // that use workgroups(subgroups) for
62                       // uploading(LOCAL_MEM_BY_THREADS for example).
63     bool different_weights_for_height;
64     bool groups_support = false;  // convolution groups
65     int src_depth_loop_size;
66     bool need_src_loop = true;
67     bool need_dst_loop = true;
68     WeightsUploadType weights_upload_type;
69     bool x_kernel_is_1 = false;
70     bool y_kernel_is_1 = false;
71     bool z_kernel_is_1 = false;
72     WeightsLayout weights_layout;
73 
74     // used only with PRIVATE_MEM_SIMD_BROADCAST
75     int simd_size = 1;
76 
AreWeightsBufferConvParams77     bool AreWeightsBuffer() const {
78       return weights_upload_type != WeightsUploadType::TEXTURES_MEM_X4;
79     }
80 
IsPrivateMemBroadcastConvParams81     bool IsPrivateMemBroadcast() const {
82       return weights_upload_type ==
83              WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST;
84     }
85   };
86   ConvGeneric() = default;
87   void GetPossibleKernelWorkGroups(
88       TuningType tuning_type, const GpuInfo& gpu_info,
89       const KernelInfo& kernel_info,
90       std::vector<int3>* work_groups) const override;
91   absl::Status BindArguments(ArgumentsBinder* args) override;
92   int3 GetGridSize() const override;
93 
GetWeightsDescription()94   WeightsDescription GetWeightsDescription() const {
95     WeightsDescription desc;
96     desc.type = conv_params_.weights_data_type;
97     desc.layout = conv_params_.weights_layout;
98     desc.output_group_size = conv_params_.block_size.w;
99     return desc;
100   }
101 
102   // Move only
103   ConvGeneric(ConvGeneric&& operation);
104   ConvGeneric& operator=(ConvGeneric&& operation);
105   ConvGeneric(const ConvGeneric&) = delete;
106   ConvGeneric& operator=(const ConvGeneric&) = delete;
107 
108  private:
109   ConvGeneric(const OperationDef& definition,
110               const Convolution2DAttributes& attr, const GpuInfo& gpu_info,
111               const BHWC* dst_shape = nullptr);
112   ConvGeneric(const OperationDef& definition,
113               const Convolution2DAttributes& attr, const BHWC& weights_shape,
114               const GpuInfo& gpu_info, const BHWC* dst_shape = nullptr);
115   ConvGeneric(const OperationDef& definition,
116               const FullyConnectedAttributes& attr, const GpuInfo& gpu_info,
117               const BHWC* dst_shape = nullptr);
118   explicit ConvGeneric(const OperationDef& definition);
119   ConvGeneric(const OperationDef& definition,
120               const Convolution3DAttributes& attr, const GpuInfo& gpu_info,
121               const BHWDC* dst_shape = nullptr);
122 
123   void GenerateCode(const GpuInfo& gpu_info);
124 
125   template <DataType T>
126   void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
127                   const tflite::gpu::Tensor<Linear, T>& biases);
128   template <DataType T>
129   void UploadDataForWinograd4x4To6x6(
130       const tflite::gpu::Tensor<OHWI, T>& weights);
131 
132   template <DataType T>
133   void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
134 
135   template <DataType T>
136   void UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights);
137 
138   template <DataType T>
139   void UploadBias(const tflite::gpu::Tensor<Linear, T>& bias);
140 
141   friend ConvGeneric CreateConvGeneric(const GpuInfo& gpu_info,
142                                        const OperationDef& definition,
143                                        const Convolution2DAttributes& attr,
144                                        const BHWC* dst_shape);
145 
146   friend ConvGeneric CreateConvGeneric(const GpuInfo& gpu_info,
147                                        const OperationDef& definition,
148                                        const FullyConnectedAttributes& attr,
149                                        const BHWC* dst_shape);
150 
151   friend ConvGeneric CreateConvGenericBatchedMatMul(
152       const GpuInfo& gpu_info, const OperationDef& definition,
153       const OHWI& weights_shape, const BHWC* dst_shape);
154 
155   friend ConvGeneric CreateConvGenericDynamicWeights(
156       const GpuInfo& gpu_info, const OperationDef& definition,
157       const Convolution2DAttributes& attr, const BHWC& weights_shape,
158       const BHWC* dst_shape);
159 
160   friend ConvGeneric CreateConvGenericWino4x4To6x6(
161       const GpuInfo& gpu_info, const OperationDef& definition,
162       const Convolution2DAttributes& attr, const BHWC* dst_shape);
163 
164   friend ConvGeneric CreateConvGeneric3D(const GpuInfo& gpu_info,
165                                          const OperationDef& definition,
166                                          const Convolution3DAttributes& attr,
167                                          const BHWDC* dst_shape);
168 
169   ConvParams GuessBestParams(const GpuInfo& gpu_info,
170                              const OperationDef& definition,
171                              const Convolution2DAttributes& attr,
172                              const BHWC* dst_shape = nullptr);
173   ConvParams GuessBestParams(const GpuInfo& gpu_info,
174                              const OperationDef& definition,
175                              const Convolution2DAttributes& attr,
176                              const BHWC& weights_shape,
177                              const BHWC* dst_shape = nullptr);
178   ConvParams GuessBestParams(const GpuInfo& gpu_info,
179                              const OperationDef& definition,
180                              const FullyConnectedAttributes& attr,
181                              const BHWC* dst_shape = nullptr);
182   ConvParams GuessBestParamsPointwise(const GpuInfo& gpu_info,
183                                       const OperationDef& definition,
184                                       const OHWI& weights_shape,
185                                       const BHWC* dst_shape = nullptr);
186   ConvParams GuessBestParams(const GpuInfo& gpu_info,
187                              const OperationDef& definition,
188                              const Convolution3DAttributes& attr,
189                              const BHWDC* dst_shape = nullptr);
190   ConvParams GuessBestParams(const GpuInfo& gpu_info,
191                              const OperationDef& definition, int src_depth,
192                              int dst_depth, bool x_kernel_is_1,
193                              bool y_kernel_is_1,
194                              bool different_weights_for_height,
195                              const BHWC* dst_shape = nullptr);
196   ConvParams GuessBestParamsApple(const GpuInfo& gpu_info,
197                                   const OperationDef& definition, int src_depth,
198                                   int dst_depth, bool x_kernel_is_1,
199                                   bool y_kernel_is_1,
200                                   bool different_weights_for_height,
201                                   const BHWC& dst_shape);
202 
203   std::string GenerateConv(const GpuInfo& gpu_info, const OperationDef& op_def,
204                            const ConvParams& conv_params);
205 
206   int4 stride_;
207   int4 padding_;
208   int4 kernel_size_;
209   int4 dilation_;
210   ConvParams conv_params_;
211 };
212 
213 template <DataType T>
UploadData(const tflite::gpu::Tensor<OHWI,T> & weights,const tflite::gpu::Tensor<Linear,T> & biases)214 void ConvGeneric::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
215                              const tflite::gpu::Tensor<Linear, T>& biases) {
216   UploadWeights(weights);
217   UploadBias(biases);
218 }
219 
220 template <DataType T>
UploadDataForWinograd4x4To6x6(const tflite::gpu::Tensor<OHWI,T> & weights)221 void ConvGeneric::UploadDataForWinograd4x4To6x6(
222     const tflite::gpu::Tensor<OHWI, T>& weights) {
223   tflite::gpu::Tensor<OHWI, T> wino_weights;
224   RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
225   UploadWeights(wino_weights);
226   tflite::gpu::Tensor<Linear, DataType::FLOAT32> biases;
227   biases.shape = Linear(weights.shape.o);
228   biases.data.resize(weights.shape.o, 0.0f);
229   UploadBias(biases);
230 }
231 
232 template <DataType T>
UploadBias(const tflite::gpu::Tensor<Linear,T> & bias)233 void ConvGeneric::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias) {
234   BufferDescriptor desc;
235   desc.element_type = conv_params_.weights_data_type;
236   desc.element_size = 4;
237   desc.memory_type = conv_params_.weights_upload_type ==
238                              ConvGeneric::WeightsUploadType::CONSTANT_MEM
239                          ? MemoryType::CONSTANT
240                          : MemoryType::GLOBAL;
241   const int float_size = conv_params_.weights_data_type == DataType::FLOAT32
242                              ? sizeof(float)
243                              : sizeof(half);
244   int aligned_channels = AlignByN(bias.shape.v, 4 * conv_params_.block_size.w);
245   desc.size = float_size * aligned_channels;
246   desc.data.resize(desc.size);
247   if (conv_params_.weights_data_type == DataType::FLOAT32) {
248     float* gpu_data = reinterpret_cast<float*>(desc.data.data());
249     for (int i = 0; i < aligned_channels; ++i) {
250       gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
251     }
252   } else {
253     half* gpu_data = reinterpret_cast<half*>(desc.data.data());
254     for (int i = 0; i < aligned_channels; ++i) {
255       gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
256     }
257   }
258   args_.AddObject("biases",
259                   std::make_unique<BufferDescriptor>(std::move(desc)));
260 }
261 
262 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWI,T> & weights)263 void ConvGeneric::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
264   const auto weights_desc = GetWeightsDescription();
265   const int flt_count =
266       GetTotalElementsCountForLayout(weights_desc, weights.shape);
267 
268   std::vector<uint8_t> weights_data(flt_count * SizeOf(weights_desc.type));
269   RearrangeWeights(weights, weights_desc, absl::MakeSpan(weights_data));
270 
271   if (conv_params_.AreWeightsBuffer()) {
272     BufferDescriptor desc;
273     desc.element_type = weights_desc.type;
274     desc.element_size = 4;
275     desc.memory_type = conv_params_.weights_upload_type ==
276                                ConvGeneric::WeightsUploadType::CONSTANT_MEM
277                            ? MemoryType::CONSTANT
278                            : MemoryType::GLOBAL;
279     desc.size = weights_data.size();
280     desc.data = std::move(weights_data);
281     args_.AddObject("weights",
282                     std::make_unique<BufferDescriptor>(std::move(desc)));
283   } else {
284     uint2 tex_size = Get2dResourceSize(weights_desc, weights.shape);
285     int sub_size = SizeOf(weights_desc.type) * 4 * tex_size.x * tex_size.y;
286     for (int i = 0; i < 4; ++i) {
287       TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
288           weights_desc.type, TensorStorageType::TEXTURE_2D, tex_size.x,
289           tex_size.y, weights_data.data() + sub_size * i);
290       args_.AddObject("weights" + std::to_string(i),
291                       std::make_unique<TensorDescriptor>(std::move(desc)));
292     }
293   }
294 }
295 
296 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWDI,T> & weights)297 void ConvGeneric::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights) {
298   const auto weights_desc = GetWeightsDescription();
299   const int flt_count =
300       GetTotalElementsCountForLayout(weights_desc, weights.shape);
301 
302   std::vector<uint8_t> weights_data(flt_count * SizeOf(weights_desc.type));
303   RearrangeWeights(weights, weights_desc, absl::MakeSpan(weights_data));
304 
305   if (conv_params_.AreWeightsBuffer()) {
306     BufferDescriptor desc;
307     desc.element_type = weights_desc.type;
308     desc.element_size = 4;
309     desc.size = weights_data.size();
310     desc.data = std::move(weights_data);
311     args_.AddObject("weights",
312                     std::make_unique<BufferDescriptor>(std::move(desc)));
313   } else {
314     uint2 tex_size = Get2dResourceSize(weights_desc, weights.shape);
315     int sub_size = SizeOf(weights_desc.type) * 4 * tex_size.x * tex_size.y;
316     for (int i = 0; i < 4; ++i) {
317       TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
318           weights_desc.type, TensorStorageType::TEXTURE_2D, tex_size.x,
319           tex_size.y, weights_data.data() + sub_size * i);
320       args_.AddObject("weights" + std::to_string(i),
321                       std::make_unique<TensorDescriptor>(std::move(desc)));
322     }
323   }
324 }
325 
326 ConvGeneric CreateConvGeneric(const GpuInfo& gpu_info,
327                               const OperationDef& definition,
328                               const Convolution2DAttributes& attr,
329                               const BHWC* dst_shape = nullptr);
330 
331 ConvGeneric CreateConvGeneric(const GpuInfo& gpu_info,
332                               const OperationDef& definition,
333                               const FullyConnectedAttributes& attr,
334                               const BHWC* dst_shape = nullptr);
335 
336 ConvGeneric CreateConvGenericDynamicWeights(const GpuInfo& gpu_info,
337                                             const OperationDef& definition,
338                                             const Convolution2DAttributes& attr,
339                                             const BHWC& weights_shape,
340                                             const BHWC* dst_shape = nullptr);
341 
342 ConvGeneric CreateConvGenericBatchedMatMul(const GpuInfo& gpu_info,
343                                            const OperationDef& definition,
344                                            const OHWI& weights_shape,
345                                            const BHWC* dst_shape = nullptr);
346 
347 ConvGeneric CreateConvGenericWino4x4To6x6(const GpuInfo& gpu_info,
348                                           const OperationDef& definition,
349                                           const Convolution2DAttributes& attr,
350                                           const BHWC* dst_shape = nullptr);
351 
352 ConvGeneric CreateConvGeneric3D(const GpuInfo& gpu_info,
353                                 const OperationDef& definition,
354                                 const Convolution3DAttributes& attr,
355                                 const BHWDC* dst_shape = nullptr);
356 
357 }  // namespace gpu
358 }  // namespace tflite
359 
360 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_GENERIC_H_
361