1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_GENERIC_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_GENERIC_H_
18
19 #include <cstring>
20 #include <memory>
21 #include <string>
22 #include <utility>
23 #include <vector>
24
25 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
26 #include "tensorflow/lite/delegates/gpu/common/operations.h"
27 #include "tensorflow/lite/delegates/gpu/common/shape.h"
28 #include "tensorflow/lite/delegates/gpu/common/status.h"
29 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
30 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
31 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
32 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
33 #include "tensorflow/lite/delegates/gpu/common/task/weights_conversion.h"
34 #include "tensorflow/lite/delegates/gpu/common/task/weights_layout.h"
35 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
36 #include "tensorflow/lite/delegates/gpu/common/types.h"
37 #include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
38
39 namespace tflite {
40 namespace gpu {
41
42 class ConvGeneric : public GPUOperation {
43 public:
44 enum class WeightsUploadType {
45 LOCAL_MEM_ASYNC_SUBGROUP, // we use it for PowerVR with workgroup size = 32
46 LOCAL_MEM_BY_THREADS,
47 GLOBAL_MEM,
48 CONSTANT_MEM,
49 PRIVATE_MEM_SIMD_BROADCAST,
50 TEXTURES_MEM_X4, // 4 textures for weights
51 };
52 struct ConvParams {
53 DataType weights_data_type; // used for weights and biases
54 int4 block_size; // WHDS
55 bool fixed_work_group_size;
56 int3 work_group_size;
57 int3 work_group_launch_order;
58 bool linear_spatial; // spatial dimensions are Width/Height/Depth
59 bool linear_all; // linear_spatial & linear_all can not be used together,
60 // linear_all can not be used with WeightsUploadTypes
61 // that use workgroups(subgroups) for
62 // uploading(LOCAL_MEM_BY_THREADS for example).
63 bool different_weights_for_height;
64 bool groups_support = false; // convolution groups
65 int src_depth_loop_size;
66 bool need_src_loop = true;
67 bool need_dst_loop = true;
68 WeightsUploadType weights_upload_type;
69 bool x_kernel_is_1 = false;
70 bool y_kernel_is_1 = false;
71 bool z_kernel_is_1 = false;
72 WeightsLayout weights_layout;
73
74 // used only with PRIVATE_MEM_SIMD_BROADCAST
75 int simd_size = 1;
76
AreWeightsBufferConvParams77 bool AreWeightsBuffer() const {
78 return weights_upload_type != WeightsUploadType::TEXTURES_MEM_X4;
79 }
80
IsPrivateMemBroadcastConvParams81 bool IsPrivateMemBroadcast() const {
82 return weights_upload_type ==
83 WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST;
84 }
85 };
86 ConvGeneric() = default;
87 void GetPossibleKernelWorkGroups(
88 TuningType tuning_type, const GpuInfo& gpu_info,
89 const KernelInfo& kernel_info,
90 std::vector<int3>* work_groups) const override;
91 absl::Status BindArguments(ArgumentsBinder* args) override;
92 int3 GetGridSize() const override;
93
GetWeightsDescription()94 WeightsDescription GetWeightsDescription() const {
95 WeightsDescription desc;
96 desc.type = conv_params_.weights_data_type;
97 desc.layout = conv_params_.weights_layout;
98 desc.output_group_size = conv_params_.block_size.w;
99 return desc;
100 }
101
102 // Move only
103 ConvGeneric(ConvGeneric&& operation);
104 ConvGeneric& operator=(ConvGeneric&& operation);
105 ConvGeneric(const ConvGeneric&) = delete;
106 ConvGeneric& operator=(const ConvGeneric&) = delete;
107
108 private:
109 ConvGeneric(const OperationDef& definition,
110 const Convolution2DAttributes& attr, const GpuInfo& gpu_info,
111 const BHWC* dst_shape = nullptr);
112 ConvGeneric(const OperationDef& definition,
113 const Convolution2DAttributes& attr, const BHWC& weights_shape,
114 const GpuInfo& gpu_info, const BHWC* dst_shape = nullptr);
115 ConvGeneric(const OperationDef& definition,
116 const FullyConnectedAttributes& attr, const GpuInfo& gpu_info,
117 const BHWC* dst_shape = nullptr);
118 explicit ConvGeneric(const OperationDef& definition);
119 ConvGeneric(const OperationDef& definition,
120 const Convolution3DAttributes& attr, const GpuInfo& gpu_info,
121 const BHWDC* dst_shape = nullptr);
122
123 void GenerateCode(const GpuInfo& gpu_info);
124
125 template <DataType T>
126 void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
127 const tflite::gpu::Tensor<Linear, T>& biases);
128 template <DataType T>
129 void UploadDataForWinograd4x4To6x6(
130 const tflite::gpu::Tensor<OHWI, T>& weights);
131
132 template <DataType T>
133 void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
134
135 template <DataType T>
136 void UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights);
137
138 template <DataType T>
139 void UploadBias(const tflite::gpu::Tensor<Linear, T>& bias);
140
141 friend ConvGeneric CreateConvGeneric(const GpuInfo& gpu_info,
142 const OperationDef& definition,
143 const Convolution2DAttributes& attr,
144 const BHWC* dst_shape);
145
146 friend ConvGeneric CreateConvGeneric(const GpuInfo& gpu_info,
147 const OperationDef& definition,
148 const FullyConnectedAttributes& attr,
149 const BHWC* dst_shape);
150
151 friend ConvGeneric CreateConvGenericBatchedMatMul(
152 const GpuInfo& gpu_info, const OperationDef& definition,
153 const OHWI& weights_shape, const BHWC* dst_shape);
154
155 friend ConvGeneric CreateConvGenericDynamicWeights(
156 const GpuInfo& gpu_info, const OperationDef& definition,
157 const Convolution2DAttributes& attr, const BHWC& weights_shape,
158 const BHWC* dst_shape);
159
160 friend ConvGeneric CreateConvGenericWino4x4To6x6(
161 const GpuInfo& gpu_info, const OperationDef& definition,
162 const Convolution2DAttributes& attr, const BHWC* dst_shape);
163
164 friend ConvGeneric CreateConvGeneric3D(const GpuInfo& gpu_info,
165 const OperationDef& definition,
166 const Convolution3DAttributes& attr,
167 const BHWDC* dst_shape);
168
169 ConvParams GuessBestParams(const GpuInfo& gpu_info,
170 const OperationDef& definition,
171 const Convolution2DAttributes& attr,
172 const BHWC* dst_shape = nullptr);
173 ConvParams GuessBestParams(const GpuInfo& gpu_info,
174 const OperationDef& definition,
175 const Convolution2DAttributes& attr,
176 const BHWC& weights_shape,
177 const BHWC* dst_shape = nullptr);
178 ConvParams GuessBestParams(const GpuInfo& gpu_info,
179 const OperationDef& definition,
180 const FullyConnectedAttributes& attr,
181 const BHWC* dst_shape = nullptr);
182 ConvParams GuessBestParamsPointwise(const GpuInfo& gpu_info,
183 const OperationDef& definition,
184 const OHWI& weights_shape,
185 const BHWC* dst_shape = nullptr);
186 ConvParams GuessBestParams(const GpuInfo& gpu_info,
187 const OperationDef& definition,
188 const Convolution3DAttributes& attr,
189 const BHWDC* dst_shape = nullptr);
190 ConvParams GuessBestParams(const GpuInfo& gpu_info,
191 const OperationDef& definition, int src_depth,
192 int dst_depth, bool x_kernel_is_1,
193 bool y_kernel_is_1,
194 bool different_weights_for_height,
195 const BHWC* dst_shape = nullptr);
196 ConvParams GuessBestParamsApple(const GpuInfo& gpu_info,
197 const OperationDef& definition, int src_depth,
198 int dst_depth, bool x_kernel_is_1,
199 bool y_kernel_is_1,
200 bool different_weights_for_height,
201 const BHWC& dst_shape);
202
203 std::string GenerateConv(const GpuInfo& gpu_info, const OperationDef& op_def,
204 const ConvParams& conv_params);
205
206 int4 stride_;
207 int4 padding_;
208 int4 kernel_size_;
209 int4 dilation_;
210 ConvParams conv_params_;
211 };
212
213 template <DataType T>
UploadData(const tflite::gpu::Tensor<OHWI,T> & weights,const tflite::gpu::Tensor<Linear,T> & biases)214 void ConvGeneric::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
215 const tflite::gpu::Tensor<Linear, T>& biases) {
216 UploadWeights(weights);
217 UploadBias(biases);
218 }
219
220 template <DataType T>
UploadDataForWinograd4x4To6x6(const tflite::gpu::Tensor<OHWI,T> & weights)221 void ConvGeneric::UploadDataForWinograd4x4To6x6(
222 const tflite::gpu::Tensor<OHWI, T>& weights) {
223 tflite::gpu::Tensor<OHWI, T> wino_weights;
224 RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
225 UploadWeights(wino_weights);
226 tflite::gpu::Tensor<Linear, DataType::FLOAT32> biases;
227 biases.shape = Linear(weights.shape.o);
228 biases.data.resize(weights.shape.o, 0.0f);
229 UploadBias(biases);
230 }
231
232 template <DataType T>
UploadBias(const tflite::gpu::Tensor<Linear,T> & bias)233 void ConvGeneric::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias) {
234 BufferDescriptor desc;
235 desc.element_type = conv_params_.weights_data_type;
236 desc.element_size = 4;
237 desc.memory_type = conv_params_.weights_upload_type ==
238 ConvGeneric::WeightsUploadType::CONSTANT_MEM
239 ? MemoryType::CONSTANT
240 : MemoryType::GLOBAL;
241 const int float_size = conv_params_.weights_data_type == DataType::FLOAT32
242 ? sizeof(float)
243 : sizeof(half);
244 int aligned_channels = AlignByN(bias.shape.v, 4 * conv_params_.block_size.w);
245 desc.size = float_size * aligned_channels;
246 desc.data.resize(desc.size);
247 if (conv_params_.weights_data_type == DataType::FLOAT32) {
248 float* gpu_data = reinterpret_cast<float*>(desc.data.data());
249 for (int i = 0; i < aligned_channels; ++i) {
250 gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
251 }
252 } else {
253 half* gpu_data = reinterpret_cast<half*>(desc.data.data());
254 for (int i = 0; i < aligned_channels; ++i) {
255 gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
256 }
257 }
258 args_.AddObject("biases",
259 std::make_unique<BufferDescriptor>(std::move(desc)));
260 }
261
262 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWI,T> & weights)263 void ConvGeneric::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
264 const auto weights_desc = GetWeightsDescription();
265 const int flt_count =
266 GetTotalElementsCountForLayout(weights_desc, weights.shape);
267
268 std::vector<uint8_t> weights_data(flt_count * SizeOf(weights_desc.type));
269 RearrangeWeights(weights, weights_desc, absl::MakeSpan(weights_data));
270
271 if (conv_params_.AreWeightsBuffer()) {
272 BufferDescriptor desc;
273 desc.element_type = weights_desc.type;
274 desc.element_size = 4;
275 desc.memory_type = conv_params_.weights_upload_type ==
276 ConvGeneric::WeightsUploadType::CONSTANT_MEM
277 ? MemoryType::CONSTANT
278 : MemoryType::GLOBAL;
279 desc.size = weights_data.size();
280 desc.data = std::move(weights_data);
281 args_.AddObject("weights",
282 std::make_unique<BufferDescriptor>(std::move(desc)));
283 } else {
284 uint2 tex_size = Get2dResourceSize(weights_desc, weights.shape);
285 int sub_size = SizeOf(weights_desc.type) * 4 * tex_size.x * tex_size.y;
286 for (int i = 0; i < 4; ++i) {
287 TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
288 weights_desc.type, TensorStorageType::TEXTURE_2D, tex_size.x,
289 tex_size.y, weights_data.data() + sub_size * i);
290 args_.AddObject("weights" + std::to_string(i),
291 std::make_unique<TensorDescriptor>(std::move(desc)));
292 }
293 }
294 }
295
296 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWDI,T> & weights)297 void ConvGeneric::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights) {
298 const auto weights_desc = GetWeightsDescription();
299 const int flt_count =
300 GetTotalElementsCountForLayout(weights_desc, weights.shape);
301
302 std::vector<uint8_t> weights_data(flt_count * SizeOf(weights_desc.type));
303 RearrangeWeights(weights, weights_desc, absl::MakeSpan(weights_data));
304
305 if (conv_params_.AreWeightsBuffer()) {
306 BufferDescriptor desc;
307 desc.element_type = weights_desc.type;
308 desc.element_size = 4;
309 desc.size = weights_data.size();
310 desc.data = std::move(weights_data);
311 args_.AddObject("weights",
312 std::make_unique<BufferDescriptor>(std::move(desc)));
313 } else {
314 uint2 tex_size = Get2dResourceSize(weights_desc, weights.shape);
315 int sub_size = SizeOf(weights_desc.type) * 4 * tex_size.x * tex_size.y;
316 for (int i = 0; i < 4; ++i) {
317 TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
318 weights_desc.type, TensorStorageType::TEXTURE_2D, tex_size.x,
319 tex_size.y, weights_data.data() + sub_size * i);
320 args_.AddObject("weights" + std::to_string(i),
321 std::make_unique<TensorDescriptor>(std::move(desc)));
322 }
323 }
324 }
325
326 ConvGeneric CreateConvGeneric(const GpuInfo& gpu_info,
327 const OperationDef& definition,
328 const Convolution2DAttributes& attr,
329 const BHWC* dst_shape = nullptr);
330
331 ConvGeneric CreateConvGeneric(const GpuInfo& gpu_info,
332 const OperationDef& definition,
333 const FullyConnectedAttributes& attr,
334 const BHWC* dst_shape = nullptr);
335
336 ConvGeneric CreateConvGenericDynamicWeights(const GpuInfo& gpu_info,
337 const OperationDef& definition,
338 const Convolution2DAttributes& attr,
339 const BHWC& weights_shape,
340 const BHWC* dst_shape = nullptr);
341
342 ConvGeneric CreateConvGenericBatchedMatMul(const GpuInfo& gpu_info,
343 const OperationDef& definition,
344 const OHWI& weights_shape,
345 const BHWC* dst_shape = nullptr);
346
347 ConvGeneric CreateConvGenericWino4x4To6x6(const GpuInfo& gpu_info,
348 const OperationDef& definition,
349 const Convolution2DAttributes& attr,
350 const BHWC* dst_shape = nullptr);
351
352 ConvGeneric CreateConvGeneric3D(const GpuInfo& gpu_info,
353 const OperationDef& definition,
354 const Convolution3DAttributes& attr,
355 const BHWDC* dst_shape = nullptr);
356
357 } // namespace gpu
358 } // namespace tflite
359
360 #endif // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_GENERIC_H_
361