1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/special/fc_fc_add.h"
17
18 #include <map>
19 #include <memory>
20 #include <set>
21 #include <string>
22 #include <utility>
23 #include <vector>
24
25 #include "tensorflow/lite/delegates/gpu/common/operations.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
27 #include "tensorflow/lite/delegates/gpu/common/types.h"
28
29 namespace tflite {
30 namespace gpu {
31 namespace {
UseBufferForWeights(const GpuInfo & gpu_info)32 bool UseBufferForWeights(const GpuInfo& gpu_info) {
33 return gpu_info.IsAdreno() || gpu_info.IsAMD() || gpu_info.IsMali();
34 }
35
RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI,DataType::INT8> & weights,uint8_t * dst)36 void RearrangeFCWeightsToOIO4I4(
37 const tflite::gpu::Tensor<OHWI, DataType::INT8>& weights, uint8_t* dst) {
38 const int src_depth = DivideRoundUp(weights.shape.i, 4);
39 const int dst_depth = DivideRoundUp(weights.shape.o, 4);
40
41 int counter = 0;
42 for (int d = 0; d < dst_depth; ++d) {
43 for (int s = 0; s < src_depth; ++s) {
44 for (int i = 0; i < 4; ++i) {
45 const int src_ch = s * 4 + i;
46 for (int j = 0; j < 4; ++j) {
47 const int dst_ch = d * 4 + j;
48 if (src_ch < weights.shape.i && dst_ch < weights.shape.o) {
49 int t =
50 127 +
51 weights.data[weights.shape.LinearIndex({dst_ch, 0, 0, src_ch})];
52 if (t < 0) {
53 t = 0;
54 }
55 dst[counter++] = t;
56 } else {
57 dst[counter++] = 127;
58 }
59 }
60 }
61 }
62 }
63 }
64 } // namespace
65
FCFCAdd(const OperationDef & definition,const GpuInfo & gpu_info)66 FCFCAdd::FCFCAdd(const OperationDef& definition, const GpuInfo& gpu_info)
67 : GPUOperation(definition) {
68 if (gpu_info.IsAdreno()) {
69 if (gpu_info.adreno_info.IsAdreno3xx()) {
70 work_group_size_ = int3(16, 4, 1);
71 } else if (gpu_info.adreno_info.IsAdreno4xx()) {
72 work_group_size_ = int3(32, 4, 1);
73 } else {
74 work_group_size_ = int3(32, 4, 1);
75 }
76 } else if (gpu_info.IsIntel()) {
77 work_group_size_ = int3(8, 4, 1);
78 } else if (gpu_info.IsNvidia()) {
79 work_group_size_ = int3(8, 4, 1);
80 } else if (gpu_info.IsPowerVR()) {
81 work_group_size_ = int3(8, 4, 1);
82 } else {
83 work_group_size_ = int3(16, 4, 1);
84 }
85 }
86
FCFCAdd(FCFCAdd && kernel)87 FCFCAdd::FCFCAdd(FCFCAdd&& kernel) : GPUOperation(std::move(kernel)) {}
88
operator =(FCFCAdd && kernel)89 FCFCAdd& FCFCAdd::operator=(FCFCAdd&& kernel) {
90 if (this != &kernel) {
91 GPUOperation::operator=(std::move(kernel));
92 }
93 return *this;
94 }
95
96 // We split vec vec dot (every thread do vec vec dot product in basic
97 // vec mat mult) on 4 parts to create more threads
98 // tid.y thread process every 4-th element in vec vec dot
99 // Good results for ~1024 x 1024 sizes, for other can be written more
100 // optimized shaders
101
GetFCFCAddKernelCode(const OperationDef & op_def,const GpuInfo & gpu_info,bool weights_are_buffer,bool quantized_0,bool quantized_1)102 std::string FCFCAdd::GetFCFCAddKernelCode(const OperationDef& op_def,
103 const GpuInfo& gpu_info,
104 bool weights_are_buffer,
105 bool quantized_0, bool quantized_1) {
106 AddSrcTensor("src_tensor_0", op_def.src_tensors[0]);
107 AddSrcTensor("src_tensor_1", op_def.src_tensors[1]);
108 AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
109
110 std::string c;
111 switch (op_def.precision) {
112 case CalculationsPrecision::F32:
113 c += "#define FLT16 float16\n";
114 break;
115 case CalculationsPrecision::F32_F16:
116 case CalculationsPrecision::F16:
117 c += "#define FLT16 half16\n";
118 break;
119 }
120
121 c += "#define WG_X " + std::to_string(work_group_size_.x) + "\n";
122 c += "#define WG_Y " + std::to_string(work_group_size_.y) + "\n";
123
124 c += R"(MAIN_FUNCTION($0) {
125 int gid = get_global_id(0);
126 int2 tid;
127 tid.x = LOCAL_ID_0;
128 tid.y = LOCAL_ID_1;
129 ACCUM_FLT4 s = INIT_ACCUM_FLT4(0.0f);
130 if (gid < args.dst_tensor.Slices()) {
131 for (int c = tid.y; c < args.src_tensor_0.Slices(); c += WG_Y) {
132 FLT4 v = args.src_tensor_0.Read(0, 0, c);
133 )";
134 if (weights_are_buffer) {
135 c += R"(FLT16 w = args.weights0.Read(c * args.dst_tensor.Slices() + gid);
136 FLT4 partial = v.x * FLT16_0123(w);
137 partial += v.y * FLT16_4567(w);
138 partial += v.z * FLT16_89ab(w);
139 partial += v.w * FLT16_cdef(w);
140 s += TO_ACCUM_TYPE(partial);
141 )";
142 } else {
143 const std::string read_as_type =
144 op_def.precision == CalculationsPrecision::F32 ? "float" : "half";
145 c += " FLT4 w0 = args.weights0.Read<" + read_as_type +
146 ">(c * 4 + 0, gid);\n";
147 c += " FLT4 w1 = args.weights0.Read<" + read_as_type +
148 ">(c * 4 + 1, gid);\n";
149 c += " FLT4 w2 = args.weights0.Read<" + read_as_type +
150 ">(c * 4 + 2, gid);\n";
151 c += " FLT4 w3 = args.weights0.Read<" + read_as_type +
152 ">(c * 4 + 3, gid);\n";
153 if (quantized_0) {
154 c += R"(w0 = w0 * args.q0_m + args.q0_a;
155 w1 = w1 * args.q0_m + args.q0_a;
156 w2 = w2 * args.q0_m + args.q0_a;
157 w3 = w3 * args.q0_m + args.q0_a;
158 )";
159 }
160 c += R"(FLT4 partial = v.x * w0;
161 partial += v.y * w1;
162 partial += v.z * w2;
163 partial += v.w * w3;
164 s += TO_ACCUM_TYPE(partial);
165 )";
166 }
167 c += R"( }
168 for (int c = tid.y; c < args.src_tensor_1.Slices(); c += WG_Y) {
169 FLT4 v = args.src_tensor_1.Read(0, 0, c);
170 )";
171 if (weights_are_buffer) {
172 c += R"(FLT16 w = args.weights1.Read(c * args.dst_tensor.Slices() + gid);
173 FLT4 partial = v.x * FLT16_0123(w);
174 partial += v.y * FLT16_4567(w);
175 partial += v.z * FLT16_89ab(w);
176 partial += v.w * FLT16_cdef(w);
177 s += TO_ACCUM_TYPE(partial);
178 )";
179 } else {
180 const std::string read_as_type =
181 op_def.precision == CalculationsPrecision::F32 ? "float" : "half";
182 c += " FLT4 w0 = args.weights1.Read<" + read_as_type +
183 ">(c * 4 + 0, gid);\n";
184 c += " FLT4 w1 = args.weights1.Read<" + read_as_type +
185 ">(c * 4 + 1, gid);\n";
186 c += " FLT4 w2 = args.weights1.Read<" + read_as_type +
187 ">(c * 4 + 2, gid);\n";
188 c += " FLT4 w3 = args.weights1.Read<" + read_as_type +
189 ">(c * 4 + 3, gid);\n";
190 if (quantized_1) {
191 c += R"(w0 = w0 * args.q1_m + args.q1_a;
192 w1 = w1 * args.q1_m + args.q1_a;
193 w2 = w2 * args.q1_m + args.q1_a;
194 w3 = w3 * args.q1_m + args.q1_a;
195 )";
196 }
197 c += R"(FLT4 partial = v.x * w0;
198 partial += v.y * w1;
199 partial += v.z * w2;
200 partial += v.w * w3;
201 s += TO_ACCUM_TYPE(partial);
202 )";
203 }
204 c += R"( }
205 }
206 __local ACCUM_FLT4 temp[WG_X][WG_Y];
207 temp[tid.x][tid.y] = s;
208 LOCAL_MEM_BARRIER;
209 if (gid >= args.dst_tensor.Slices()) {
210 return;
211 }
212 if (tid.y == 0) {
213 )";
214 for (int i = 1; i < work_group_size_.y; ++i) {
215 c += " s += temp[tid.x][" + std::to_string(i) + "];\n";
216 }
217 c +=
218 R"( FLT4 r0 = TO_FLT4(s) + args.biases0.Read(gid) + args.biases1.Read(gid);
219 args.dst_tensor.Write(r0, 0, 0, gid);
220 }
221 })";
222
223 return c;
224 }
225
GetGridSize() const226 int3 FCFCAdd::GetGridSize() const { return int3(dst_[0]->Slices(), 1, 1); }
227
UploadQuantizedWeights(const tflite::gpu::Tensor<OHWI,DataType::INT8> & weights,float scale,float zero_point,int index)228 void FCFCAdd::UploadQuantizedWeights(
229 const tflite::gpu::Tensor<OHWI, DataType::INT8>& weights, float scale,
230 float zero_point, int index) {
231 const int src_depth = DivideRoundUp(weights.shape.i, 4);
232 const int dst_depth = DivideRoundUp(weights.shape.o, 4);
233
234 std::vector<uint8_t> data(src_depth * 4 * dst_depth * 4);
235 RearrangeFCWeightsToOIO4I4(weights, data.data());
236 TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
237 DataType::UINT8, TensorStorageType::TEXTURE_2D, src_depth * 4, dst_depth,
238 data.data());
239
240 std::string q_name = "q" + std::to_string(index) + "_";
241 if (definition_.precision == CalculationsPrecision::F32) {
242 args_.AddFloat(q_name + "m", scale);
243 args_.AddFloat(q_name + "a", -scale * (127.0 + zero_point));
244 } else {
245 args_.AddHalf(q_name + "m", half(scale));
246 args_.AddHalf(q_name + "a", half(-scale * (127.0 + zero_point)));
247 }
248 args_.AddObject("weights" + std::to_string(index),
249 std::make_unique<TensorDescriptor>(std::move(desc)));
250 }
251
CreateFCFCAdd(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedAttributes & attr0,const FullyConnectedAttributes & attr1)252 FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
253 const FullyConnectedAttributes& attr0,
254 const FullyConnectedAttributes& attr1) {
255 FCFCAdd result(definition, gpu_info);
256 bool weights_are_buffer = UseBufferForWeights(gpu_info);
257 result.UploadWeights(attr0.weights, "weights0", weights_are_buffer);
258 result.UploadWeights(attr1.weights, "weights1", weights_are_buffer);
259 result.code_ = result.GetFCFCAddKernelCode(definition, gpu_info,
260 weights_are_buffer, false, false);
261
262 TensorDescriptor bias0_tensor_desc = CreateConstantLinearTensorDescriptor(
263 gpu_info, definition.src_tensors[0].GetDataType(), attr0.bias);
264 result.args_.AddObject("biases0", std::make_unique<TensorDescriptor>(
265 std::move(bias0_tensor_desc)));
266
267 TensorDescriptor bias1_tensor_desc = CreateConstantLinearTensorDescriptor(
268 gpu_info, definition.src_tensors[0].GetDataType(), attr1.bias);
269 result.args_.AddObject("biases1", std::make_unique<TensorDescriptor>(
270 std::move(bias1_tensor_desc)));
271
272 return result;
273 }
274
CreateFCFCAdd(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedInt8Attributes & attr0,const FullyConnectedInt8Attributes & attr1)275 FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
276 const FullyConnectedInt8Attributes& attr0,
277 const FullyConnectedInt8Attributes& attr1) {
278 FCFCAdd result(definition, gpu_info);
279 result.UploadQuantizedWeights(attr0.weights, attr0.scale, attr0.zero_point,
280 0);
281 result.UploadQuantizedWeights(attr1.weights, attr1.scale, attr1.zero_point,
282 1);
283 result.code_ =
284 result.GetFCFCAddKernelCode(definition, gpu_info, false, true, true);
285
286 TensorDescriptor bias0_tensor_desc = CreateConstantLinearTensorDescriptor(
287 gpu_info, definition.src_tensors[0].GetDataType(), attr0.bias);
288 result.args_.AddObject("biases0", std::make_unique<TensorDescriptor>(
289 std::move(bias0_tensor_desc)));
290
291 TensorDescriptor bias1_tensor_desc = CreateConstantLinearTensorDescriptor(
292 gpu_info, definition.src_tensors[0].GetDataType(), attr1.bias);
293 result.args_.AddObject("biases1", std::make_unique<TensorDescriptor>(
294 std::move(bias1_tensor_desc)));
295
296 return result;
297 }
298
299 // fully connected + fully connected + add
TryFCFCAdd(const GpuInfo & gpu_info,CalculationsPrecision precision,const GraphFloat32 & graph,NodeId first_node_id,const std::map<ValueId,TensorDescriptor> & tensor_descriptors,std::set<NodeId> * consumed_nodes,GPUOperationsSubgraph * gpu_subgraph)300 absl::Status TryFCFCAdd(
301 const GpuInfo& gpu_info, CalculationsPrecision precision,
302 const GraphFloat32& graph, NodeId first_node_id,
303 const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
304 std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph) {
305 if (!(gpu_info.IsIntel() || gpu_info.IsNvidia() || gpu_info.IsAMD())) {
306 return absl::NotFoundError("FCFCAdd not suitable.");
307 }
308 auto* fc0_node = graph.GetNode(first_node_id);
309 if (fc0_node == nullptr) {
310 return absl::NotFoundError("FCFCAdd not suitable.");
311 }
312 auto first_op_type = OperationTypeFromString(fc0_node->operation.type);
313 if (first_op_type != OperationType::FULLY_CONNECTED &&
314 first_op_type != OperationType::FULLY_CONNECTED_INT8) {
315 return absl::NotFoundError("FCFCAdd not suitable.");
316 }
317 const bool first_quantized =
318 first_op_type == OperationType::FULLY_CONNECTED_INT8;
319 auto fc0_inputs = graph.FindInputs(fc0_node->id);
320 if (fc0_inputs.size() != 1) {
321 return absl::NotFoundError("FCFCAdd not suitable.");
322 }
323 auto fc0_output_id = graph.FindOutputs(fc0_node->id)[0]->id;
324 auto consumers = graph.FindConsumers(fc0_output_id);
325 if (consumers.size() != 1) {
326 return absl::NotFoundError("FCFCAdd not suitable.");
327 }
328 auto* add_node = consumers[0];
329 if (add_node == nullptr) {
330 return absl::NotFoundError("FCFCAdd not suitable.");
331 }
332 if (consumed_nodes->find(add_node->id) != consumed_nodes->end()) {
333 return absl::NotFoundError("FCFCAdd not suitable.");
334 }
335 if (OperationTypeFromString(add_node->operation.type) != OperationType::ADD) {
336 return absl::NotFoundError("FCFCAdd not suitable.");
337 }
338 auto add_inputs = graph.FindInputs(add_node->id);
339 if (add_inputs.size() != 2) {
340 return absl::NotFoundError("FCFCAdd not suitable.");
341 }
342 auto fc1_output_id = add_inputs[0]->id + add_inputs[1]->id - fc0_output_id;
343 auto* fc1_node = graph.FindProducer(fc1_output_id);
344 if (fc1_node == nullptr) {
345 return absl::NotFoundError("FCFCAdd not suitable.");
346 }
347 auto second_op_type = OperationTypeFromString(fc1_node->operation.type);
348 if (second_op_type != OperationType::FULLY_CONNECTED &&
349 second_op_type != OperationType::FULLY_CONNECTED_INT8) {
350 return absl::NotFoundError("FCFCAdd not suitable.");
351 }
352 const bool second_quantized =
353 second_op_type == OperationType::FULLY_CONNECTED_INT8;
354 const bool both_quantized = first_quantized && second_quantized;
355 const bool both_not_quantized = !first_quantized && !second_quantized;
356 if (!(both_quantized || both_not_quantized)) {
357 return absl::NotFoundError("FCFCAdd not suitable.");
358 }
359 if (consumed_nodes->find(fc1_node->id) != consumed_nodes->end()) {
360 return absl::NotFoundError("FCFCAdd not suitable.");
361 }
362 auto fc1_inputs = graph.FindInputs(fc1_node->id);
363 if (fc1_inputs.size() != 1) {
364 return absl::NotFoundError("FCFCAdd not suitable.");
365 }
366 auto add_outputs = graph.FindOutputs(add_node->id);
367
368 OperationDef op_def;
369 op_def.precision = precision;
370 auto it = tensor_descriptors.find(fc0_inputs[0]->id);
371 if (it != tensor_descriptors.end()) {
372 op_def.src_tensors.push_back(it->second);
373 }
374 it = tensor_descriptors.find(fc1_inputs[0]->id);
375 if (it != tensor_descriptors.end()) {
376 op_def.src_tensors.push_back(it->second);
377 }
378 it = tensor_descriptors.find(add_outputs[0]->id);
379 if (it != tensor_descriptors.end()) {
380 op_def.dst_tensors.push_back(it->second);
381 }
382
383 for (int i = 0; i < fc1_inputs.size(); ++i) {
384 fc0_inputs.push_back(fc1_inputs[i]);
385 }
386 std::unique_ptr<GPUOperation>* gpu_op =
387 InitSingleOpSubgraph(fc0_inputs, add_outputs, gpu_subgraph);
388 FCFCAdd fc;
389 if (both_not_quantized) {
390 auto fc0_attr = absl::any_cast<FullyConnectedAttributes>(
391 fc0_node->operation.attributes);
392 auto fc1_attr = absl::any_cast<FullyConnectedAttributes>(
393 fc1_node->operation.attributes);
394 if (fc0_attr.weights.shape.o != fc1_attr.weights.shape.o) {
395 return absl::NotFoundError("FCFCAdd not suitable.");
396 }
397 fc = CreateFCFCAdd(gpu_info, op_def, fc0_attr, fc1_attr);
398 } else {
399 // both_quantized
400 auto fc0_attr = absl::any_cast<FullyConnectedInt8Attributes>(
401 fc0_node->operation.attributes);
402 auto fc1_attr = absl::any_cast<FullyConnectedInt8Attributes>(
403 fc1_node->operation.attributes);
404 if (fc0_attr.weights.shape.o != fc1_attr.weights.shape.o) {
405 return absl::NotFoundError("FCFCAdd not suitable.");
406 }
407 fc = CreateFCFCAdd(gpu_info, op_def, fc0_attr, fc1_attr);
408 }
409 *gpu_op = std::make_unique<FCFCAdd>(std::move(fc));
410 const std::string fused_nodes = std::to_string(fc0_node->id) + " " +
411 std::to_string(fc1_node->id) + " " +
412 std::to_string(add_node->id);
413 gpu_subgraph->operations[0].name =
414 "fully_connected_x2_and_add " + fused_nodes;
415 consumed_nodes->insert(fc0_node->id);
416 consumed_nodes->insert(fc1_node->id);
417 consumed_nodes->insert(add_node->id);
418 return absl::OkStatus();
419 }
420
421 } // namespace gpu
422 } // namespace tflite
423