xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/delegates/gpu/common/tasks/special/fc_fc_add.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/special/fc_fc_add.h"
17 
18 #include <map>
19 #include <memory>
20 #include <set>
21 #include <string>
22 #include <utility>
23 #include <vector>
24 
25 #include "tensorflow/lite/delegates/gpu/common/operations.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
27 #include "tensorflow/lite/delegates/gpu/common/types.h"
28 
29 namespace tflite {
30 namespace gpu {
31 namespace {
UseBufferForWeights(const GpuInfo & gpu_info)32 bool UseBufferForWeights(const GpuInfo& gpu_info) {
33   return gpu_info.IsAdreno() || gpu_info.IsAMD() || gpu_info.IsMali();
34 }
35 
RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI,DataType::INT8> & weights,uint8_t * dst)36 void RearrangeFCWeightsToOIO4I4(
37     const tflite::gpu::Tensor<OHWI, DataType::INT8>& weights, uint8_t* dst) {
38   const int src_depth = DivideRoundUp(weights.shape.i, 4);
39   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
40 
41   int counter = 0;
42   for (int d = 0; d < dst_depth; ++d) {
43     for (int s = 0; s < src_depth; ++s) {
44       for (int i = 0; i < 4; ++i) {
45         const int src_ch = s * 4 + i;
46         for (int j = 0; j < 4; ++j) {
47           const int dst_ch = d * 4 + j;
48           if (src_ch < weights.shape.i && dst_ch < weights.shape.o) {
49             int t =
50                 127 +
51                 weights.data[weights.shape.LinearIndex({dst_ch, 0, 0, src_ch})];
52             if (t < 0) {
53               t = 0;
54             }
55             dst[counter++] = t;
56           } else {
57             dst[counter++] = 127;
58           }
59         }
60       }
61     }
62   }
63 }
64 }  // namespace
65 
FCFCAdd(const OperationDef & definition,const GpuInfo & gpu_info)66 FCFCAdd::FCFCAdd(const OperationDef& definition, const GpuInfo& gpu_info)
67     : GPUOperation(definition) {
68   if (gpu_info.IsAdreno()) {
69     if (gpu_info.adreno_info.IsAdreno3xx()) {
70       work_group_size_ = int3(16, 4, 1);
71     } else if (gpu_info.adreno_info.IsAdreno4xx()) {
72       work_group_size_ = int3(32, 4, 1);
73     } else {
74       work_group_size_ = int3(32, 4, 1);
75     }
76   } else if (gpu_info.IsIntel()) {
77     work_group_size_ = int3(8, 4, 1);
78   } else if (gpu_info.IsNvidia()) {
79     work_group_size_ = int3(8, 4, 1);
80   } else if (gpu_info.IsPowerVR()) {
81     work_group_size_ = int3(8, 4, 1);
82   } else {
83     work_group_size_ = int3(16, 4, 1);
84   }
85 }
86 
FCFCAdd(FCFCAdd && kernel)87 FCFCAdd::FCFCAdd(FCFCAdd&& kernel) : GPUOperation(std::move(kernel)) {}
88 
operator =(FCFCAdd && kernel)89 FCFCAdd& FCFCAdd::operator=(FCFCAdd&& kernel) {
90   if (this != &kernel) {
91     GPUOperation::operator=(std::move(kernel));
92   }
93   return *this;
94 }
95 
96 // We split vec vec dot (every thread do vec vec dot product in basic
97 // vec mat mult) on 4 parts to create more threads
98 // tid.y thread process every 4-th element in vec vec dot
99 // Good results for ~1024 x 1024 sizes, for other can be written more
100 // optimized shaders
101 
GetFCFCAddKernelCode(const OperationDef & op_def,const GpuInfo & gpu_info,bool weights_are_buffer,bool quantized_0,bool quantized_1)102 std::string FCFCAdd::GetFCFCAddKernelCode(const OperationDef& op_def,
103                                           const GpuInfo& gpu_info,
104                                           bool weights_are_buffer,
105                                           bool quantized_0, bool quantized_1) {
106   AddSrcTensor("src_tensor_0", op_def.src_tensors[0]);
107   AddSrcTensor("src_tensor_1", op_def.src_tensors[1]);
108   AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
109 
110   std::string c;
111   switch (op_def.precision) {
112     case CalculationsPrecision::F32:
113       c += "#define FLT16 float16\n";
114       break;
115     case CalculationsPrecision::F32_F16:
116     case CalculationsPrecision::F16:
117       c += "#define FLT16 half16\n";
118       break;
119   }
120 
121   c += "#define WG_X " + std::to_string(work_group_size_.x) + "\n";
122   c += "#define WG_Y " + std::to_string(work_group_size_.y) + "\n";
123 
124   c += R"(MAIN_FUNCTION($0) {
125   int gid = get_global_id(0);
126   int2 tid;
127   tid.x = LOCAL_ID_0;
128   tid.y = LOCAL_ID_1;
129   ACCUM_FLT4 s = INIT_ACCUM_FLT4(0.0f);
130   if (gid < args.dst_tensor.Slices()) {
131     for (int c = tid.y; c < args.src_tensor_0.Slices(); c += WG_Y) {
132       FLT4 v = args.src_tensor_0.Read(0, 0, c);
133 )";
134   if (weights_are_buffer) {
135     c += R"(FLT16 w = args.weights0.Read(c * args.dst_tensor.Slices() + gid);
136       FLT4 partial = v.x * FLT16_0123(w);
137       partial += v.y * FLT16_4567(w);
138       partial += v.z * FLT16_89ab(w);
139       partial += v.w * FLT16_cdef(w);
140       s += TO_ACCUM_TYPE(partial);
141 )";
142   } else {
143     const std::string read_as_type =
144         op_def.precision == CalculationsPrecision::F32 ? "float" : "half";
145     c += "      FLT4 w0 = args.weights0.Read<" + read_as_type +
146          ">(c * 4 + 0, gid);\n";
147     c += "      FLT4 w1 = args.weights0.Read<" + read_as_type +
148          ">(c * 4 + 1, gid);\n";
149     c += "      FLT4 w2 = args.weights0.Read<" + read_as_type +
150          ">(c * 4 + 2, gid);\n";
151     c += "      FLT4 w3 = args.weights0.Read<" + read_as_type +
152          ">(c * 4 + 3, gid);\n";
153     if (quantized_0) {
154       c += R"(w0 = w0 * args.q0_m + args.q0_a;
155       w1 = w1 * args.q0_m + args.q0_a;
156       w2 = w2 * args.q0_m + args.q0_a;
157       w3 = w3 * args.q0_m + args.q0_a;
158 )";
159     }
160     c += R"(FLT4 partial = v.x * w0;
161       partial += v.y * w1;
162       partial += v.z * w2;
163       partial += v.w * w3;
164       s += TO_ACCUM_TYPE(partial);
165 )";
166   }
167   c += R"(    }
168     for (int c = tid.y; c < args.src_tensor_1.Slices(); c += WG_Y) {
169       FLT4 v = args.src_tensor_1.Read(0, 0, c);
170       )";
171   if (weights_are_buffer) {
172     c += R"(FLT16 w = args.weights1.Read(c * args.dst_tensor.Slices() + gid);
173       FLT4 partial = v.x * FLT16_0123(w);
174       partial += v.y * FLT16_4567(w);
175       partial += v.z * FLT16_89ab(w);
176       partial += v.w * FLT16_cdef(w);
177       s += TO_ACCUM_TYPE(partial);
178 )";
179   } else {
180     const std::string read_as_type =
181         op_def.precision == CalculationsPrecision::F32 ? "float" : "half";
182     c += "      FLT4 w0 = args.weights1.Read<" + read_as_type +
183          ">(c * 4 + 0, gid);\n";
184     c += "      FLT4 w1 = args.weights1.Read<" + read_as_type +
185          ">(c * 4 + 1, gid);\n";
186     c += "      FLT4 w2 = args.weights1.Read<" + read_as_type +
187          ">(c * 4 + 2, gid);\n";
188     c += "      FLT4 w3 = args.weights1.Read<" + read_as_type +
189          ">(c * 4 + 3, gid);\n";
190     if (quantized_1) {
191       c += R"(w0 = w0 * args.q1_m + args.q1_a;
192       w1 = w1 * args.q1_m + args.q1_a;
193       w2 = w2 * args.q1_m + args.q1_a;
194       w3 = w3 * args.q1_m + args.q1_a;
195 )";
196     }
197     c += R"(FLT4 partial = v.x * w0;
198       partial += v.y * w1;
199       partial += v.z * w2;
200       partial += v.w * w3;
201       s += TO_ACCUM_TYPE(partial);
202 )";
203   }
204   c += R"(    }
205   }
206   __local ACCUM_FLT4 temp[WG_X][WG_Y];
207   temp[tid.x][tid.y] = s;
208   LOCAL_MEM_BARRIER;
209   if (gid >= args.dst_tensor.Slices()) {
210     return;
211   }
212   if (tid.y == 0) {
213 )";
214   for (int i = 1; i < work_group_size_.y; ++i) {
215     c += "    s += temp[tid.x][" + std::to_string(i) + "];\n";
216   }
217   c +=
218       R"(    FLT4 r0 = TO_FLT4(s) + args.biases0.Read(gid) + args.biases1.Read(gid);
219     args.dst_tensor.Write(r0, 0, 0, gid);
220   }
221 })";
222 
223   return c;
224 }
225 
GetGridSize() const226 int3 FCFCAdd::GetGridSize() const { return int3(dst_[0]->Slices(), 1, 1); }
227 
UploadQuantizedWeights(const tflite::gpu::Tensor<OHWI,DataType::INT8> & weights,float scale,float zero_point,int index)228 void FCFCAdd::UploadQuantizedWeights(
229     const tflite::gpu::Tensor<OHWI, DataType::INT8>& weights, float scale,
230     float zero_point, int index) {
231   const int src_depth = DivideRoundUp(weights.shape.i, 4);
232   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
233 
234   std::vector<uint8_t> data(src_depth * 4 * dst_depth * 4);
235   RearrangeFCWeightsToOIO4I4(weights, data.data());
236   TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
237       DataType::UINT8, TensorStorageType::TEXTURE_2D, src_depth * 4, dst_depth,
238       data.data());
239 
240   std::string q_name = "q" + std::to_string(index) + "_";
241   if (definition_.precision == CalculationsPrecision::F32) {
242     args_.AddFloat(q_name + "m", scale);
243     args_.AddFloat(q_name + "a", -scale * (127.0 + zero_point));
244   } else {
245     args_.AddHalf(q_name + "m", half(scale));
246     args_.AddHalf(q_name + "a", half(-scale * (127.0 + zero_point)));
247   }
248   args_.AddObject("weights" + std::to_string(index),
249                   std::make_unique<TensorDescriptor>(std::move(desc)));
250 }
251 
CreateFCFCAdd(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedAttributes & attr0,const FullyConnectedAttributes & attr1)252 FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
253                       const FullyConnectedAttributes& attr0,
254                       const FullyConnectedAttributes& attr1) {
255   FCFCAdd result(definition, gpu_info);
256   bool weights_are_buffer = UseBufferForWeights(gpu_info);
257   result.UploadWeights(attr0.weights, "weights0", weights_are_buffer);
258   result.UploadWeights(attr1.weights, "weights1", weights_are_buffer);
259   result.code_ = result.GetFCFCAddKernelCode(definition, gpu_info,
260                                              weights_are_buffer, false, false);
261 
262   TensorDescriptor bias0_tensor_desc = CreateConstantLinearTensorDescriptor(
263       gpu_info, definition.src_tensors[0].GetDataType(), attr0.bias);
264   result.args_.AddObject("biases0", std::make_unique<TensorDescriptor>(
265                                         std::move(bias0_tensor_desc)));
266 
267   TensorDescriptor bias1_tensor_desc = CreateConstantLinearTensorDescriptor(
268       gpu_info, definition.src_tensors[0].GetDataType(), attr1.bias);
269   result.args_.AddObject("biases1", std::make_unique<TensorDescriptor>(
270                                         std::move(bias1_tensor_desc)));
271 
272   return result;
273 }
274 
CreateFCFCAdd(const GpuInfo & gpu_info,const OperationDef & definition,const FullyConnectedInt8Attributes & attr0,const FullyConnectedInt8Attributes & attr1)275 FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
276                       const FullyConnectedInt8Attributes& attr0,
277                       const FullyConnectedInt8Attributes& attr1) {
278   FCFCAdd result(definition, gpu_info);
279   result.UploadQuantizedWeights(attr0.weights, attr0.scale, attr0.zero_point,
280                                 0);
281   result.UploadQuantizedWeights(attr1.weights, attr1.scale, attr1.zero_point,
282                                 1);
283   result.code_ =
284       result.GetFCFCAddKernelCode(definition, gpu_info, false, true, true);
285 
286   TensorDescriptor bias0_tensor_desc = CreateConstantLinearTensorDescriptor(
287       gpu_info, definition.src_tensors[0].GetDataType(), attr0.bias);
288   result.args_.AddObject("biases0", std::make_unique<TensorDescriptor>(
289                                         std::move(bias0_tensor_desc)));
290 
291   TensorDescriptor bias1_tensor_desc = CreateConstantLinearTensorDescriptor(
292       gpu_info, definition.src_tensors[0].GetDataType(), attr1.bias);
293   result.args_.AddObject("biases1", std::make_unique<TensorDescriptor>(
294                                         std::move(bias1_tensor_desc)));
295 
296   return result;
297 }
298 
299 // fully connected + fully connected + add
TryFCFCAdd(const GpuInfo & gpu_info,CalculationsPrecision precision,const GraphFloat32 & graph,NodeId first_node_id,const std::map<ValueId,TensorDescriptor> & tensor_descriptors,std::set<NodeId> * consumed_nodes,GPUOperationsSubgraph * gpu_subgraph)300 absl::Status TryFCFCAdd(
301     const GpuInfo& gpu_info, CalculationsPrecision precision,
302     const GraphFloat32& graph, NodeId first_node_id,
303     const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
304     std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph) {
305   if (!(gpu_info.IsIntel() || gpu_info.IsNvidia() || gpu_info.IsAMD())) {
306     return absl::NotFoundError("FCFCAdd not suitable.");
307   }
308   auto* fc0_node = graph.GetNode(first_node_id);
309   if (fc0_node == nullptr) {
310     return absl::NotFoundError("FCFCAdd not suitable.");
311   }
312   auto first_op_type = OperationTypeFromString(fc0_node->operation.type);
313   if (first_op_type != OperationType::FULLY_CONNECTED &&
314       first_op_type != OperationType::FULLY_CONNECTED_INT8) {
315     return absl::NotFoundError("FCFCAdd not suitable.");
316   }
317   const bool first_quantized =
318       first_op_type == OperationType::FULLY_CONNECTED_INT8;
319   auto fc0_inputs = graph.FindInputs(fc0_node->id);
320   if (fc0_inputs.size() != 1) {
321     return absl::NotFoundError("FCFCAdd not suitable.");
322   }
323   auto fc0_output_id = graph.FindOutputs(fc0_node->id)[0]->id;
324   auto consumers = graph.FindConsumers(fc0_output_id);
325   if (consumers.size() != 1) {
326     return absl::NotFoundError("FCFCAdd not suitable.");
327   }
328   auto* add_node = consumers[0];
329   if (add_node == nullptr) {
330     return absl::NotFoundError("FCFCAdd not suitable.");
331   }
332   if (consumed_nodes->find(add_node->id) != consumed_nodes->end()) {
333     return absl::NotFoundError("FCFCAdd not suitable.");
334   }
335   if (OperationTypeFromString(add_node->operation.type) != OperationType::ADD) {
336     return absl::NotFoundError("FCFCAdd not suitable.");
337   }
338   auto add_inputs = graph.FindInputs(add_node->id);
339   if (add_inputs.size() != 2) {
340     return absl::NotFoundError("FCFCAdd not suitable.");
341   }
342   auto fc1_output_id = add_inputs[0]->id + add_inputs[1]->id - fc0_output_id;
343   auto* fc1_node = graph.FindProducer(fc1_output_id);
344   if (fc1_node == nullptr) {
345     return absl::NotFoundError("FCFCAdd not suitable.");
346   }
347   auto second_op_type = OperationTypeFromString(fc1_node->operation.type);
348   if (second_op_type != OperationType::FULLY_CONNECTED &&
349       second_op_type != OperationType::FULLY_CONNECTED_INT8) {
350     return absl::NotFoundError("FCFCAdd not suitable.");
351   }
352   const bool second_quantized =
353       second_op_type == OperationType::FULLY_CONNECTED_INT8;
354   const bool both_quantized = first_quantized && second_quantized;
355   const bool both_not_quantized = !first_quantized && !second_quantized;
356   if (!(both_quantized || both_not_quantized)) {
357     return absl::NotFoundError("FCFCAdd not suitable.");
358   }
359   if (consumed_nodes->find(fc1_node->id) != consumed_nodes->end()) {
360     return absl::NotFoundError("FCFCAdd not suitable.");
361   }
362   auto fc1_inputs = graph.FindInputs(fc1_node->id);
363   if (fc1_inputs.size() != 1) {
364     return absl::NotFoundError("FCFCAdd not suitable.");
365   }
366   auto add_outputs = graph.FindOutputs(add_node->id);
367 
368   OperationDef op_def;
369   op_def.precision = precision;
370   auto it = tensor_descriptors.find(fc0_inputs[0]->id);
371   if (it != tensor_descriptors.end()) {
372     op_def.src_tensors.push_back(it->second);
373   }
374   it = tensor_descriptors.find(fc1_inputs[0]->id);
375   if (it != tensor_descriptors.end()) {
376     op_def.src_tensors.push_back(it->second);
377   }
378   it = tensor_descriptors.find(add_outputs[0]->id);
379   if (it != tensor_descriptors.end()) {
380     op_def.dst_tensors.push_back(it->second);
381   }
382 
383   for (int i = 0; i < fc1_inputs.size(); ++i) {
384     fc0_inputs.push_back(fc1_inputs[i]);
385   }
386   std::unique_ptr<GPUOperation>* gpu_op =
387       InitSingleOpSubgraph(fc0_inputs, add_outputs, gpu_subgraph);
388   FCFCAdd fc;
389   if (both_not_quantized) {
390     auto fc0_attr = absl::any_cast<FullyConnectedAttributes>(
391         fc0_node->operation.attributes);
392     auto fc1_attr = absl::any_cast<FullyConnectedAttributes>(
393         fc1_node->operation.attributes);
394     if (fc0_attr.weights.shape.o != fc1_attr.weights.shape.o) {
395       return absl::NotFoundError("FCFCAdd not suitable.");
396     }
397     fc = CreateFCFCAdd(gpu_info, op_def, fc0_attr, fc1_attr);
398   } else {
399     // both_quantized
400     auto fc0_attr = absl::any_cast<FullyConnectedInt8Attributes>(
401         fc0_node->operation.attributes);
402     auto fc1_attr = absl::any_cast<FullyConnectedInt8Attributes>(
403         fc1_node->operation.attributes);
404     if (fc0_attr.weights.shape.o != fc1_attr.weights.shape.o) {
405       return absl::NotFoundError("FCFCAdd not suitable.");
406     }
407     fc = CreateFCFCAdd(gpu_info, op_def, fc0_attr, fc1_attr);
408   }
409   *gpu_op = std::make_unique<FCFCAdd>(std::move(fc));
410   const std::string fused_nodes = std::to_string(fc0_node->id) + " " +
411                                   std::to_string(fc1_node->id) + " " +
412                                   std::to_string(add_node->id);
413   gpu_subgraph->operations[0].name =
414       "fully_connected_x2_and_add " + fused_nodes;
415   consumed_nodes->insert(fc0_node->id);
416   consumed_nodes->insert(fc1_node->id);
417   consumed_nodes->insert(add_node->id);
418   return absl::OkStatus();
419 }
420 
421 }  // namespace gpu
422 }  // namespace tflite
423