xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/delegates/gpu/common/tasks/special/fc_fc_add.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_FC_FC_ADD_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_FC_FC_ADD_H_
18 
19 #include <stdint.h>
20 
21 #include <map>
22 #include <memory>
23 #include <set>
24 #include <string>
25 #include <utility>
26 #include <vector>
27 
28 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
29 #include "tensorflow/lite/delegates/gpu/common/model.h"
30 #include "tensorflow/lite/delegates/gpu/common/operations.h"
31 #include "tensorflow/lite/delegates/gpu/common/selectors/subgraph.h"
32 #include "tensorflow/lite/delegates/gpu/common/shape.h"
33 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
34 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
35 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
36 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
37 #include "tensorflow/lite/delegates/gpu/common/types.h"
38 #include "tensorflow/lite/delegates/gpu/common/util.h"
39 
40 namespace tflite {
41 namespace gpu {
42 
43 template <DataType T, typename S>
RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI,T> & weights,S * dst)44 void RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
45                                 S* dst) {
46   const int src_channels = weights.shape.i;
47   const int padded_src_channels = AlignByN(src_channels, 4);
48   const int dst_channels = weights.shape.o;
49   const int padded_dst_channels = AlignByN(dst_channels, 4);
50 
51   for (int block_y = 0; 4 * block_y < padded_dst_channels; block_y++) {
52     for (int y_in_block = 0; y_in_block < 4; y_in_block++) {
53       for (int block_x = 0; 4 * block_x < padded_src_channels; block_x++) {
54         for (int x_in_block = 0; x_in_block < 4; x_in_block++) {
55           int y = 4 * block_y + y_in_block;
56           int x = 4 * block_x + x_in_block;
57           int dst_index = block_x * padded_dst_channels * 4 + block_y * 16 +
58                           x_in_block * 4 + y_in_block;
59           if (x < src_channels && y < dst_channels) {
60             dst[dst_index] = weights.data[src_channels * y + x];
61           } else {
62             dst[dst_index] = 0.0f;
63           }
64         }
65       }
66     }
67   }
68 }
69 
70 template <DataType T, typename S>
RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI,T> & weights,S * dst)71 void RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
72                                 S* dst) {
73   const int src_channels = weights.shape.i;
74   const int src_depth = DivideRoundUp(src_channels, 4);
75   const int dst_channels = weights.shape.o;
76   const int dst_depth = DivideRoundUp(dst_channels, 4);
77 
78   int counter = 0;
79   for (int d = 0; d < dst_depth; ++d) {
80     for (int s = 0; s < src_depth; ++s) {
81       for (int i = 0; i < 4; ++i) {
82         const int src_ch = s * 4 + i;
83         for (int j = 0; j < 4; ++j) {
84           const int dst_ch = d * 4 + j;
85           if (src_ch < src_channels && dst_ch < dst_channels) {
86             dst[counter++] = weights.data[dst_ch * src_channels + src_ch];
87           } else {
88             dst[counter++] = 0.0f;
89           }
90         }
91       }
92     }
93   }
94 }
95 
96 class FCFCAdd : public GPUOperation {
97  public:
98   FCFCAdd() = default;
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups)99   void GetPossibleKernelWorkGroups(
100       TuningType tuning_type, const GpuInfo& gpu_info,
101       const KernelInfo& kernel_info,
102       std::vector<int3>* work_groups) const override {
103     work_groups->push_back(work_group_size_);
104   }
105   int3 GetGridSize() const override;
106 
107   // Move only
108   FCFCAdd(FCFCAdd&& kernel);
109   FCFCAdd& operator=(FCFCAdd&& kernel);
110   FCFCAdd(const FCFCAdd&) = delete;
111   FCFCAdd& operator=(const FCFCAdd&) = delete;
112 
113  private:
114   FCFCAdd(const OperationDef& definition, const GpuInfo& gpu_info);
115   friend FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info,
116                                const OperationDef& definition,
117                                const FullyConnectedAttributes& attr0,
118                                const FullyConnectedAttributes& attr1);
119   friend FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info,
120                                const OperationDef& definition,
121                                const FullyConnectedInt8Attributes& attr0,
122                                const FullyConnectedInt8Attributes& attr1);
123 
124   void UploadQuantizedWeights(
125       const tflite::gpu::Tensor<OHWI, DataType::INT8>& weights, float scale,
126       float zero_point, int index);
127 
128   template <DataType T>
129   void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
130                      const std::string& name, bool weights_are_buffer);
131 
132   std::string GetFCFCAddKernelCode(const OperationDef& op_def,
133                                    const GpuInfo& gpu_info,
134                                    bool weights_are_buffer, bool quantized_0,
135                                    bool quantized_1);
136 };
137 
138 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWI,T> & weights,const std::string & name,bool weights_are_buffer)139 void FCFCAdd::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
140                             const std::string& name, bool weights_are_buffer) {
141   const int src_depth = DivideRoundUp(weights.shape.i, 4);
142   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
143 
144   const int elements_count = src_depth * dst_depth * 4;
145   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
146 
147   const int float4_size = f32_weights ? 16 : 8;
148 
149   if (weights_are_buffer) {
150     BufferDescriptor desc;
151     desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
152     desc.element_size = 16;
153     desc.size = float4_size * elements_count;
154     desc.data.resize(desc.size);
155 
156     if (f32_weights) {
157       float* ptr = reinterpret_cast<float*>(desc.data.data());
158       RearrangeFCWeightsToIOO4I4(weights, ptr);
159     } else {
160       half* ptr = reinterpret_cast<half*>(desc.data.data());
161       RearrangeFCWeightsToIOO4I4(weights, ptr);
162     }
163 
164     args_.AddObject(name, std::make_unique<BufferDescriptor>(std::move(desc)));
165   } else {
166     std::vector<uint8_t> data(float4_size * elements_count);
167     if (f32_weights) {
168       float* ptr = reinterpret_cast<float*>(data.data());
169       RearrangeFCWeightsToOIO4I4(weights, ptr);
170     } else {
171       half* ptr = reinterpret_cast<half*>(data.data());
172       RearrangeFCWeightsToOIO4I4(weights, ptr);
173     }
174 
175     TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
176         f32_weights ? DataType::FLOAT32 : DataType::FLOAT16,
177         TensorStorageType::TEXTURE_2D, src_depth * 4, dst_depth, data.data());
178 
179     args_.AddObject(name, std::make_unique<TensorDescriptor>(std::move(desc)));
180   }
181 }
182 
183 FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
184                       const FullyConnectedAttributes& attr0,
185                       const FullyConnectedAttributes& attr1);
186 
187 FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
188                       const FullyConnectedInt8Attributes& attr0,
189                       const FullyConnectedInt8Attributes& attr1);
190 
191 absl::Status TryFCFCAdd(
192     const GpuInfo& gpu_info, CalculationsPrecision precision,
193     const GraphFloat32& graph, NodeId first_node_id,
194     const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
195     std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph);
196 
197 }  // namespace gpu
198 }  // namespace tflite
199 
200 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_FC_FC_ADD_H_
201