1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_FC_FC_ADD_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_FC_FC_ADD_H_
18
19 #include <stdint.h>
20
21 #include <map>
22 #include <memory>
23 #include <set>
24 #include <string>
25 #include <utility>
26 #include <vector>
27
28 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
29 #include "tensorflow/lite/delegates/gpu/common/model.h"
30 #include "tensorflow/lite/delegates/gpu/common/operations.h"
31 #include "tensorflow/lite/delegates/gpu/common/selectors/subgraph.h"
32 #include "tensorflow/lite/delegates/gpu/common/shape.h"
33 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
34 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
35 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
36 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
37 #include "tensorflow/lite/delegates/gpu/common/types.h"
38 #include "tensorflow/lite/delegates/gpu/common/util.h"
39
40 namespace tflite {
41 namespace gpu {
42
43 template <DataType T, typename S>
RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI,T> & weights,S * dst)44 void RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
45 S* dst) {
46 const int src_channels = weights.shape.i;
47 const int padded_src_channels = AlignByN(src_channels, 4);
48 const int dst_channels = weights.shape.o;
49 const int padded_dst_channels = AlignByN(dst_channels, 4);
50
51 for (int block_y = 0; 4 * block_y < padded_dst_channels; block_y++) {
52 for (int y_in_block = 0; y_in_block < 4; y_in_block++) {
53 for (int block_x = 0; 4 * block_x < padded_src_channels; block_x++) {
54 for (int x_in_block = 0; x_in_block < 4; x_in_block++) {
55 int y = 4 * block_y + y_in_block;
56 int x = 4 * block_x + x_in_block;
57 int dst_index = block_x * padded_dst_channels * 4 + block_y * 16 +
58 x_in_block * 4 + y_in_block;
59 if (x < src_channels && y < dst_channels) {
60 dst[dst_index] = weights.data[src_channels * y + x];
61 } else {
62 dst[dst_index] = 0.0f;
63 }
64 }
65 }
66 }
67 }
68 }
69
70 template <DataType T, typename S>
RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI,T> & weights,S * dst)71 void RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
72 S* dst) {
73 const int src_channels = weights.shape.i;
74 const int src_depth = DivideRoundUp(src_channels, 4);
75 const int dst_channels = weights.shape.o;
76 const int dst_depth = DivideRoundUp(dst_channels, 4);
77
78 int counter = 0;
79 for (int d = 0; d < dst_depth; ++d) {
80 for (int s = 0; s < src_depth; ++s) {
81 for (int i = 0; i < 4; ++i) {
82 const int src_ch = s * 4 + i;
83 for (int j = 0; j < 4; ++j) {
84 const int dst_ch = d * 4 + j;
85 if (src_ch < src_channels && dst_ch < dst_channels) {
86 dst[counter++] = weights.data[dst_ch * src_channels + src_ch];
87 } else {
88 dst[counter++] = 0.0f;
89 }
90 }
91 }
92 }
93 }
94 }
95
96 class FCFCAdd : public GPUOperation {
97 public:
98 FCFCAdd() = default;
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups)99 void GetPossibleKernelWorkGroups(
100 TuningType tuning_type, const GpuInfo& gpu_info,
101 const KernelInfo& kernel_info,
102 std::vector<int3>* work_groups) const override {
103 work_groups->push_back(work_group_size_);
104 }
105 int3 GetGridSize() const override;
106
107 // Move only
108 FCFCAdd(FCFCAdd&& kernel);
109 FCFCAdd& operator=(FCFCAdd&& kernel);
110 FCFCAdd(const FCFCAdd&) = delete;
111 FCFCAdd& operator=(const FCFCAdd&) = delete;
112
113 private:
114 FCFCAdd(const OperationDef& definition, const GpuInfo& gpu_info);
115 friend FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info,
116 const OperationDef& definition,
117 const FullyConnectedAttributes& attr0,
118 const FullyConnectedAttributes& attr1);
119 friend FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info,
120 const OperationDef& definition,
121 const FullyConnectedInt8Attributes& attr0,
122 const FullyConnectedInt8Attributes& attr1);
123
124 void UploadQuantizedWeights(
125 const tflite::gpu::Tensor<OHWI, DataType::INT8>& weights, float scale,
126 float zero_point, int index);
127
128 template <DataType T>
129 void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
130 const std::string& name, bool weights_are_buffer);
131
132 std::string GetFCFCAddKernelCode(const OperationDef& op_def,
133 const GpuInfo& gpu_info,
134 bool weights_are_buffer, bool quantized_0,
135 bool quantized_1);
136 };
137
138 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWI,T> & weights,const std::string & name,bool weights_are_buffer)139 void FCFCAdd::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
140 const std::string& name, bool weights_are_buffer) {
141 const int src_depth = DivideRoundUp(weights.shape.i, 4);
142 const int dst_depth = DivideRoundUp(weights.shape.o, 4);
143
144 const int elements_count = src_depth * dst_depth * 4;
145 const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
146
147 const int float4_size = f32_weights ? 16 : 8;
148
149 if (weights_are_buffer) {
150 BufferDescriptor desc;
151 desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
152 desc.element_size = 16;
153 desc.size = float4_size * elements_count;
154 desc.data.resize(desc.size);
155
156 if (f32_weights) {
157 float* ptr = reinterpret_cast<float*>(desc.data.data());
158 RearrangeFCWeightsToIOO4I4(weights, ptr);
159 } else {
160 half* ptr = reinterpret_cast<half*>(desc.data.data());
161 RearrangeFCWeightsToIOO4I4(weights, ptr);
162 }
163
164 args_.AddObject(name, std::make_unique<BufferDescriptor>(std::move(desc)));
165 } else {
166 std::vector<uint8_t> data(float4_size * elements_count);
167 if (f32_weights) {
168 float* ptr = reinterpret_cast<float*>(data.data());
169 RearrangeFCWeightsToOIO4I4(weights, ptr);
170 } else {
171 half* ptr = reinterpret_cast<half*>(data.data());
172 RearrangeFCWeightsToOIO4I4(weights, ptr);
173 }
174
175 TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
176 f32_weights ? DataType::FLOAT32 : DataType::FLOAT16,
177 TensorStorageType::TEXTURE_2D, src_depth * 4, dst_depth, data.data());
178
179 args_.AddObject(name, std::make_unique<TensorDescriptor>(std::move(desc)));
180 }
181 }
182
183 FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
184 const FullyConnectedAttributes& attr0,
185 const FullyConnectedAttributes& attr1);
186
187 FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
188 const FullyConnectedInt8Attributes& attr0,
189 const FullyConnectedInt8Attributes& attr1);
190
191 absl::Status TryFCFCAdd(
192 const GpuInfo& gpu_info, CalculationsPrecision precision,
193 const GraphFloat32& graph, NodeId first_node_id,
194 const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
195 std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph);
196
197 } // namespace gpu
198 } // namespace tflite
199
200 #endif // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_FC_FC_ADD_H_
201