1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_3x3_thin.h"
17
18 #include <memory>
19 #include <string>
20 #include <utility>
21 #include <vector>
22
23 #include "absl/strings/substitute.h"
24 #include "tensorflow/lite/delegates/gpu/common/precision.h"
25 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/weights_conversion.h"
27
28 namespace tflite {
29 namespace gpu {
30 namespace {
ConvInstr(CalculationsPrecision precision,bool is_i4_o4,const std::string & dst_name,const std::string & src_name,int weights_offset)31 std::string ConvInstr(CalculationsPrecision precision, bool is_i4_o4,
32 const std::string& dst_name, const std::string& src_name,
33 int weights_offset) {
34 std::string c;
35 if (is_i4_o4) {
36 switch (precision) {
37 case CalculationsPrecision::F32:
38 case CalculationsPrecision::F16:
39 c += " $0 += $1.x * args.weights.Read($2); \n";
40 c += " $0 += $1.y * args.weights.Read($3); \n";
41 c += " $0 += $1.z * args.weights.Read($4); \n";
42 c += " $0 += $1.w * args.weights.Read($5); \n";
43 break;
44 case CalculationsPrecision::F32_F16:
45 c += " $0 += TO_ACCUM_TYPE($1.x * args.weights.Read($2) + $1.y * "
46 "args.weights.Read($3) + $1.z * args.weights.Read($4) + $1.w * "
47 "args.weights.Read($5)); \n";
48 break;
49 }
50 } else {
51 // O4I4
52 c += " $0.x += dot($1, args.weights.Read($2)); \n";
53 c += " $0.y += dot($1, args.weights.Read($3)); \n";
54 c += " $0.z += dot($1, args.weights.Read($4)); \n";
55 c += " $0.w += dot($1, args.weights.Read($5)); \n";
56 }
57 return absl::Substitute(c, dst_name, src_name, weights_offset,
58 weights_offset + 1, weights_offset + 2,
59 weights_offset + 3);
60 }
61 } // namespace
62
ConvolutionTransposed3x3Thin(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)63 ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
64 const GpuInfo& gpu_info, const OperationDef& definition,
65 const ConvolutionTransposedAttributes& attr)
66 : GPUOperation(definition) {
67 if (gpu_info.IsApple()) {
68 weights_layout_ = WeightsLayout::kOICustomSpatialO4I4;
69 } else {
70 weights_layout_ = WeightsLayout::kOICustomSpatialI4O4;
71 }
72 code_ = GenerateConvolutionTransposedCode(
73 definition_, gpu_info, DivideRoundUp(attr.weights.shape.i, 4),
74 DivideRoundUp(attr.weights.shape.o, 4));
75 }
76
GenerateConvolutionTransposedCode(const OperationDef & op_def,const GpuInfo & gpu_info,int src_depth,int dst_depth)77 std::string ConvolutionTransposed3x3Thin::GenerateConvolutionTransposedCode(
78 const OperationDef& op_def, const GpuInfo& gpu_info, int src_depth,
79 int dst_depth) {
80 AddSrcTensor("src_tensor", op_def.src_tensors[0]);
81 AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
82
83 if (op_def.src_tensors.size() == 2) {
84 // dynamic weights
85 BufferDescriptor desc;
86 desc.element_type = op_def.src_tensors[1].GetDataType();
87 desc.element_size = 4;
88 desc.memory_type = MemoryType::CONSTANT;
89 AddSrcBuffer("weights", desc);
90 }
91
92 std::string c;
93
94 c += "MAIN_FUNCTION($0) {\n";
95 if (op_def.IsBatchSupported()) {
96 c += " int linear_id = GLOBAL_ID_0;\n";
97 c += " int X = linear_id / args.dst_tensor.Batch();\n";
98 c += " int B = linear_id % args.dst_tensor.Batch();\n";
99 c += " args.dst_tensor.SetBatchRef(B);\n";
100 c += " args.src_tensor.SetBatchRef(B);\n";
101 } else {
102 c += " int X = GLOBAL_ID_0;\n";
103 }
104 c += " int Y = GLOBAL_ID_1;\n";
105 c += " if (X >= args.src_tensor.Width() || Y >= args.src_tensor.Height()) "
106 "return;\n";
107 for (int d = 0; d < dst_depth; ++d) {
108 const std::string layer = std::to_string(d);
109 c += " ACCUM_FLT4 r" + layer + "[2][2];\n";
110 c += " r" + layer + "[0][0] = INIT_ACCUM_FLT4(0.0f);\n";
111 c += " r" + layer + "[0][1] = INIT_ACCUM_FLT4(0.0f);\n";
112 c += " r" + layer + "[1][0] = INIT_ACCUM_FLT4(0.0f);\n";
113 c += " r" + layer + "[1][1] = INIT_ACCUM_FLT4(0.0f);\n";
114 }
115 for (int s = 0; s < src_depth; ++s) {
116 const std::string z = std::to_string(s);
117 c += " {\n";
118 if (op_def.src_tensors[0].SupportsZeroClamp(Axis::WIDTH, gpu_info) &&
119 op_def.src_tensors[0].SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
120 c += " FLT4 src0 = args.src_tensor.Read(X, Y, " + z + ");\n";
121 c += " FLT4 src1 = args.src_tensor.Read(X + 1, Y, " + z + ");\n";
122 c += " FLT4 src2 = args.src_tensor.Read(X, Y + 1, " + z + ");\n";
123 c += " FLT4 src3 = args.src_tensor.Read(X + 1, Y + 1, " + z + ");\n";
124 } else if (op_def.src_tensors[0].IsLinear() &&
125 op_def.src_tensors[0].ReturnsZeroForNegOneRead(gpu_info)) {
126 c += " int c0 = args.src_tensor.GetAddress(X, Y, " + z + ");\n";
127 c += " int c1 = args.src_tensor.GetAddress(X + 1, Y, " + z + ");\n";
128 c += " int c2 = args.src_tensor.GetAddress(X, Y + 1, " + z + ");\n";
129 c += " int c3 = args.src_tensor.GetAddress(X + 1, Y + 1, " + z + ");\n";
130 c += " bool x_in = X + 1 < args.src_tensor.Width();\n";
131 c += " bool y_in = Y + 1 < args.src_tensor.Height();\n";
132 c += " c1 = select(-1, c1, x_in);\n";
133 c += " c2 = select(-1, c2, y_in);\n";
134 c += " c3 = select(-1, c3, x_in && y_in);\n";
135 c += " FLT4 src0 = args.src_tensor.Read(c0);\n";
136 c += " FLT4 src1 = args.src_tensor.Read(c1);\n";
137 c += " FLT4 src2 = args.src_tensor.Read(c2);\n";
138 c += " FLT4 src3 = args.src_tensor.Read(c3);\n";
139 } else {
140 // Manual zero clamp
141 c += " bool x_in = X + 1 < args.src_tensor.Width();\n";
142 c += " bool y_in = Y + 1 < args.src_tensor.Height();\n";
143 c += " FLT4 src0 = args.src_tensor.Read(X, Y, " + z + ");\n";
144 c += " FLT4 src1 = INIT_FLT4(0.0);\n";
145 c += " FLT4 src2 = INIT_FLT4(0.0);\n";
146 c += " FLT4 src3 = INIT_FLT4(0.0);\n";
147 c += " if (x_in) {\n";
148 c += " src1 = args.src_tensor.Read(X + 1, Y, " + z + ");\n";
149 c += " }\n";
150 c += " if (y_in) {\n";
151 c += " src2 = args.src_tensor.Read(X, Y + 1, " + z + ");\n";
152 c += " }\n";
153 c += " if (x_in && y_in) {\n";
154 c += " src3 = args.src_tensor.Read(X + 1, Y + 1, " + z + ");\n";
155 c += " }\n";
156 }
157 for (int d = 0; d < dst_depth; ++d) {
158 const std::string layer = std::to_string(d);
159 const int filters_index = (s * dst_depth + d) * 36;
160 const bool is_i4_o4 = GetWeightsDescription().IsI4O4();
161 c += ConvInstr(op_def.precision, is_i4_o4, "r" + layer + "[0][0]", "src0",
162 filters_index);
163 c += ConvInstr(op_def.precision, is_i4_o4, "r" + layer + "[0][1]", "src0",
164 filters_index + 4);
165 c += ConvInstr(op_def.precision, is_i4_o4, "r" + layer + "[0][1]", "src1",
166 filters_index + 8);
167 c += ConvInstr(op_def.precision, is_i4_o4, "r" + layer + "[1][0]", "src0",
168 filters_index + 12);
169 c += ConvInstr(op_def.precision, is_i4_o4, "r" + layer + "[1][0]", "src2",
170 filters_index + 16);
171 c += ConvInstr(op_def.precision, is_i4_o4, "r" + layer + "[1][1]", "src0",
172 filters_index + 20);
173 c += ConvInstr(op_def.precision, is_i4_o4, "r" + layer + "[1][1]", "src1",
174 filters_index + 24);
175 c += ConvInstr(op_def.precision, is_i4_o4, "r" + layer + "[1][1]", "src2",
176 filters_index + 28);
177 c += ConvInstr(op_def.precision, is_i4_o4, "r" + layer + "[1][1]", "src3",
178 filters_index + 32);
179 }
180 c += " }\n";
181 }
182 c += " X *= 2;\n";
183 c += " Y *= 2;\n";
184 for (int d = 0; d < dst_depth; ++d) {
185 const std::string layer = std::to_string(d);
186 c += " {\n";
187 c += " FLT4 bias_val = args.biases.Read(" + layer + ");\n";
188 for (int y = 0; y < 2; ++y) {
189 for (int x = 0; x < 2; ++x) {
190 const std::string x_coord = "X + " + std::to_string(x);
191 const std::string y_coord = "Y + " + std::to_string(y);
192 c += " {\n";
193 c += " FLT4 result = TO_FLT4(r" + layer + "[" + std::to_string(y) +
194 "][" + std::to_string(x) + "]) + bias_val;\n";
195 c += " args.dst_tensor.Write(result, " + x_coord + ", " + y_coord +
196 ", " + layer + ");\n";
197 c += " }\n";
198 }
199 }
200 c += " }\n";
201 }
202 c += "}\n";
203
204 return c;
205 }
206
GetGridSize() const207 int3 ConvolutionTransposed3x3Thin::GetGridSize() const {
208 const int grid_x = src_[0]->Width() * dst_[0]->Batch();
209 const int grid_y = src_[0]->Height();
210 const int grid_z = 1;
211 return int3(grid_x, grid_y, grid_z);
212 }
213
GetSpatialWeightsRemap() const214 std::vector<int> ConvolutionTransposed3x3Thin::GetSpatialWeightsRemap() const {
215 return std::vector<int>{4, 5, 3, 7, 1, 8, 6, 2, 0};
216 }
217
UploadWeights(const tflite::gpu::Tensor<OHWI,DataType::FLOAT32> & weights)218 void ConvolutionTransposed3x3Thin::UploadWeights(
219 const tflite::gpu::Tensor<OHWI, DataType::FLOAT32>& weights) {
220 const auto weights_desc = GetWeightsDescription();
221 const int flt_count =
222 GetTotalElementsCountForLayout(weights_desc, weights.shape);
223
224 BufferDescriptor desc;
225 desc.element_type = weights_desc.type;
226 desc.element_size = 4;
227 desc.memory_type = MemoryType::CONSTANT;
228 desc.size = flt_count * SizeOf(desc.element_type);
229 desc.data.resize(desc.size);
230
231 RearrangeWeights(weights, weights_desc, absl::MakeSpan(desc.data));
232
233 args_.AddObject("weights",
234 std::make_unique<BufferDescriptor>(std::move(desc)));
235 }
236
IsConvolutionTransposed3x3ThinSupported(const ConvolutionTransposedAttributes & attr)237 bool IsConvolutionTransposed3x3ThinSupported(
238 const ConvolutionTransposedAttributes& attr) {
239 return attr.weights.shape.o <= 8 && attr.weights.shape.w == 3 &&
240 attr.weights.shape.h == 3 && attr.stride.w == 2 &&
241 attr.stride.h == 2 && attr.padding.prepended.w == 1 &&
242 attr.padding.prepended.h == 1 && attr.padding.appended.w == 1 &&
243 attr.padding.appended.h == 1;
244 }
245
CreateConvolutionTransposed3x3Thin(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)246 ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3Thin(
247 const GpuInfo& gpu_info, const OperationDef& definition,
248 const ConvolutionTransposedAttributes& attr) {
249 ConvolutionTransposed3x3Thin result(gpu_info, definition, attr);
250 result.UploadWeights(attr.weights);
251
252 TensorDescriptor bias_tensor_desc = CreateConstantLinearTensorDescriptor(
253 gpu_info, definition.src_tensors[0].GetDataType(), attr.bias);
254 result.args_.AddObject("biases", std::make_unique<TensorDescriptor>(
255 std::move(bias_tensor_desc)));
256 return result;
257 }
258
CreateConvolutionTransposed3x3ThinDynamicWeights(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)259 ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3ThinDynamicWeights(
260 const GpuInfo& gpu_info, const OperationDef& definition,
261 const ConvolutionTransposedAttributes& attr) {
262 OperationDef new_def = definition;
263 new_def.src_tensors = {
264 definition.src_tensors[0]}; // leaving only src_tensor def, weights defs
265 // will be added later
266 const DataType weights_type = definition.GetDataType();
267 // add 1 src_tensor(buffer) for weights
268 new_def.src_tensors.push_back(
269 {weights_type, TensorStorageType::BUFFER, Layout::HWC});
270 ConvolutionTransposed3x3Thin result(gpu_info, new_def, attr);
271
272 TensorDescriptor bias_tensor_desc = CreateConstantLinearTensorDescriptor(
273 gpu_info, definition.src_tensors[0].GetDataType(), attr.bias);
274 result.args_.AddObject("biases", std::make_unique<TensorDescriptor>(
275 std::move(bias_tensor_desc)));
276 return result;
277 }
278
279 } // namespace gpu
280 } // namespace tflite
281