1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_3x3_thin.h"
17 
18 #include <memory>
19 #include <string>
20 #include <utility>
21 #include <vector>
22 
23 #include "absl/strings/substitute.h"
24 #include "tensorflow/lite/delegates/gpu/common/precision.h"
25 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/weights_conversion.h"
27 
28 namespace tflite {
29 namespace gpu {
30 namespace {
ConvInstr(CalculationsPrecision precision,bool is_i4_o4,const std::string & dst_name,const std::string & src_name,int weights_offset)31 std::string ConvInstr(CalculationsPrecision precision, bool is_i4_o4,
32                       const std::string& dst_name, const std::string& src_name,
33                       int weights_offset) {
34   std::string c;
35   if (is_i4_o4) {
36     switch (precision) {
37       case CalculationsPrecision::F32:
38       case CalculationsPrecision::F16:
39         c += "  $0 += $1.x * args.weights.Read($2); \n";
40         c += "  $0 += $1.y * args.weights.Read($3); \n";
41         c += "  $0 += $1.z * args.weights.Read($4); \n";
42         c += "  $0 += $1.w * args.weights.Read($5); \n";
43         break;
44       case CalculationsPrecision::F32_F16:
45         c += "  $0 += TO_ACCUM_TYPE($1.x * args.weights.Read($2) + $1.y * "
46              "args.weights.Read($3) + $1.z * args.weights.Read($4) + $1.w * "
47              "args.weights.Read($5)); \n";
48         break;
49     }
50   } else {
51     // O4I4
52     c += "  $0.x += dot($1, args.weights.Read($2)); \n";
53     c += "  $0.y += dot($1, args.weights.Read($3)); \n";
54     c += "  $0.z += dot($1, args.weights.Read($4)); \n";
55     c += "  $0.w += dot($1, args.weights.Read($5)); \n";
56   }
57   return absl::Substitute(c, dst_name, src_name, weights_offset,
58                           weights_offset + 1, weights_offset + 2,
59                           weights_offset + 3);
60 }
61 }  // namespace
62 
ConvolutionTransposed3x3Thin(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)63 ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
64     const GpuInfo& gpu_info, const OperationDef& definition,
65     const ConvolutionTransposedAttributes& attr)
66     : GPUOperation(definition) {
67   if (gpu_info.IsApple()) {
68     weights_layout_ = WeightsLayout::kOICustomSpatialO4I4;
69   } else {
70     weights_layout_ = WeightsLayout::kOICustomSpatialI4O4;
71   }
72   code_ = GenerateConvolutionTransposedCode(
73       definition_, gpu_info, DivideRoundUp(attr.weights.shape.i, 4),
74       DivideRoundUp(attr.weights.shape.o, 4));
75 }
76 
GenerateConvolutionTransposedCode(const OperationDef & op_def,const GpuInfo & gpu_info,int src_depth,int dst_depth)77 std::string ConvolutionTransposed3x3Thin::GenerateConvolutionTransposedCode(
78     const OperationDef& op_def, const GpuInfo& gpu_info, int src_depth,
79     int dst_depth) {
80   AddSrcTensor("src_tensor", op_def.src_tensors[0]);
81   AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
82 
83   if (op_def.src_tensors.size() == 2) {
84     // dynamic weights
85     BufferDescriptor desc;
86     desc.element_type = op_def.src_tensors[1].GetDataType();
87     desc.element_size = 4;
88     desc.memory_type = MemoryType::CONSTANT;
89     AddSrcBuffer("weights", desc);
90   }
91 
92   std::string c;
93 
94   c += "MAIN_FUNCTION($0) {\n";
95   if (op_def.IsBatchSupported()) {
96     c += "  int linear_id = GLOBAL_ID_0;\n";
97     c += "  int X = linear_id / args.dst_tensor.Batch();\n";
98     c += "  int B = linear_id % args.dst_tensor.Batch();\n";
99     c += "  args.dst_tensor.SetBatchRef(B);\n";
100     c += "  args.src_tensor.SetBatchRef(B);\n";
101   } else {
102     c += "  int X = GLOBAL_ID_0;\n";
103   }
104   c += "  int Y = GLOBAL_ID_1;\n";
105   c += "  if (X >= args.src_tensor.Width() || Y >= args.src_tensor.Height()) "
106        "return;\n";
107   for (int d = 0; d < dst_depth; ++d) {
108     const std::string layer = std::to_string(d);
109     c += "  ACCUM_FLT4 r" + layer + "[2][2];\n";
110     c += "  r" + layer + "[0][0] = INIT_ACCUM_FLT4(0.0f);\n";
111     c += "  r" + layer + "[0][1] = INIT_ACCUM_FLT4(0.0f);\n";
112     c += "  r" + layer + "[1][0] = INIT_ACCUM_FLT4(0.0f);\n";
113     c += "  r" + layer + "[1][1] = INIT_ACCUM_FLT4(0.0f);\n";
114   }
115   for (int s = 0; s < src_depth; ++s) {
116     const std::string z = std::to_string(s);
117     c += "  {\n";
118     if (op_def.src_tensors[0].SupportsZeroClamp(Axis::WIDTH, gpu_info) &&
119         op_def.src_tensors[0].SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
120       c += "  FLT4 src0 = args.src_tensor.Read(X, Y, " + z + ");\n";
121       c += "  FLT4 src1 = args.src_tensor.Read(X + 1, Y, " + z + ");\n";
122       c += "  FLT4 src2 = args.src_tensor.Read(X, Y + 1, " + z + ");\n";
123       c += "  FLT4 src3 = args.src_tensor.Read(X + 1, Y + 1, " + z + ");\n";
124     } else if (op_def.src_tensors[0].IsLinear() &&
125                op_def.src_tensors[0].ReturnsZeroForNegOneRead(gpu_info)) {
126       c += "  int c0 = args.src_tensor.GetAddress(X, Y, " + z + ");\n";
127       c += "  int c1 = args.src_tensor.GetAddress(X + 1, Y, " + z + ");\n";
128       c += "  int c2 = args.src_tensor.GetAddress(X, Y + 1, " + z + ");\n";
129       c += "  int c3 = args.src_tensor.GetAddress(X + 1, Y + 1, " + z + ");\n";
130       c += "  bool x_in = X + 1 < args.src_tensor.Width();\n";
131       c += "  bool y_in = Y + 1 < args.src_tensor.Height();\n";
132       c += "  c1 = select(-1, c1, x_in);\n";
133       c += "  c2 = select(-1, c2, y_in);\n";
134       c += "  c3 = select(-1, c3, x_in && y_in);\n";
135       c += "  FLT4 src0 = args.src_tensor.Read(c0);\n";
136       c += "  FLT4 src1 = args.src_tensor.Read(c1);\n";
137       c += "  FLT4 src2 = args.src_tensor.Read(c2);\n";
138       c += "  FLT4 src3 = args.src_tensor.Read(c3);\n";
139     } else {
140       // Manual zero clamp
141       c += "  bool x_in = X + 1 < args.src_tensor.Width();\n";
142       c += "  bool y_in = Y + 1 < args.src_tensor.Height();\n";
143       c += "  FLT4 src0 = args.src_tensor.Read(X, Y, " + z + ");\n";
144       c += "  FLT4 src1 = INIT_FLT4(0.0);\n";
145       c += "  FLT4 src2 = INIT_FLT4(0.0);\n";
146       c += "  FLT4 src3 = INIT_FLT4(0.0);\n";
147       c += "  if (x_in) {\n";
148       c += "    src1 = args.src_tensor.Read(X + 1, Y, " + z + ");\n";
149       c += "  }\n";
150       c += "  if (y_in) {\n";
151       c += "    src2 = args.src_tensor.Read(X, Y + 1, " + z + ");\n";
152       c += "  }\n";
153       c += "  if (x_in && y_in) {\n";
154       c += "    src3 = args.src_tensor.Read(X + 1, Y + 1, " + z + ");\n";
155       c += "  }\n";
156     }
157     for (int d = 0; d < dst_depth; ++d) {
158       const std::string layer = std::to_string(d);
159       const int filters_index = (s * dst_depth + d) * 36;
160       const bool is_i4_o4 = GetWeightsDescription().IsI4O4();
161       c += ConvInstr(op_def.precision, is_i4_o4, "r" + layer + "[0][0]", "src0",
162                      filters_index);
163       c += ConvInstr(op_def.precision, is_i4_o4, "r" + layer + "[0][1]", "src0",
164                      filters_index + 4);
165       c += ConvInstr(op_def.precision, is_i4_o4, "r" + layer + "[0][1]", "src1",
166                      filters_index + 8);
167       c += ConvInstr(op_def.precision, is_i4_o4, "r" + layer + "[1][0]", "src0",
168                      filters_index + 12);
169       c += ConvInstr(op_def.precision, is_i4_o4, "r" + layer + "[1][0]", "src2",
170                      filters_index + 16);
171       c += ConvInstr(op_def.precision, is_i4_o4, "r" + layer + "[1][1]", "src0",
172                      filters_index + 20);
173       c += ConvInstr(op_def.precision, is_i4_o4, "r" + layer + "[1][1]", "src1",
174                      filters_index + 24);
175       c += ConvInstr(op_def.precision, is_i4_o4, "r" + layer + "[1][1]", "src2",
176                      filters_index + 28);
177       c += ConvInstr(op_def.precision, is_i4_o4, "r" + layer + "[1][1]", "src3",
178                      filters_index + 32);
179     }
180     c += "  }\n";
181   }
182   c += "  X *= 2;\n";
183   c += "  Y *= 2;\n";
184   for (int d = 0; d < dst_depth; ++d) {
185     const std::string layer = std::to_string(d);
186     c += "  {\n";
187     c += "  FLT4 bias_val = args.biases.Read(" + layer + ");\n";
188     for (int y = 0; y < 2; ++y) {
189       for (int x = 0; x < 2; ++x) {
190         const std::string x_coord = "X + " + std::to_string(x);
191         const std::string y_coord = "Y + " + std::to_string(y);
192         c += "  {\n";
193         c += "    FLT4 result = TO_FLT4(r" + layer + "[" + std::to_string(y) +
194              "][" + std::to_string(x) + "]) + bias_val;\n";
195         c += "    args.dst_tensor.Write(result, " + x_coord + ", " + y_coord +
196              ", " + layer + ");\n";
197         c += "  }\n";
198       }
199     }
200     c += "  }\n";
201   }
202   c += "}\n";
203 
204   return c;
205 }
206 
GetGridSize() const207 int3 ConvolutionTransposed3x3Thin::GetGridSize() const {
208   const int grid_x = src_[0]->Width() * dst_[0]->Batch();
209   const int grid_y = src_[0]->Height();
210   const int grid_z = 1;
211   return int3(grid_x, grid_y, grid_z);
212 }
213 
GetSpatialWeightsRemap() const214 std::vector<int> ConvolutionTransposed3x3Thin::GetSpatialWeightsRemap() const {
215   return std::vector<int>{4, 5, 3, 7, 1, 8, 6, 2, 0};
216 }
217 
UploadWeights(const tflite::gpu::Tensor<OHWI,DataType::FLOAT32> & weights)218 void ConvolutionTransposed3x3Thin::UploadWeights(
219     const tflite::gpu::Tensor<OHWI, DataType::FLOAT32>& weights) {
220   const auto weights_desc = GetWeightsDescription();
221   const int flt_count =
222       GetTotalElementsCountForLayout(weights_desc, weights.shape);
223 
224   BufferDescriptor desc;
225   desc.element_type = weights_desc.type;
226   desc.element_size = 4;
227   desc.memory_type = MemoryType::CONSTANT;
228   desc.size = flt_count * SizeOf(desc.element_type);
229   desc.data.resize(desc.size);
230 
231   RearrangeWeights(weights, weights_desc, absl::MakeSpan(desc.data));
232 
233   args_.AddObject("weights",
234                   std::make_unique<BufferDescriptor>(std::move(desc)));
235 }
236 
IsConvolutionTransposed3x3ThinSupported(const ConvolutionTransposedAttributes & attr)237 bool IsConvolutionTransposed3x3ThinSupported(
238     const ConvolutionTransposedAttributes& attr) {
239   return attr.weights.shape.o <= 8 && attr.weights.shape.w == 3 &&
240          attr.weights.shape.h == 3 && attr.stride.w == 2 &&
241          attr.stride.h == 2 && attr.padding.prepended.w == 1 &&
242          attr.padding.prepended.h == 1 && attr.padding.appended.w == 1 &&
243          attr.padding.appended.h == 1;
244 }
245 
CreateConvolutionTransposed3x3Thin(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)246 ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3Thin(
247     const GpuInfo& gpu_info, const OperationDef& definition,
248     const ConvolutionTransposedAttributes& attr) {
249   ConvolutionTransposed3x3Thin result(gpu_info, definition, attr);
250   result.UploadWeights(attr.weights);
251 
252   TensorDescriptor bias_tensor_desc = CreateConstantLinearTensorDescriptor(
253       gpu_info, definition.src_tensors[0].GetDataType(), attr.bias);
254   result.args_.AddObject("biases", std::make_unique<TensorDescriptor>(
255                                        std::move(bias_tensor_desc)));
256   return result;
257 }
258 
CreateConvolutionTransposed3x3ThinDynamicWeights(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)259 ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3ThinDynamicWeights(
260     const GpuInfo& gpu_info, const OperationDef& definition,
261     const ConvolutionTransposedAttributes& attr) {
262   OperationDef new_def = definition;
263   new_def.src_tensors = {
264       definition.src_tensors[0]};  // leaving only src_tensor def, weights defs
265                                    // will be added later
266   const DataType weights_type = definition.GetDataType();
267   // add 1 src_tensor(buffer) for weights
268   new_def.src_tensors.push_back(
269       {weights_type, TensorStorageType::BUFFER, Layout::HWC});
270   ConvolutionTransposed3x3Thin result(gpu_info, new_def, attr);
271 
272   TensorDescriptor bias_tensor_desc = CreateConstantLinearTensorDescriptor(
273       gpu_info, definition.src_tensors[0].GetDataType(), attr.bias);
274   result.args_.AddObject("biases", std::make_unique<TensorDescriptor>(
275                                        std::move(bias_tensor_desc)));
276   return result;
277 }
278 
279 }  // namespace gpu
280 }  // namespace tflite
281