1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_STRIDE_H2_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_STRIDE_H2_H_
18 
19 #include <memory>
20 #include <utility>
21 #include <vector>
22 
23 #include "tensorflow/lite/delegates/gpu/common/operations.h"
24 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
25 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
27 
28 namespace tflite {
29 namespace gpu {
30 
31 // Depth Wise Convolution for kernel 3x3
32 // require:
33 //   channels_multiplier = 1;
34 //   kernel_size = 3x3;
35 //   dilation.y = 1;
36 //   stride.y = 2;
37 class DepthWiseConv3x3StrideH2 : public GPUOperation {
38  public:
39   DepthWiseConv3x3StrideH2() = default;
40   void GetPossibleKernelWorkGroups(
41       TuningType tuning_type, const GpuInfo& gpu_info,
42       const KernelInfo& kernel_info,
43       std::vector<int3>* work_groups) const override;
44   int3 GetGridSize() const override;
45 
46   // Move only
47   DepthWiseConv3x3StrideH2(DepthWiseConv3x3StrideH2&& kernel) = default;
48   DepthWiseConv3x3StrideH2& operator=(DepthWiseConv3x3StrideH2&& kernel) =
49       default;
50   DepthWiseConv3x3StrideH2(const DepthWiseConv3x3StrideH2&) = delete;
51   DepthWiseConv3x3StrideH2& operator=(const DepthWiseConv3x3StrideH2&) = delete;
52 
53  private:
DepthWiseConv3x3StrideH2(const OperationDef & definition)54   explicit DepthWiseConv3x3StrideH2(const OperationDef& definition)
55       : GPUOperation(definition) {}
56   friend DepthWiseConv3x3StrideH2 CreateDepthWiseConv3x3StrideH2(
57       const OperationDef& definition,
58       const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info);
59 
60   template <DataType T>
61   void UploadWeightsAndBiases(const tflite::gpu::Tensor<OHWI, T>& weights,
62                               const tflite::gpu::Tensor<Linear, T>& biases,
63                               bool weights_are_buffer);
64   template <DataType S, typename T>
65   void RearrangeWeightsAndBiasesData(
66       const tflite::gpu::Tensor<OHWI, S>& weights,
67       const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst);
68 
69   bool local_mem_uploads_;
70 };
71 
72 template <DataType T>
UploadWeightsAndBiases(const tflite::gpu::Tensor<OHWI,T> & weights,const tflite::gpu::Tensor<Linear,T> & biases,bool weights_are_buffer)73 void DepthWiseConv3x3StrideH2::UploadWeightsAndBiases(
74     const tflite::gpu::Tensor<OHWI, T>& weights,
75     const tflite::gpu::Tensor<Linear, T>& biases, bool weights_are_buffer) {
76   const int src_depth = DivideRoundUp(weights.shape.i, 4);
77   int texture_width = 10;  // 3x3 kernel + 1 bias
78   int texture_height = src_depth;
79   const int elements_count = texture_width * texture_height;
80   const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
81   const int float4_size = fp32_weights ? 16 : 8;
82 
83   std::vector<uint8_t> data(float4_size * elements_count);
84   if (fp32_weights) {
85     float4* ptr = reinterpret_cast<float4*>(data.data());
86     RearrangeWeightsAndBiasesData(weights, biases,
87                                   absl::MakeSpan(ptr, elements_count));
88   } else {
89     half4* ptr = reinterpret_cast<half4*>(data.data());
90     RearrangeWeightsAndBiasesData(weights, biases,
91                                   absl::MakeSpan(ptr, elements_count));
92   }
93 
94   if (weights_are_buffer) {
95     BufferDescriptor desc;
96     desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
97     desc.element_size = 4;
98     desc.size = float4_size * elements_count;
99     desc.data = std::move(data);
100     args_.AddObject("weights",
101                     std::make_unique<BufferDescriptor>(std::move(desc)));
102   } else {
103     TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
104         fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16,
105         TensorStorageType::TEXTURE_2D, texture_width, texture_height,
106         data.data());
107     args_.AddObject("weights", std::make_unique<TensorDescriptor>(desc));
108   }
109 }
110 
111 template <DataType S, typename T>
RearrangeWeightsAndBiasesData(const tflite::gpu::Tensor<OHWI,S> & weights,const tflite::gpu::Tensor<Linear,S> & biases,absl::Span<T> dst)112 void DepthWiseConv3x3StrideH2::RearrangeWeightsAndBiasesData(
113     const tflite::gpu::Tensor<OHWI, S>& weights,
114     const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst) {
115   const int src_depth = DivideRoundUp(weights.shape.i, 4);
116 
117   int counter = 0;
118   for (int s = 0; s < src_depth; ++s) {
119     for (int y = 0; y < 3; ++y) {
120       for (int x = 0; x < 3; ++x) {
121         T filter_val;
122         for (int i = 0; i < 4; ++i) {
123           const int s_ch = s * 4 + i;
124           if (s_ch < weights.shape.i) {
125             const int f_index = weights.shape.LinearIndex({0, y, x, s_ch});
126             filter_val[i] = weights.data[f_index];
127           } else {
128             filter_val[i] = 0.0f;
129           }
130         }
131         dst[counter++] = filter_val;
132       }
133     }
134 
135     T bias_val;
136     for (int i = 0; i < 4; ++i) {
137       const int dst_ch = s * 4 + i;
138       bias_val[i] = dst_ch >= biases.shape.v ? 0.0f : biases.data[dst_ch];
139     }
140     dst[counter++] = bias_val;
141   }
142 }
143 
144 DepthWiseConv3x3StrideH2 CreateDepthWiseConv3x3StrideH2(
145     const OperationDef& definition,
146     const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info);
147 
148 bool IsDepthWiseConv3x3StrideH2Supported(
149     const DepthwiseConvolution2DAttributes& attr);
150 
151 }  // namespace gpu
152 }  // namespace tflite
153 
154 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_STRIDE_H2_H_
155