1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_CONSTANTS_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_CONSTANTS_H_
18
19 #include <memory>
20 #include <utility>
21
22 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
23 #include "tensorflow/lite/delegates/gpu/common/operations.h"
24 #include "tensorflow/lite/delegates/gpu/common/shape.h"
25 #include "tensorflow/lite/delegates/gpu/common/status.h"
26 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
27 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
28 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
29 #include "tensorflow/lite/delegates/gpu/common/types.h"
30
31 namespace tflite {
32 namespace gpu {
33
34 template <DataType S, typename T>
RearrangeWeightsForConvConstants(const tflite::gpu::Tensor<OHWI,S> & weights,absl::Span<T> dst)35 void RearrangeWeightsForConvConstants(
36 const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
37 const int dst_depth = DivideRoundUp(weights.shape.o, 4);
38 const int src_depth = DivideRoundUp(weights.shape.i, 4);
39 const int kernel_x = weights.shape.w;
40 const int kernel_y = weights.shape.h;
41
42 int counter = 0;
43 for (int s = 0; s < src_depth; ++s) {
44 for (int y = 0; y < kernel_y; ++y) {
45 for (int x = 0; x < kernel_x; ++x) {
46 for (int d = 0; d < dst_depth; ++d) {
47 const int channels_count = std::min(4, weights.shape.i - s * 4);
48 T filters[4];
49 for (int i = 0; i < 4; ++i) {
50 for (int j = 0; j < channels_count; ++j) {
51 const int s_ch = s * 4 + j;
52 const int d_ch = d * 4 + i;
53 if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
54 const int f_index =
55 weights.shape.LinearIndex({d_ch, y, x, s_ch});
56 filters[j][i] = weights.data[f_index];
57 } else {
58 filters[j][i] = 0.0f;
59 }
60 }
61 }
62 for (int i = 0; i < channels_count; ++i) {
63 dst[counter++] = filters[i];
64 }
65 }
66 }
67 }
68 }
69 }
70
71 template <DataType S, typename T>
RearrangeWeightsForConvConstantsDot(const tflite::gpu::Tensor<OHWI,S> & weights,absl::Span<T> dst)72 void RearrangeWeightsForConvConstantsDot(
73 const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
74 const int dst_depth = DivideRoundUp(weights.shape.o, 4);
75 const int src_depth = DivideRoundUp(weights.shape.i, 4);
76 const int kernel_x = weights.shape.w;
77 const int kernel_y = weights.shape.h;
78
79 int counter = 0;
80 for (int s = 0; s < src_depth; ++s) {
81 for (int y = 0; y < kernel_y; ++y) {
82 for (int x = 0; x < kernel_x; ++x) {
83 for (int d = 0; d < dst_depth; ++d) {
84 const int channels_count = std::min(4, weights.shape.o - d * 4);
85 T filters[4];
86 for (int j = 0; j < channels_count; ++j) {
87 for (int i = 0; i < 4; ++i) {
88 const int s_ch = s * 4 + i;
89 const int d_ch = d * 4 + j;
90 if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
91 const int f_index =
92 weights.shape.LinearIndex({d_ch, y, x, s_ch});
93 filters[j][i] = weights.data[f_index];
94 } else {
95 filters[j][i] = 0.0f;
96 }
97 }
98 }
99 for (int i = 0; i < channels_count; ++i) {
100 dst[counter++] = filters[i];
101 }
102 }
103 }
104 }
105 }
106 }
107
108 template <DataType T>
UploadWeightsForConvConstants(const tflite::gpu::Tensor<OHWI,T> & weights,const GpuInfo & gpu_info,CalculationsPrecision precision,bool use_dot_conv,GPUOperation * op)109 void UploadWeightsForConvConstants(const tflite::gpu::Tensor<OHWI, T>& weights,
110 const GpuInfo& gpu_info,
111 CalculationsPrecision precision,
112 bool use_dot_conv, GPUOperation* op) {
113 const int src_depth = DivideRoundUp(weights.shape.i, 4);
114 const int dst_depth = DivideRoundUp(weights.shape.o, 4);
115 const int kernel_x = weights.shape.w;
116 const int kernel_y = weights.shape.h;
117
118 const bool f32_weights = precision == CalculationsPrecision::F32;
119 const int float_size = f32_weights ? 4 : 2;
120 const int aligned_ch_count = use_dot_conv ? weights.shape.o * src_depth * 4
121 : weights.shape.i * dst_depth * 4;
122 const int float_count = aligned_ch_count * kernel_x * kernel_y;
123
124 BufferDescriptor desc;
125 desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
126 desc.element_size = 4;
127 if (gpu_info.IsApiOpenCl() || gpu_info.IsApiMetal()) {
128 desc.memory_type = MemoryType::CONSTANT;
129 } else {
130 desc.memory_type = MemoryType::GLOBAL;
131 }
132 desc.size = float_size * float_count;
133 desc.data.resize(desc.size);
134
135 if (f32_weights) {
136 float4* ptr = reinterpret_cast<float4*>(desc.data.data());
137 if (use_dot_conv) {
138 RearrangeWeightsForConvConstantsDot(weights,
139 absl::MakeSpan(ptr, float_count / 4));
140 } else {
141 RearrangeWeightsForConvConstants(weights,
142 absl::MakeSpan(ptr, float_count / 4));
143 }
144 } else {
145 half4* ptr = reinterpret_cast<half4*>(desc.data.data());
146 if (use_dot_conv) {
147 RearrangeWeightsForConvConstantsDot(weights,
148 absl::MakeSpan(ptr, float_count / 4));
149 } else {
150 RearrangeWeightsForConvConstants(weights,
151 absl::MakeSpan(ptr, float_count / 4));
152 }
153 }
154
155 op->args_.AddObject("weights",
156 std::make_unique<BufferDescriptor>(std::move(desc)));
157 }
158
159 bool IsConvConstantsSupported(const GpuInfo& gpu_info,
160 const OperationDef& definition,
161 const Convolution2DAttributes& attr);
162
163 GPUOperation CreateConvConstants(const GpuInfo& gpu_info,
164 const OperationDef& definition,
165 const Convolution2DAttributes& attr);
166
167 } // namespace gpu
168 } // namespace tflite
169
170 #endif // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_CONV_CONSTANTS_H_
171