1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
17
18 #include <algorithm>
19
20 #include "fixedpoint/fixedpoint.h"
21 #include "tensorflow/lite/kernels/internal/common.h"
22 #include "tensorflow/lite/kernels/internal/compatibility.h"
23 #include "tensorflow/lite/kernels/internal/types.h"
24
25 namespace tflite {
26
27 // Used in tests and template parameters to control which version of depthwise
28 // convolution is called. Primarily for reference code, and specializations
29 // forced in tests.
30 enum class DepthwiseConvImplementation {
31 // Run all tests against kUseStandardEntry even if also testing another
32 // kernel, since we need to be sure that the main DepthwiseConv() function in
33 // optimized_ops.h dispatches to a correctly-executing kernel.
34 kNone = 0, // The "default" option: use the normal
35 // DepthwiseConv kernel (entry) function.
36 kUseGenericKernel, // Forced use of generic kernel.
37 kUseNeon3x3, // 3x3 kernel that uses NEON when available.
38 kUseNeon3x3DotProduct, // 3x3 kernel that uses dot-product enabled NEON
39 // when available.
40 kUseCModel3x3DotProduct, // 3x3 kernel, reference C model that is intended
41 // to match overall design NEON code.
42 kUseUnwound3x3DotProduct, // 3x3 kernel, reference C model with unwound loops
43 // and some arrays.
44 kUseIntrinsics3x3DotProduct, // 3x3 kernel using NEON intrinsics.
45 };
46
47 // Category of depthwise convolution output rounding.
48 enum class DepthwiseConvOutputRounding {
49 kNone = 0, // Invalid: specific method must be specified.
50 kAwayFromZero, // Original method: exact halves rounded away from zero.
51 kUpward, // Halves towards +infinity: adds 0.5 before truncate.
52 // This is where a future kNearestEven would be placed.
53 };
54
55 // Category of depthwise convolution depth multiplication.
56 enum class DepthwiseConvDepthMultiplication {
57 kNoMultiplication = 0, // Depth multiplier = 1.
58 kUnitInputDepth, // Input depth = 1, output depth = depth multiplier.
59 };
60
61 namespace reference_ops {
62 namespace depthwise_conv {
63
64 template <DepthwiseConvOutputRounding output_rounding>
DepthwiseConvRound(int32_t x,int32_t quantized_multiplier,int shift)65 inline int32_t DepthwiseConvRound(int32_t x, int32_t quantized_multiplier,
66 int shift) {
67 TFLITE_DCHECK_NE(output_rounding, DepthwiseConvOutputRounding::kNone);
68 return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
69 }
70
71 // Single-rounding MultiplyByQuantizedMultiplier
72 #if TFLITE_SINGLE_ROUNDING
73 template <>
74 inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
75 int32_t x, int32_t quantized_multiplier, int shift) {
76 using gemmlowp::RoundingDivideByPOT;
77 using gemmlowp::SaturatingRoundingDoublingHighMul;
78 int left_shift = shift > 0 ? shift : 0;
79 int right_shift = shift > 0 ? 0 : -shift;
80 return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
81 x * (1 << left_shift), quantized_multiplier),
82 right_shift);
83 }
84
85 template <>
86 inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
87 int32_t x, int32_t quantized_multiplier, int shift) {
88 return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
89 }
90 // Double-rounding MultiplyByQuantizedMultiplier
91 #else
92 template <>
93 inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
94 int32_t x, int32_t quantized_multiplier, int shift) {
95 return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
96 }
97
98 template <>
99 inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
100 int32_t x, int32_t quantized_multiplier, int shift) {
101 using gemmlowp::SaturatingRoundingDoublingHighMul;
102 const int left_shift = shift > 0 ? shift : 0;
103 const int right_shift = shift > 0 ? 0 : -shift;
104 const int rounding_offset = right_shift > 0 ? 1 << (right_shift - 1) : 0;
105 return (SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
106 quantized_multiplier) +
107 rounding_offset) >>
108 right_shift;
109 }
110 #endif // TFLITE_SINGLE_ROUNDING
111
112 template <DepthwiseConvOutputRounding output_rounding>
113 struct DepthwiseConvBasicKernel {
RunDepthwiseConvBasicKernel114 static inline void Run(
115 const DepthwiseParams& params, const RuntimeShape& input_shape,
116 const uint8_t* input_data, const RuntimeShape& filter_shape,
117 const uint8_t* filter_data, const RuntimeShape& bias_shape,
118 const int32_t* bias_data, const RuntimeShape& output_shape,
119 uint8_t* output_data) {
120 const int stride_width = params.stride_width;
121 const int stride_height = params.stride_height;
122 const int dilation_width_factor = params.dilation_width_factor;
123 const int dilation_height_factor = params.dilation_height_factor;
124 const int pad_width = params.padding_values.width;
125 const int pad_height = params.padding_values.height;
126 const int depth_multiplier = params.depth_multiplier;
127 const int32_t output_activation_min = params.quantized_activation_min;
128 const int32_t output_activation_max = params.quantized_activation_max;
129 const int32_t input_offset = params.input_offset;
130 const int32_t filter_offset = params.weights_offset;
131 const int32_t output_offset = params.output_offset;
132 const int32_t output_multiplier = params.output_multiplier;
133 const int output_shift = params.output_shift;
134 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
135 TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
136 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
137
138 TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
139 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
140 const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
141 const int input_height = input_shape.Dims(1);
142 const int input_width = input_shape.Dims(2);
143 const int input_depth = input_shape.Dims(3);
144 const int filter_height = filter_shape.Dims(1);
145 const int filter_width = filter_shape.Dims(2);
146 const int output_height = output_shape.Dims(1);
147 const int output_width = output_shape.Dims(2);
148 TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
149 TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
150
151 for (int b = 0; b < batches; ++b) {
152 for (int out_y = 0; out_y < output_height; ++out_y) {
153 for (int out_x = 0; out_x < output_width; ++out_x) {
154 for (int ic = 0; ic < input_depth; ++ic) {
155 for (int m = 0; m < depth_multiplier; m++) {
156 const int oc = m + ic * depth_multiplier;
157 const int in_x_origin = (out_x * stride_width) - pad_width;
158 const int in_y_origin = (out_y * stride_height) - pad_height;
159 int32_t acc = 0;
160 for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
161 for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
162 const int in_x =
163 in_x_origin + dilation_width_factor * filter_x;
164 const int in_y =
165 in_y_origin + dilation_height_factor * filter_y;
166 // If the location is outside the bounds of the input image,
167 // use zero as a default value.
168 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
169 (in_y < input_height)) {
170 int32_t input_val =
171 input_data[Offset(input_shape, b, in_y, in_x, ic)];
172 int32_t filter_val = filter_data[Offset(
173 filter_shape, 0, filter_y, filter_x, oc)];
174 acc += (filter_val + filter_offset) *
175 (input_val + input_offset);
176 }
177 }
178 }
179 if (bias_data) {
180 acc += bias_data[oc];
181 }
182 acc = DepthwiseConvRound<output_rounding>(acc, output_multiplier,
183 output_shift);
184 acc += output_offset;
185 acc = std::max(acc, output_activation_min);
186 acc = std::min(acc, output_activation_max);
187 output_data[Offset(output_shape, b, out_y, out_x, oc)] =
188 static_cast<uint8_t>(acc);
189 }
190 }
191 }
192 }
193 }
194 }
195
196 // TODO(b/148596273): Reconcile reference versions, perhaps with common
197 // MultiplyByQuantizedMultiplier or DepthwiseConvRound function.
RunPerChannelDepthwiseConvBasicKernel198 static inline void RunPerChannel(
199 const DepthwiseParams& params, const RuntimeShape& input_shape,
200 const int8_t* input_data, const RuntimeShape& filter_shape,
201 const int8_t* filter_data, const RuntimeShape& bias_shape,
202 const int32_t* bias_data, const RuntimeShape& output_shape,
203 int8_t* output_data) {
204 // Get parameters.
205 // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
206 const int stride_width = params.stride_width;
207 const int stride_height = params.stride_height;
208 const int dilation_width_factor = params.dilation_width_factor;
209 const int dilation_height_factor = params.dilation_height_factor;
210 const int pad_width = params.padding_values.width;
211 const int pad_height = params.padding_values.height;
212 const int depth_multiplier = params.depth_multiplier;
213 const int32_t input_offset = params.input_offset;
214 const int32_t output_offset = params.output_offset;
215 const int32_t output_activation_min = params.quantized_activation_min;
216 const int32_t output_activation_max = params.quantized_activation_max;
217 const int32_t* output_multiplier = params.output_multiplier_per_channel;
218 const int32_t* output_shift = params.output_shift_per_channel;
219
220 // Check dimensions of the tensors.
221 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
222 TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
223 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
224
225 TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
226 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
227 const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
228 const int input_height = input_shape.Dims(1);
229 const int input_width = input_shape.Dims(2);
230 const int input_depth = input_shape.Dims(3);
231 const int filter_height = filter_shape.Dims(1);
232 const int filter_width = filter_shape.Dims(2);
233 const int output_height = output_shape.Dims(1);
234 const int output_width = output_shape.Dims(2);
235 TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
236 TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
237
238 for (int batch = 0; batch < batches; ++batch) {
239 for (int out_y = 0; out_y < output_height; ++out_y) {
240 for (int out_x = 0; out_x < output_width; ++out_x) {
241 for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
242 for (int m = 0; m < depth_multiplier; ++m) {
243 const int output_channel = m + in_channel * depth_multiplier;
244 const int in_x_origin = (out_x * stride_width) - pad_width;
245 const int in_y_origin = (out_y * stride_height) - pad_height;
246 int32_t acc = 0;
247 for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
248 for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
249 const int in_x =
250 in_x_origin + dilation_width_factor * filter_x;
251 const int in_y =
252 in_y_origin + dilation_height_factor * filter_y;
253 // Zero padding by omitting the areas outside the image.
254 const bool is_point_inside_image =
255 (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
256 (in_y < input_height);
257 if (is_point_inside_image) {
258 int32_t input_val = input_data[Offset(
259 input_shape, batch, in_y, in_x, in_channel)];
260 int32_t filter_val = filter_data[Offset(
261 filter_shape, 0, filter_y, filter_x, output_channel)];
262 // Accumulate with 32 bits accumulator.
263 // In the nudging process during model quantization, we
264 // force real value of 0.0 be represented by a quantized
265 // value. This guarantees that the input_offset is a int8_t,
266 // even though it is represented using int32_t. int32_t +=
267 // int8_t
268 // * (int8_t - int8_t) so the highest value we can get from
269 // each accumulation is [-127, 127] * ([-128, 127] -
270 // [-128, 127]), which is [-32512, 32512]. log2(32512)
271 // = 14.98, which means we can accumulate at least 2^16
272 // multiplications without overflow. The accumulator is
273 // applied to a filter so the accumulation logic will hold
274 // as long as the filter size (filter_y * filter_x *
275 // in_channel) does not exceed 2^16, which is the case in
276 // all the models we have seen so far.
277 acc += filter_val * (input_val + input_offset);
278 }
279 }
280 }
281 if (bias_data) {
282 acc += bias_data[output_channel];
283 }
284 acc = DepthwiseConvRound<output_rounding>(
285 acc, output_multiplier[output_channel],
286 output_shift[output_channel]);
287 acc += output_offset;
288 acc = std::max(acc, output_activation_min);
289 acc = std::min(acc, output_activation_max);
290 output_data[Offset(output_shape, batch, out_y, out_x,
291 output_channel)] = static_cast<int8_t>(acc);
292 }
293 }
294 }
295 }
296 }
297 }
298 };
299
300 } // namespace depthwise_conv
301
DepthwiseConv(const DepthwiseParams & params,const RuntimeShape & input_shape,const uint8_t * input_data,const RuntimeShape & filter_shape,const uint8_t * filter_data,const RuntimeShape & bias_shape,const int32_t * bias_data,const RuntimeShape & output_shape,uint8_t * output_data)302 inline void DepthwiseConv(
303 const DepthwiseParams& params, const RuntimeShape& input_shape,
304 const uint8_t* input_data, const RuntimeShape& filter_shape,
305 const uint8_t* filter_data, const RuntimeShape& bias_shape,
306 const int32_t* bias_data, const RuntimeShape& output_shape,
307 uint8_t* output_data) {
308 return depthwise_conv::DepthwiseConvBasicKernel<
309 DepthwiseConvOutputRounding::kAwayFromZero>::Run(params, input_shape,
310 input_data, filter_shape,
311 filter_data, bias_shape,
312 bias_data, output_shape,
313 output_data);
314 }
315
316 } // namespace reference_ops
317 } // end namespace tflite
318
319 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
320