xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
17 
18 #include <algorithm>
19 
20 #include "fixedpoint/fixedpoint.h"
21 #include "tensorflow/lite/kernels/internal/common.h"
22 #include "tensorflow/lite/kernels/internal/compatibility.h"
23 #include "tensorflow/lite/kernels/internal/types.h"
24 
25 namespace tflite {
26 
27 // Used in tests and template parameters to control which version of depthwise
28 // convolution is called. Primarily for reference code, and specializations
29 // forced in tests.
30 enum class DepthwiseConvImplementation {
31   // Run all tests against kUseStandardEntry even if also testing another
32   // kernel, since we need to be sure that the main DepthwiseConv() function in
33   // optimized_ops.h dispatches to a correctly-executing kernel.
34   kNone = 0,                 // The "default" option: use the normal
35                              // DepthwiseConv kernel (entry) function.
36   kUseGenericKernel,         // Forced use of generic kernel.
37   kUseNeon3x3,               // 3x3 kernel that uses NEON when available.
38   kUseNeon3x3DotProduct,     // 3x3 kernel that uses dot-product enabled NEON
39                              // when available.
40   kUseCModel3x3DotProduct,   // 3x3 kernel, reference C model that is intended
41                              // to match overall design NEON code.
42   kUseUnwound3x3DotProduct,  // 3x3 kernel, reference C model with unwound loops
43                              // and some arrays.
44   kUseIntrinsics3x3DotProduct,  // 3x3 kernel using NEON intrinsics.
45 };
46 
47 // Category of depthwise convolution output rounding.
48 enum class DepthwiseConvOutputRounding {
49   kNone = 0,      // Invalid: specific method must be specified.
50   kAwayFromZero,  // Original method: exact halves rounded away from zero.
51   kUpward,        // Halves towards +infinity: adds 0.5 before truncate.
52   // This is where a future kNearestEven would be placed.
53 };
54 
55 // Category of depthwise convolution depth multiplication.
56 enum class DepthwiseConvDepthMultiplication {
57   kNoMultiplication = 0,  // Depth multiplier = 1.
58   kUnitInputDepth,        // Input depth = 1, output depth = depth multiplier.
59 };
60 
61 namespace reference_ops {
62 namespace depthwise_conv {
63 
64 template <DepthwiseConvOutputRounding output_rounding>
DepthwiseConvRound(int32_t x,int32_t quantized_multiplier,int shift)65 inline int32_t DepthwiseConvRound(int32_t x, int32_t quantized_multiplier,
66                                   int shift) {
67   TFLITE_DCHECK_NE(output_rounding, DepthwiseConvOutputRounding::kNone);
68   return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
69 }
70 
71 // Single-rounding MultiplyByQuantizedMultiplier
72 #if TFLITE_SINGLE_ROUNDING
73 template <>
74 inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
75     int32_t x, int32_t quantized_multiplier, int shift) {
76   using gemmlowp::RoundingDivideByPOT;
77   using gemmlowp::SaturatingRoundingDoublingHighMul;
78   int left_shift = shift > 0 ? shift : 0;
79   int right_shift = shift > 0 ? 0 : -shift;
80   return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
81                                  x * (1 << left_shift), quantized_multiplier),
82                              right_shift);
83 }
84 
85 template <>
86 inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
87     int32_t x, int32_t quantized_multiplier, int shift) {
88   return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
89 }
90 // Double-rounding MultiplyByQuantizedMultiplier
91 #else
92 template <>
93 inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
94     int32_t x, int32_t quantized_multiplier, int shift) {
95   return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
96 }
97 
98 template <>
99 inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
100     int32_t x, int32_t quantized_multiplier, int shift) {
101   using gemmlowp::SaturatingRoundingDoublingHighMul;
102   const int left_shift = shift > 0 ? shift : 0;
103   const int right_shift = shift > 0 ? 0 : -shift;
104   const int rounding_offset = right_shift > 0 ? 1 << (right_shift - 1) : 0;
105   return (SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
106                                             quantized_multiplier) +
107           rounding_offset) >>
108          right_shift;
109 }
110 #endif  // TFLITE_SINGLE_ROUNDING
111 
112 template <DepthwiseConvOutputRounding output_rounding>
113 struct DepthwiseConvBasicKernel {
RunDepthwiseConvBasicKernel114   static inline void Run(
115       const DepthwiseParams& params, const RuntimeShape& input_shape,
116       const uint8_t* input_data, const RuntimeShape& filter_shape,
117       const uint8_t* filter_data, const RuntimeShape& bias_shape,
118       const int32_t* bias_data, const RuntimeShape& output_shape,
119       uint8_t* output_data) {
120     const int stride_width = params.stride_width;
121     const int stride_height = params.stride_height;
122     const int dilation_width_factor = params.dilation_width_factor;
123     const int dilation_height_factor = params.dilation_height_factor;
124     const int pad_width = params.padding_values.width;
125     const int pad_height = params.padding_values.height;
126     const int depth_multiplier = params.depth_multiplier;
127     const int32_t output_activation_min = params.quantized_activation_min;
128     const int32_t output_activation_max = params.quantized_activation_max;
129     const int32_t input_offset = params.input_offset;
130     const int32_t filter_offset = params.weights_offset;
131     const int32_t output_offset = params.output_offset;
132     const int32_t output_multiplier = params.output_multiplier;
133     const int output_shift = params.output_shift;
134     TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
135     TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
136     TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
137 
138     TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
139     const int batches = MatchingDim(input_shape, 0, output_shape, 0);
140     const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
141     const int input_height = input_shape.Dims(1);
142     const int input_width = input_shape.Dims(2);
143     const int input_depth = input_shape.Dims(3);
144     const int filter_height = filter_shape.Dims(1);
145     const int filter_width = filter_shape.Dims(2);
146     const int output_height = output_shape.Dims(1);
147     const int output_width = output_shape.Dims(2);
148     TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
149     TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
150 
151     for (int b = 0; b < batches; ++b) {
152       for (int out_y = 0; out_y < output_height; ++out_y) {
153         for (int out_x = 0; out_x < output_width; ++out_x) {
154           for (int ic = 0; ic < input_depth; ++ic) {
155             for (int m = 0; m < depth_multiplier; m++) {
156               const int oc = m + ic * depth_multiplier;
157               const int in_x_origin = (out_x * stride_width) - pad_width;
158               const int in_y_origin = (out_y * stride_height) - pad_height;
159               int32_t acc = 0;
160               for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
161                 for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
162                   const int in_x =
163                       in_x_origin + dilation_width_factor * filter_x;
164                   const int in_y =
165                       in_y_origin + dilation_height_factor * filter_y;
166                   // If the location is outside the bounds of the input image,
167                   // use zero as a default value.
168                   if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
169                       (in_y < input_height)) {
170                     int32_t input_val =
171                         input_data[Offset(input_shape, b, in_y, in_x, ic)];
172                     int32_t filter_val = filter_data[Offset(
173                         filter_shape, 0, filter_y, filter_x, oc)];
174                     acc += (filter_val + filter_offset) *
175                            (input_val + input_offset);
176                   }
177                 }
178               }
179               if (bias_data) {
180                 acc += bias_data[oc];
181               }
182               acc = DepthwiseConvRound<output_rounding>(acc, output_multiplier,
183                                                         output_shift);
184               acc += output_offset;
185               acc = std::max(acc, output_activation_min);
186               acc = std::min(acc, output_activation_max);
187               output_data[Offset(output_shape, b, out_y, out_x, oc)] =
188                   static_cast<uint8_t>(acc);
189             }
190           }
191         }
192       }
193     }
194   }
195 
196   // TODO(b/148596273): Reconcile reference versions, perhaps with common
197   // MultiplyByQuantizedMultiplier or DepthwiseConvRound function.
RunPerChannelDepthwiseConvBasicKernel198   static inline void RunPerChannel(
199       const DepthwiseParams& params, const RuntimeShape& input_shape,
200       const int8_t* input_data, const RuntimeShape& filter_shape,
201       const int8_t* filter_data, const RuntimeShape& bias_shape,
202       const int32_t* bias_data, const RuntimeShape& output_shape,
203       int8_t* output_data) {
204     // Get parameters.
205     // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
206     const int stride_width = params.stride_width;
207     const int stride_height = params.stride_height;
208     const int dilation_width_factor = params.dilation_width_factor;
209     const int dilation_height_factor = params.dilation_height_factor;
210     const int pad_width = params.padding_values.width;
211     const int pad_height = params.padding_values.height;
212     const int depth_multiplier = params.depth_multiplier;
213     const int32_t input_offset = params.input_offset;
214     const int32_t output_offset = params.output_offset;
215     const int32_t output_activation_min = params.quantized_activation_min;
216     const int32_t output_activation_max = params.quantized_activation_max;
217     const int32_t* output_multiplier = params.output_multiplier_per_channel;
218     const int32_t* output_shift = params.output_shift_per_channel;
219 
220     // Check dimensions of the tensors.
221     TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
222     TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
223     TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
224 
225     TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
226     const int batches = MatchingDim(input_shape, 0, output_shape, 0);
227     const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
228     const int input_height = input_shape.Dims(1);
229     const int input_width = input_shape.Dims(2);
230     const int input_depth = input_shape.Dims(3);
231     const int filter_height = filter_shape.Dims(1);
232     const int filter_width = filter_shape.Dims(2);
233     const int output_height = output_shape.Dims(1);
234     const int output_width = output_shape.Dims(2);
235     TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
236     TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
237 
238     for (int batch = 0; batch < batches; ++batch) {
239       for (int out_y = 0; out_y < output_height; ++out_y) {
240         for (int out_x = 0; out_x < output_width; ++out_x) {
241           for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
242             for (int m = 0; m < depth_multiplier; ++m) {
243               const int output_channel = m + in_channel * depth_multiplier;
244               const int in_x_origin = (out_x * stride_width) - pad_width;
245               const int in_y_origin = (out_y * stride_height) - pad_height;
246               int32_t acc = 0;
247               for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
248                 for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
249                   const int in_x =
250                       in_x_origin + dilation_width_factor * filter_x;
251                   const int in_y =
252                       in_y_origin + dilation_height_factor * filter_y;
253                   // Zero padding by omitting the areas outside the image.
254                   const bool is_point_inside_image =
255                       (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
256                       (in_y < input_height);
257                   if (is_point_inside_image) {
258                     int32_t input_val = input_data[Offset(
259                         input_shape, batch, in_y, in_x, in_channel)];
260                     int32_t filter_val = filter_data[Offset(
261                         filter_shape, 0, filter_y, filter_x, output_channel)];
262                     // Accumulate with 32 bits accumulator.
263                     // In the nudging process during model quantization, we
264                     // force real value of 0.0 be represented by a quantized
265                     // value. This guarantees that the input_offset is a int8_t,
266                     // even though it is represented using int32_t. int32_t +=
267                     // int8_t
268                     // * (int8_t - int8_t) so the highest value we can get from
269                     // each accumulation is [-127, 127] * ([-128, 127] -
270                     // [-128, 127]), which is [-32512, 32512]. log2(32512)
271                     // = 14.98, which means we can accumulate at least 2^16
272                     // multiplications without overflow. The accumulator is
273                     // applied to a filter so the accumulation logic will hold
274                     // as long as the filter size (filter_y * filter_x *
275                     // in_channel) does not exceed 2^16, which is the case in
276                     // all the models we have seen so far.
277                     acc += filter_val * (input_val + input_offset);
278                   }
279                 }
280               }
281               if (bias_data) {
282                 acc += bias_data[output_channel];
283               }
284               acc = DepthwiseConvRound<output_rounding>(
285                   acc, output_multiplier[output_channel],
286                   output_shift[output_channel]);
287               acc += output_offset;
288               acc = std::max(acc, output_activation_min);
289               acc = std::min(acc, output_activation_max);
290               output_data[Offset(output_shape, batch, out_y, out_x,
291                                  output_channel)] = static_cast<int8_t>(acc);
292             }
293           }
294         }
295       }
296     }
297   }
298 };
299 
300 }  // namespace depthwise_conv
301 
DepthwiseConv(const DepthwiseParams & params,const RuntimeShape & input_shape,const uint8_t * input_data,const RuntimeShape & filter_shape,const uint8_t * filter_data,const RuntimeShape & bias_shape,const int32_t * bias_data,const RuntimeShape & output_shape,uint8_t * output_data)302 inline void DepthwiseConv(
303     const DepthwiseParams& params, const RuntimeShape& input_shape,
304     const uint8_t* input_data, const RuntimeShape& filter_shape,
305     const uint8_t* filter_data, const RuntimeShape& bias_shape,
306     const int32_t* bias_data, const RuntimeShape& output_shape,
307     uint8_t* output_data) {
308   return depthwise_conv::DepthwiseConvBasicKernel<
309       DepthwiseConvOutputRounding::kAwayFromZero>::Run(params, input_shape,
310                                                        input_data, filter_shape,
311                                                        filter_data, bias_shape,
312                                                        bias_data, output_shape,
313                                                        output_data);
314 }
315 
316 }  // namespace reference_ops
317 }  // end namespace tflite
318 
319 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
320