xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/kernels/internal/reference/div.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
17 
18 #include <algorithm>
19 
20 #include "tensorflow/lite/kernels/internal/common.h"
21 
22 namespace tflite {
23 
24 namespace reference_ops {
25 
26 template <typename T>
DivCheckArithmeticParams(const ArithmeticParams & params)27 inline void DivCheckArithmeticParams(const ArithmeticParams& params) {
28   TFLITE_DCHECK_LE(params.quantized_activation_min,
29                    params.quantized_activation_max);
30   // Input offset is negative input zero point. Activation tensors are
31   // asymmetric quantized so they span the full int8 range.
32   constexpr int32_t max_value =
33       static_cast<int32_t>(std::numeric_limits<T>::max());
34   TFLITE_DCHECK_GE(params.input1_offset, -max_value);
35   TFLITE_DCHECK_LE(params.input1_offset, max_value);
36   TFLITE_DCHECK_GE(params.input2_offset, -max_value);
37   TFLITE_DCHECK_LE(params.input2_offset, max_value);
38   TFLITE_DCHECK_GE(params.output_offset, -max_value);
39   TFLITE_DCHECK_LE(params.output_offset, max_value);
40 }
41 
42 // Element-wise div that can often be used for inner loop of broadcast Div as
43 // well as the non-broadcast Div.
44 template <typename T>
DivElementwise(int size,const ArithmeticParams & params,const T * input1_data,const T * input2_data,T * output_data)45 inline void DivElementwise(int size, const ArithmeticParams& params,
46                            const T* input1_data, const T* input2_data,
47                            T* output_data) {
48   DivCheckArithmeticParams<T>(params);
49 
50   for (int i = 0; i < size; ++i) {
51     int32_t input1_val = params.input1_offset + input1_data[i];
52     int32_t input2_val = params.input2_offset + input2_data[i];
53     TFLITE_DCHECK_NE(input2_val, 0);
54     if (input2_val < 0) {
55       // Invert signs to avoid a negative input2_val as input2_inv needs to be
56       // positive to be used as multiplier of MultiplyByQuantizedMultiplier.
57       input1_val = -input1_val;
58       input2_val = -input2_val;
59     }
60     int recip_shift;
61     const int32_t input2_inv = GetReciprocal(input2_val, 31, &recip_shift);
62     const int headroom = CountLeadingSignBits(input1_val);
63     const int32_t unscaled_quotient =
64         MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
65                                                     headroom);
66     const int total_shift = params.output_shift - recip_shift - headroom;
67     const int32_t unclamped_result =
68         params.output_offset +
69         MultiplyByQuantizedMultiplierSmallerThanOneExp(
70             unscaled_quotient, params.output_multiplier, total_shift);
71     const int32_t clamped_output =
72         std::min(params.quantized_activation_max,
73                  std::max(params.quantized_activation_min, unclamped_result));
74     output_data[i] = static_cast<T>(clamped_output);
75   }
76 }
77 
Div(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8_t * input1_data,const RuntimeShape & input2_shape,const uint8_t * input2_data,const RuntimeShape & output_shape,uint8_t * output_data)78 inline void Div(const ArithmeticParams& params,
79                 const RuntimeShape& input1_shape, const uint8_t* input1_data,
80                 const RuntimeShape& input2_shape, const uint8_t* input2_data,
81                 const RuntimeShape& output_shape, uint8_t* output_data) {
82   TFLITE_DCHECK_LE(params.quantized_activation_min,
83                    params.quantized_activation_max);
84   const int flat_size =
85       MatchingElementsSize(input1_shape, input2_shape, output_shape);
86 
87   DivElementwise(flat_size, params, input1_data, input2_data, output_data);
88 }
89 
Div(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int8_t * input1_data,const RuntimeShape & input2_shape,const int8_t * input2_data,const RuntimeShape & output_shape,int8_t * output_data)90 inline void Div(const ArithmeticParams& params,
91                 const RuntimeShape& input1_shape, const int8_t* input1_data,
92                 const RuntimeShape& input2_shape, const int8_t* input2_data,
93                 const RuntimeShape& output_shape, int8_t* output_data) {
94   TFLITE_DCHECK_LE(params.quantized_activation_min,
95                    params.quantized_activation_max);
96   const int flat_size =
97       MatchingElementsSize(input1_shape, input2_shape, output_shape);
98 
99   DivElementwise(flat_size, params, input1_data, input2_data, output_data);
100 }
101 
102 template <typename T, int N = 5>
BroadcastDivSlowQuantized(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,T * output_data)103 inline void BroadcastDivSlowQuantized(
104     const ArithmeticParams& params, const RuntimeShape& unextended_input1_shape,
105     const T* input1_data, const RuntimeShape& unextended_input2_shape,
106     const T* input2_data, const RuntimeShape& unextended_output_shape,
107     T* output_data) {
108   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
109   TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
110   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
111 
112   NdArrayDesc<N> desc1;
113   NdArrayDesc<N> desc2;
114   NdArrayDesc<N> output_desc;
115   NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
116                                       unextended_input2_shape, &desc1, &desc2);
117   CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
118                  &output_desc);
119 
120   DivCheckArithmeticParams<T>(params);
121 
122   auto div_func = [&](int indexes[N]) {
123     int32_t input1_val =
124         params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
125     int32_t input2_val =
126         params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
127     TFLITE_DCHECK_NE(input2_val, 0);
128     if (input2_val < 0) {
129       // Invert signs to avoid a negative input2_val as input2_inv needs to be
130       // positive to be used as multiplier of MultiplyByQuantizedMultiplier.
131       input1_val = -input1_val;
132       input2_val = -input2_val;
133     }
134     int recip_shift;
135     const int32_t input2_inv = GetReciprocal(input2_val, 31, &recip_shift);
136     const int headroom = CountLeadingSignBits(input1_val);
137     const int32_t unscaled_quotient =
138         MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
139                                                     headroom);
140     const int total_shift = params.output_shift - recip_shift - headroom;
141     const int32_t unclamped_result =
142         params.output_offset +
143         MultiplyByQuantizedMultiplierSmallerThanOneExp(
144             unscaled_quotient, params.output_multiplier, total_shift);
145     const int32_t clamped_output =
146         std::min(params.quantized_activation_max,
147                  std::max(params.quantized_activation_min, unclamped_result));
148     output_data[SubscriptToIndex(output_desc, indexes)] =
149         static_cast<T>(clamped_output);
150   };
151   NDOpsHelper<N>(output_desc, div_func);
152 }
153 
154 template <int N = 5>
BroadcastDivSlow(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const uint8_t * input1_data,const RuntimeShape & unextended_input2_shape,const uint8_t * input2_data,const RuntimeShape & unextended_output_shape,uint8_t * output_data)155 inline void BroadcastDivSlow(const ArithmeticParams& params,
156                              const RuntimeShape& unextended_input1_shape,
157                              const uint8_t* input1_data,
158                              const RuntimeShape& unextended_input2_shape,
159                              const uint8_t* input2_data,
160                              const RuntimeShape& unextended_output_shape,
161                              uint8_t* output_data) {
162   BroadcastDivSlowQuantized<uint8_t, N>(
163       params, unextended_input1_shape, input1_data, unextended_input2_shape,
164       input2_data, unextended_output_shape, output_data);
165 }
166 
167 template <int N = 5>
BroadcastDivSlow(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const int8_t * input1_data,const RuntimeShape & unextended_input2_shape,const int8_t * input2_data,const RuntimeShape & unextended_output_shape,int8_t * output_data)168 inline void BroadcastDivSlow(const ArithmeticParams& params,
169                              const RuntimeShape& unextended_input1_shape,
170                              const int8_t* input1_data,
171                              const RuntimeShape& unextended_input2_shape,
172                              const int8_t* input2_data,
173                              const RuntimeShape& unextended_output_shape,
174                              int8_t* output_data) {
175   BroadcastDivSlowQuantized<int8_t, N>(
176       params, unextended_input1_shape, input1_data, unextended_input2_shape,
177       input2_data, unextended_output_shape, output_data);
178 }
179 
180 // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
181 // dimensionality if the runtime code does a single loop over one dimension
182 // that handles broadcasting as the base case. The code generator would then
183 // generate max(D1, D2) nested for loops.
184 template <typename T, int N = 5>
BroadcastDivSlow(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,T * output_data)185 void BroadcastDivSlow(const ArithmeticParams& params,
186                       const RuntimeShape& unextended_input1_shape,
187                       const T* input1_data,
188                       const RuntimeShape& unextended_input2_shape,
189                       const T* input2_data,
190                       const RuntimeShape& unextended_output_shape,
191                       T* output_data) {
192   T output_activation_min;
193   T output_activation_max;
194   GetActivationParams(params, &output_activation_min, &output_activation_max);
195 
196   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
197   TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
198   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
199 
200   NdArrayDesc<N> desc1;
201   NdArrayDesc<N> desc2;
202   NdArrayDesc<N> output_desc;
203   NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
204                                       unextended_input2_shape, &desc1, &desc2);
205   CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
206                  &output_desc);
207 
208   // In Tensorflow, the dimensions are canonically named (batch_number, row,
209   // col, channel), with extents (batches, height, width, depth), with the
210   // trailing dimension changing most rapidly (channels has the smallest
211   // stride, typically 1 element).
212   //
213   // In generated C code, we store arrays with the dimensions reversed. The
214   // first dimension has smallest stride.
215 
216   auto div_func = [&](int indexes[N]) {
217     output_data[SubscriptToIndex(output_desc, indexes)] =
218         ActivationFunctionWithMinMax(
219             input1_data[SubscriptToIndex(desc1, indexes)] /
220                 input2_data[SubscriptToIndex(desc2, indexes)],
221             output_activation_min, output_activation_max);
222   };
223   NDOpsHelper<N>(output_desc, div_func);
224 }
225 
226 template <typename T>
Div(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)227 inline void Div(const ArithmeticParams& params,
228                 const RuntimeShape& input1_shape, const T* input1_data,
229                 const RuntimeShape& input2_shape, const T* input2_data,
230                 const RuntimeShape& output_shape, T* output_data) {
231   T output_activation_min;
232   T output_activation_max;
233   GetActivationParams(params, &output_activation_min, &output_activation_max);
234 
235   const int flat_size =
236       MatchingElementsSize(input1_shape, input2_shape, output_shape);
237   for (int i = 0; i < flat_size; ++i) {
238     output_data[i] = ActivationFunctionWithMinMax(
239         input1_data[i] / input2_data[i], output_activation_min,
240         output_activation_max);
241   }
242 }
243 
244 }  // namespace reference_ops
245 }  // namespace tflite
246 
247 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
248