1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
17
18 #include <algorithm>
19
20 #include "tensorflow/lite/kernels/internal/common.h"
21
22 namespace tflite {
23
24 namespace reference_ops {
25
26 template <typename T>
DivCheckArithmeticParams(const ArithmeticParams & params)27 inline void DivCheckArithmeticParams(const ArithmeticParams& params) {
28 TFLITE_DCHECK_LE(params.quantized_activation_min,
29 params.quantized_activation_max);
30 // Input offset is negative input zero point. Activation tensors are
31 // asymmetric quantized so they span the full int8 range.
32 constexpr int32_t max_value =
33 static_cast<int32_t>(std::numeric_limits<T>::max());
34 TFLITE_DCHECK_GE(params.input1_offset, -max_value);
35 TFLITE_DCHECK_LE(params.input1_offset, max_value);
36 TFLITE_DCHECK_GE(params.input2_offset, -max_value);
37 TFLITE_DCHECK_LE(params.input2_offset, max_value);
38 TFLITE_DCHECK_GE(params.output_offset, -max_value);
39 TFLITE_DCHECK_LE(params.output_offset, max_value);
40 }
41
42 // Element-wise div that can often be used for inner loop of broadcast Div as
43 // well as the non-broadcast Div.
44 template <typename T>
DivElementwise(int size,const ArithmeticParams & params,const T * input1_data,const T * input2_data,T * output_data)45 inline void DivElementwise(int size, const ArithmeticParams& params,
46 const T* input1_data, const T* input2_data,
47 T* output_data) {
48 DivCheckArithmeticParams<T>(params);
49
50 for (int i = 0; i < size; ++i) {
51 int32_t input1_val = params.input1_offset + input1_data[i];
52 int32_t input2_val = params.input2_offset + input2_data[i];
53 TFLITE_DCHECK_NE(input2_val, 0);
54 if (input2_val < 0) {
55 // Invert signs to avoid a negative input2_val as input2_inv needs to be
56 // positive to be used as multiplier of MultiplyByQuantizedMultiplier.
57 input1_val = -input1_val;
58 input2_val = -input2_val;
59 }
60 int recip_shift;
61 const int32_t input2_inv = GetReciprocal(input2_val, 31, &recip_shift);
62 const int headroom = CountLeadingSignBits(input1_val);
63 const int32_t unscaled_quotient =
64 MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
65 headroom);
66 const int total_shift = params.output_shift - recip_shift - headroom;
67 const int32_t unclamped_result =
68 params.output_offset +
69 MultiplyByQuantizedMultiplierSmallerThanOneExp(
70 unscaled_quotient, params.output_multiplier, total_shift);
71 const int32_t clamped_output =
72 std::min(params.quantized_activation_max,
73 std::max(params.quantized_activation_min, unclamped_result));
74 output_data[i] = static_cast<T>(clamped_output);
75 }
76 }
77
Div(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8_t * input1_data,const RuntimeShape & input2_shape,const uint8_t * input2_data,const RuntimeShape & output_shape,uint8_t * output_data)78 inline void Div(const ArithmeticParams& params,
79 const RuntimeShape& input1_shape, const uint8_t* input1_data,
80 const RuntimeShape& input2_shape, const uint8_t* input2_data,
81 const RuntimeShape& output_shape, uint8_t* output_data) {
82 TFLITE_DCHECK_LE(params.quantized_activation_min,
83 params.quantized_activation_max);
84 const int flat_size =
85 MatchingElementsSize(input1_shape, input2_shape, output_shape);
86
87 DivElementwise(flat_size, params, input1_data, input2_data, output_data);
88 }
89
Div(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int8_t * input1_data,const RuntimeShape & input2_shape,const int8_t * input2_data,const RuntimeShape & output_shape,int8_t * output_data)90 inline void Div(const ArithmeticParams& params,
91 const RuntimeShape& input1_shape, const int8_t* input1_data,
92 const RuntimeShape& input2_shape, const int8_t* input2_data,
93 const RuntimeShape& output_shape, int8_t* output_data) {
94 TFLITE_DCHECK_LE(params.quantized_activation_min,
95 params.quantized_activation_max);
96 const int flat_size =
97 MatchingElementsSize(input1_shape, input2_shape, output_shape);
98
99 DivElementwise(flat_size, params, input1_data, input2_data, output_data);
100 }
101
102 template <typename T, int N = 5>
BroadcastDivSlowQuantized(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,T * output_data)103 inline void BroadcastDivSlowQuantized(
104 const ArithmeticParams& params, const RuntimeShape& unextended_input1_shape,
105 const T* input1_data, const RuntimeShape& unextended_input2_shape,
106 const T* input2_data, const RuntimeShape& unextended_output_shape,
107 T* output_data) {
108 TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
109 TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
110 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
111
112 NdArrayDesc<N> desc1;
113 NdArrayDesc<N> desc2;
114 NdArrayDesc<N> output_desc;
115 NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
116 unextended_input2_shape, &desc1, &desc2);
117 CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
118 &output_desc);
119
120 DivCheckArithmeticParams<T>(params);
121
122 auto div_func = [&](int indexes[N]) {
123 int32_t input1_val =
124 params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
125 int32_t input2_val =
126 params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
127 TFLITE_DCHECK_NE(input2_val, 0);
128 if (input2_val < 0) {
129 // Invert signs to avoid a negative input2_val as input2_inv needs to be
130 // positive to be used as multiplier of MultiplyByQuantizedMultiplier.
131 input1_val = -input1_val;
132 input2_val = -input2_val;
133 }
134 int recip_shift;
135 const int32_t input2_inv = GetReciprocal(input2_val, 31, &recip_shift);
136 const int headroom = CountLeadingSignBits(input1_val);
137 const int32_t unscaled_quotient =
138 MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
139 headroom);
140 const int total_shift = params.output_shift - recip_shift - headroom;
141 const int32_t unclamped_result =
142 params.output_offset +
143 MultiplyByQuantizedMultiplierSmallerThanOneExp(
144 unscaled_quotient, params.output_multiplier, total_shift);
145 const int32_t clamped_output =
146 std::min(params.quantized_activation_max,
147 std::max(params.quantized_activation_min, unclamped_result));
148 output_data[SubscriptToIndex(output_desc, indexes)] =
149 static_cast<T>(clamped_output);
150 };
151 NDOpsHelper<N>(output_desc, div_func);
152 }
153
154 template <int N = 5>
BroadcastDivSlow(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const uint8_t * input1_data,const RuntimeShape & unextended_input2_shape,const uint8_t * input2_data,const RuntimeShape & unextended_output_shape,uint8_t * output_data)155 inline void BroadcastDivSlow(const ArithmeticParams& params,
156 const RuntimeShape& unextended_input1_shape,
157 const uint8_t* input1_data,
158 const RuntimeShape& unextended_input2_shape,
159 const uint8_t* input2_data,
160 const RuntimeShape& unextended_output_shape,
161 uint8_t* output_data) {
162 BroadcastDivSlowQuantized<uint8_t, N>(
163 params, unextended_input1_shape, input1_data, unextended_input2_shape,
164 input2_data, unextended_output_shape, output_data);
165 }
166
167 template <int N = 5>
BroadcastDivSlow(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const int8_t * input1_data,const RuntimeShape & unextended_input2_shape,const int8_t * input2_data,const RuntimeShape & unextended_output_shape,int8_t * output_data)168 inline void BroadcastDivSlow(const ArithmeticParams& params,
169 const RuntimeShape& unextended_input1_shape,
170 const int8_t* input1_data,
171 const RuntimeShape& unextended_input2_shape,
172 const int8_t* input2_data,
173 const RuntimeShape& unextended_output_shape,
174 int8_t* output_data) {
175 BroadcastDivSlowQuantized<int8_t, N>(
176 params, unextended_input1_shape, input1_data, unextended_input2_shape,
177 input2_data, unextended_output_shape, output_data);
178 }
179
180 // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
181 // dimensionality if the runtime code does a single loop over one dimension
182 // that handles broadcasting as the base case. The code generator would then
183 // generate max(D1, D2) nested for loops.
184 template <typename T, int N = 5>
BroadcastDivSlow(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,T * output_data)185 void BroadcastDivSlow(const ArithmeticParams& params,
186 const RuntimeShape& unextended_input1_shape,
187 const T* input1_data,
188 const RuntimeShape& unextended_input2_shape,
189 const T* input2_data,
190 const RuntimeShape& unextended_output_shape,
191 T* output_data) {
192 T output_activation_min;
193 T output_activation_max;
194 GetActivationParams(params, &output_activation_min, &output_activation_max);
195
196 TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
197 TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
198 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
199
200 NdArrayDesc<N> desc1;
201 NdArrayDesc<N> desc2;
202 NdArrayDesc<N> output_desc;
203 NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
204 unextended_input2_shape, &desc1, &desc2);
205 CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
206 &output_desc);
207
208 // In Tensorflow, the dimensions are canonically named (batch_number, row,
209 // col, channel), with extents (batches, height, width, depth), with the
210 // trailing dimension changing most rapidly (channels has the smallest
211 // stride, typically 1 element).
212 //
213 // In generated C code, we store arrays with the dimensions reversed. The
214 // first dimension has smallest stride.
215
216 auto div_func = [&](int indexes[N]) {
217 output_data[SubscriptToIndex(output_desc, indexes)] =
218 ActivationFunctionWithMinMax(
219 input1_data[SubscriptToIndex(desc1, indexes)] /
220 input2_data[SubscriptToIndex(desc2, indexes)],
221 output_activation_min, output_activation_max);
222 };
223 NDOpsHelper<N>(output_desc, div_func);
224 }
225
226 template <typename T>
Div(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)227 inline void Div(const ArithmeticParams& params,
228 const RuntimeShape& input1_shape, const T* input1_data,
229 const RuntimeShape& input2_shape, const T* input2_data,
230 const RuntimeShape& output_shape, T* output_data) {
231 T output_activation_min;
232 T output_activation_max;
233 GetActivationParams(params, &output_activation_min, &output_activation_max);
234
235 const int flat_size =
236 MatchingElementsSize(input1_shape, input2_shape, output_shape);
237 for (int i = 0; i < flat_size; ++i) {
238 output_data[i] = ActivationFunctionWithMinMax(
239 input1_data[i] / input2_data[i], output_activation_min,
240 output_activation_max);
241 }
242 }
243
244 } // namespace reference_ops
245 } // namespace tflite
246
247 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
248