1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
17
18 #include <stdint.h>
19
20 #include <algorithm>
21 #include <limits>
22
23 #include "ruy/profiler/instrumentation.h" // from @ruy
24 #include "tensorflow/lite/kernels/internal/common.h"
25 #include "tensorflow/lite/kernels/internal/compatibility.h"
26 #include "tensorflow/lite/kernels/internal/types.h"
27
28 namespace tflite {
29
30 namespace reference_ops {
31
SubNonBroadcast(const ArithmeticParams & params,const RuntimeShape & input1_shape,const float * input1_data,const RuntimeShape & input2_shape,const float * input2_data,const RuntimeShape & output_shape,float * output_data)32 inline void SubNonBroadcast(const ArithmeticParams& params,
33 const RuntimeShape& input1_shape,
34 const float* input1_data,
35 const RuntimeShape& input2_shape,
36 const float* input2_data,
37 const RuntimeShape& output_shape,
38 float* output_data) {
39 const int flat_size =
40 MatchingElementsSize(input1_shape, input2_shape, output_shape);
41 for (int i = 0; i < flat_size; ++i) {
42 output_data[i] = ActivationFunctionWithMinMax(
43 input1_data[i] - input2_data[i], params.float_activation_min,
44 params.float_activation_max);
45 }
46 }
47
SubNonBroadcast(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int32_t * input1_data,const RuntimeShape & input2_shape,const int32_t * input2_data,const RuntimeShape & output_shape,int32_t * output_data)48 inline void SubNonBroadcast(const ArithmeticParams& params,
49 const RuntimeShape& input1_shape,
50 const int32_t* input1_data,
51 const RuntimeShape& input2_shape,
52 const int32_t* input2_data,
53 const RuntimeShape& output_shape,
54 int32_t* output_data) {
55 const int flat_size =
56 MatchingElementsSize(input1_shape, input2_shape, output_shape);
57 for (int i = 0; i < flat_size; ++i) {
58 output_data[i] = ActivationFunctionWithMinMax(
59 input1_data[i] - input2_data[i], params.quantized_activation_min,
60 params.quantized_activation_max);
61 }
62 }
63
64 // TODO(b/151345304): We can implement BroadcastSub on buffers of arbitrary
65 // dimensionality if the runtime code does a single loop over one dimension
66 // that handles broadcasting as the base case. The code generator would then
67 // generate max(D1, D2) nested for loops.
68 template <int N = 5>
BroadcastSubSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const float * input1_data,const RuntimeShape & input2_shape,const float * input2_data,const RuntimeShape & output_shape,float * output_data)69 inline void BroadcastSubSlow(const ArithmeticParams& params,
70 const RuntimeShape& input1_shape,
71 const float* input1_data,
72 const RuntimeShape& input2_shape,
73 const float* input2_data,
74 const RuntimeShape& output_shape,
75 float* output_data) {
76 ruy::profiler::ScopeLabel label("BroadcastSubSlow/float");
77 TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
78 TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
79 TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
80 NdArrayDesc<N> desc1;
81 NdArrayDesc<N> desc2;
82 NdArrayDesc<N> output_desc;
83 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
84 &desc2);
85 CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
86
87 // In Tensorflow, the dimensions are canonically named (batch_number, row,
88 // col, channel), with extents (batches, height, width, depth), with the
89 // trailing dimension changing most rapidly (channels has the smallest stride,
90 // typically 1 element).
91 //
92 // In generated C code, we store arrays with the dimensions reversed. The
93 // first dimension has smallest stride.
94 //
95 // We name our variables by their Tensorflow convention, but generate C code
96 // nesting loops such that the innermost loop has the smallest stride for the
97 // best cache behavior.
98 auto sub_func = [&](int indexes[N]) {
99 output_data[SubscriptToIndex(output_desc, indexes)] =
100 ActivationFunctionWithMinMax(
101 input1_data[SubscriptToIndex(desc1, indexes)] -
102 input2_data[SubscriptToIndex(desc2, indexes)],
103 params.float_activation_min, params.float_activation_max);
104 };
105 NDOpsHelper<N>(output_desc, sub_func);
106 }
107
108 template <int N = 5>
BroadcastSubSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int32_t * input1_data,const RuntimeShape & input2_shape,const int32_t * input2_data,const RuntimeShape & output_shape,int32_t * output_data)109 inline void BroadcastSubSlow(const ArithmeticParams& params,
110 const RuntimeShape& input1_shape,
111 const int32_t* input1_data,
112 const RuntimeShape& input2_shape,
113 const int32_t* input2_data,
114 const RuntimeShape& output_shape,
115 int32_t* output_data) {
116 ruy::profiler::ScopeLabel label("BroadcastSubSlow/int32_t");
117 TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
118 TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
119 TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
120 NdArrayDesc<N> desc1;
121 NdArrayDesc<N> desc2;
122 NdArrayDesc<N> output_desc;
123 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
124 &desc2);
125 CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
126
127 // In Tensorflow, the dimensions are canonically named (batch_number, row,
128 // col, channel), with extents (batches, height, width, depth), with the
129 // trailing dimension changing most rapidly (channels has the smallest stride,
130 // typically 1 element).
131 //
132 // In generated C code, we store arrays with the dimensions reversed. The
133 // first dimension has smallest stride.
134 //
135 // We name our variables by their Tensorflow convention, but generate C code
136 // nesting loops such that the innermost loop has the smallest stride for the
137 // best cache behavior.
138 auto sub_func = [&](int indexes[N]) {
139 output_data[SubscriptToIndex(output_desc, indexes)] =
140 ActivationFunctionWithMinMax(
141 input1_data[SubscriptToIndex(desc1, indexes)] -
142 input2_data[SubscriptToIndex(desc2, indexes)],
143 params.quantized_activation_min, params.quantized_activation_max);
144 };
145 NDOpsHelper<N>(output_desc, sub_func);
146 }
147
148 template <int N = 5>
BroadcastSubSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int64_t * input1_data,const RuntimeShape & input2_shape,const int64_t * input2_data,const RuntimeShape & output_shape,int64_t * output_data)149 void BroadcastSubSlow(const ArithmeticParams& params,
150 const RuntimeShape& input1_shape,
151 const int64_t* input1_data,
152 const RuntimeShape& input2_shape,
153 const int64_t* input2_data,
154 const RuntimeShape& output_shape, int64_t* output_data) {
155 ruy::profiler::ScopeLabel label("BroadcastSubSlow/int64_t");
156 TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
157 TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
158 TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
159 NdArrayDesc<N> desc1;
160 NdArrayDesc<N> desc2;
161 NdArrayDesc<N> output_desc;
162 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
163 &desc2);
164 CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
165
166 // In Tensorflow, the dimensions are canonically named (batch_number, row,
167 // col, channel), with extents (batches, height, width, depth), with the
168 // trailing dimension changing most rapidly (channels has the smallest stride,
169 // typically 1 element).
170 //
171 // In generated C code, we store arrays with the dimensions reversed. The
172 // first dimension has smallest stride.
173 //
174 // We name our variables by their Tensorflow convention, but generate C code
175 // nesting loops such that the innermost loop has the smallest stride for the
176 // best cache behavior.
177 auto sub_func = [&](int indexes[N]) {
178 output_data[SubscriptToIndex(output_desc, indexes)] =
179 ActivationFunctionWithMinMax(
180 input1_data[SubscriptToIndex(desc1, indexes)] -
181 input2_data[SubscriptToIndex(desc2, indexes)],
182 params.int64_activation_min, params.int64_activation_max);
183 };
184 NDOpsHelper<N>(output_desc, sub_func);
185 }
186
187 template <typename T, int N = 5>
BroadcastSubSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)188 void BroadcastSubSlow(const ArithmeticParams& params,
189 const RuntimeShape& input1_shape, const T* input1_data,
190 const RuntimeShape& input2_shape, const T* input2_data,
191 const RuntimeShape& output_shape, T* output_data) {
192 ruy::profiler::ScopeLabel label("BroadcastSubSlow/templated");
193 TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
194 TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
195 TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
196 NdArrayDesc<N> desc1;
197 NdArrayDesc<N> desc2;
198 NdArrayDesc<N> output_desc;
199 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
200 &desc2);
201 CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
202
203 // In Tensorflow, the dimensions are canonically named (batch_number, row,
204 // col, channel), with extents (batches, height, width, depth), with the
205 // trailing dimension changing most rapidly (channels has the smallest stride,
206 // typically 1 element).
207 //
208 // In generated C code, we store arrays with the dimensions reversed. The
209 // first dimension has smallest stride.
210 //
211 // We name our variables by their Tensorflow convention, but generate C code
212 // nesting loops such that the innermost loop has the smallest stride for the
213 // best cache behavior.
214 auto sub_func = [&](int indexes[N]) {
215 output_data[SubscriptToIndex(output_desc, indexes)] =
216 ActivationFunctionWithMinMax(
217 input1_data[SubscriptToIndex(desc1, indexes)] -
218 input2_data[SubscriptToIndex(desc2, indexes)],
219 params.quantized_activation_min, params.quantized_activation_max);
220 };
221 NDOpsHelper<N>(output_desc, sub_func);
222 }
223
224 template <int N = 5>
BroadcastSub16POTSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int16_t * input1_data,const RuntimeShape & input2_shape,const int16_t * input2_data,const RuntimeShape & output_shape,int16_t * output_data)225 inline void BroadcastSub16POTSlow(const ArithmeticParams& params,
226 const RuntimeShape& input1_shape,
227 const int16_t* input1_data,
228 const RuntimeShape& input2_shape,
229 const int16_t* input2_data,
230 const RuntimeShape& output_shape,
231 int16_t* output_data) {
232 ruy::profiler::ScopeLabel label("BroadcastSub16POTSlow/int16_t");
233 NdArrayDesc<N> desc1;
234 NdArrayDesc<N> desc2;
235 NdArrayDesc<N> output_desc;
236 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
237 &desc2);
238 CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
239
240 // In Tensorflow, the dimensions are canonically named (batch_number, row,
241 // col, channel), with extents (batches, height, width, depth), with the
242 // trailing dimension changing most rapidly (channels has the smallest stride,
243 // typically 1 element).
244 //
245 // In generated C code, we store arrays with the dimensions reversed. The
246 // first dimension has smallest stride.
247 //
248 // We name our variables by their Tensorflow convention, but generate C code
249 // nesting loops such that the innermost loop has the smallest stride for the
250 // best cache behavior.
251 auto sub_func = [&](int indexes[N]) {
252 const int32_t input1_val = input1_data[SubscriptToIndex(desc1, indexes)];
253 const int32_t input2_val = input2_data[SubscriptToIndex(desc2, indexes)];
254 const int32_t scaled_input1_val =
255 gemmlowp::RoundingDivideByPOT(input1_val, -params.input1_shift);
256 const int32_t scaled_input2_val =
257 gemmlowp::RoundingDivideByPOT(input2_val, -params.input2_shift);
258 const int32_t raw_output = scaled_input1_val - scaled_input2_val;
259 const int32_t clamped_output =
260 std::min(params.quantized_activation_max,
261 std::max(params.quantized_activation_min, raw_output));
262 output_data[SubscriptToIndex(output_desc, indexes)] =
263 static_cast<int16_t>(clamped_output);
264 };
265 NDOpsHelper<N>(output_desc, sub_func);
266 }
267
268 template <typename T, int N = 5>
BroadcastQuantSubSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)269 void BroadcastQuantSubSlow(const ArithmeticParams& params,
270 const RuntimeShape& input1_shape,
271 const T* input1_data,
272 const RuntimeShape& input2_shape,
273 const T* input2_data,
274 const RuntimeShape& output_shape, T* output_data) {
275 ruy::profiler::ScopeLabel label("BroadcastQuantSubSlow/T");
276 TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
277 TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
278 TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
279 NdArrayDesc<N> desc1;
280 NdArrayDesc<N> desc2;
281 NdArrayDesc<N> output_desc;
282 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
283 &desc2);
284 CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
285
286 // In Tensorflow, the dimensions are canonically named (batch_number, row,
287 // col, channel), with extents (batches, height, width, depth), with the
288 // trailing dimension changing most rapidly (channels has the smallest stride,
289 // typically 1 element).
290 //
291 // In generated C code, we store arrays with the dimensions reversed. The
292 // first dimension has smallest stride.
293 //
294 // We name our variables by their Tensorflow convention, but generate C code
295 // nesting loops such that the innermost loop has the smallest stride for the
296 // best cache behavior.
297 auto sub_func = [&](int indexes[N]) {
298 const int32_t input1_val =
299 params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
300 const int32_t input2_val =
301 params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
302 const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
303 const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
304 const int32_t scaled_input1_val =
305 MultiplyByQuantizedMultiplierSmallerThanOneExp(
306 shifted_input1_val, params.input1_multiplier, params.input1_shift);
307 const int32_t scaled_input2_val =
308 MultiplyByQuantizedMultiplierSmallerThanOneExp(
309 shifted_input2_val, params.input2_multiplier, params.input2_shift);
310 const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
311 const int32_t raw_output =
312 MultiplyByQuantizedMultiplierSmallerThanOneExp(
313 raw_sub, params.output_multiplier, params.output_shift) +
314 params.output_offset;
315 const int32_t clamped_output =
316 std::min(params.quantized_activation_max,
317 std::max(params.quantized_activation_min, raw_output));
318 output_data[SubscriptToIndex(output_desc, indexes)] =
319 static_cast<T>(clamped_output);
320 };
321 NDOpsHelper<N>(output_desc, sub_func);
322 }
323
324 // Element-wise add that can often be used for inner loop of broadcast add as
325 // well as the non-broadcast add.
326 template <typename T>
SubElementwise(int size,const ArithmeticParams & params,const T * input1_data,const T * input2_data,T * output_data)327 inline void SubElementwise(int size, const ArithmeticParams& params,
328 const T* input1_data, const T* input2_data,
329 T* output_data) {
330 for (int i = 0; i < size; ++i) {
331 const int32_t input1_val = params.input1_offset + input1_data[i];
332 const int32_t input2_val = params.input2_offset + input2_data[i];
333 const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
334 const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
335 const int32_t scaled_input1_val =
336 MultiplyByQuantizedMultiplierSmallerThanOneExp(
337 shifted_input1_val, params.input1_multiplier, params.input1_shift);
338 const int32_t scaled_input2_val =
339 MultiplyByQuantizedMultiplierSmallerThanOneExp(
340 shifted_input2_val, params.input2_multiplier, params.input2_shift);
341 const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
342 const int32_t raw_output =
343 MultiplyByQuantizedMultiplierSmallerThanOneExp(
344 raw_sub, params.output_multiplier, params.output_shift) +
345 params.output_offset;
346 const int32_t clamped_output =
347 std::min(params.quantized_activation_max,
348 std::max(params.quantized_activation_min, raw_output));
349 output_data[i] = static_cast<T>(clamped_output);
350 }
351 }
352
Sub(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8_t * input1_data,const RuntimeShape & input2_shape,const uint8_t * input2_data,const RuntimeShape & output_shape,uint8_t * output_data)353 inline void Sub(const ArithmeticParams& params,
354 const RuntimeShape& input1_shape, const uint8_t* input1_data,
355 const RuntimeShape& input2_shape, const uint8_t* input2_data,
356 const RuntimeShape& output_shape, uint8_t* output_data) {
357 TFLITE_DCHECK_LE(params.quantized_activation_min,
358 params.quantized_activation_max);
359 const int flat_size =
360 MatchingElementsSize(input1_shape, input2_shape, output_shape);
361
362 TFLITE_DCHECK_GT(params.input1_offset, -256);
363 TFLITE_DCHECK_GT(params.input2_offset, -256);
364 TFLITE_DCHECK_LT(params.input1_offset, 256);
365 TFLITE_DCHECK_LT(params.input2_offset, 256);
366 SubElementwise(flat_size, params, input1_data, input2_data, output_data);
367 }
368
Sub(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int8_t * input1_data,const RuntimeShape & input2_shape,const int8_t * input2_data,const RuntimeShape & output_shape,int8_t * output_data)369 inline void Sub(const ArithmeticParams& params,
370 const RuntimeShape& input1_shape, const int8_t* input1_data,
371 const RuntimeShape& input2_shape, const int8_t* input2_data,
372 const RuntimeShape& output_shape, int8_t* output_data) {
373 TFLITE_DCHECK_LE(params.quantized_activation_min,
374 params.quantized_activation_max);
375
376 const int flat_size =
377 MatchingElementsSize(input1_shape, input2_shape, output_shape);
378
379 TFLITE_DCHECK_GE(params.input1_offset, -128);
380 TFLITE_DCHECK_GE(params.input2_offset, -128);
381 // offset = -quantization_params.zero_point in PrepareGeneralSubOp().
382 // So it's maximum can be 128 not 127.
383 TFLITE_DCHECK_LE(params.input1_offset, 128);
384 TFLITE_DCHECK_LE(params.input2_offset, 128);
385 SubElementwise(flat_size, params, input1_data, input2_data, output_data);
386 }
387
Sub(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int16_t * input1_data,const RuntimeShape & input2_shape,const int16_t * input2_data,const RuntimeShape & output_shape,int16_t * output_data)388 inline void Sub(const ArithmeticParams& params,
389 const RuntimeShape& input1_shape, const int16_t* input1_data,
390 const RuntimeShape& input2_shape, const int16_t* input2_data,
391 const RuntimeShape& output_shape, int16_t* output_data) {
392 TFLITE_DCHECK_LE(params.quantized_activation_min,
393 params.quantized_activation_max);
394
395 const int flat_size =
396 MatchingElementsSize(input1_shape, input2_shape, output_shape);
397
398 TFLITE_DCHECK_EQ(params.input1_offset, 0);
399 TFLITE_DCHECK_EQ(params.input2_offset, 0);
400 SubElementwise(flat_size, params, input1_data, input2_data, output_data);
401 }
402
403 template <typename T>
Sub(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)404 void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
405 const T* input1_data, const RuntimeShape& input2_shape,
406 const T* input2_data, const RuntimeShape& output_shape,
407 T* output_data) {
408 NdArrayDesc<4> desc1;
409 NdArrayDesc<4> desc2;
410 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
411 &desc2);
412 const RuntimeShape extended_output_shape =
413 RuntimeShape::ExtendedShape(4, output_shape);
414
415 // In Tensorflow, the dimensions are canonically named (batch_number, row,
416 // col, channel), with extents (batches, height, width, depth), with the
417 // trailing dimension changing most rapidly (channels has the smallest stride,
418 // typically 1 element).
419 //
420 // In generated C code, we store arrays with the dimensions reversed. The
421 // first dimension has smallest stride.
422 //
423 // We name our variables by their Tensorflow convention, but generate C code
424 // nesting loops such that the innermost loop has the smallest stride for the
425 // best cache behavior.
426 for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
427 for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
428 for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
429 for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
430 output_data[Offset(extended_output_shape, b, y, x, c)] =
431 input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
432 input2_data[SubscriptToIndex(desc2, b, y, x, c)];
433 }
434 }
435 }
436 }
437 }
438
SetActivationMinMax(const ArithmeticParams & params,int32_t * activation_min,int32_t * activation_max)439 inline void SetActivationMinMax(const ArithmeticParams& params,
440 int32_t* activation_min,
441 int32_t* activation_max) {
442 *activation_min = params.quantized_activation_min;
443 *activation_max = params.quantized_activation_max;
444 }
445
SetActivationMinMax(const ArithmeticParams & params,float * activation_min,float * activation_max)446 inline void SetActivationMinMax(const ArithmeticParams& params,
447 float* activation_min, float* activation_max) {
448 *activation_min = params.float_activation_min;
449 *activation_max = params.float_activation_max;
450 }
451
SetActivationMinMax(const ArithmeticParams & params,int64_t * activation_min,int64_t * activation_max)452 inline void SetActivationMinMax(const ArithmeticParams& params,
453 int64_t* activation_min,
454 int64_t* activation_max) {
455 *activation_min = params.int64_activation_min;
456 *activation_max = params.int64_activation_max;
457 }
458
459 template <typename T>
SubWithActivation(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)460 inline void SubWithActivation(
461 const ArithmeticParams& params, const RuntimeShape& input1_shape,
462 const T* input1_data, const RuntimeShape& input2_shape,
463 const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
464 ruy::profiler::ScopeLabel label("SubWithActivation");
465 const int flat_size =
466 MatchingElementsSize(input1_shape, input2_shape, output_shape);
467 T activation_min, activation_max;
468 SetActivationMinMax(params, &activation_min, &activation_max);
469
470 for (int i = 0; i < flat_size; ++i) {
471 output_data[i] = ActivationFunctionWithMinMax(
472 input1_data[i] - input2_data[i], activation_min, activation_max);
473 }
474 }
475
476 } // namespace reference_ops
477 } // namespace tflite
478
479 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
480