xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/kernels/internal/reference/sub.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
17 
18 #include <stdint.h>
19 
20 #include <algorithm>
21 #include <limits>
22 
23 #include "ruy/profiler/instrumentation.h"  // from @ruy
24 #include "tensorflow/lite/kernels/internal/common.h"
25 #include "tensorflow/lite/kernels/internal/compatibility.h"
26 #include "tensorflow/lite/kernels/internal/types.h"
27 
28 namespace tflite {
29 
30 namespace reference_ops {
31 
SubNonBroadcast(const ArithmeticParams & params,const RuntimeShape & input1_shape,const float * input1_data,const RuntimeShape & input2_shape,const float * input2_data,const RuntimeShape & output_shape,float * output_data)32 inline void SubNonBroadcast(const ArithmeticParams& params,
33                             const RuntimeShape& input1_shape,
34                             const float* input1_data,
35                             const RuntimeShape& input2_shape,
36                             const float* input2_data,
37                             const RuntimeShape& output_shape,
38                             float* output_data) {
39   const int flat_size =
40       MatchingElementsSize(input1_shape, input2_shape, output_shape);
41   for (int i = 0; i < flat_size; ++i) {
42     output_data[i] = ActivationFunctionWithMinMax(
43         input1_data[i] - input2_data[i], params.float_activation_min,
44         params.float_activation_max);
45   }
46 }
47 
SubNonBroadcast(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int32_t * input1_data,const RuntimeShape & input2_shape,const int32_t * input2_data,const RuntimeShape & output_shape,int32_t * output_data)48 inline void SubNonBroadcast(const ArithmeticParams& params,
49                             const RuntimeShape& input1_shape,
50                             const int32_t* input1_data,
51                             const RuntimeShape& input2_shape,
52                             const int32_t* input2_data,
53                             const RuntimeShape& output_shape,
54                             int32_t* output_data) {
55   const int flat_size =
56       MatchingElementsSize(input1_shape, input2_shape, output_shape);
57   for (int i = 0; i < flat_size; ++i) {
58     output_data[i] = ActivationFunctionWithMinMax(
59         input1_data[i] - input2_data[i], params.quantized_activation_min,
60         params.quantized_activation_max);
61   }
62 }
63 
64 // TODO(b/151345304): We can implement BroadcastSub on buffers of arbitrary
65 // dimensionality if the runtime code does a single loop over one dimension
66 // that handles broadcasting as the base case. The code generator would then
67 // generate max(D1, D2) nested for loops.
68 template <int N = 5>
BroadcastSubSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const float * input1_data,const RuntimeShape & input2_shape,const float * input2_data,const RuntimeShape & output_shape,float * output_data)69 inline void BroadcastSubSlow(const ArithmeticParams& params,
70                              const RuntimeShape& input1_shape,
71                              const float* input1_data,
72                              const RuntimeShape& input2_shape,
73                              const float* input2_data,
74                              const RuntimeShape& output_shape,
75                              float* output_data) {
76   ruy::profiler::ScopeLabel label("BroadcastSubSlow/float");
77   TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
78   TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
79   TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
80   NdArrayDesc<N> desc1;
81   NdArrayDesc<N> desc2;
82   NdArrayDesc<N> output_desc;
83   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
84                                       &desc2);
85   CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
86 
87   // In Tensorflow, the dimensions are canonically named (batch_number, row,
88   // col, channel), with extents (batches, height, width, depth), with the
89   // trailing dimension changing most rapidly (channels has the smallest stride,
90   // typically 1 element).
91   //
92   // In generated C code, we store arrays with the dimensions reversed. The
93   // first dimension has smallest stride.
94   //
95   // We name our variables by their Tensorflow convention, but generate C code
96   // nesting loops such that the innermost loop has the smallest stride for the
97   // best cache behavior.
98   auto sub_func = [&](int indexes[N]) {
99     output_data[SubscriptToIndex(output_desc, indexes)] =
100         ActivationFunctionWithMinMax(
101             input1_data[SubscriptToIndex(desc1, indexes)] -
102                 input2_data[SubscriptToIndex(desc2, indexes)],
103             params.float_activation_min, params.float_activation_max);
104   };
105   NDOpsHelper<N>(output_desc, sub_func);
106 }
107 
108 template <int N = 5>
BroadcastSubSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int32_t * input1_data,const RuntimeShape & input2_shape,const int32_t * input2_data,const RuntimeShape & output_shape,int32_t * output_data)109 inline void BroadcastSubSlow(const ArithmeticParams& params,
110                              const RuntimeShape& input1_shape,
111                              const int32_t* input1_data,
112                              const RuntimeShape& input2_shape,
113                              const int32_t* input2_data,
114                              const RuntimeShape& output_shape,
115                              int32_t* output_data) {
116   ruy::profiler::ScopeLabel label("BroadcastSubSlow/int32_t");
117   TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
118   TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
119   TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
120   NdArrayDesc<N> desc1;
121   NdArrayDesc<N> desc2;
122   NdArrayDesc<N> output_desc;
123   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
124                                       &desc2);
125   CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
126 
127   // In Tensorflow, the dimensions are canonically named (batch_number, row,
128   // col, channel), with extents (batches, height, width, depth), with the
129   // trailing dimension changing most rapidly (channels has the smallest stride,
130   // typically 1 element).
131   //
132   // In generated C code, we store arrays with the dimensions reversed. The
133   // first dimension has smallest stride.
134   //
135   // We name our variables by their Tensorflow convention, but generate C code
136   // nesting loops such that the innermost loop has the smallest stride for the
137   // best cache behavior.
138   auto sub_func = [&](int indexes[N]) {
139     output_data[SubscriptToIndex(output_desc, indexes)] =
140         ActivationFunctionWithMinMax(
141             input1_data[SubscriptToIndex(desc1, indexes)] -
142                 input2_data[SubscriptToIndex(desc2, indexes)],
143             params.quantized_activation_min, params.quantized_activation_max);
144   };
145   NDOpsHelper<N>(output_desc, sub_func);
146 }
147 
148 template <int N = 5>
BroadcastSubSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int64_t * input1_data,const RuntimeShape & input2_shape,const int64_t * input2_data,const RuntimeShape & output_shape,int64_t * output_data)149 void BroadcastSubSlow(const ArithmeticParams& params,
150                       const RuntimeShape& input1_shape,
151                       const int64_t* input1_data,
152                       const RuntimeShape& input2_shape,
153                       const int64_t* input2_data,
154                       const RuntimeShape& output_shape, int64_t* output_data) {
155   ruy::profiler::ScopeLabel label("BroadcastSubSlow/int64_t");
156   TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
157   TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
158   TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
159   NdArrayDesc<N> desc1;
160   NdArrayDesc<N> desc2;
161   NdArrayDesc<N> output_desc;
162   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
163                                       &desc2);
164   CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
165 
166   // In Tensorflow, the dimensions are canonically named (batch_number, row,
167   // col, channel), with extents (batches, height, width, depth), with the
168   // trailing dimension changing most rapidly (channels has the smallest stride,
169   // typically 1 element).
170   //
171   // In generated C code, we store arrays with the dimensions reversed. The
172   // first dimension has smallest stride.
173   //
174   // We name our variables by their Tensorflow convention, but generate C code
175   // nesting loops such that the innermost loop has the smallest stride for the
176   // best cache behavior.
177   auto sub_func = [&](int indexes[N]) {
178     output_data[SubscriptToIndex(output_desc, indexes)] =
179         ActivationFunctionWithMinMax(
180             input1_data[SubscriptToIndex(desc1, indexes)] -
181                 input2_data[SubscriptToIndex(desc2, indexes)],
182             params.int64_activation_min, params.int64_activation_max);
183   };
184   NDOpsHelper<N>(output_desc, sub_func);
185 }
186 
187 template <typename T, int N = 5>
BroadcastSubSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)188 void BroadcastSubSlow(const ArithmeticParams& params,
189                       const RuntimeShape& input1_shape, const T* input1_data,
190                       const RuntimeShape& input2_shape, const T* input2_data,
191                       const RuntimeShape& output_shape, T* output_data) {
192   ruy::profiler::ScopeLabel label("BroadcastSubSlow/templated");
193   TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
194   TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
195   TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
196   NdArrayDesc<N> desc1;
197   NdArrayDesc<N> desc2;
198   NdArrayDesc<N> output_desc;
199   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
200                                       &desc2);
201   CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
202 
203   // In Tensorflow, the dimensions are canonically named (batch_number, row,
204   // col, channel), with extents (batches, height, width, depth), with the
205   // trailing dimension changing most rapidly (channels has the smallest stride,
206   // typically 1 element).
207   //
208   // In generated C code, we store arrays with the dimensions reversed. The
209   // first dimension has smallest stride.
210   //
211   // We name our variables by their Tensorflow convention, but generate C code
212   // nesting loops such that the innermost loop has the smallest stride for the
213   // best cache behavior.
214   auto sub_func = [&](int indexes[N]) {
215     output_data[SubscriptToIndex(output_desc, indexes)] =
216         ActivationFunctionWithMinMax(
217             input1_data[SubscriptToIndex(desc1, indexes)] -
218                 input2_data[SubscriptToIndex(desc2, indexes)],
219             params.quantized_activation_min, params.quantized_activation_max);
220   };
221   NDOpsHelper<N>(output_desc, sub_func);
222 }
223 
224 template <int N = 5>
BroadcastSub16POTSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int16_t * input1_data,const RuntimeShape & input2_shape,const int16_t * input2_data,const RuntimeShape & output_shape,int16_t * output_data)225 inline void BroadcastSub16POTSlow(const ArithmeticParams& params,
226                                   const RuntimeShape& input1_shape,
227                                   const int16_t* input1_data,
228                                   const RuntimeShape& input2_shape,
229                                   const int16_t* input2_data,
230                                   const RuntimeShape& output_shape,
231                                   int16_t* output_data) {
232   ruy::profiler::ScopeLabel label("BroadcastSub16POTSlow/int16_t");
233   NdArrayDesc<N> desc1;
234   NdArrayDesc<N> desc2;
235   NdArrayDesc<N> output_desc;
236   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
237                                       &desc2);
238   CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
239 
240   // In Tensorflow, the dimensions are canonically named (batch_number, row,
241   // col, channel), with extents (batches, height, width, depth), with the
242   // trailing dimension changing most rapidly (channels has the smallest stride,
243   // typically 1 element).
244   //
245   // In generated C code, we store arrays with the dimensions reversed. The
246   // first dimension has smallest stride.
247   //
248   // We name our variables by their Tensorflow convention, but generate C code
249   // nesting loops such that the innermost loop has the smallest stride for the
250   // best cache behavior.
251   auto sub_func = [&](int indexes[N]) {
252     const int32_t input1_val = input1_data[SubscriptToIndex(desc1, indexes)];
253     const int32_t input2_val = input2_data[SubscriptToIndex(desc2, indexes)];
254     const int32_t scaled_input1_val =
255         gemmlowp::RoundingDivideByPOT(input1_val, -params.input1_shift);
256     const int32_t scaled_input2_val =
257         gemmlowp::RoundingDivideByPOT(input2_val, -params.input2_shift);
258     const int32_t raw_output = scaled_input1_val - scaled_input2_val;
259     const int32_t clamped_output =
260         std::min(params.quantized_activation_max,
261                  std::max(params.quantized_activation_min, raw_output));
262     output_data[SubscriptToIndex(output_desc, indexes)] =
263         static_cast<int16_t>(clamped_output);
264   };
265   NDOpsHelper<N>(output_desc, sub_func);
266 }
267 
268 template <typename T, int N = 5>
BroadcastQuantSubSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)269 void BroadcastQuantSubSlow(const ArithmeticParams& params,
270                            const RuntimeShape& input1_shape,
271                            const T* input1_data,
272                            const RuntimeShape& input2_shape,
273                            const T* input2_data,
274                            const RuntimeShape& output_shape, T* output_data) {
275   ruy::profiler::ScopeLabel label("BroadcastQuantSubSlow/T");
276   TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
277   TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
278   TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
279   NdArrayDesc<N> desc1;
280   NdArrayDesc<N> desc2;
281   NdArrayDesc<N> output_desc;
282   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
283                                       &desc2);
284   CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
285 
286   // In Tensorflow, the dimensions are canonically named (batch_number, row,
287   // col, channel), with extents (batches, height, width, depth), with the
288   // trailing dimension changing most rapidly (channels has the smallest stride,
289   // typically 1 element).
290   //
291   // In generated C code, we store arrays with the dimensions reversed. The
292   // first dimension has smallest stride.
293   //
294   // We name our variables by their Tensorflow convention, but generate C code
295   // nesting loops such that the innermost loop has the smallest stride for the
296   // best cache behavior.
297   auto sub_func = [&](int indexes[N]) {
298     const int32_t input1_val =
299         params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
300     const int32_t input2_val =
301         params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
302     const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
303     const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
304     const int32_t scaled_input1_val =
305         MultiplyByQuantizedMultiplierSmallerThanOneExp(
306             shifted_input1_val, params.input1_multiplier, params.input1_shift);
307     const int32_t scaled_input2_val =
308         MultiplyByQuantizedMultiplierSmallerThanOneExp(
309             shifted_input2_val, params.input2_multiplier, params.input2_shift);
310     const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
311     const int32_t raw_output =
312         MultiplyByQuantizedMultiplierSmallerThanOneExp(
313             raw_sub, params.output_multiplier, params.output_shift) +
314         params.output_offset;
315     const int32_t clamped_output =
316         std::min(params.quantized_activation_max,
317                  std::max(params.quantized_activation_min, raw_output));
318     output_data[SubscriptToIndex(output_desc, indexes)] =
319         static_cast<T>(clamped_output);
320   };
321   NDOpsHelper<N>(output_desc, sub_func);
322 }
323 
324 // Element-wise add that can often be used for inner loop of broadcast add as
325 // well as the non-broadcast add.
326 template <typename T>
SubElementwise(int size,const ArithmeticParams & params,const T * input1_data,const T * input2_data,T * output_data)327 inline void SubElementwise(int size, const ArithmeticParams& params,
328                            const T* input1_data, const T* input2_data,
329                            T* output_data) {
330   for (int i = 0; i < size; ++i) {
331     const int32_t input1_val = params.input1_offset + input1_data[i];
332     const int32_t input2_val = params.input2_offset + input2_data[i];
333     const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
334     const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
335     const int32_t scaled_input1_val =
336         MultiplyByQuantizedMultiplierSmallerThanOneExp(
337             shifted_input1_val, params.input1_multiplier, params.input1_shift);
338     const int32_t scaled_input2_val =
339         MultiplyByQuantizedMultiplierSmallerThanOneExp(
340             shifted_input2_val, params.input2_multiplier, params.input2_shift);
341     const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
342     const int32_t raw_output =
343         MultiplyByQuantizedMultiplierSmallerThanOneExp(
344             raw_sub, params.output_multiplier, params.output_shift) +
345         params.output_offset;
346     const int32_t clamped_output =
347         std::min(params.quantized_activation_max,
348                  std::max(params.quantized_activation_min, raw_output));
349     output_data[i] = static_cast<T>(clamped_output);
350   }
351 }
352 
Sub(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8_t * input1_data,const RuntimeShape & input2_shape,const uint8_t * input2_data,const RuntimeShape & output_shape,uint8_t * output_data)353 inline void Sub(const ArithmeticParams& params,
354                 const RuntimeShape& input1_shape, const uint8_t* input1_data,
355                 const RuntimeShape& input2_shape, const uint8_t* input2_data,
356                 const RuntimeShape& output_shape, uint8_t* output_data) {
357   TFLITE_DCHECK_LE(params.quantized_activation_min,
358                    params.quantized_activation_max);
359   const int flat_size =
360       MatchingElementsSize(input1_shape, input2_shape, output_shape);
361 
362   TFLITE_DCHECK_GT(params.input1_offset, -256);
363   TFLITE_DCHECK_GT(params.input2_offset, -256);
364   TFLITE_DCHECK_LT(params.input1_offset, 256);
365   TFLITE_DCHECK_LT(params.input2_offset, 256);
366   SubElementwise(flat_size, params, input1_data, input2_data, output_data);
367 }
368 
Sub(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int8_t * input1_data,const RuntimeShape & input2_shape,const int8_t * input2_data,const RuntimeShape & output_shape,int8_t * output_data)369 inline void Sub(const ArithmeticParams& params,
370                 const RuntimeShape& input1_shape, const int8_t* input1_data,
371                 const RuntimeShape& input2_shape, const int8_t* input2_data,
372                 const RuntimeShape& output_shape, int8_t* output_data) {
373   TFLITE_DCHECK_LE(params.quantized_activation_min,
374                    params.quantized_activation_max);
375 
376   const int flat_size =
377       MatchingElementsSize(input1_shape, input2_shape, output_shape);
378 
379   TFLITE_DCHECK_GE(params.input1_offset, -128);
380   TFLITE_DCHECK_GE(params.input2_offset, -128);
381   // offset = -quantization_params.zero_point in PrepareGeneralSubOp().
382   // So it's maximum can be 128 not 127.
383   TFLITE_DCHECK_LE(params.input1_offset, 128);
384   TFLITE_DCHECK_LE(params.input2_offset, 128);
385   SubElementwise(flat_size, params, input1_data, input2_data, output_data);
386 }
387 
Sub(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int16_t * input1_data,const RuntimeShape & input2_shape,const int16_t * input2_data,const RuntimeShape & output_shape,int16_t * output_data)388 inline void Sub(const ArithmeticParams& params,
389                 const RuntimeShape& input1_shape, const int16_t* input1_data,
390                 const RuntimeShape& input2_shape, const int16_t* input2_data,
391                 const RuntimeShape& output_shape, int16_t* output_data) {
392   TFLITE_DCHECK_LE(params.quantized_activation_min,
393                    params.quantized_activation_max);
394 
395   const int flat_size =
396       MatchingElementsSize(input1_shape, input2_shape, output_shape);
397 
398   TFLITE_DCHECK_EQ(params.input1_offset, 0);
399   TFLITE_DCHECK_EQ(params.input2_offset, 0);
400   SubElementwise(flat_size, params, input1_data, input2_data, output_data);
401 }
402 
403 template <typename T>
Sub(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)404 void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
405          const T* input1_data, const RuntimeShape& input2_shape,
406          const T* input2_data, const RuntimeShape& output_shape,
407          T* output_data) {
408   NdArrayDesc<4> desc1;
409   NdArrayDesc<4> desc2;
410   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
411                                       &desc2);
412   const RuntimeShape extended_output_shape =
413       RuntimeShape::ExtendedShape(4, output_shape);
414 
415   // In Tensorflow, the dimensions are canonically named (batch_number, row,
416   // col, channel), with extents (batches, height, width, depth), with the
417   // trailing dimension changing most rapidly (channels has the smallest stride,
418   // typically 1 element).
419   //
420   // In generated C code, we store arrays with the dimensions reversed. The
421   // first dimension has smallest stride.
422   //
423   // We name our variables by their Tensorflow convention, but generate C code
424   // nesting loops such that the innermost loop has the smallest stride for the
425   // best cache behavior.
426   for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
427     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
428       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
429         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
430           output_data[Offset(extended_output_shape, b, y, x, c)] =
431               input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
432               input2_data[SubscriptToIndex(desc2, b, y, x, c)];
433         }
434       }
435     }
436   }
437 }
438 
SetActivationMinMax(const ArithmeticParams & params,int32_t * activation_min,int32_t * activation_max)439 inline void SetActivationMinMax(const ArithmeticParams& params,
440                                 int32_t* activation_min,
441                                 int32_t* activation_max) {
442   *activation_min = params.quantized_activation_min;
443   *activation_max = params.quantized_activation_max;
444 }
445 
SetActivationMinMax(const ArithmeticParams & params,float * activation_min,float * activation_max)446 inline void SetActivationMinMax(const ArithmeticParams& params,
447                                 float* activation_min, float* activation_max) {
448   *activation_min = params.float_activation_min;
449   *activation_max = params.float_activation_max;
450 }
451 
SetActivationMinMax(const ArithmeticParams & params,int64_t * activation_min,int64_t * activation_max)452 inline void SetActivationMinMax(const ArithmeticParams& params,
453                                 int64_t* activation_min,
454                                 int64_t* activation_max) {
455   *activation_min = params.int64_activation_min;
456   *activation_max = params.int64_activation_max;
457 }
458 
459 template <typename T>
SubWithActivation(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)460 inline void SubWithActivation(
461     const ArithmeticParams& params, const RuntimeShape& input1_shape,
462     const T* input1_data, const RuntimeShape& input2_shape,
463     const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
464   ruy::profiler::ScopeLabel label("SubWithActivation");
465   const int flat_size =
466       MatchingElementsSize(input1_shape, input2_shape, output_shape);
467   T activation_min, activation_max;
468   SetActivationMinMax(params, &activation_min, &activation_max);
469 
470   for (int i = 0; i < flat_size; ++i) {
471     output_data[i] = ActivationFunctionWithMinMax(
472         input1_data[i] - input2_data[i], activation_min, activation_max);
473   }
474 }
475 
476 }  // namespace reference_ops
477 }  // namespace tflite
478 
479 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
480