xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
17 
18 #include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
19 
20 #if defined(_MSC_VER)
21 #define __restrict__ __restrict
22 #endif
23 
24 namespace tflite {
25 namespace tensor_utils {
26 
27 // Check if all entries of a vector are zero for float.
IsZeroVector(const float * vector,int v_size)28 bool IsZeroVector(const float* vector, int v_size) {
29   return PortableIsZeroVector(vector, v_size);
30 }
31 
32 // Check if all entries of a vector are zero for int8_t.
IsZeroVector(const int8_t * vector,int v_size)33 bool IsZeroVector(const int8_t* vector, int v_size) {
34   return PortableIsZeroVector(vector, v_size);
35 }
36 
SymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float * min,float * max,float * scaling_factor)37 void SymmetricQuantizeFloats(const float* values, const int size,
38                              int8_t* quantized_values, float* min, float* max,
39                              float* scaling_factor) {
40   PortableSymmetricQuantizeFloats(values, size, quantized_values, min, max,
41                                   scaling_factor);
42 }
43 
SymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float min_value,float max_value,float * scaling_factor)44 void SymmetricQuantizeFloats(const float* values, const int size,
45                              int8_t* quantized_values, float min_value,
46                              float max_value, float* scaling_factor) {
47   PortableSymmetricQuantizeFloats(values, size, quantized_values, min_value,
48                                   max_value, scaling_factor);
49 }
50 
AsymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float * scaling_factor,int32_t * offset)51 void AsymmetricQuantizeFloats(const float* values, const int size,
52                               int8_t* quantized_values, float* scaling_factor,
53                               int32_t* offset) {
54   PortableAsymmetricQuantizeFloats(values, size, quantized_values,
55                                    scaling_factor, offset);
56 }
57 
MatrixBatchVectorMultiplyAccumulate(const float * matrix,int m_rows,int m_cols,const float * vector,int n_batch,float * result)58 void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
59                                          int m_cols, const float* vector,
60                                          int n_batch, float* result) {
61   PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector,
62                                               n_batch, result);
63 }
64 
MatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vector,const float * scaling_factors,int n_batch,float * __restrict__ result)65 void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
66                                          const int m_rows, const int m_cols,
67                                          const int8_t* __restrict__ vector,
68                                          const float* scaling_factors,
69                                          int n_batch,
70                                          float* __restrict__ result) {
71   PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector,
72                                               scaling_factors, n_batch, result);
73 }
74 
MatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result,const float * per_channel_scale,const int32_t * input_offset,int32_t * scratch,int32_t * row_sums,bool * compute_row_sums,CpuBackendContext * context)75 void MatrixBatchVectorMultiplyAccumulate(
76     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
77     const int8_t* __restrict__ vectors, const float* scaling_factors,
78     int n_batch, float* __restrict__ result, const float* per_channel_scale,
79     const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
80     bool* compute_row_sums, CpuBackendContext* context) {
81   PortableMatrixBatchVectorMultiplyAccumulate(
82       matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
83       per_channel_scale, input_offset, scratch, row_sums, compute_row_sums,
84       context);
85 }
86 
MatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vector,const float * scaling_factors,int n_batch,int32_t * scratch,float * __restrict__ result,CpuBackendContext * context)87 void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
88                                          const int m_rows, const int m_cols,
89                                          const int8_t* __restrict__ vector,
90                                          const float* scaling_factors,
91                                          int n_batch, int32_t* scratch,
92                                          float* __restrict__ result,
93                                          CpuBackendContext* context) {
94   PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector,
95                                               scaling_factors, n_batch, result);
96 }
97 
SparseMatrixBatchVectorMultiplyAccumulate1x4(const float * __restrict__ matrix,const int32_t * __restrict__ segments,const int32_t * __restrict__ indices,int m_rows,int m_cols,const float * __restrict__ vector,int n_batch,float * __restrict__ result)98 void SparseMatrixBatchVectorMultiplyAccumulate1x4(
99     const float* __restrict__ matrix, const int32_t* __restrict__ segments,
100     const int32_t* __restrict__ indices, int m_rows, int m_cols,
101     const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
102   PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
103       matrix, segments, indices, m_rows, m_cols, vector, n_batch, result);
104 }
105 
SparseMatrixBatchVectorMultiplyAccumulate(const float * __restrict__ matrix,const uint8_t * __restrict__ ledger,int m_rows,int m_cols,const float * __restrict__ vector,int n_batch,float * __restrict__ result)106 void SparseMatrixBatchVectorMultiplyAccumulate(
107     const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
108     int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
109     float* __restrict__ result) {
110   PortableSparseMatrixBatchVectorMultiplyAccumulate(
111       matrix, ledger, m_rows, m_cols, vector, n_batch, result);
112 }
113 
SparseMatrixBatchVectorMultiplyAccumulate1x16(const int8_t * __restrict__ matrix,const int32_t * __restrict__ segments,const int32_t * __restrict__ indices,int m_rows,int m_cols,const int8_t * __restrict__ vector,const int32_t * __restrict__ bias_vector,int n_batch,const int32_t input_offset,const int32_t output_multiplier,const int32_t output_shift,const int32_t output_offset,const int32_t output_activation_min,const int32_t output_activation_max,int8_t * __restrict__ result)114 void SparseMatrixBatchVectorMultiplyAccumulate1x16(
115     const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
116     const int32_t* __restrict__ indices, int m_rows, int m_cols,
117     const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
118     int n_batch, const int32_t input_offset, const int32_t output_multiplier,
119     const int32_t output_shift, const int32_t output_offset,
120     const int32_t output_activation_min, const int32_t output_activation_max,
121 
122     int8_t* __restrict__ result) {
123   PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
124       matrix, segments, indices, m_rows, m_cols, vector, bias_vector, n_batch,
125       input_offset, output_multiplier, output_shift, output_offset,
126       output_activation_min, output_activation_max, result);
127 }
128 
SparseMatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const uint8_t * ledger,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result)129 void SparseMatrixBatchVectorMultiplyAccumulate(
130     const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
131     const int m_cols, const int8_t* __restrict__ vectors,
132     const float* scaling_factors, int n_batch, float* __restrict__ result) {
133   PortableSparseMatrixBatchVectorMultiplyAccumulate(
134       matrix, ledger, m_rows, m_cols, vectors, scaling_factors, n_batch,
135       result);
136 }
137 
MatrixBatchVectorMultiplyAccumulate(const int8_t * input,const int32_t * bias,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,int32_t * scratch,int16_t * output,CpuBackendContext * context)138 void MatrixBatchVectorMultiplyAccumulate(
139     const int8_t* input, const int32_t* bias,
140     const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
141     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
142     int32_t* scratch, int16_t* output, CpuBackendContext* context) {
143   PortableMatrixBatchVectorMultiplyAccumulate(
144       input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
145       n_output, output_zp, scratch, output, context);
146 }
147 
MatrixBatchVectorMultiplyAccumulate(const int8_t * input,const int32_t * bias,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,int32_t * scratch,int8_t * output,CpuBackendContext * context)148 void MatrixBatchVectorMultiplyAccumulate(
149     const int8_t* input, const int32_t* bias,
150     const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
151     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
152     int32_t* scratch, int8_t* output, CpuBackendContext* context) {
153   PortableMatrixBatchVectorMultiplyAccumulate(
154       input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
155       n_output, output_zp, scratch, output, context);
156 }
157 
MatrixScalarMultiplyAccumulate(const int8_t * matrix,int32_t scalar,int32_t n_row,int32_t n_col,int32_t * output)158 void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
159                                     int32_t n_row, int32_t n_col,
160                                     int32_t* output) {
161   PortableMatrixScalarMultiplyAccumulate(matrix, scalar, n_row, n_col, output);
162 }
163 
MatrixBatchVectorMultiply(const int8_t * input,int32_t input_zeropoint,const int8_t * input_to_gate_weights,int32_t input_to_gate_effective_scale_a,int32_t input_to_gate_effective_scale_b,int32_t n_batch,int32_t n_input,int32_t n_cell,int8_t * gate_output,int8_t gate_output_zp)164 void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
165                                const int8_t* input_to_gate_weights,
166                                int32_t input_to_gate_effective_scale_a,
167                                int32_t input_to_gate_effective_scale_b,
168                                int32_t n_batch, int32_t n_input, int32_t n_cell,
169                                int8_t* gate_output, int8_t gate_output_zp) {
170   PortableMatrixBatchVectorMultiply(
171       input, input_zeropoint, input_to_gate_weights,
172       input_to_gate_effective_scale_a, input_to_gate_effective_scale_b, n_batch,
173       n_input, n_cell, gate_output, gate_output_zp);
174 }
175 
MatrixBatchVectorMultiply(const int16_t * hidden,const int8_t * hidden_to_output_weights,int32_t proj_effective_scale_a,int32_t proj_effective_scale_b,const int32_t * gate_bias,int32_t n_batch,int32_t n_hidden,int32_t n_output,int32_t output_zp,int8_t * proj_output)176 void MatrixBatchVectorMultiply(const int16_t* hidden,
177                                const int8_t* hidden_to_output_weights,
178                                int32_t proj_effective_scale_a,
179                                int32_t proj_effective_scale_b,
180                                const int32_t* gate_bias, int32_t n_batch,
181                                int32_t n_hidden, int32_t n_output,
182                                int32_t output_zp, int8_t* proj_output) {
183   PortableMatrixBatchVectorMultiply(hidden, hidden_to_output_weights,
184                                     proj_effective_scale_a,
185                                     proj_effective_scale_b, gate_bias, n_batch,
186                                     n_hidden, n_output, output_zp, proj_output);
187 }
188 
ApplyLayerNorm(const int16_t * input,const int16_t * layer_norm_weights,const int32_t * bias,int32_t layer_norm_scale_a,int32_t layer_norm_scale_b,int32_t variance_limit,int n_batch,int n_input,int16_t * output)189 void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
190                     const int32_t* bias, int32_t layer_norm_scale_a,
191                     int32_t layer_norm_scale_b, int32_t variance_limit,
192                     int n_batch, int n_input, int16_t* output) {
193   PortableApplyLayerNorm(input, layer_norm_weights, bias, layer_norm_scale_a,
194                          layer_norm_scale_b, variance_limit, n_batch, n_input,
195                          output);
196 }
197 
ApplyLayerNormFloat(const int16_t * input,const int16_t * layer_norm_weights,int32_t layer_norm_scale_a,int32_t layer_norm_scale_b,const int32_t * bias,int n_batch,int n_input,int16_t * output)198 void ApplyLayerNormFloat(const int16_t* input,
199                          const int16_t* layer_norm_weights,
200                          int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
201                          const int32_t* bias, int n_batch, int n_input,
202                          int16_t* output) {
203   PortableApplyLayerNormFloat(input, layer_norm_weights, layer_norm_scale_a,
204                               layer_norm_scale_b, bias, n_batch, n_input,
205                               output);
206 }
207 
ApplySigmoid(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)208 void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
209                   int16_t* output) {
210   PortableApplySigmoid(input, n_batch, n_input, output);
211 }
212 
ApplySigmoidFloat(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)213 void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
214                        int16_t* output) {
215   PortableApplySigmoidFloat(input, n_batch, n_input, output);
216 }
217 
ApplyTanh(int32_t integer_bits,const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)218 void ApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch,
219                int32_t n_input, int16_t* output) {
220   PortableApplyTanh(integer_bits, input, n_batch, n_input, output);
221 }
222 
ApplyTanhFloat(const int16_t * input,int32_t n_batch,int32_t n_input,int32_t integer_bits,int16_t * output)223 void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
224                     int32_t integer_bits, int16_t* output) {
225   PortableApplyTanhFloat(input, n_batch, n_input, integer_bits, output);
226 }
227 
CwiseMul(const int16_t * input_1,const int16_t * input_2,int n_batch,int n_input,int shift,int16_t * output)228 void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
229               int n_input, int shift, int16_t* output) {
230   PortableCwiseMul(input_1, input_2, n_batch, n_input, shift, output);
231 }
232 
CwiseMul(const int16_t * input_1,const int16_t * input_2,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t output_zp,int8_t * output)233 void CwiseMul(const int16_t* input_1, const int16_t* input_2,
234               int32_t multiplier, int32_t shift, int32_t n_batch,
235               int32_t n_input, int32_t output_zp, int8_t* output) {
236   PortableCwiseMul(input_1, input_2, multiplier, shift, n_batch, n_input,
237                    output_zp, output);
238 }
239 
CwiseAdd(const int16_t * input_1,const int16_t * input_2,int n_batch,int n_input,int16_t * output)240 void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
241               int n_input, int16_t* output) {
242   PortableCwiseAdd(input_1, input_2, n_batch, n_input, output);
243 }
244 
CwiseClipping(float * vector,const int v_size,const float clipping_value)245 void CwiseClipping(float* vector, const int v_size,
246                    const float clipping_value) {
247   PortableCwiseClipping(vector, v_size, clipping_value);
248 }
249 
CwiseClipping(int16_t * vector,const int v_size,const int16_t clipping_value)250 void CwiseClipping(int16_t* vector, const int v_size,
251                    const int16_t clipping_value) {
252   PortableCwiseClipping(vector, v_size, clipping_value);
253 }
254 
CwiseClipping(int8_t * vector,const int v_size,const int8_t clipping_value)255 void CwiseClipping(int8_t* vector, const int v_size,
256                    const int8_t clipping_value) {
257   PortableCwiseClipping(vector, v_size, clipping_value);
258 }
259 
VectorBatchVectorCwiseProductAccumulate(const int16_t * vector,int v_size,const int16_t * batch_vector,int n_batch,int32_t multiplier,int shift,int16_t * result)260 void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
261                                              const int16_t* batch_vector,
262                                              int n_batch, int32_t multiplier,
263                                              int shift, int16_t* result) {
264   PortableVectorBatchVectorCwiseProductAccumulate(
265       vector, v_size, batch_vector, n_batch, multiplier, shift, result);
266 }
267 
VectorVectorDotProduct(const float * vector1,const float * vector2,int v_size)268 float VectorVectorDotProduct(const float* vector1, const float* vector2,
269                              int v_size) {
270   return PortableVectorVectorDotProduct(vector1, vector2, v_size);
271 }
272 
BatchVectorBatchVectorDotProduct(const int16_t * vector1,const int16_t * vector2,int v_size,int n_batch,int32_t * result)273 void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
274                                       const int16_t* vector2, int v_size,
275                                       int n_batch, int32_t* result) {
276   PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size, n_batch,
277                                            result);
278 }
279 
Sub1Vector(const float * vector,int v_size,float * result)280 void Sub1Vector(const float* vector, int v_size, float* result) {
281   PortableSub1Vector(vector, v_size, result);
282 }
283 
Sub1Vector(const int16_t * vector,int v_size,int16_t * result)284 void Sub1Vector(const int16_t* vector, int v_size, int16_t* result) {
285   PortableSub1Vector(vector, v_size, result);
286 }
287 
288 // Multiply all elements of vector with a scalar.
VectorScalarMultiply(const int8_t * vector,int v_size,float scale,float * result)289 void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
290                           float* result) {
291   PortableVectorScalarMultiply(vector, v_size, scale, result);
292 }
293 
ReductionSumVector(const float * input_vector,float * output_vector,int output_size,int reduction_size)294 void ReductionSumVector(const float* input_vector, float* output_vector,
295                         int output_size, int reduction_size) {
296   PortableReductionSumVector(input_vector, output_vector, output_size,
297                              reduction_size);
298 }
299 
ReductionSumVector(const int32_t * input_vector,int32_t * output_vector,int output_size,int reduction_size)300 void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
301                         int output_size, int reduction_size) {
302   PortableReductionSumVector(input_vector, output_vector, output_size,
303                              reduction_size);
304 }
305 
ReductionSumVector(const int8_t * input_vector,int32_t * output_vector,int output_size,int reduction_size)306 void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
307                         int output_size, int reduction_size) {
308   PortableReductionSumVector(input_vector, output_vector, output_size,
309                              reduction_size);
310 }
311 
MeanStddevNormalization(const float * input_vector,float * output_vector,int v_size,int n_batch)312 void MeanStddevNormalization(const float* input_vector, float* output_vector,
313                              int v_size, int n_batch) {
314   PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
315 }
316 
TwoGateSaturatingAdd(const int8_t * input,int8_t input_zp,const int8_t * recurrent,int8_t recurrent_zp,int32_t input_effective_scale_a,int32_t input_effective_scale_b,int32_t recurrent_effective_scale_a,int32_t recurrent_effective_scale_b,int32_t n_batch,int32_t n_cell,int16_t * output)317 void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
318                           const int8_t* recurrent, int8_t recurrent_zp,
319                           int32_t input_effective_scale_a,
320                           int32_t input_effective_scale_b,
321                           int32_t recurrent_effective_scale_a,
322                           int32_t recurrent_effective_scale_b, int32_t n_batch,
323                           int32_t n_cell, int16_t* output) {
324   PortableTwoGateSaturatingAdd(
325       input, input_zp, recurrent, recurrent_zp, input_effective_scale_a,
326       input_effective_scale_b, recurrent_effective_scale_a,
327       recurrent_effective_scale_b, n_batch, n_cell, output);
328 }
329 
330 }  // namespace tensor_utils
331 }  // namespace tflite
332 
333 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
334