xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include <algorithm>
16 #include <cmath>
17 #include <cstdint>
18 #include <cstring>
19 #include <limits>
20 #include <utility>
21 
22 #include "fixedpoint/fixedpoint.h"
23 #include "tensorflow/lite/kernels/internal/common.h"
24 #include "tensorflow/lite/kernels/internal/compatibility.h"
25 #include "tensorflow/lite/kernels/internal/cppmath.h"
26 #include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
27 
28 #if defined(_MSC_VER)
29 #define __restrict__ __restrict
30 #endif
31 
32 namespace tflite {
33 namespace tensor_utils {
34 
35 namespace {
36 const int32_t kInt16Max = std::numeric_limits<int16_t>::max();
37 const int32_t kInt16Min = std::numeric_limits<int16_t>::min();
38 }  // namespace
39 
PortableSymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float * min_value,float * max_value,float * scaling_factor)40 void PortableSymmetricQuantizeFloats(const float* values, const int size,
41                                      int8_t* quantized_values, float* min_value,
42                                      float* max_value, float* scaling_factor) {
43   auto minmax = std::minmax_element(values, values + size);
44   *min_value = *minmax.first;
45   *max_value = *minmax.second;
46 
47   PortableSymmetricQuantizeFloats(values, size, quantized_values, *min_value,
48                                   *max_value, scaling_factor);
49 }
50 
PortableSymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float min_value,float max_value,float * scaling_factor)51 void PortableSymmetricQuantizeFloats(const float* values, const int size,
52                                      int8_t* quantized_values, float min_value,
53                                      float max_value, float* scaling_factor) {
54   const int32_t kScale = 127;
55   const float range = std::max(std::abs(min_value), std::abs(max_value));
56   if (range == 0) {
57     memset(quantized_values, 0, size * sizeof(int8_t));
58     *scaling_factor = 1;
59     return;
60   }
61   *scaling_factor = range / kScale;
62   const float scaling_factor_inv = kScale / range;
63   for (int i = 0; i < size; ++i) {
64     const int32_t quantized_value =
65         static_cast<int32_t>(TfLiteRound(values[i] * scaling_factor_inv));
66     // Clamp: just in case some odd numeric offset.
67     quantized_values[i] = static_cast<int8_t>(
68         std::min(kScale, std::max(-kScale, quantized_value)));
69   }
70 }
71 
PortableAsymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float * scaling_factor,int32_t * offset)72 void PortableAsymmetricQuantizeFloats(const float* values, const int size,
73                                       int8_t* quantized_values,
74                                       float* scaling_factor, int32_t* offset) {
75   const int32_t kMinScale = -128;
76   const int32_t kMaxScale = 127;
77   const double qmin_double = kMinScale;
78   const double qmax_double = kMaxScale;
79   const auto minmax = std::minmax_element(values, values + size);
80   const double rmin = static_cast<double>(std::min(0.0f, *minmax.first));
81   const double rmax = static_cast<double>(std::max(0.0f, *minmax.second));
82   if (rmin == rmax) {
83     memset(quantized_values, 0, size * sizeof(int8_t));
84     *scaling_factor = 1;
85     *offset = 0;
86     return;
87   } else {
88     double scale = (rmax - rmin) / (qmax_double - qmin_double);
89     const double zero_point_from_min = qmin_double - rmin / scale;
90     const double zero_point_from_max = qmax_double - rmax / scale;
91     const double zero_point_from_min_error =
92         std::abs(qmin_double) + std::abs(rmin / scale);
93     const double zero_point_from_max_error =
94         std::abs(qmax_double) + std::abs(rmax / scale);
95     const double zero_point_double =
96         zero_point_from_min_error < zero_point_from_max_error
97             ? zero_point_from_min
98             : zero_point_from_max;
99     int8_t nudged_zero_point = 0;
100     if (zero_point_double <= qmin_double) {
101       nudged_zero_point = kMinScale;
102     } else if (zero_point_double >= qmax_double) {
103       nudged_zero_point = kMaxScale;
104     } else {
105       nudged_zero_point = static_cast<int8_t>(round(zero_point_double));
106     }
107     *scaling_factor = scale;
108     *offset = nudged_zero_point;
109   }
110   const float scaling_factor_inv = 1.0f / *scaling_factor;
111   for (int i = 0; i < size; ++i) {
112     const int32_t quantized_value = static_cast<int32_t>(
113         TfLiteRound(*offset + values[i] * scaling_factor_inv));
114     quantized_values[i] =
115         std::min(kMaxScale, std::max(kMinScale, quantized_value));
116   }
117 }
118 
PortableMatrixBatchVectorMultiplyAccumulate(const float * matrix,int m_rows,int m_cols,const float * vector,int n_batch,float * result)119 void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
120                                                  int m_rows, int m_cols,
121                                                  const float* vector,
122                                                  int n_batch, float* result) {
123   float* result_in_batch = result;
124   for (int b = 0; b < n_batch; b++) {
125     const float* matrix_ptr = matrix;
126     for (int r = 0; r < m_rows; r++) {
127       float dot_prod = 0.0f;
128       const float* vector_in_batch = vector + b * m_cols;
129       for (int c = 0; c < m_cols; c++) {
130         dot_prod += *matrix_ptr++ * *vector_in_batch++;
131       }
132       *result_in_batch += dot_prod;
133       ++result_in_batch;
134     }
135   }
136 }
137 
PortableMatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result)138 void PortableMatrixBatchVectorMultiplyAccumulate(
139     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
140     const int8_t* __restrict__ vectors, const float* scaling_factors,
141     int n_batch, float* __restrict__ result) {
142   for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
143     const float batch_scaling_factor = scaling_factors[batch];
144     // Get the address of the first row.
145     const int8_t* row_ptr = matrix;
146     for (int row = 0; row < m_rows; ++row) {
147       // Initialize the dot product sum for the row to 0.
148       int32_t dotprod = 0;
149 #if defined(__GNUC__)
150       // Prefetch the row to cache.
151       __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
152                          3 /* temporal locality */);
153 #endif
154       for (int col = 0; col < m_cols; ++col, ++row_ptr) {
155         dotprod += (*row_ptr) * (vectors[col]);
156       }  // for col
157       *result += dotprod * batch_scaling_factor;
158       ++result;
159     }  // for row
160   }    // for batch
161 }
162 
PortableMatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result,const float * per_channel_scale,const int32_t * input_offset,int32_t * scratch,int32_t * row_sums,bool * compute_row_sums,CpuBackendContext * context)163 void PortableMatrixBatchVectorMultiplyAccumulate(
164     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
165     const int8_t* __restrict__ vectors, const float* scaling_factors,
166     int n_batch, float* __restrict__ result, const float* per_channel_scale,
167     const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
168     bool* compute_row_sums, CpuBackendContext* context) {
169   if (input_offset == nullptr) {
170     PortableMatrixBatchVectorMultiplyAccumulate(
171         matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result);
172     return;
173   }
174   if (!compute_row_sums || *compute_row_sums) {
175     PortableReductionSumVector(matrix, row_sums, m_rows, m_cols);
176     if (compute_row_sums) {
177       *compute_row_sums = false;
178     }
179   }
180 
181   for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
182     const float batch_scaling_factor = scaling_factors[batch];
183     const int32_t batch_offset = input_offset[batch];
184     const int8_t* row_ptr = matrix;
185     for (int row = 0; row < m_rows; ++row) {
186       int32_t dotprod = 0;
187       float scale = batch_scaling_factor;
188       if (per_channel_scale) {
189         scale *= per_channel_scale[row];
190       }
191 #if defined(__GNUC__)
192       // Prefetch the row to cache.
193       __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
194                          3 /* temporal locality */);
195 #endif
196       for (int col = 0; col < m_cols; ++col, ++row_ptr) {
197         dotprod += (*row_ptr) * vectors[col];
198       }  // for col
199       dotprod -= row_sums[row] * batch_offset;
200       *result += dotprod * scale;
201       ++result;
202     }  // for row
203   }    // for batch
204 }
205 
PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(const float * __restrict__ matrix,const int32_t * __restrict__ segments,const int32_t * __restrict__ indices,int m_rows,int m_cols,const float * __restrict__ vector,int n_batch,float * __restrict__ result)206 void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
207     const float* __restrict__ matrix, const int32_t* __restrict__ segments,
208     const int32_t* __restrict__ indices, int m_rows, int m_cols,
209     const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
210   const int kBlockSize = 4;
211   TFLITE_DCHECK_EQ(m_cols % kBlockSize, 0);
212   for (int batch = 0; batch < n_batch; batch++) {
213     const float* matrix_ptr = matrix;
214     for (int row = 0; row < m_rows; row++) {
215       float dot_prod = 0.0f;
216       const float* vector_in_batch = vector + batch * m_cols;
217       for (int i = segments[row]; i < segments[row + 1]; i++) {
218         const int block_start_index = indices[i] * kBlockSize;
219         const float* vector_block_in_batch_ptr =
220             vector_in_batch + block_start_index;
221         for (int c = 0; c < kBlockSize; c++) {
222           dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
223         }
224       }
225       result[batch * m_rows + row] += dot_prod;
226     }
227   }
228 }
229 
PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(const int8_t * __restrict__ matrix,const int32_t * __restrict__ segments,const int32_t * __restrict__ indices,int m_rows,int m_cols,const int8_t * __restrict__ vector,const int32_t * __restrict__ bias_vector,int n_batch,const int32_t input_offset,const int32_t output_multiplier,const int32_t output_shift,const int32_t output_offset,const int32_t output_activation_min,const int32_t output_activation_max,int8_t * __restrict__ result)230 void PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
231     const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
232     const int32_t* __restrict__ indices, int m_rows, int m_cols,
233     const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
234     int n_batch, const int32_t input_offset, const int32_t output_multiplier,
235     const int32_t output_shift, const int32_t output_offset,
236     const int32_t output_activation_min, const int32_t output_activation_max,
237     int8_t* __restrict__ result) {
238   const int kBlockSize = 16;
239   TFLITE_DCHECK_EQ(m_cols % kBlockSize, 0);
240   for (int batch = 0; batch < n_batch; ++batch) {
241     const int8_t* matrix_ptr = matrix;
242     for (int row = 0; row < m_rows; ++row) {
243       int32_t dot_prod = 0;
244       const int8_t* vector_in_batch = vector + batch * m_cols;
245       for (int i = segments[row]; i < segments[row + 1]; ++i) {
246         const int block_start_index = indices[i] * kBlockSize;
247         const int8_t* vector_block_in_batch_ptr =
248             vector_in_batch + block_start_index;
249         for (int c = 0; c < kBlockSize; c++) {
250           dot_prod += *matrix_ptr * *vector_block_in_batch_ptr++;
251           dot_prod += *matrix_ptr++ * input_offset;
252         }
253       }
254       const int32_t bias_value = bias_vector != nullptr ? bias_vector[row] : 0;
255       dot_prod = MultiplyByQuantizedMultiplier(dot_prod + bias_value,
256                                                output_multiplier, output_shift);
257       dot_prod += output_offset;
258       result[batch * m_rows + row] =
259           static_cast<int8_t>(ActivationFunctionWithMinMax(
260               dot_prod, output_activation_min, output_activation_max));
261     }
262   }
263 }
264 
PortableSparseMatrixBatchVectorMultiplyAccumulate(const float * __restrict__ matrix,const uint8_t * __restrict__ ledger,int m_rows,int m_cols,const float * __restrict__ vector,int n_batch,float * __restrict__ result)265 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
266     const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
267     int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
268     float* __restrict__ result) {
269   const int kBlockSize = 16;
270   TFLITE_DCHECK_EQ(  // NOLINT
271       m_cols % kBlockSize, 0);
272   for (int batch = 0; batch < n_batch; batch++) {
273     const float* matrix_ptr = matrix;
274     const uint8_t* ledger_ptr = ledger;
275     for (int row = 0; row < m_rows; row++) {
276       float dot_prod = 0.0f;
277       int num_nonzero_blocks = *ledger_ptr++;
278       if (num_nonzero_blocks > 0) {
279         const float* vector_in_batch = vector + batch * m_cols;
280         for (int i = 0; i < num_nonzero_blocks; i++) {
281           const int block_start_index = *ledger_ptr++ * kBlockSize;
282           const float* vector_block_in_batch_ptr =
283               vector_in_batch + block_start_index;
284           for (int c = 0; c < kBlockSize; c++) {
285             dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
286           }
287         }
288       }
289       result[batch * m_rows + row] += dot_prod;
290     }
291   }
292 }
293 
PortableSparseMatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const uint8_t * ledger,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result)294 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
295     const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
296     const int m_cols, const int8_t* __restrict__ vectors,
297     const float* scaling_factors, int n_batch, float* __restrict__ result) {
298   static const int kBlockSize = 16;
299   TFLITE_DCHECK_EQ(  // NOLINT
300       m_cols % kBlockSize, 0);
301   for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
302     const float batch_scaling_factor = scaling_factors[batch];
303     const uint8_t* ledger_ptr = ledger;
304     // Get the address of the first row.
305     const int8_t* row_ptr = matrix;
306     for (int row = 0; row < m_rows; ++row) {
307       // Initialize the dot product sum for the row to 0.
308       int32_t dotprod = 0;
309 #if defined(__GNUC__)
310       // Prefetch the row to cache.
311       __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
312                          3 /* temporal locality */);
313 #endif
314       int num_nonzero_blocks = *ledger_ptr++;
315       for (int i = 0; i < num_nonzero_blocks; i++) {
316         const int block_start_index = *ledger_ptr++ * kBlockSize;
317         const int8_t* vector_block_ptr = vectors + block_start_index;
318         for (int c = 0; c < kBlockSize; c++) {
319           dotprod += (*row_ptr++) * (*vector_block_ptr++);
320         }  // for block
321       }    // for num_nonzero_blocks
322       result[batch * m_rows + row] += dotprod * batch_scaling_factor;
323     }  // for row
324   }    // for batch
325 }
326 
327 template <typename T>
PortableMatrixBatchVectorMultiplyAccumulateImpl(const int8_t * input,const int32_t * bias,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,T * output)328 void PortableMatrixBatchVectorMultiplyAccumulateImpl(
329     const int8_t* input, const int32_t* bias,
330     const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
331     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
332     T* output) {
333   const int16_t output_max = std::numeric_limits<T>::max();
334   const int16_t output_min = std::numeric_limits<T>::min();
335   for (int batch = 0; batch < n_batch; ++batch) {
336     for (int row = 0; row < n_output; ++row) {
337       int32_t acc = bias[row];
338       for (int col = 0; col < n_input; ++col) {
339         int8_t input_val = input[batch * n_input + col];
340         int8_t weights_val = input_to_gate_weights[row * n_input + col];
341         acc += input_val * weights_val;
342       }
343       acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
344       acc += output_zp;
345       acc += output[batch * n_output + row];
346       if (acc > output_max) {
347         acc = output_max;
348       }
349       if (acc < output_min) {
350         acc = output_min;
351       }
352       output[batch * n_output + row] = static_cast<T>(acc);
353     }
354   }
355 }
356 
PortableMatrixBatchVectorMultiplyAccumulate(const int8_t * input,const int32_t * bias,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,int32_t * scratch,int16_t * output,CpuBackendContext * context)357 void PortableMatrixBatchVectorMultiplyAccumulate(
358     const int8_t* input, const int32_t* bias,
359     const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
360     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
361     int32_t* scratch, int16_t* output, CpuBackendContext* context) {
362   PortableMatrixBatchVectorMultiplyAccumulateImpl(
363       input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
364       n_output, output_zp, output);
365 }
366 
PortableMatrixBatchVectorMultiplyAccumulate(const int8_t * input,const int32_t * bias,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,int32_t * scratch,int8_t * output,CpuBackendContext * context)367 void PortableMatrixBatchVectorMultiplyAccumulate(
368     const int8_t* input, const int32_t* bias,
369     const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
370     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
371     int32_t* scratch, int8_t* output, CpuBackendContext* context) {
372   PortableMatrixBatchVectorMultiplyAccumulateImpl(
373       input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
374       n_output, output_zp, output);
375 }
376 
PortableMatrixBatchVectorMultiply(const int8_t * input,int32_t input_zeropoint,const int8_t * input_to_gate_weights,int32_t input_to_gate_effective_scale_a,int32_t input_to_gate_effective_scale_b,int32_t n_batch,int32_t n_input,int32_t n_cell,int8_t * gate_output,int8_t gate_output_zp)377 void PortableMatrixBatchVectorMultiply(const int8_t* input,
378                                        int32_t input_zeropoint,
379                                        const int8_t* input_to_gate_weights,
380                                        int32_t input_to_gate_effective_scale_a,
381                                        int32_t input_to_gate_effective_scale_b,
382                                        int32_t n_batch, int32_t n_input,
383                                        int32_t n_cell, int8_t* gate_output,
384                                        int8_t gate_output_zp) {
385   const int32_t int8_max = std::numeric_limits<int8_t>::max();
386   const int32_t int8_min = std::numeric_limits<int8_t>::min();
387   for (int batch = 0; batch < n_batch; ++batch) {
388     for (int row = 0; row < n_cell; ++row) {
389       int32_t acc = 0;
390       for (int col = 0; col < n_input; ++col) {
391         int32_t input_val = input[batch * n_input + col];
392         int8_t weights_val = input_to_gate_weights[row * n_input + col];
393         acc += (input_val - input_zeropoint) * weights_val;
394       }
395       acc = MultiplyByQuantizedMultiplier(acc, input_to_gate_effective_scale_a,
396                                           input_to_gate_effective_scale_b);
397       acc += gate_output_zp;
398       if (acc > int8_max) {
399         acc = int8_max;
400       }
401       if (acc < int8_min) {
402         acc = int8_min;
403       }
404       gate_output[batch * n_cell + row] = static_cast<int8_t>(acc);
405     }
406   }
407 }
408 
PortableMatrixBatchVectorMultiply(const int16_t * hidden,const int8_t * hidden_to_output_weights,int32_t proj_effective_scale_a,int32_t proj_effective_scale_b,const int32_t * gate_bias,int32_t n_batch,int32_t n_hidden,int32_t n_output,int32_t output_zp,int8_t * proj_output)409 void PortableMatrixBatchVectorMultiply(
410     const int16_t* hidden, const int8_t* hidden_to_output_weights,
411     int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
412     const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
413     int32_t n_output, int32_t output_zp, int8_t* proj_output) {
414   const int16_t int8_max = std::numeric_limits<int8_t>::max();
415   const int16_t int8_min = std::numeric_limits<int8_t>::min();
416   for (int batch = 0; batch < n_batch; ++batch) {
417     for (int row = 0; row < n_output; ++row) {
418       int64_t acc = gate_bias[row];
419       for (int col = 0; col < n_hidden; ++col) {
420         int16_t input_val = hidden[batch * n_hidden + col];
421         int8_t weights_val = hidden_to_output_weights[row * n_hidden + col];
422         int64_t curr = acc;
423         acc += input_val * weights_val;
424         if (input_val * weights_val > 0 && acc < curr) {
425           acc = std::numeric_limits<int32_t>::max();
426         }
427         if (input_val * weights_val < 0 && acc > curr) {
428           acc = std::numeric_limits<int32_t>::min();
429         }
430       }
431       acc = MultiplyByQuantizedMultiplier(acc, proj_effective_scale_a,
432                                           proj_effective_scale_b);
433       acc += output_zp;
434       if (acc > int8_max) {
435         acc = int8_max;
436       }
437       if (acc < int8_min) {
438         acc = int8_min;
439       }
440       proj_output[batch * n_output + row] = acc;
441     }
442   }
443 }
444 
PortableApplyLayerNorm(const int16_t * input,const int16_t * layer_norm_weights,const int32_t * bias,int32_t layer_norm_scale_a,int32_t layer_norm_scale_b,int32_t variance_limit,int n_batch,int n_input,int16_t * output)445 void PortableApplyLayerNorm(const int16_t* input,
446                             const int16_t* layer_norm_weights,
447                             const int32_t* bias, int32_t layer_norm_scale_a,
448                             int32_t layer_norm_scale_b, int32_t variance_limit,
449                             int n_batch, int n_input, int16_t* output) {
450   // The square of std::pow(2, 10), which is the extra factor that makes sure
451   // normalized values has enough resolution.
452   static const int kTwoToPower20 = 1 << 20;
453   for (int i = 0; i < n_batch; ++i) {
454     int64_t sum = 0;
455     int64_t sum_sq = 0;
456     for (int j = 0; j < n_input; ++j) {
457       const int32_t index = i * n_input + j;
458       int32_t val = static_cast<int32_t>(input[index]);
459       sum += val;
460       sum_sq += val * val;
461     }
462     int32_t mean =
463         static_cast<int32_t>(static_cast<int64_t>(sum) * 1024 / n_input);
464     // TODO(b/173994730): Avoids overflow but only works for POT n_input.
465     int32_t temp = kTwoToPower20 / n_input;
466     int64_t variance =
467         sum_sq * temp - static_cast<int64_t>(mean) * static_cast<int64_t>(mean);
468     int32_t variance2 = static_cast<int32_t>(variance / kTwoToPower20);
469     if (variance2 < 1) {
470       variance2 = variance_limit;
471     }
472     int32_t stddev_inverse_a;
473     int stddev_inverse_b;
474     GetInvSqrtQuantizedMultiplierExp(variance2, /*reverse_shift*/ -1,
475                                      &stddev_inverse_a, &stddev_inverse_b);
476 
477     for (int j = 0; j < n_input; ++j) {
478       const int32_t index = i * n_input + j;
479       int32_t val = static_cast<int32_t>(input[index]);
480       int32_t shifted = 1024 * val - mean;
481       int32_t rescaled = MultiplyByQuantizedMultiplier(
482           shifted, stddev_inverse_a, stddev_inverse_b);
483       // TODO(jianlijianli): Saturate this.
484       int64_t val3 = rescaled * layer_norm_weights[j] + bias[j];
485       int32_t val4 =
486           static_cast<int32_t>((val3 > 0 ? val3 + 512 : val3 - 512) / 1024);
487       int32_t val5 = MultiplyByQuantizedMultiplier(val4, layer_norm_scale_a,
488                                                    layer_norm_scale_b + 12);
489       val5 = std::min(std::max(kInt16Min, val5), kInt16Max);
490       output[index] = static_cast<int16_t>(val5);
491     }
492   }
493 }
494 
PortableApplyLayerNormFloat(const int16_t * input,const int16_t * layer_norm_weights,int32_t layer_norm_scale_a,int32_t layer_norm_scale_b,const int32_t * bias,int n_batch,int n_input,int16_t * output)495 void PortableApplyLayerNormFloat(const int16_t* input,
496                                  const int16_t* layer_norm_weights,
497                                  int32_t layer_norm_scale_a,
498                                  int32_t layer_norm_scale_b,
499                                  const int32_t* bias, int n_batch, int n_input,
500                                  int16_t* output) {
501   const int32_t int16_max = std::numeric_limits<int16_t>::max();
502   const int32_t int16_min = std::numeric_limits<int16_t>::min();
503   const float layer_norm_scale =
504       layer_norm_scale_a *
505       std::pow(2.0, static_cast<double>(layer_norm_scale_b - 31));
506   const float bias_scale =
507       static_cast<float>(std::pow(2.0, -10)) * layer_norm_scale;
508 
509   for (int batch = 0; batch < n_batch; ++batch) {
510     float sum = 0.0f;
511     float sum_sq = 0.0f;
512     for (int i = 0; i < n_input; ++i) {
513       const int index = batch * n_input + i;
514       const float value = static_cast<float>(input[index]);
515       sum += value;
516       sum_sq += value * value;
517     }
518     const float mean = sum / n_input;
519     float stddev_inv = 0.0f;
520     const float variance = sum_sq / n_input - mean * mean;
521     if (variance == 0) {
522       stddev_inv = 1.0f / std::sqrt(1e-8f);
523     } else {
524       stddev_inv = 1.0f / std::sqrt(variance);
525     }
526     for (int i = 0; i < n_input; ++i) {
527       const int index = batch * n_input + i;
528       const float normalized_value =
529           (static_cast<float>(input[index]) - mean) * stddev_inv;
530       const float weighted_normalized_value =
531           normalized_value * layer_norm_weights[i] * layer_norm_scale +
532           bias[i] * bias_scale;
533       const int32_t quant_output = static_cast<int32_t>(round(
534           weighted_normalized_value * static_cast<float>(std::pow(2, 12))));
535       output[index] = std::min(int16_max, std::max(int16_min, quant_output));
536     }
537   }
538 }
539 
PortableMatrixScalarMultiplyAccumulate(const int8_t * matrix,int32_t scalar,int32_t n_row,int32_t n_col,int32_t * output)540 void PortableMatrixScalarMultiplyAccumulate(const int8_t* matrix,
541                                             int32_t scalar, int32_t n_row,
542                                             int32_t n_col, int32_t* output) {
543   for (int i = 0; i < n_row; ++i) {
544     int32_t row_sum = 0;
545     for (int j = 0; j < n_col; ++j) {
546       row_sum += *matrix++;
547     }
548     output[i] += row_sum * scalar;
549   }
550 }
551 
PortableApplySigmoid(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)552 void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
553                           int32_t n_input, int16_t* output) {
554   for (int batch = 0; batch < n_batch; ++batch) {
555     for (int c = 0; c < n_input; c++) {
556       using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
557       using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
558       const int index = batch * n_input + c;
559       F3 sigmoid_input = F3::FromRaw(input[index]);
560       F0 sigmoid_output = gemmlowp::logistic(sigmoid_input);
561       output[index] = sigmoid_output.raw();
562     }
563   }
564 }
565 
PortableApplySigmoidFloat(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)566 void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
567                                int32_t n_input, int16_t* output) {
568   const int32_t int16_max = std::numeric_limits<int16_t>::max();
569   const int32_t int16_min = std::numeric_limits<int16_t>::min();
570   for (int batch = 0; batch < n_batch; ++batch) {
571     for (int i = 0; i < n_input; ++i) {
572       const int index = batch * n_input + i;
573       const float float_input =
574           input[index] * static_cast<float>(std::pow(2, -12));
575       const float float_output = 1.0f / (1.0f + std::exp(-float_input));
576       const int32_t quant_output = static_cast<int32_t>(
577           float_output * static_cast<float>(std::pow(2, 15)));
578       const int32_t quant_output_clamped =
579           std::min(int16_max, std::max(int16_min, quant_output));
580       output[index] = static_cast<int16_t>(quant_output_clamped);
581     }
582   }
583 }
584 
585 template <int IntegerBits>
PortableApplyTanhImpl(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)586 void PortableApplyTanhImpl(const int16_t* input, int32_t n_batch,
587                            int32_t n_input, int16_t* output) {
588   using FX = gemmlowp::FixedPoint<std::int16_t, IntegerBits>;
589   using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
590   for (int batch = 0; batch < n_batch; ++batch) {
591     for (int i = 0; i < n_input; ++i) {
592       const int index = batch * n_input + i;
593       FX tanh_input = FX::FromRaw(input[index]);
594       F0 tanh_output = gemmlowp::tanh(tanh_input);
595       output[index] = tanh_output.raw();
596     }
597   }
598 }
599 
PortableApplyTanh(int32_t integer_bits,const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)600 void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
601                        int32_t n_batch, int32_t n_input, int16_t* output) {
602   assert(integer_bits <= 6);
603 #define DISPATCH_TANH(i)                                       \
604   case i:                                                      \
605     PortableApplyTanhImpl<i>(input, n_batch, n_input, output); \
606     break;
607   switch (integer_bits) {
608     DISPATCH_TANH(0);
609     DISPATCH_TANH(1);
610     DISPATCH_TANH(2);
611     DISPATCH_TANH(3);
612     DISPATCH_TANH(4);
613     DISPATCH_TANH(5);
614     DISPATCH_TANH(6);
615     default:
616       return;
617   }
618 #undef DISPATCH_TANH
619 }
620 
PortableApplyTanhFloat(const int16_t * input,int32_t n_batch,int32_t n_input,int32_t integer_bits,int16_t * output)621 void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
622                             int32_t n_input, int32_t integer_bits,
623                             int16_t* output) {
624   const int32_t int16_max = std::numeric_limits<int16_t>::max();
625   const int32_t int16_min = std::numeric_limits<int16_t>::min();
626   const double two = 2.0;
627   for (int batch = 0; batch < n_batch; ++batch) {
628     for (int i = 0; i < n_input; ++i) {
629       const int index = batch * n_input + i;
630       const float float_input =
631           input[index] * std::pow(two, static_cast<double>(integer_bits));
632       const float float_output = std::tanh(float_input);
633       const int32_t quant_output = static_cast<int32_t>(
634           float_output * static_cast<float>(std::pow(2, 15)));
635       const int32_t quant_output_clamped =
636           std::min(int16_max, std::max(int16_min, quant_output));
637       output[index] = static_cast<int16_t>(quant_output_clamped);
638     }
639   }
640 }
641 
PortableCwiseMul(const int16_t * input_1,const int16_t * input_2,int n_batch,int n_input,int shift,int16_t * output)642 void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
643                       int n_batch, int n_input, int shift, int16_t* output) {
644   for (int batch = 0; batch < n_batch; ++batch) {
645     for (int i = 0; i < n_input; ++i) {
646       const int index = batch * n_input + i;
647       const int16_t a = input_1[index];
648       const int16_t b = input_2[index];
649       const int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
650       output[index] =
651           static_cast<int16_t>(gemmlowp::RoundingDivideByPOT(value, shift));
652     }
653   }
654 }
655 
PortableCwiseMul(const int16_t * input_1,const int16_t * input_2,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t output_zp,int8_t * output)656 void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
657                       int32_t multiplier, int32_t shift, int32_t n_batch,
658                       int32_t n_input, int32_t output_zp, int8_t* output) {
659   for (int batch = 0; batch < n_batch; ++batch) {
660     for (int i = 0; i < n_input; ++i) {
661       const int index = batch * n_input + i;
662       const int16_t a = input_1[index];
663       const int16_t b = input_2[index];
664       int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
665       value = MultiplyByQuantizedMultiplier(value, multiplier, shift);
666       value -= output_zp;
667       value = std::min(std::max(static_cast<int32_t>(-128), value),
668                        static_cast<int32_t>(127));
669 
670       output[index] = static_cast<int8_t>(value);
671     }
672   }
673 }
674 
PortableCwiseAdd(const int16_t * input_1,const int16_t * input_2,int n_batch,int n_input,int16_t * output)675 void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
676                       int n_batch, int n_input, int16_t* output) {
677   for (int batch = 0; batch < n_batch; ++batch) {
678     for (int i = 0; i < n_input; ++i) {
679       const int index = batch * n_input + i;
680       int32_t sum = input_1[index] + input_2[index];
681       const int32_t sum_clamped = std::min(kInt16Max, std::max(kInt16Min, sum));
682       output[index] = static_cast<int16_t>(sum_clamped);
683     }
684   }
685 }
686 
PortableVectorVectorDotProduct(const float * vector1,const float * vector2,int v_size)687 float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
688                                      int v_size) {
689   float result = 0.0;
690   for (int v = 0; v < v_size; v++) {
691     result += *vector1++ * *vector2++;
692   }
693   return result;
694 }
695 
696 namespace {
VectorVectorDotProduct(const int16_t * vector1,const int16_t * vector2,int v_size)697 inline int32_t VectorVectorDotProduct(const int16_t* vector1,
698                                       const int16_t* vector2, int v_size) {
699   int32_t result = 0;
700   for (int v = 0; v < v_size; v++) {
701     result += *vector1++ * *vector2++;
702   }
703   return result;
704 }
705 }  // namespace
706 
PortableBatchVectorBatchVectorDotProduct(const int16_t * vector1,const int16_t * vector2,int v_size,int n_batch,int32_t * result)707 void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1,
708                                               const int16_t* vector2,
709                                               int v_size, int n_batch,
710                                               int32_t* result) {
711   for (int b = 0; b < n_batch; b++) {
712     result[b] = VectorVectorDotProduct(vector1, vector2, v_size);
713     vector1 += v_size;
714     vector2 += v_size;
715   }
716 }
717 
PortableVectorBatchVectorCwiseProductAccumulate(const int16_t * vector,int v_size,const int16_t * batch_vector,int n_batch,int32_t multiplier,int shift,int16_t * result)718 void PortableVectorBatchVectorCwiseProductAccumulate(
719     const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
720     int32_t multiplier, int shift, int16_t* result) {
721   for (int b = 0; b < n_batch; b++) {
722     for (int v = 0; v < v_size; v++) {
723       int32_t prod = vector[v] * *batch_vector++;
724       prod = MultiplyByQuantizedMultiplier(prod, multiplier, shift);
725       int32_t output = prod + *result;
726       output = std::max(std::min(static_cast<int32_t>(32767), output),
727                         static_cast<int32_t>(-32768));
728       *result++ = output;
729     }
730   }
731 }
732 
PortableSub1Vector(const float * vector,int v_size,float * result)733 void PortableSub1Vector(const float* vector, int v_size, float* result) {
734   for (int v = 0; v < v_size; v++) {
735     *result++ = 1.0f - *vector++;
736   }
737 }
738 
PortableSub1Vector(const int16_t * vector,int v_size,int16_t * result)739 void PortableSub1Vector(const int16_t* vector, int v_size, int16_t* result) {
740   static const int16_t kOne = 32767;
741   for (int v = 0; v < v_size; v++) {
742     *result++ = kOne - *vector++;
743   }
744 }
745 
PortableVectorScalarMultiply(const int8_t * vector,const int v_size,const float scale,float * result)746 void PortableVectorScalarMultiply(const int8_t* vector, const int v_size,
747                                   const float scale, float* result) {
748   for (int v = 0; v < v_size; ++v) {
749     *result++ = scale * *vector++;
750   }
751 }
752 
PortableMeanStddevNormalization(const float * __restrict__ input_vector,float * __restrict__ output_vector,int v_size,int n_batch)753 void PortableMeanStddevNormalization(const float* __restrict__ input_vector,
754                                      float* __restrict__ output_vector,
755                                      int v_size, int n_batch) {
756   for (int batch = 0; batch < n_batch; ++batch) {
757     float sum = 0.0f;
758     for (int i = 0; i < v_size; ++i) {
759       sum += input_vector[i];
760     }
761     const float mean = sum / v_size;
762     float sum_diff_sq = 0.0f;
763     for (int i = 0; i < v_size; ++i) {
764       const float diff = input_vector[i] - mean;
765       sum_diff_sq += diff * diff;
766     }
767     const float variance = sum_diff_sq / v_size;
768     constexpr float kNormalizationConstant = 1e-8f;
769     const float stddev_inv =
770         1.0f / std::sqrt(variance + kNormalizationConstant);
771     for (int i = 0; i < v_size; ++i) {
772       output_vector[i] = (input_vector[i] - mean) * stddev_inv;
773     }
774     input_vector += v_size;
775     output_vector += v_size;
776   }
777 }
778 
PortableTwoGateSaturatingAdd(const int8_t * input,int8_t input_zp,const int8_t * recurrent,int8_t recurrent_zp,int32_t input_effective_scale_a,int32_t input_effective_scale_b,int32_t recurrent_effective_scale_a,int32_t recurrent_effective_scale_b,int32_t n_batch,int32_t n_cell,int16_t * output)779 void PortableTwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
780                                   const int8_t* recurrent, int8_t recurrent_zp,
781                                   int32_t input_effective_scale_a,
782                                   int32_t input_effective_scale_b,
783                                   int32_t recurrent_effective_scale_a,
784                                   int32_t recurrent_effective_scale_b,
785                                   int32_t n_batch, int32_t n_cell,
786                                   int16_t* output) {
787   const int32_t int16_max = std::numeric_limits<int16_t>::max();
788   const int32_t int16_min = std::numeric_limits<int16_t>::min();
789   for (int i = 0; i < n_batch * n_cell; ++i) {
790     int32_t x = static_cast<int32_t>(input[i]) - static_cast<int32_t>(input_zp);
791     int32_t h =
792         static_cast<int32_t>(recurrent[i]) - static_cast<int32_t>(recurrent_zp);
793     int32_t x_scaled = MultiplyByQuantizedMultiplier(x, input_effective_scale_a,
794                                                      input_effective_scale_b);
795     int32_t h_scaled = MultiplyByQuantizedMultiplier(
796         h, recurrent_effective_scale_a, recurrent_effective_scale_b);
797     int32_t y = h_scaled + x_scaled;
798     if (y > int16_max) {
799       y = int16_max;
800     }
801     if (y < int16_min) {
802       y = int16_min;
803     }
804     output[i] = static_cast<int16_t>(y);
805   }
806 }
807 
808 }  // namespace tensor_utils
809 }  // namespace tflite
810