1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include <algorithm>
16 #include <cmath>
17 #include <cstdint>
18 #include <cstring>
19 #include <limits>
20 #include <utility>
21
22 #include "fixedpoint/fixedpoint.h"
23 #include "tensorflow/lite/kernels/internal/common.h"
24 #include "tensorflow/lite/kernels/internal/compatibility.h"
25 #include "tensorflow/lite/kernels/internal/cppmath.h"
26 #include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
27
28 #if defined(_MSC_VER)
29 #define __restrict__ __restrict
30 #endif
31
32 namespace tflite {
33 namespace tensor_utils {
34
35 namespace {
36 const int32_t kInt16Max = std::numeric_limits<int16_t>::max();
37 const int32_t kInt16Min = std::numeric_limits<int16_t>::min();
38 } // namespace
39
PortableSymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float * min_value,float * max_value,float * scaling_factor)40 void PortableSymmetricQuantizeFloats(const float* values, const int size,
41 int8_t* quantized_values, float* min_value,
42 float* max_value, float* scaling_factor) {
43 auto minmax = std::minmax_element(values, values + size);
44 *min_value = *minmax.first;
45 *max_value = *minmax.second;
46
47 PortableSymmetricQuantizeFloats(values, size, quantized_values, *min_value,
48 *max_value, scaling_factor);
49 }
50
PortableSymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float min_value,float max_value,float * scaling_factor)51 void PortableSymmetricQuantizeFloats(const float* values, const int size,
52 int8_t* quantized_values, float min_value,
53 float max_value, float* scaling_factor) {
54 const int32_t kScale = 127;
55 const float range = std::max(std::abs(min_value), std::abs(max_value));
56 if (range == 0) {
57 memset(quantized_values, 0, size * sizeof(int8_t));
58 *scaling_factor = 1;
59 return;
60 }
61 *scaling_factor = range / kScale;
62 const float scaling_factor_inv = kScale / range;
63 for (int i = 0; i < size; ++i) {
64 const int32_t quantized_value =
65 static_cast<int32_t>(TfLiteRound(values[i] * scaling_factor_inv));
66 // Clamp: just in case some odd numeric offset.
67 quantized_values[i] = static_cast<int8_t>(
68 std::min(kScale, std::max(-kScale, quantized_value)));
69 }
70 }
71
PortableAsymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float * scaling_factor,int32_t * offset)72 void PortableAsymmetricQuantizeFloats(const float* values, const int size,
73 int8_t* quantized_values,
74 float* scaling_factor, int32_t* offset) {
75 const int32_t kMinScale = -128;
76 const int32_t kMaxScale = 127;
77 const double qmin_double = kMinScale;
78 const double qmax_double = kMaxScale;
79 const auto minmax = std::minmax_element(values, values + size);
80 const double rmin = static_cast<double>(std::min(0.0f, *minmax.first));
81 const double rmax = static_cast<double>(std::max(0.0f, *minmax.second));
82 if (rmin == rmax) {
83 memset(quantized_values, 0, size * sizeof(int8_t));
84 *scaling_factor = 1;
85 *offset = 0;
86 return;
87 } else {
88 double scale = (rmax - rmin) / (qmax_double - qmin_double);
89 const double zero_point_from_min = qmin_double - rmin / scale;
90 const double zero_point_from_max = qmax_double - rmax / scale;
91 const double zero_point_from_min_error =
92 std::abs(qmin_double) + std::abs(rmin / scale);
93 const double zero_point_from_max_error =
94 std::abs(qmax_double) + std::abs(rmax / scale);
95 const double zero_point_double =
96 zero_point_from_min_error < zero_point_from_max_error
97 ? zero_point_from_min
98 : zero_point_from_max;
99 int8_t nudged_zero_point = 0;
100 if (zero_point_double <= qmin_double) {
101 nudged_zero_point = kMinScale;
102 } else if (zero_point_double >= qmax_double) {
103 nudged_zero_point = kMaxScale;
104 } else {
105 nudged_zero_point = static_cast<int8_t>(round(zero_point_double));
106 }
107 *scaling_factor = scale;
108 *offset = nudged_zero_point;
109 }
110 const float scaling_factor_inv = 1.0f / *scaling_factor;
111 for (int i = 0; i < size; ++i) {
112 const int32_t quantized_value = static_cast<int32_t>(
113 TfLiteRound(*offset + values[i] * scaling_factor_inv));
114 quantized_values[i] =
115 std::min(kMaxScale, std::max(kMinScale, quantized_value));
116 }
117 }
118
PortableMatrixBatchVectorMultiplyAccumulate(const float * matrix,int m_rows,int m_cols,const float * vector,int n_batch,float * result)119 void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
120 int m_rows, int m_cols,
121 const float* vector,
122 int n_batch, float* result) {
123 float* result_in_batch = result;
124 for (int b = 0; b < n_batch; b++) {
125 const float* matrix_ptr = matrix;
126 for (int r = 0; r < m_rows; r++) {
127 float dot_prod = 0.0f;
128 const float* vector_in_batch = vector + b * m_cols;
129 for (int c = 0; c < m_cols; c++) {
130 dot_prod += *matrix_ptr++ * *vector_in_batch++;
131 }
132 *result_in_batch += dot_prod;
133 ++result_in_batch;
134 }
135 }
136 }
137
PortableMatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result)138 void PortableMatrixBatchVectorMultiplyAccumulate(
139 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
140 const int8_t* __restrict__ vectors, const float* scaling_factors,
141 int n_batch, float* __restrict__ result) {
142 for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
143 const float batch_scaling_factor = scaling_factors[batch];
144 // Get the address of the first row.
145 const int8_t* row_ptr = matrix;
146 for (int row = 0; row < m_rows; ++row) {
147 // Initialize the dot product sum for the row to 0.
148 int32_t dotprod = 0;
149 #if defined(__GNUC__)
150 // Prefetch the row to cache.
151 __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
152 3 /* temporal locality */);
153 #endif
154 for (int col = 0; col < m_cols; ++col, ++row_ptr) {
155 dotprod += (*row_ptr) * (vectors[col]);
156 } // for col
157 *result += dotprod * batch_scaling_factor;
158 ++result;
159 } // for row
160 } // for batch
161 }
162
PortableMatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result,const float * per_channel_scale,const int32_t * input_offset,int32_t * scratch,int32_t * row_sums,bool * compute_row_sums,CpuBackendContext * context)163 void PortableMatrixBatchVectorMultiplyAccumulate(
164 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
165 const int8_t* __restrict__ vectors, const float* scaling_factors,
166 int n_batch, float* __restrict__ result, const float* per_channel_scale,
167 const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
168 bool* compute_row_sums, CpuBackendContext* context) {
169 if (input_offset == nullptr) {
170 PortableMatrixBatchVectorMultiplyAccumulate(
171 matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result);
172 return;
173 }
174 if (!compute_row_sums || *compute_row_sums) {
175 PortableReductionSumVector(matrix, row_sums, m_rows, m_cols);
176 if (compute_row_sums) {
177 *compute_row_sums = false;
178 }
179 }
180
181 for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
182 const float batch_scaling_factor = scaling_factors[batch];
183 const int32_t batch_offset = input_offset[batch];
184 const int8_t* row_ptr = matrix;
185 for (int row = 0; row < m_rows; ++row) {
186 int32_t dotprod = 0;
187 float scale = batch_scaling_factor;
188 if (per_channel_scale) {
189 scale *= per_channel_scale[row];
190 }
191 #if defined(__GNUC__)
192 // Prefetch the row to cache.
193 __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
194 3 /* temporal locality */);
195 #endif
196 for (int col = 0; col < m_cols; ++col, ++row_ptr) {
197 dotprod += (*row_ptr) * vectors[col];
198 } // for col
199 dotprod -= row_sums[row] * batch_offset;
200 *result += dotprod * scale;
201 ++result;
202 } // for row
203 } // for batch
204 }
205
PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(const float * __restrict__ matrix,const int32_t * __restrict__ segments,const int32_t * __restrict__ indices,int m_rows,int m_cols,const float * __restrict__ vector,int n_batch,float * __restrict__ result)206 void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
207 const float* __restrict__ matrix, const int32_t* __restrict__ segments,
208 const int32_t* __restrict__ indices, int m_rows, int m_cols,
209 const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
210 const int kBlockSize = 4;
211 TFLITE_DCHECK_EQ(m_cols % kBlockSize, 0);
212 for (int batch = 0; batch < n_batch; batch++) {
213 const float* matrix_ptr = matrix;
214 for (int row = 0; row < m_rows; row++) {
215 float dot_prod = 0.0f;
216 const float* vector_in_batch = vector + batch * m_cols;
217 for (int i = segments[row]; i < segments[row + 1]; i++) {
218 const int block_start_index = indices[i] * kBlockSize;
219 const float* vector_block_in_batch_ptr =
220 vector_in_batch + block_start_index;
221 for (int c = 0; c < kBlockSize; c++) {
222 dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
223 }
224 }
225 result[batch * m_rows + row] += dot_prod;
226 }
227 }
228 }
229
PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(const int8_t * __restrict__ matrix,const int32_t * __restrict__ segments,const int32_t * __restrict__ indices,int m_rows,int m_cols,const int8_t * __restrict__ vector,const int32_t * __restrict__ bias_vector,int n_batch,const int32_t input_offset,const int32_t output_multiplier,const int32_t output_shift,const int32_t output_offset,const int32_t output_activation_min,const int32_t output_activation_max,int8_t * __restrict__ result)230 void PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
231 const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
232 const int32_t* __restrict__ indices, int m_rows, int m_cols,
233 const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
234 int n_batch, const int32_t input_offset, const int32_t output_multiplier,
235 const int32_t output_shift, const int32_t output_offset,
236 const int32_t output_activation_min, const int32_t output_activation_max,
237 int8_t* __restrict__ result) {
238 const int kBlockSize = 16;
239 TFLITE_DCHECK_EQ(m_cols % kBlockSize, 0);
240 for (int batch = 0; batch < n_batch; ++batch) {
241 const int8_t* matrix_ptr = matrix;
242 for (int row = 0; row < m_rows; ++row) {
243 int32_t dot_prod = 0;
244 const int8_t* vector_in_batch = vector + batch * m_cols;
245 for (int i = segments[row]; i < segments[row + 1]; ++i) {
246 const int block_start_index = indices[i] * kBlockSize;
247 const int8_t* vector_block_in_batch_ptr =
248 vector_in_batch + block_start_index;
249 for (int c = 0; c < kBlockSize; c++) {
250 dot_prod += *matrix_ptr * *vector_block_in_batch_ptr++;
251 dot_prod += *matrix_ptr++ * input_offset;
252 }
253 }
254 const int32_t bias_value = bias_vector != nullptr ? bias_vector[row] : 0;
255 dot_prod = MultiplyByQuantizedMultiplier(dot_prod + bias_value,
256 output_multiplier, output_shift);
257 dot_prod += output_offset;
258 result[batch * m_rows + row] =
259 static_cast<int8_t>(ActivationFunctionWithMinMax(
260 dot_prod, output_activation_min, output_activation_max));
261 }
262 }
263 }
264
PortableSparseMatrixBatchVectorMultiplyAccumulate(const float * __restrict__ matrix,const uint8_t * __restrict__ ledger,int m_rows,int m_cols,const float * __restrict__ vector,int n_batch,float * __restrict__ result)265 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
266 const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
267 int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
268 float* __restrict__ result) {
269 const int kBlockSize = 16;
270 TFLITE_DCHECK_EQ( // NOLINT
271 m_cols % kBlockSize, 0);
272 for (int batch = 0; batch < n_batch; batch++) {
273 const float* matrix_ptr = matrix;
274 const uint8_t* ledger_ptr = ledger;
275 for (int row = 0; row < m_rows; row++) {
276 float dot_prod = 0.0f;
277 int num_nonzero_blocks = *ledger_ptr++;
278 if (num_nonzero_blocks > 0) {
279 const float* vector_in_batch = vector + batch * m_cols;
280 for (int i = 0; i < num_nonzero_blocks; i++) {
281 const int block_start_index = *ledger_ptr++ * kBlockSize;
282 const float* vector_block_in_batch_ptr =
283 vector_in_batch + block_start_index;
284 for (int c = 0; c < kBlockSize; c++) {
285 dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
286 }
287 }
288 }
289 result[batch * m_rows + row] += dot_prod;
290 }
291 }
292 }
293
PortableSparseMatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const uint8_t * ledger,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result)294 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
295 const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
296 const int m_cols, const int8_t* __restrict__ vectors,
297 const float* scaling_factors, int n_batch, float* __restrict__ result) {
298 static const int kBlockSize = 16;
299 TFLITE_DCHECK_EQ( // NOLINT
300 m_cols % kBlockSize, 0);
301 for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
302 const float batch_scaling_factor = scaling_factors[batch];
303 const uint8_t* ledger_ptr = ledger;
304 // Get the address of the first row.
305 const int8_t* row_ptr = matrix;
306 for (int row = 0; row < m_rows; ++row) {
307 // Initialize the dot product sum for the row to 0.
308 int32_t dotprod = 0;
309 #if defined(__GNUC__)
310 // Prefetch the row to cache.
311 __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
312 3 /* temporal locality */);
313 #endif
314 int num_nonzero_blocks = *ledger_ptr++;
315 for (int i = 0; i < num_nonzero_blocks; i++) {
316 const int block_start_index = *ledger_ptr++ * kBlockSize;
317 const int8_t* vector_block_ptr = vectors + block_start_index;
318 for (int c = 0; c < kBlockSize; c++) {
319 dotprod += (*row_ptr++) * (*vector_block_ptr++);
320 } // for block
321 } // for num_nonzero_blocks
322 result[batch * m_rows + row] += dotprod * batch_scaling_factor;
323 } // for row
324 } // for batch
325 }
326
327 template <typename T>
PortableMatrixBatchVectorMultiplyAccumulateImpl(const int8_t * input,const int32_t * bias,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,T * output)328 void PortableMatrixBatchVectorMultiplyAccumulateImpl(
329 const int8_t* input, const int32_t* bias,
330 const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
331 int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
332 T* output) {
333 const int16_t output_max = std::numeric_limits<T>::max();
334 const int16_t output_min = std::numeric_limits<T>::min();
335 for (int batch = 0; batch < n_batch; ++batch) {
336 for (int row = 0; row < n_output; ++row) {
337 int32_t acc = bias[row];
338 for (int col = 0; col < n_input; ++col) {
339 int8_t input_val = input[batch * n_input + col];
340 int8_t weights_val = input_to_gate_weights[row * n_input + col];
341 acc += input_val * weights_val;
342 }
343 acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
344 acc += output_zp;
345 acc += output[batch * n_output + row];
346 if (acc > output_max) {
347 acc = output_max;
348 }
349 if (acc < output_min) {
350 acc = output_min;
351 }
352 output[batch * n_output + row] = static_cast<T>(acc);
353 }
354 }
355 }
356
PortableMatrixBatchVectorMultiplyAccumulate(const int8_t * input,const int32_t * bias,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,int32_t * scratch,int16_t * output,CpuBackendContext * context)357 void PortableMatrixBatchVectorMultiplyAccumulate(
358 const int8_t* input, const int32_t* bias,
359 const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
360 int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
361 int32_t* scratch, int16_t* output, CpuBackendContext* context) {
362 PortableMatrixBatchVectorMultiplyAccumulateImpl(
363 input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
364 n_output, output_zp, output);
365 }
366
PortableMatrixBatchVectorMultiplyAccumulate(const int8_t * input,const int32_t * bias,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,int32_t * scratch,int8_t * output,CpuBackendContext * context)367 void PortableMatrixBatchVectorMultiplyAccumulate(
368 const int8_t* input, const int32_t* bias,
369 const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
370 int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
371 int32_t* scratch, int8_t* output, CpuBackendContext* context) {
372 PortableMatrixBatchVectorMultiplyAccumulateImpl(
373 input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
374 n_output, output_zp, output);
375 }
376
PortableMatrixBatchVectorMultiply(const int8_t * input,int32_t input_zeropoint,const int8_t * input_to_gate_weights,int32_t input_to_gate_effective_scale_a,int32_t input_to_gate_effective_scale_b,int32_t n_batch,int32_t n_input,int32_t n_cell,int8_t * gate_output,int8_t gate_output_zp)377 void PortableMatrixBatchVectorMultiply(const int8_t* input,
378 int32_t input_zeropoint,
379 const int8_t* input_to_gate_weights,
380 int32_t input_to_gate_effective_scale_a,
381 int32_t input_to_gate_effective_scale_b,
382 int32_t n_batch, int32_t n_input,
383 int32_t n_cell, int8_t* gate_output,
384 int8_t gate_output_zp) {
385 const int32_t int8_max = std::numeric_limits<int8_t>::max();
386 const int32_t int8_min = std::numeric_limits<int8_t>::min();
387 for (int batch = 0; batch < n_batch; ++batch) {
388 for (int row = 0; row < n_cell; ++row) {
389 int32_t acc = 0;
390 for (int col = 0; col < n_input; ++col) {
391 int32_t input_val = input[batch * n_input + col];
392 int8_t weights_val = input_to_gate_weights[row * n_input + col];
393 acc += (input_val - input_zeropoint) * weights_val;
394 }
395 acc = MultiplyByQuantizedMultiplier(acc, input_to_gate_effective_scale_a,
396 input_to_gate_effective_scale_b);
397 acc += gate_output_zp;
398 if (acc > int8_max) {
399 acc = int8_max;
400 }
401 if (acc < int8_min) {
402 acc = int8_min;
403 }
404 gate_output[batch * n_cell + row] = static_cast<int8_t>(acc);
405 }
406 }
407 }
408
PortableMatrixBatchVectorMultiply(const int16_t * hidden,const int8_t * hidden_to_output_weights,int32_t proj_effective_scale_a,int32_t proj_effective_scale_b,const int32_t * gate_bias,int32_t n_batch,int32_t n_hidden,int32_t n_output,int32_t output_zp,int8_t * proj_output)409 void PortableMatrixBatchVectorMultiply(
410 const int16_t* hidden, const int8_t* hidden_to_output_weights,
411 int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
412 const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
413 int32_t n_output, int32_t output_zp, int8_t* proj_output) {
414 const int16_t int8_max = std::numeric_limits<int8_t>::max();
415 const int16_t int8_min = std::numeric_limits<int8_t>::min();
416 for (int batch = 0; batch < n_batch; ++batch) {
417 for (int row = 0; row < n_output; ++row) {
418 int64_t acc = gate_bias[row];
419 for (int col = 0; col < n_hidden; ++col) {
420 int16_t input_val = hidden[batch * n_hidden + col];
421 int8_t weights_val = hidden_to_output_weights[row * n_hidden + col];
422 int64_t curr = acc;
423 acc += input_val * weights_val;
424 if (input_val * weights_val > 0 && acc < curr) {
425 acc = std::numeric_limits<int32_t>::max();
426 }
427 if (input_val * weights_val < 0 && acc > curr) {
428 acc = std::numeric_limits<int32_t>::min();
429 }
430 }
431 acc = MultiplyByQuantizedMultiplier(acc, proj_effective_scale_a,
432 proj_effective_scale_b);
433 acc += output_zp;
434 if (acc > int8_max) {
435 acc = int8_max;
436 }
437 if (acc < int8_min) {
438 acc = int8_min;
439 }
440 proj_output[batch * n_output + row] = acc;
441 }
442 }
443 }
444
PortableApplyLayerNorm(const int16_t * input,const int16_t * layer_norm_weights,const int32_t * bias,int32_t layer_norm_scale_a,int32_t layer_norm_scale_b,int32_t variance_limit,int n_batch,int n_input,int16_t * output)445 void PortableApplyLayerNorm(const int16_t* input,
446 const int16_t* layer_norm_weights,
447 const int32_t* bias, int32_t layer_norm_scale_a,
448 int32_t layer_norm_scale_b, int32_t variance_limit,
449 int n_batch, int n_input, int16_t* output) {
450 // The square of std::pow(2, 10), which is the extra factor that makes sure
451 // normalized values has enough resolution.
452 static const int kTwoToPower20 = 1 << 20;
453 for (int i = 0; i < n_batch; ++i) {
454 int64_t sum = 0;
455 int64_t sum_sq = 0;
456 for (int j = 0; j < n_input; ++j) {
457 const int32_t index = i * n_input + j;
458 int32_t val = static_cast<int32_t>(input[index]);
459 sum += val;
460 sum_sq += val * val;
461 }
462 int32_t mean =
463 static_cast<int32_t>(static_cast<int64_t>(sum) * 1024 / n_input);
464 // TODO(b/173994730): Avoids overflow but only works for POT n_input.
465 int32_t temp = kTwoToPower20 / n_input;
466 int64_t variance =
467 sum_sq * temp - static_cast<int64_t>(mean) * static_cast<int64_t>(mean);
468 int32_t variance2 = static_cast<int32_t>(variance / kTwoToPower20);
469 if (variance2 < 1) {
470 variance2 = variance_limit;
471 }
472 int32_t stddev_inverse_a;
473 int stddev_inverse_b;
474 GetInvSqrtQuantizedMultiplierExp(variance2, /*reverse_shift*/ -1,
475 &stddev_inverse_a, &stddev_inverse_b);
476
477 for (int j = 0; j < n_input; ++j) {
478 const int32_t index = i * n_input + j;
479 int32_t val = static_cast<int32_t>(input[index]);
480 int32_t shifted = 1024 * val - mean;
481 int32_t rescaled = MultiplyByQuantizedMultiplier(
482 shifted, stddev_inverse_a, stddev_inverse_b);
483 // TODO(jianlijianli): Saturate this.
484 int64_t val3 = rescaled * layer_norm_weights[j] + bias[j];
485 int32_t val4 =
486 static_cast<int32_t>((val3 > 0 ? val3 + 512 : val3 - 512) / 1024);
487 int32_t val5 = MultiplyByQuantizedMultiplier(val4, layer_norm_scale_a,
488 layer_norm_scale_b + 12);
489 val5 = std::min(std::max(kInt16Min, val5), kInt16Max);
490 output[index] = static_cast<int16_t>(val5);
491 }
492 }
493 }
494
PortableApplyLayerNormFloat(const int16_t * input,const int16_t * layer_norm_weights,int32_t layer_norm_scale_a,int32_t layer_norm_scale_b,const int32_t * bias,int n_batch,int n_input,int16_t * output)495 void PortableApplyLayerNormFloat(const int16_t* input,
496 const int16_t* layer_norm_weights,
497 int32_t layer_norm_scale_a,
498 int32_t layer_norm_scale_b,
499 const int32_t* bias, int n_batch, int n_input,
500 int16_t* output) {
501 const int32_t int16_max = std::numeric_limits<int16_t>::max();
502 const int32_t int16_min = std::numeric_limits<int16_t>::min();
503 const float layer_norm_scale =
504 layer_norm_scale_a *
505 std::pow(2.0, static_cast<double>(layer_norm_scale_b - 31));
506 const float bias_scale =
507 static_cast<float>(std::pow(2.0, -10)) * layer_norm_scale;
508
509 for (int batch = 0; batch < n_batch; ++batch) {
510 float sum = 0.0f;
511 float sum_sq = 0.0f;
512 for (int i = 0; i < n_input; ++i) {
513 const int index = batch * n_input + i;
514 const float value = static_cast<float>(input[index]);
515 sum += value;
516 sum_sq += value * value;
517 }
518 const float mean = sum / n_input;
519 float stddev_inv = 0.0f;
520 const float variance = sum_sq / n_input - mean * mean;
521 if (variance == 0) {
522 stddev_inv = 1.0f / std::sqrt(1e-8f);
523 } else {
524 stddev_inv = 1.0f / std::sqrt(variance);
525 }
526 for (int i = 0; i < n_input; ++i) {
527 const int index = batch * n_input + i;
528 const float normalized_value =
529 (static_cast<float>(input[index]) - mean) * stddev_inv;
530 const float weighted_normalized_value =
531 normalized_value * layer_norm_weights[i] * layer_norm_scale +
532 bias[i] * bias_scale;
533 const int32_t quant_output = static_cast<int32_t>(round(
534 weighted_normalized_value * static_cast<float>(std::pow(2, 12))));
535 output[index] = std::min(int16_max, std::max(int16_min, quant_output));
536 }
537 }
538 }
539
PortableMatrixScalarMultiplyAccumulate(const int8_t * matrix,int32_t scalar,int32_t n_row,int32_t n_col,int32_t * output)540 void PortableMatrixScalarMultiplyAccumulate(const int8_t* matrix,
541 int32_t scalar, int32_t n_row,
542 int32_t n_col, int32_t* output) {
543 for (int i = 0; i < n_row; ++i) {
544 int32_t row_sum = 0;
545 for (int j = 0; j < n_col; ++j) {
546 row_sum += *matrix++;
547 }
548 output[i] += row_sum * scalar;
549 }
550 }
551
PortableApplySigmoid(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)552 void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
553 int32_t n_input, int16_t* output) {
554 for (int batch = 0; batch < n_batch; ++batch) {
555 for (int c = 0; c < n_input; c++) {
556 using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
557 using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
558 const int index = batch * n_input + c;
559 F3 sigmoid_input = F3::FromRaw(input[index]);
560 F0 sigmoid_output = gemmlowp::logistic(sigmoid_input);
561 output[index] = sigmoid_output.raw();
562 }
563 }
564 }
565
PortableApplySigmoidFloat(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)566 void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
567 int32_t n_input, int16_t* output) {
568 const int32_t int16_max = std::numeric_limits<int16_t>::max();
569 const int32_t int16_min = std::numeric_limits<int16_t>::min();
570 for (int batch = 0; batch < n_batch; ++batch) {
571 for (int i = 0; i < n_input; ++i) {
572 const int index = batch * n_input + i;
573 const float float_input =
574 input[index] * static_cast<float>(std::pow(2, -12));
575 const float float_output = 1.0f / (1.0f + std::exp(-float_input));
576 const int32_t quant_output = static_cast<int32_t>(
577 float_output * static_cast<float>(std::pow(2, 15)));
578 const int32_t quant_output_clamped =
579 std::min(int16_max, std::max(int16_min, quant_output));
580 output[index] = static_cast<int16_t>(quant_output_clamped);
581 }
582 }
583 }
584
585 template <int IntegerBits>
PortableApplyTanhImpl(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)586 void PortableApplyTanhImpl(const int16_t* input, int32_t n_batch,
587 int32_t n_input, int16_t* output) {
588 using FX = gemmlowp::FixedPoint<std::int16_t, IntegerBits>;
589 using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
590 for (int batch = 0; batch < n_batch; ++batch) {
591 for (int i = 0; i < n_input; ++i) {
592 const int index = batch * n_input + i;
593 FX tanh_input = FX::FromRaw(input[index]);
594 F0 tanh_output = gemmlowp::tanh(tanh_input);
595 output[index] = tanh_output.raw();
596 }
597 }
598 }
599
PortableApplyTanh(int32_t integer_bits,const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)600 void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
601 int32_t n_batch, int32_t n_input, int16_t* output) {
602 assert(integer_bits <= 6);
603 #define DISPATCH_TANH(i) \
604 case i: \
605 PortableApplyTanhImpl<i>(input, n_batch, n_input, output); \
606 break;
607 switch (integer_bits) {
608 DISPATCH_TANH(0);
609 DISPATCH_TANH(1);
610 DISPATCH_TANH(2);
611 DISPATCH_TANH(3);
612 DISPATCH_TANH(4);
613 DISPATCH_TANH(5);
614 DISPATCH_TANH(6);
615 default:
616 return;
617 }
618 #undef DISPATCH_TANH
619 }
620
PortableApplyTanhFloat(const int16_t * input,int32_t n_batch,int32_t n_input,int32_t integer_bits,int16_t * output)621 void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
622 int32_t n_input, int32_t integer_bits,
623 int16_t* output) {
624 const int32_t int16_max = std::numeric_limits<int16_t>::max();
625 const int32_t int16_min = std::numeric_limits<int16_t>::min();
626 const double two = 2.0;
627 for (int batch = 0; batch < n_batch; ++batch) {
628 for (int i = 0; i < n_input; ++i) {
629 const int index = batch * n_input + i;
630 const float float_input =
631 input[index] * std::pow(two, static_cast<double>(integer_bits));
632 const float float_output = std::tanh(float_input);
633 const int32_t quant_output = static_cast<int32_t>(
634 float_output * static_cast<float>(std::pow(2, 15)));
635 const int32_t quant_output_clamped =
636 std::min(int16_max, std::max(int16_min, quant_output));
637 output[index] = static_cast<int16_t>(quant_output_clamped);
638 }
639 }
640 }
641
PortableCwiseMul(const int16_t * input_1,const int16_t * input_2,int n_batch,int n_input,int shift,int16_t * output)642 void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
643 int n_batch, int n_input, int shift, int16_t* output) {
644 for (int batch = 0; batch < n_batch; ++batch) {
645 for (int i = 0; i < n_input; ++i) {
646 const int index = batch * n_input + i;
647 const int16_t a = input_1[index];
648 const int16_t b = input_2[index];
649 const int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
650 output[index] =
651 static_cast<int16_t>(gemmlowp::RoundingDivideByPOT(value, shift));
652 }
653 }
654 }
655
PortableCwiseMul(const int16_t * input_1,const int16_t * input_2,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t output_zp,int8_t * output)656 void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
657 int32_t multiplier, int32_t shift, int32_t n_batch,
658 int32_t n_input, int32_t output_zp, int8_t* output) {
659 for (int batch = 0; batch < n_batch; ++batch) {
660 for (int i = 0; i < n_input; ++i) {
661 const int index = batch * n_input + i;
662 const int16_t a = input_1[index];
663 const int16_t b = input_2[index];
664 int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
665 value = MultiplyByQuantizedMultiplier(value, multiplier, shift);
666 value -= output_zp;
667 value = std::min(std::max(static_cast<int32_t>(-128), value),
668 static_cast<int32_t>(127));
669
670 output[index] = static_cast<int8_t>(value);
671 }
672 }
673 }
674
PortableCwiseAdd(const int16_t * input_1,const int16_t * input_2,int n_batch,int n_input,int16_t * output)675 void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
676 int n_batch, int n_input, int16_t* output) {
677 for (int batch = 0; batch < n_batch; ++batch) {
678 for (int i = 0; i < n_input; ++i) {
679 const int index = batch * n_input + i;
680 int32_t sum = input_1[index] + input_2[index];
681 const int32_t sum_clamped = std::min(kInt16Max, std::max(kInt16Min, sum));
682 output[index] = static_cast<int16_t>(sum_clamped);
683 }
684 }
685 }
686
PortableVectorVectorDotProduct(const float * vector1,const float * vector2,int v_size)687 float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
688 int v_size) {
689 float result = 0.0;
690 for (int v = 0; v < v_size; v++) {
691 result += *vector1++ * *vector2++;
692 }
693 return result;
694 }
695
696 namespace {
VectorVectorDotProduct(const int16_t * vector1,const int16_t * vector2,int v_size)697 inline int32_t VectorVectorDotProduct(const int16_t* vector1,
698 const int16_t* vector2, int v_size) {
699 int32_t result = 0;
700 for (int v = 0; v < v_size; v++) {
701 result += *vector1++ * *vector2++;
702 }
703 return result;
704 }
705 } // namespace
706
PortableBatchVectorBatchVectorDotProduct(const int16_t * vector1,const int16_t * vector2,int v_size,int n_batch,int32_t * result)707 void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1,
708 const int16_t* vector2,
709 int v_size, int n_batch,
710 int32_t* result) {
711 for (int b = 0; b < n_batch; b++) {
712 result[b] = VectorVectorDotProduct(vector1, vector2, v_size);
713 vector1 += v_size;
714 vector2 += v_size;
715 }
716 }
717
PortableVectorBatchVectorCwiseProductAccumulate(const int16_t * vector,int v_size,const int16_t * batch_vector,int n_batch,int32_t multiplier,int shift,int16_t * result)718 void PortableVectorBatchVectorCwiseProductAccumulate(
719 const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
720 int32_t multiplier, int shift, int16_t* result) {
721 for (int b = 0; b < n_batch; b++) {
722 for (int v = 0; v < v_size; v++) {
723 int32_t prod = vector[v] * *batch_vector++;
724 prod = MultiplyByQuantizedMultiplier(prod, multiplier, shift);
725 int32_t output = prod + *result;
726 output = std::max(std::min(static_cast<int32_t>(32767), output),
727 static_cast<int32_t>(-32768));
728 *result++ = output;
729 }
730 }
731 }
732
PortableSub1Vector(const float * vector,int v_size,float * result)733 void PortableSub1Vector(const float* vector, int v_size, float* result) {
734 for (int v = 0; v < v_size; v++) {
735 *result++ = 1.0f - *vector++;
736 }
737 }
738
PortableSub1Vector(const int16_t * vector,int v_size,int16_t * result)739 void PortableSub1Vector(const int16_t* vector, int v_size, int16_t* result) {
740 static const int16_t kOne = 32767;
741 for (int v = 0; v < v_size; v++) {
742 *result++ = kOne - *vector++;
743 }
744 }
745
PortableVectorScalarMultiply(const int8_t * vector,const int v_size,const float scale,float * result)746 void PortableVectorScalarMultiply(const int8_t* vector, const int v_size,
747 const float scale, float* result) {
748 for (int v = 0; v < v_size; ++v) {
749 *result++ = scale * *vector++;
750 }
751 }
752
PortableMeanStddevNormalization(const float * __restrict__ input_vector,float * __restrict__ output_vector,int v_size,int n_batch)753 void PortableMeanStddevNormalization(const float* __restrict__ input_vector,
754 float* __restrict__ output_vector,
755 int v_size, int n_batch) {
756 for (int batch = 0; batch < n_batch; ++batch) {
757 float sum = 0.0f;
758 for (int i = 0; i < v_size; ++i) {
759 sum += input_vector[i];
760 }
761 const float mean = sum / v_size;
762 float sum_diff_sq = 0.0f;
763 for (int i = 0; i < v_size; ++i) {
764 const float diff = input_vector[i] - mean;
765 sum_diff_sq += diff * diff;
766 }
767 const float variance = sum_diff_sq / v_size;
768 constexpr float kNormalizationConstant = 1e-8f;
769 const float stddev_inv =
770 1.0f / std::sqrt(variance + kNormalizationConstant);
771 for (int i = 0; i < v_size; ++i) {
772 output_vector[i] = (input_vector[i] - mean) * stddev_inv;
773 }
774 input_vector += v_size;
775 output_vector += v_size;
776 }
777 }
778
PortableTwoGateSaturatingAdd(const int8_t * input,int8_t input_zp,const int8_t * recurrent,int8_t recurrent_zp,int32_t input_effective_scale_a,int32_t input_effective_scale_b,int32_t recurrent_effective_scale_a,int32_t recurrent_effective_scale_b,int32_t n_batch,int32_t n_cell,int16_t * output)779 void PortableTwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
780 const int8_t* recurrent, int8_t recurrent_zp,
781 int32_t input_effective_scale_a,
782 int32_t input_effective_scale_b,
783 int32_t recurrent_effective_scale_a,
784 int32_t recurrent_effective_scale_b,
785 int32_t n_batch, int32_t n_cell,
786 int16_t* output) {
787 const int32_t int16_max = std::numeric_limits<int16_t>::max();
788 const int32_t int16_min = std::numeric_limits<int16_t>::min();
789 for (int i = 0; i < n_batch * n_cell; ++i) {
790 int32_t x = static_cast<int32_t>(input[i]) - static_cast<int32_t>(input_zp);
791 int32_t h =
792 static_cast<int32_t>(recurrent[i]) - static_cast<int32_t>(recurrent_zp);
793 int32_t x_scaled = MultiplyByQuantizedMultiplier(x, input_effective_scale_a,
794 input_effective_scale_b);
795 int32_t h_scaled = MultiplyByQuantizedMultiplier(
796 h, recurrent_effective_scale_a, recurrent_effective_scale_b);
797 int32_t y = h_scaled + x_scaled;
798 if (y > int16_max) {
799 y = int16_max;
800 }
801 if (y < int16_min) {
802 y = int16_min;
803 }
804 output[i] = static_cast<int16_t>(y);
805 }
806 }
807
808 } // namespace tensor_utils
809 } // namespace tflite
810