1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_TENSOR_UTILS_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_TENSOR_UTILS_H_
17
18 // Note: This file is a copy-paste version of neon_tensor_utils.h, only
19 // difference is in MatrixBatchVectorMultiplyAccumulate and
20 // SparseMatrixBatchVectorMultiplyAccumulate (other functions do not have SSE
21 // implementation yet).
22
23 // Note: Most of the functions below use NEON_OR_PORTABLE, through the Intel
24 // NEON_2_SSE translator library. If a native SSE version of a function is
25 // implemented, replace the appropriate one to SSE_OR_PORTABLE.
26
27 #include "tensorflow/lite/kernels/cpu_backend_context.h"
28 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
29 #include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h"
30 #include "tensorflow/lite/kernels/internal/optimized/sse_check.h"
31 #include "tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h"
32 #include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
33
34 namespace tflite {
35 namespace tensor_utils {
36
MatrixBatchVectorMultiplyAccumulate(const float * matrix,int m_rows,int m_cols,const float * vector,int n_batch,float * result)37 void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
38 int m_cols, const float* vector,
39 int n_batch, float* result) {
40 #if defined(__AVX2__)
41 Avx2MatrixBatchVectorMultiplyAccumulateImpl(matrix, m_rows, m_cols, vector,
42 n_batch, result);
43 #else
44 NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
45 vector, n_batch, result);
46 #endif
47 }
48
MatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * __restrict__ scaling_factors,int n_batch,float * __restrict__ result)49 void MatrixBatchVectorMultiplyAccumulate(
50 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
51 const int8_t* __restrict__ vectors,
52 const float* __restrict__ scaling_factors, int n_batch,
53 float* __restrict__ result) {
54 SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
55 vectors, scaling_factors, n_batch, result);
56 }
57
MatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result,const float * per_channel_scale,const int32_t * input_offset,int32_t * scratch,int32_t * row_sums,bool * compute_row_sums,CpuBackendContext * context)58 void MatrixBatchVectorMultiplyAccumulate(
59 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
60 const int8_t* __restrict__ vectors, const float* scaling_factors,
61 int n_batch, float* __restrict__ result, const float* per_channel_scale,
62 const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
63 bool* compute_row_sums, CpuBackendContext* context) {
64 SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
65 vectors, scaling_factors, n_batch, result, per_channel_scale,
66 input_offset, scratch, row_sums, compute_row_sums, context);
67 }
68
MatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * __restrict__ scaling_factors,int n_batch,int32_t * __restrict__ scratch,float * __restrict__ result,CpuBackendContext * __restrict__ context)69 void MatrixBatchVectorMultiplyAccumulate(
70 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
71 const int8_t* __restrict__ vectors,
72 const float* __restrict__ scaling_factors, int n_batch,
73 int32_t* __restrict__ scratch, float* __restrict__ result,
74 CpuBackendContext* __restrict__ context) {
75 SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
76 vectors, scaling_factors, n_batch, scratch, result, context);
77 }
78
SparseMatrixBatchVectorMultiplyAccumulate1x4(const float * __restrict__ matrix,const int32_t * __restrict__ segments,const int32_t * __restrict__ indices,int m_rows,int m_cols,const float * __restrict__ vector,int n_batch,float * __restrict__ result)79 void SparseMatrixBatchVectorMultiplyAccumulate1x4(
80 const float* __restrict__ matrix, const int32_t* __restrict__ segments,
81 const int32_t* __restrict__ indices, int m_rows, int m_cols,
82 const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
83 NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate1x4, matrix,
84 segments, indices, m_rows, m_cols, vector, n_batch, result);
85 }
86
SparseMatrixBatchVectorMultiplyAccumulate1x16(const int8_t * __restrict__ matrix,const int32_t * __restrict__ segments,const int32_t * __restrict__ indices,int m_rows,int m_cols,const int8_t * __restrict__ vector,const int32_t * __restrict__ bias_vector,int n_batch,const int32_t input_offset,const int32_t output_multiplier,const int32_t output_shift,const int32_t output_offset,const int32_t output_activation_min,const int32_t output_activation_max,int8_t * __restrict__ result)87 void SparseMatrixBatchVectorMultiplyAccumulate1x16(
88 const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
89 const int32_t* __restrict__ indices, int m_rows, int m_cols,
90 const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
91 int n_batch, const int32_t input_offset, const int32_t output_multiplier,
92 const int32_t output_shift, const int32_t output_offset,
93 const int32_t output_activation_min, const int32_t output_activation_max,
94 int8_t* __restrict__ result) {
95 NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate1x16, matrix,
96 segments, indices, m_rows, m_cols, vector, bias_vector,
97 n_batch, input_offset, output_multiplier, output_shift,
98 output_offset, output_activation_min, output_activation_max,
99 result);
100 }
101
SparseMatrixBatchVectorMultiplyAccumulate(const float * __restrict__ matrix,const uint8_t * __restrict__ ledger,int m_rows,int m_cols,const float * __restrict__ vector,int n_batch,float * __restrict__ result)102 void SparseMatrixBatchVectorMultiplyAccumulate(
103 const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
104 int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
105 float* __restrict__ result) {
106 NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate, matrix, ledger,
107 m_rows, m_cols, vector, n_batch, result);
108 }
109
SparseMatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const uint8_t * __restrict__ ledger,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * __restrict__ scaling_factors,int n_batch,float * __restrict__ result)110 void SparseMatrixBatchVectorMultiplyAccumulate(
111 const int8_t* __restrict__ matrix, const uint8_t* __restrict__ ledger,
112 const int m_rows, const int m_cols, const int8_t* __restrict__ vectors,
113 const float* __restrict__ scaling_factors, int n_batch,
114 float* __restrict__ result) {
115 SSE_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate, matrix, ledger,
116 m_rows, m_cols, vectors, scaling_factors, n_batch, result);
117 }
118
MatrixBatchVectorMultiplyAccumulate(const int8_t * input,const int32_t * input_zeropoint_times_weights,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,int32_t * scratch,int16_t * output,CpuBackendContext * context)119 void MatrixBatchVectorMultiplyAccumulate(
120 const int8_t* input, const int32_t* input_zeropoint_times_weights,
121 const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
122 int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
123 int32_t* scratch, int16_t* output, CpuBackendContext* context) {
124 PortableMatrixBatchVectorMultiplyAccumulate(
125 input, input_zeropoint_times_weights, input_to_gate_weights, multiplier,
126 shift, n_batch, n_input, n_output, output_zp, scratch, output, context);
127 }
128
MatrixBatchVectorMultiplyAccumulate(const int8_t * input,const int32_t * input_zeropoint_times_weights,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,int32_t * scratch,int8_t * output,CpuBackendContext * context)129 void MatrixBatchVectorMultiplyAccumulate(
130 const int8_t* input, const int32_t* input_zeropoint_times_weights,
131 const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
132 int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
133 int32_t* scratch, int8_t* output, CpuBackendContext* context) {
134 PortableMatrixBatchVectorMultiplyAccumulate(
135 input, input_zeropoint_times_weights, input_to_gate_weights, multiplier,
136 shift, n_batch, n_input, n_output, output_zp, scratch, output, context);
137 }
138
MatrixBatchVectorMultiply(const int8_t * input,int32_t input_zeropoint,const int8_t * input_to_gate_weights,int32_t input_to_gate_effective_scale_a,int32_t input_to_gate_effective_scale_b,int32_t n_batch,int32_t n_input,int32_t n_cell,int8_t * gate_output,int8_t gate_output_zp)139 void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
140 const int8_t* input_to_gate_weights,
141 int32_t input_to_gate_effective_scale_a,
142 int32_t input_to_gate_effective_scale_b,
143 int32_t n_batch, int32_t n_input, int32_t n_cell,
144 int8_t* gate_output, int8_t gate_output_zp) {
145 PortableMatrixBatchVectorMultiply(
146 input, input_zeropoint, input_to_gate_weights,
147 input_to_gate_effective_scale_a, input_to_gate_effective_scale_b, n_batch,
148 n_input, n_cell, gate_output, gate_output_zp);
149 }
150
MatrixBatchVectorMultiply(const int16_t * hidden,const int8_t * hidden_to_output_weights,int32_t proj_effective_scale_a,int32_t proj_effective_scale_b,const int32_t * gate_bias,int32_t n_batch,int32_t n_hidden,int32_t n_output,int32_t output_zp,int8_t * proj_output)151 void MatrixBatchVectorMultiply(const int16_t* hidden,
152 const int8_t* hidden_to_output_weights,
153 int32_t proj_effective_scale_a,
154 int32_t proj_effective_scale_b,
155 const int32_t* gate_bias, int32_t n_batch,
156 int32_t n_hidden, int32_t n_output,
157 int32_t output_zp, int8_t* proj_output) {
158 PortableMatrixBatchVectorMultiply(hidden, hidden_to_output_weights,
159 proj_effective_scale_a,
160 proj_effective_scale_b, gate_bias, n_batch,
161 n_hidden, n_output, output_zp, proj_output);
162 }
163
MatrixScalarMultiplyAccumulate(const int8_t * matrix,int32_t scalar,int32_t n_row,int32_t n_col,int32_t * output)164 void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
165 int32_t n_row, int32_t n_col,
166 int32_t* output) {
167 PortableMatrixScalarMultiplyAccumulate(matrix, scalar, n_row, n_col, output);
168 }
169
ApplyLayerNorm(const int16_t * input,const int16_t * layer_norm_weights,const int32_t * bias,int32_t layer_norm_scale_a,int32_t layer_norm_scale_b,int32_t variance_limit,int n_batch,int n_input,int16_t * output)170 void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
171 const int32_t* bias, int32_t layer_norm_scale_a,
172 int32_t layer_norm_scale_b, int32_t variance_limit,
173 int n_batch, int n_input, int16_t* output) {
174 PortableApplyLayerNorm(input, layer_norm_weights, bias, layer_norm_scale_a,
175 layer_norm_scale_b, variance_limit, n_batch, n_input,
176 output);
177 }
178
ApplyLayerNormFloat(const int16_t * input,const int16_t * layer_norm_weights,int32_t layer_norm_scale_a,int32_t layer_norm_scale_b,const int32_t * bias,int n_batch,int n_input,int16_t * output)179 void ApplyLayerNormFloat(const int16_t* input,
180 const int16_t* layer_norm_weights,
181 int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
182 const int32_t* bias, int n_batch, int n_input,
183 int16_t* output) {
184 PortableApplyLayerNormFloat(input, layer_norm_weights, layer_norm_scale_a,
185 layer_norm_scale_b, bias, n_batch, n_input,
186 output);
187 }
188
ApplySigmoid(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)189 void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
190 int16_t* output) {
191 PortableApplySigmoid(input, n_batch, n_input, output);
192 }
193
ApplySigmoidFloat(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)194 void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
195 int16_t* output) {
196 PortableApplySigmoidFloat(input, n_batch, n_input, output);
197 }
198
ApplyTanh(int32_t intger_bits,const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)199 void ApplyTanh(int32_t intger_bits, const int16_t* input, int32_t n_batch,
200 int32_t n_input, int16_t* output) {
201 PortableApplyTanh(intger_bits, input, n_batch, n_input, output);
202 }
203
ApplyTanhFloat(const int16_t * input,int32_t n_batch,int32_t n_input,int32_t integer_bits,int16_t * output)204 void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
205 int32_t integer_bits, int16_t* output) {
206 PortableApplyTanhFloat(input, n_batch, n_input, integer_bits, output);
207 }
208
CwiseMul(const int16_t * input_1,const int16_t * input_2,int n_batch,int n_input,int shift,int16_t * output)209 void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
210 int n_input, int shift, int16_t* output) {
211 PortableCwiseMul(input_1, input_2, n_batch, n_input, shift, output);
212 }
213
CwiseMul(const int16_t * input_1,const int16_t * input_2,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t output_zp,int8_t * output)214 void CwiseMul(const int16_t* input_1, const int16_t* input_2,
215 int32_t multiplier, int32_t shift, int32_t n_batch,
216 int32_t n_input, int32_t output_zp, int8_t* output) {
217 PortableCwiseMul(input_1, input_2, multiplier, shift, n_batch, n_input,
218 output_zp, output);
219 }
220
CwiseAdd(const int16_t * input_1,const int16_t * input_2,int n_batch,int n_input,int16_t * output)221 void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
222 int n_input, int16_t* output) {
223 PortableCwiseAdd(input_1, input_2, n_batch, n_input, output);
224 }
225
CwiseClipping(float * vector,const int v_size,const float clipping_value)226 void CwiseClipping(float* vector, const int v_size,
227 const float clipping_value) {
228 PortableCwiseClipping(vector, v_size, clipping_value);
229 }
230
CwiseClipping(int16_t * vector,const int v_size,const int16_t clipping_value)231 void CwiseClipping(int16_t* vector, const int v_size,
232 const int16_t clipping_value) {
233 PortableCwiseClipping(vector, v_size, clipping_value);
234 }
235
CwiseClipping(int8_t * vector,const int v_size,const int8_t clipping_value)236 void CwiseClipping(int8_t* vector, const int v_size,
237 const int8_t clipping_value) {
238 PortableCwiseClipping(vector, v_size, clipping_value);
239 }
240
BatchVectorBatchVectorDotProduct(const int16_t * vector1,const int16_t * vector2,int v_size,int n_batch,int32_t * result)241 void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
242 const int16_t* vector2, int v_size,
243 int n_batch, int32_t* result) {
244 PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size, n_batch,
245 result);
246 }
247
VectorBatchVectorCwiseProductAccumulate(const int16_t * vector,int v_size,const int16_t * batch_vector,int n_batch,int32_t multiplier,int shift,int16_t * result)248 void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
249 const int16_t* batch_vector,
250 int n_batch, int32_t multiplier,
251 int shift, int16_t* result) {
252 NEON_OR_PORTABLE(VectorBatchVectorCwiseProductAccumulate, vector, v_size,
253 batch_vector, n_batch, multiplier, shift, result);
254 }
255
VectorVectorDotProduct(const float * vector1,const float * vector2,int v_size)256 float VectorVectorDotProduct(const float* vector1, const float* vector2,
257 int v_size) {
258 return NEON_OR_PORTABLE(VectorVectorDotProduct, vector1, vector2, v_size);
259 }
260
Sub1Vector(const float * vector,int v_size,float * result)261 void Sub1Vector(const float* vector, int v_size, float* result) {
262 NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result);
263 }
264
Sub1Vector(const int16_t * vector,int v_size,int16_t * result)265 void Sub1Vector(const int16_t* vector, int v_size, int16_t* result) {
266 PortableSub1Vector(vector, v_size, result);
267 }
268
269 // Check if all entries of a vector are zero for float.
IsZeroVector(const float * vector,int v_size)270 bool IsZeroVector(const float* vector, int v_size) {
271 return NEON_OR_PORTABLE(IsZeroVector, vector, v_size);
272 }
273
274 // Check if all entries of a vector are zero for int8.
IsZeroVector(const int8_t * vector,int v_size)275 bool IsZeroVector(const int8_t* vector, int v_size) {
276 return PortableIsZeroVector(vector, v_size);
277 }
278
VectorScalarMultiply(const int8_t * vector,int v_size,float scale,float * result)279 void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
280 float* result) {
281 NEON_OR_PORTABLE(VectorScalarMultiply, vector, v_size, scale, result);
282 }
283
SymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float * min_value,float * max_value,float * scaling_factor)284 void SymmetricQuantizeFloats(const float* values, const int size,
285 int8_t* quantized_values, float* min_value,
286 float* max_value, float* scaling_factor) {
287 NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values,
288 min_value, max_value, scaling_factor);
289 }
290
SymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float min_value,float max_value,float * scaling_factor)291 void SymmetricQuantizeFloats(const float* values, const int size,
292 int8_t* quantized_values, float min_value,
293 float max_value, float* scaling_factor) {
294 NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values,
295 min_value, max_value, scaling_factor);
296 }
297
AsymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float * scaling_factor,int32_t * offset)298 void AsymmetricQuantizeFloats(const float* values, const int size,
299 int8_t* quantized_values, float* scaling_factor,
300 int32_t* offset) {
301 NEON_OR_PORTABLE(AsymmetricQuantizeFloats, values, size, quantized_values,
302 scaling_factor, offset);
303 }
304
ReductionSumVector(const float * input_vector,float * output_vector,int output_size,int reduction_size)305 void ReductionSumVector(const float* input_vector, float* output_vector,
306 int output_size, int reduction_size) {
307 NEON_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
308 reduction_size);
309 }
310
ReductionSumVector(const int32_t * input_vector,int32_t * output_vector,int output_size,int reduction_size)311 void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
312 int output_size, int reduction_size) {
313 PortableReductionSumVector(input_vector, output_vector, output_size,
314 reduction_size);
315 }
316
ReductionSumVector(const int8_t * input_vector,int32_t * output_vector,int output_size,int reduction_size)317 void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
318 int output_size, int reduction_size) {
319 SSE_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
320 reduction_size);
321 }
322
MeanStddevNormalization(const float * __restrict__ input_vector,float * __restrict__ output_vector,int v_size,int n_batch)323 void MeanStddevNormalization(const float* __restrict__ input_vector,
324 float* __restrict__ output_vector, int v_size,
325 int n_batch) {
326 PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
327 }
328
TwoGateSaturatingAdd(const int8_t * input,int8_t input_zp,const int8_t * recurrent,int8_t recurrent_zp,int32_t input_effective_scale_a,int32_t input_effective_scale_b,int32_t recurrent_effective_scale_a,int32_t recurrent_effective_scale_b,int32_t n_batch,int32_t n_cell,int16_t * output)329 void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
330 const int8_t* recurrent, int8_t recurrent_zp,
331 int32_t input_effective_scale_a,
332 int32_t input_effective_scale_b,
333 int32_t recurrent_effective_scale_a,
334 int32_t recurrent_effective_scale_b, int32_t n_batch,
335 int32_t n_cell, int16_t* output) {
336 PortableTwoGateSaturatingAdd(
337 input, input_zp, recurrent, recurrent_zp, input_effective_scale_a,
338 input_effective_scale_b, recurrent_effective_scale_a,
339 recurrent_effective_scale_b, n_batch, n_cell, output);
340 }
341
342 } // namespace tensor_utils
343 } // namespace tflite
344
345 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_TENSOR_UTILS_H_
346