xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_TENSOR_UTILS_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_TENSOR_UTILS_H_
17 
18 // Note: This file is a copy-paste version of neon_tensor_utils.h, only
19 // difference is in MatrixBatchVectorMultiplyAccumulate and
20 // SparseMatrixBatchVectorMultiplyAccumulate (other functions do not have SSE
21 // implementation yet).
22 
23 // Note: Most of the functions below use NEON_OR_PORTABLE, through the Intel
24 // NEON_2_SSE translator library. If a native SSE version of a function is
25 // implemented, replace the appropriate one to SSE_OR_PORTABLE.
26 
27 #include "tensorflow/lite/kernels/cpu_backend_context.h"
28 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
29 #include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h"
30 #include "tensorflow/lite/kernels/internal/optimized/sse_check.h"
31 #include "tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h"
32 #include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
33 
34 namespace tflite {
35 namespace tensor_utils {
36 
MatrixBatchVectorMultiplyAccumulate(const float * matrix,int m_rows,int m_cols,const float * vector,int n_batch,float * result)37 void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
38                                          int m_cols, const float* vector,
39                                          int n_batch, float* result) {
40 #if defined(__AVX2__)
41   Avx2MatrixBatchVectorMultiplyAccumulateImpl(matrix, m_rows, m_cols, vector,
42                                               n_batch, result);
43 #else
44   NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
45                    vector, n_batch, result);
46 #endif
47 }
48 
MatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * __restrict__ scaling_factors,int n_batch,float * __restrict__ result)49 void MatrixBatchVectorMultiplyAccumulate(
50     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
51     const int8_t* __restrict__ vectors,
52     const float* __restrict__ scaling_factors, int n_batch,
53     float* __restrict__ result) {
54   SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
55                   vectors, scaling_factors, n_batch, result);
56 }
57 
MatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result,const float * per_channel_scale,const int32_t * input_offset,int32_t * scratch,int32_t * row_sums,bool * compute_row_sums,CpuBackendContext * context)58 void MatrixBatchVectorMultiplyAccumulate(
59     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
60     const int8_t* __restrict__ vectors, const float* scaling_factors,
61     int n_batch, float* __restrict__ result, const float* per_channel_scale,
62     const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
63     bool* compute_row_sums, CpuBackendContext* context) {
64   SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
65                   vectors, scaling_factors, n_batch, result, per_channel_scale,
66                   input_offset, scratch, row_sums, compute_row_sums, context);
67 }
68 
MatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * __restrict__ scaling_factors,int n_batch,int32_t * __restrict__ scratch,float * __restrict__ result,CpuBackendContext * __restrict__ context)69 void MatrixBatchVectorMultiplyAccumulate(
70     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
71     const int8_t* __restrict__ vectors,
72     const float* __restrict__ scaling_factors, int n_batch,
73     int32_t* __restrict__ scratch, float* __restrict__ result,
74     CpuBackendContext* __restrict__ context) {
75   SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
76                   vectors, scaling_factors, n_batch, scratch, result, context);
77 }
78 
SparseMatrixBatchVectorMultiplyAccumulate1x4(const float * __restrict__ matrix,const int32_t * __restrict__ segments,const int32_t * __restrict__ indices,int m_rows,int m_cols,const float * __restrict__ vector,int n_batch,float * __restrict__ result)79 void SparseMatrixBatchVectorMultiplyAccumulate1x4(
80     const float* __restrict__ matrix, const int32_t* __restrict__ segments,
81     const int32_t* __restrict__ indices, int m_rows, int m_cols,
82     const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
83   NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate1x4, matrix,
84                    segments, indices, m_rows, m_cols, vector, n_batch, result);
85 }
86 
SparseMatrixBatchVectorMultiplyAccumulate1x16(const int8_t * __restrict__ matrix,const int32_t * __restrict__ segments,const int32_t * __restrict__ indices,int m_rows,int m_cols,const int8_t * __restrict__ vector,const int32_t * __restrict__ bias_vector,int n_batch,const int32_t input_offset,const int32_t output_multiplier,const int32_t output_shift,const int32_t output_offset,const int32_t output_activation_min,const int32_t output_activation_max,int8_t * __restrict__ result)87 void SparseMatrixBatchVectorMultiplyAccumulate1x16(
88     const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
89     const int32_t* __restrict__ indices, int m_rows, int m_cols,
90     const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
91     int n_batch, const int32_t input_offset, const int32_t output_multiplier,
92     const int32_t output_shift, const int32_t output_offset,
93     const int32_t output_activation_min, const int32_t output_activation_max,
94     int8_t* __restrict__ result) {
95   NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate1x16, matrix,
96                    segments, indices, m_rows, m_cols, vector, bias_vector,
97                    n_batch, input_offset, output_multiplier, output_shift,
98                    output_offset, output_activation_min, output_activation_max,
99                    result);
100 }
101 
SparseMatrixBatchVectorMultiplyAccumulate(const float * __restrict__ matrix,const uint8_t * __restrict__ ledger,int m_rows,int m_cols,const float * __restrict__ vector,int n_batch,float * __restrict__ result)102 void SparseMatrixBatchVectorMultiplyAccumulate(
103     const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
104     int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
105     float* __restrict__ result) {
106   NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate, matrix, ledger,
107                    m_rows, m_cols, vector, n_batch, result);
108 }
109 
SparseMatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const uint8_t * __restrict__ ledger,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * __restrict__ scaling_factors,int n_batch,float * __restrict__ result)110 void SparseMatrixBatchVectorMultiplyAccumulate(
111     const int8_t* __restrict__ matrix, const uint8_t* __restrict__ ledger,
112     const int m_rows, const int m_cols, const int8_t* __restrict__ vectors,
113     const float* __restrict__ scaling_factors, int n_batch,
114     float* __restrict__ result) {
115   SSE_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate, matrix, ledger,
116                   m_rows, m_cols, vectors, scaling_factors, n_batch, result);
117 }
118 
MatrixBatchVectorMultiplyAccumulate(const int8_t * input,const int32_t * input_zeropoint_times_weights,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,int32_t * scratch,int16_t * output,CpuBackendContext * context)119 void MatrixBatchVectorMultiplyAccumulate(
120     const int8_t* input, const int32_t* input_zeropoint_times_weights,
121     const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
122     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
123     int32_t* scratch, int16_t* output, CpuBackendContext* context) {
124   PortableMatrixBatchVectorMultiplyAccumulate(
125       input, input_zeropoint_times_weights, input_to_gate_weights, multiplier,
126       shift, n_batch, n_input, n_output, output_zp, scratch, output, context);
127 }
128 
MatrixBatchVectorMultiplyAccumulate(const int8_t * input,const int32_t * input_zeropoint_times_weights,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,int32_t * scratch,int8_t * output,CpuBackendContext * context)129 void MatrixBatchVectorMultiplyAccumulate(
130     const int8_t* input, const int32_t* input_zeropoint_times_weights,
131     const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
132     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
133     int32_t* scratch, int8_t* output, CpuBackendContext* context) {
134   PortableMatrixBatchVectorMultiplyAccumulate(
135       input, input_zeropoint_times_weights, input_to_gate_weights, multiplier,
136       shift, n_batch, n_input, n_output, output_zp, scratch, output, context);
137 }
138 
MatrixBatchVectorMultiply(const int8_t * input,int32_t input_zeropoint,const int8_t * input_to_gate_weights,int32_t input_to_gate_effective_scale_a,int32_t input_to_gate_effective_scale_b,int32_t n_batch,int32_t n_input,int32_t n_cell,int8_t * gate_output,int8_t gate_output_zp)139 void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
140                                const int8_t* input_to_gate_weights,
141                                int32_t input_to_gate_effective_scale_a,
142                                int32_t input_to_gate_effective_scale_b,
143                                int32_t n_batch, int32_t n_input, int32_t n_cell,
144                                int8_t* gate_output, int8_t gate_output_zp) {
145   PortableMatrixBatchVectorMultiply(
146       input, input_zeropoint, input_to_gate_weights,
147       input_to_gate_effective_scale_a, input_to_gate_effective_scale_b, n_batch,
148       n_input, n_cell, gate_output, gate_output_zp);
149 }
150 
MatrixBatchVectorMultiply(const int16_t * hidden,const int8_t * hidden_to_output_weights,int32_t proj_effective_scale_a,int32_t proj_effective_scale_b,const int32_t * gate_bias,int32_t n_batch,int32_t n_hidden,int32_t n_output,int32_t output_zp,int8_t * proj_output)151 void MatrixBatchVectorMultiply(const int16_t* hidden,
152                                const int8_t* hidden_to_output_weights,
153                                int32_t proj_effective_scale_a,
154                                int32_t proj_effective_scale_b,
155                                const int32_t* gate_bias, int32_t n_batch,
156                                int32_t n_hidden, int32_t n_output,
157                                int32_t output_zp, int8_t* proj_output) {
158   PortableMatrixBatchVectorMultiply(hidden, hidden_to_output_weights,
159                                     proj_effective_scale_a,
160                                     proj_effective_scale_b, gate_bias, n_batch,
161                                     n_hidden, n_output, output_zp, proj_output);
162 }
163 
MatrixScalarMultiplyAccumulate(const int8_t * matrix,int32_t scalar,int32_t n_row,int32_t n_col,int32_t * output)164 void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
165                                     int32_t n_row, int32_t n_col,
166                                     int32_t* output) {
167   PortableMatrixScalarMultiplyAccumulate(matrix, scalar, n_row, n_col, output);
168 }
169 
ApplyLayerNorm(const int16_t * input,const int16_t * layer_norm_weights,const int32_t * bias,int32_t layer_norm_scale_a,int32_t layer_norm_scale_b,int32_t variance_limit,int n_batch,int n_input,int16_t * output)170 void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
171                     const int32_t* bias, int32_t layer_norm_scale_a,
172                     int32_t layer_norm_scale_b, int32_t variance_limit,
173                     int n_batch, int n_input, int16_t* output) {
174   PortableApplyLayerNorm(input, layer_norm_weights, bias, layer_norm_scale_a,
175                          layer_norm_scale_b, variance_limit, n_batch, n_input,
176                          output);
177 }
178 
ApplyLayerNormFloat(const int16_t * input,const int16_t * layer_norm_weights,int32_t layer_norm_scale_a,int32_t layer_norm_scale_b,const int32_t * bias,int n_batch,int n_input,int16_t * output)179 void ApplyLayerNormFloat(const int16_t* input,
180                          const int16_t* layer_norm_weights,
181                          int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
182                          const int32_t* bias, int n_batch, int n_input,
183                          int16_t* output) {
184   PortableApplyLayerNormFloat(input, layer_norm_weights, layer_norm_scale_a,
185                               layer_norm_scale_b, bias, n_batch, n_input,
186                               output);
187 }
188 
ApplySigmoid(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)189 void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
190                   int16_t* output) {
191   PortableApplySigmoid(input, n_batch, n_input, output);
192 }
193 
ApplySigmoidFloat(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)194 void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
195                        int16_t* output) {
196   PortableApplySigmoidFloat(input, n_batch, n_input, output);
197 }
198 
ApplyTanh(int32_t intger_bits,const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)199 void ApplyTanh(int32_t intger_bits, const int16_t* input, int32_t n_batch,
200                int32_t n_input, int16_t* output) {
201   PortableApplyTanh(intger_bits, input, n_batch, n_input, output);
202 }
203 
ApplyTanhFloat(const int16_t * input,int32_t n_batch,int32_t n_input,int32_t integer_bits,int16_t * output)204 void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
205                     int32_t integer_bits, int16_t* output) {
206   PortableApplyTanhFloat(input, n_batch, n_input, integer_bits, output);
207 }
208 
CwiseMul(const int16_t * input_1,const int16_t * input_2,int n_batch,int n_input,int shift,int16_t * output)209 void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
210               int n_input, int shift, int16_t* output) {
211   PortableCwiseMul(input_1, input_2, n_batch, n_input, shift, output);
212 }
213 
CwiseMul(const int16_t * input_1,const int16_t * input_2,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t output_zp,int8_t * output)214 void CwiseMul(const int16_t* input_1, const int16_t* input_2,
215               int32_t multiplier, int32_t shift, int32_t n_batch,
216               int32_t n_input, int32_t output_zp, int8_t* output) {
217   PortableCwiseMul(input_1, input_2, multiplier, shift, n_batch, n_input,
218                    output_zp, output);
219 }
220 
CwiseAdd(const int16_t * input_1,const int16_t * input_2,int n_batch,int n_input,int16_t * output)221 void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
222               int n_input, int16_t* output) {
223   PortableCwiseAdd(input_1, input_2, n_batch, n_input, output);
224 }
225 
CwiseClipping(float * vector,const int v_size,const float clipping_value)226 void CwiseClipping(float* vector, const int v_size,
227                    const float clipping_value) {
228   PortableCwiseClipping(vector, v_size, clipping_value);
229 }
230 
CwiseClipping(int16_t * vector,const int v_size,const int16_t clipping_value)231 void CwiseClipping(int16_t* vector, const int v_size,
232                    const int16_t clipping_value) {
233   PortableCwiseClipping(vector, v_size, clipping_value);
234 }
235 
CwiseClipping(int8_t * vector,const int v_size,const int8_t clipping_value)236 void CwiseClipping(int8_t* vector, const int v_size,
237                    const int8_t clipping_value) {
238   PortableCwiseClipping(vector, v_size, clipping_value);
239 }
240 
BatchVectorBatchVectorDotProduct(const int16_t * vector1,const int16_t * vector2,int v_size,int n_batch,int32_t * result)241 void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
242                                       const int16_t* vector2, int v_size,
243                                       int n_batch, int32_t* result) {
244   PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size, n_batch,
245                                            result);
246 }
247 
VectorBatchVectorCwiseProductAccumulate(const int16_t * vector,int v_size,const int16_t * batch_vector,int n_batch,int32_t multiplier,int shift,int16_t * result)248 void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
249                                              const int16_t* batch_vector,
250                                              int n_batch, int32_t multiplier,
251                                              int shift, int16_t* result) {
252   NEON_OR_PORTABLE(VectorBatchVectorCwiseProductAccumulate, vector, v_size,
253                    batch_vector, n_batch, multiplier, shift, result);
254 }
255 
VectorVectorDotProduct(const float * vector1,const float * vector2,int v_size)256 float VectorVectorDotProduct(const float* vector1, const float* vector2,
257                              int v_size) {
258   return NEON_OR_PORTABLE(VectorVectorDotProduct, vector1, vector2, v_size);
259 }
260 
Sub1Vector(const float * vector,int v_size,float * result)261 void Sub1Vector(const float* vector, int v_size, float* result) {
262   NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result);
263 }
264 
Sub1Vector(const int16_t * vector,int v_size,int16_t * result)265 void Sub1Vector(const int16_t* vector, int v_size, int16_t* result) {
266   PortableSub1Vector(vector, v_size, result);
267 }
268 
269 // Check if all entries of a vector are zero for float.
IsZeroVector(const float * vector,int v_size)270 bool IsZeroVector(const float* vector, int v_size) {
271   return NEON_OR_PORTABLE(IsZeroVector, vector, v_size);
272 }
273 
274 // Check if all entries of a vector are zero for int8.
IsZeroVector(const int8_t * vector,int v_size)275 bool IsZeroVector(const int8_t* vector, int v_size) {
276   return PortableIsZeroVector(vector, v_size);
277 }
278 
VectorScalarMultiply(const int8_t * vector,int v_size,float scale,float * result)279 void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
280                           float* result) {
281   NEON_OR_PORTABLE(VectorScalarMultiply, vector, v_size, scale, result);
282 }
283 
SymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float * min_value,float * max_value,float * scaling_factor)284 void SymmetricQuantizeFloats(const float* values, const int size,
285                              int8_t* quantized_values, float* min_value,
286                              float* max_value, float* scaling_factor) {
287   NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values,
288                    min_value, max_value, scaling_factor);
289 }
290 
SymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float min_value,float max_value,float * scaling_factor)291 void SymmetricQuantizeFloats(const float* values, const int size,
292                              int8_t* quantized_values, float min_value,
293                              float max_value, float* scaling_factor) {
294   NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values,
295                    min_value, max_value, scaling_factor);
296 }
297 
AsymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float * scaling_factor,int32_t * offset)298 void AsymmetricQuantizeFloats(const float* values, const int size,
299                               int8_t* quantized_values, float* scaling_factor,
300                               int32_t* offset) {
301   NEON_OR_PORTABLE(AsymmetricQuantizeFloats, values, size, quantized_values,
302                    scaling_factor, offset);
303 }
304 
ReductionSumVector(const float * input_vector,float * output_vector,int output_size,int reduction_size)305 void ReductionSumVector(const float* input_vector, float* output_vector,
306                         int output_size, int reduction_size) {
307   NEON_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
308                    reduction_size);
309 }
310 
ReductionSumVector(const int32_t * input_vector,int32_t * output_vector,int output_size,int reduction_size)311 void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
312                         int output_size, int reduction_size) {
313   PortableReductionSumVector(input_vector, output_vector, output_size,
314                              reduction_size);
315 }
316 
ReductionSumVector(const int8_t * input_vector,int32_t * output_vector,int output_size,int reduction_size)317 void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
318                         int output_size, int reduction_size) {
319   SSE_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
320                   reduction_size);
321 }
322 
MeanStddevNormalization(const float * __restrict__ input_vector,float * __restrict__ output_vector,int v_size,int n_batch)323 void MeanStddevNormalization(const float* __restrict__ input_vector,
324                              float* __restrict__ output_vector, int v_size,
325                              int n_batch) {
326   PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
327 }
328 
TwoGateSaturatingAdd(const int8_t * input,int8_t input_zp,const int8_t * recurrent,int8_t recurrent_zp,int32_t input_effective_scale_a,int32_t input_effective_scale_b,int32_t recurrent_effective_scale_a,int32_t recurrent_effective_scale_b,int32_t n_batch,int32_t n_cell,int16_t * output)329 void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
330                           const int8_t* recurrent, int8_t recurrent_zp,
331                           int32_t input_effective_scale_a,
332                           int32_t input_effective_scale_b,
333                           int32_t recurrent_effective_scale_a,
334                           int32_t recurrent_effective_scale_b, int32_t n_batch,
335                           int32_t n_cell, int16_t* output) {
336   PortableTwoGateSaturatingAdd(
337       input, input_zp, recurrent, recurrent_zp, input_effective_scale_a,
338       input_effective_scale_b, recurrent_effective_scale_a,
339       recurrent_effective_scale_b, n_batch, n_cell, output);
340 }
341 
342 }  // namespace tensor_utils
343 }  // namespace tflite
344 
345 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SSE_TENSOR_UTILS_H_
346