1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
17 
18 #include <algorithm>
19 #include <cstdint>
20 
21 #if defined(_MSC_VER)
22 #define __restrict__ __restrict
23 #endif
24 
25 namespace tflite {
26 
27 // Not all backends support CpuBackendContext usage, so forward declare to avoid
28 // pulling in its implementation.
29 class CpuBackendContext;
30 
31 namespace tensor_utils {
32 
33 template <typename T>
PortableIsZeroVector(const T * vector,int v_size)34 bool PortableIsZeroVector(const T* vector, int v_size) {
35   for (int i = 0; i < v_size; ++i) {
36     if (vector[i] != 0) {
37       return false;
38     }
39   }
40   return true;
41 }
42 
43 void PortableSymmetricQuantizeFloats(const float* values, const int size,
44                                      int8_t* quantized_values, float* min_value,
45                                      float* max_value, float* scaling_factor);
46 
47 void PortableSymmetricQuantizeFloats(const float* values, const int size,
48                                      int8_t* quantized_values, float min_value,
49                                      float max_value, float* scaling_factor);
50 
51 void PortableAsymmetricQuantizeFloats(const float* values, const int size,
52                                       int8_t* quantized_values,
53                                       float* scaling_factor, int32_t* offset);
54 
55 // Multiply a matrix by a batch vector, and store results in a batch-size
56 // vector.
57 void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
58                                                  int m_rows, int m_cols,
59                                                  const float* vector,
60                                                  int n_batch, float* result);
61 
62 void PortableMatrixBatchVectorMultiplyAccumulate(
63     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
64     const int8_t* __restrict__ vectors, const float* scaling_factors,
65     int n_batch, float* __restrict__ result);
66 
67 void PortableMatrixBatchVectorMultiplyAccumulate(
68     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
69     const int8_t* __restrict__ vectors, const float* scaling_factors,
70     int n_batch, float* __restrict__ result, const float* per_channel_scale,
71     const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
72     bool* compute_row_sums, CpuBackendContext* context);
73 
74 void PortableMatrixBatchVectorMultiplyAccumulate(
75     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
76     const int8_t* __restrict__ vector, const float* scaling_factors,
77     int n_batch, int32_t* scratch, float* __restrict__ result,
78     CpuBackendContext* context);
79 
80 void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
81     const float* __restrict__ matrix, const int32_t* __restrict__ segments,
82     const int32_t* __restrict__ indices, int m_rows, int m_cols,
83     const float* __restrict__ vector, int n_batch, float* __restrict__ result);
84 
85 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
86     const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
87     int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
88     float* __restrict__ result);
89 
90 void PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
91     const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
92     const int32_t* __restrict__ indices, int m_rows, int m_cols,
93     const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
94     int n_batch, const int32_t input_offset, const int32_t output_multiplier,
95     const int32_t output_shift, const int32_t output_offset,
96     const int32_t output_activation_min, const int32_t output_activation_max,
97     int8_t* __restrict__ result);
98 
99 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
100     const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
101     const int m_cols, const int8_t* __restrict__ vectors,
102     const float* scaling_factors, int n_batch, float* __restrict__ result);
103 
104 // Dot product of two vectors.
105 float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
106                                      int v_size);
107 
108 void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1,
109                                               const int16_t* vector2,
110                                               int v_size, int n_batch,
111                                               int32_t* result);
112 
113 void PortableVectorBatchVectorCwiseProductAccumulate(
114     const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
115     int32_t multiplier, int shift, int16_t* result);
116 
117 void PortableMatrixBatchVectorMultiplyAccumulate(
118     const int8_t* input, const int32_t* bias,
119     const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
120     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
121     int32_t* scratch, int16_t* output, CpuBackendContext* context);
122 
123 void PortableMatrixBatchVectorMultiplyAccumulate(
124     const int8_t* input, const int32_t* bias,
125     const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
126     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
127     int32_t* scratch, int8_t* output, CpuBackendContext* context);
128 
129 void PortableMatrixBatchVectorMultiply(const int8_t* input,
130                                        int32_t input_zeropoint,
131                                        const int8_t* input_to_gate_weights,
132                                        int32_t input_to_gate_effective_scale_a,
133                                        int32_t input_to_gate_effective_scale_b,
134                                        int32_t n_batch, int32_t n_input,
135                                        int32_t n_cell, int8_t* gate_output,
136                                        int8_t gate_output_zp);
137 
138 void PortableMatrixBatchVectorMultiply(
139     const int16_t* hidden, const int8_t* hidden_to_output_weights,
140     int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
141     const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
142     int32_t n_output, int32_t output_zp, int8_t* proj_output);
143 
144 void PortableMatrixScalarMultiplyAccumulate(const int8_t* matrix,
145                                             int32_t scalar, int32_t n_row,
146                                             int32_t n_col, int32_t* output);
147 
148 void PortableApplyLayerNorm(const int16_t* input,
149                             const int16_t* layer_norm_weights,
150                             const int32_t* bias, int32_t layer_norm_scale_a,
151                             int32_t layer_norm_scale_b, int32_t variance_limit,
152                             int n_batch, int n_input, int16_t* output);
153 
154 void PortableApplyLayerNormFloat(const int16_t* input,
155                                  const int16_t* layer_norm_weights,
156                                  int32_t layer_norm_scale_a,
157                                  int32_t layer_norm_scale_b,
158                                  const int32_t* bias, int n_batch, int n_input,
159                                  int16_t* output);
160 
161 void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
162                           int32_t n_input, int16_t* output);
163 
164 void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
165                                int32_t n_input, int16_t* output);
166 
167 void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
168                        int32_t n_batch, int32_t n_input, int16_t* output);
169 
170 void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
171                             int32_t n_input, int32_t integer_bits,
172                             int16_t* output);
173 
174 void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
175                       int n_batch, int n_input, int shift, int16_t* output);
176 
177 void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
178                       int32_t multiplier, int32_t shift, int32_t n_batch,
179                       int32_t n_input, int32_t output_zp, int8_t* output);
180 
181 void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
182                       int n_batch, int n_input, int16_t* output);
183 
184 template <typename T>
PortableCwiseClipping(T * vector,const int v_size,const T & clipping_value)185 void PortableCwiseClipping(T* vector, const int v_size,
186                            const T& clipping_value) {
187   for (int i = 0; i < v_size; i++) {
188     vector[i] = std::max(std::min(clipping_value, vector[i]),
189                          static_cast<T>(-clipping_value));
190   }
191 }
192 
193 // Batch vector initialization with another vector.
194 void PortableVectorBatchVectorAssign(const float* vector, int v_size,
195                                      int n_batch, float* batch_vector);
196 
197 // Compute "1.0f - elements of vector" (used in CIFG).
198 void PortableSub1Vector(const float* vector, int v_size, float* result);
199 
200 void PortableSub1Vector(const int16_t* vector, int v_size, int16_t* result);
201 
202 // Multiply all elements of vector with a scalar.
203 void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
204                                   float* result);
205 
206 // Reduce-sum on a vector:
207 // input_vector: pointer to input vector.
208 // output_vector: pointer to vector.
209 // output_size: output vector size.
210 // reduction_size: number of consecutive elements from input vector which are
211 // added to get one element of output.
212 template <typename INPUT, typename OUTPUT>
PortableReductionSumVector(const INPUT * input_vector,OUTPUT * output_vector,int output_size,int reduction_size)213 void PortableReductionSumVector(const INPUT* input_vector,
214                                 OUTPUT* output_vector, int output_size,
215                                 int reduction_size) {
216   for (int o = 0; o < output_size; o++) {
217     OUTPUT result = 0;
218     for (int r = 0; r < reduction_size; r++) {
219       result += input_vector[r];
220     }
221     output_vector[o] = result;
222     input_vector += reduction_size;
223   }
224 }
225 
226 // Layer norm for each batch.
227 void PortableMeanStddevNormalization(const float* __restrict__ input_vector,
228                                      float* __restrict__ output_vector,
229                                      int v_size, int n_batch);
230 
231 // Saturate Add.
232 void PortableTwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
233                                   const int8_t* recurrent, int8_t recurrent_zp,
234                                   int32_t input_effective_scale_a,
235                                   int32_t input_effective_scale_b,
236                                   int32_t recurrent_effective_scale_a,
237                                   int32_t recurrent_effective_scale_b,
238                                   int32_t n_batch, int32_t n_cell,
239                                   int16_t* output);
240 
241 }  // namespace tensor_utils
242 }  // namespace tflite
243 
244 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
245