1 /*
2 * Copyright (c) 2017-2020 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #include "GEMMLowp.h"
25
26 #include "arm_compute/core/Types.h"
27 #include "tests/validation/reference/UtilsQuantizedAsymm.h"
28
29 #include "support/ToolchainSupport.h"
30
31 #include <limits>
32
33 namespace arm_compute
34 {
35 namespace test
36 {
37 namespace validation
38 {
39 namespace reference
40 {
41 namespace
42 {
43 template <typename T>
44 struct DataTypeExtractor
45 {
data_typearm_compute::test::validation::reference::__anon8a60d57b0111::DataTypeExtractor46 static DataType data_type()
47 {
48 DataType data_type = DataType::UNKNOWN;
49 if(std::is_same<T, int8_t>::value)
50 {
51 data_type = DataType::QASYMM8_SIGNED;
52 }
53 else if(std::is_same<T, uint8_t>::value)
54 {
55 data_type = DataType::QASYMM8;
56 }
57 else if(std::is_same<T, int16_t>::value)
58 {
59 data_type = DataType::QSYMM16;
60 }
61 return data_type;
62 }
63 };
64
65 template <typename TIn, typename TOut>
quantize_down_scale(const SimpleTensor<TIn> * in,const SimpleTensor<TIn> * bias,SimpleTensor<TOut> * dst,int32_t result_offset,std::vector<int32_t> result_mult_int,std::vector<int32_t> result_shift,int32_t min,int32_t max)66 void quantize_down_scale(const SimpleTensor<TIn> *in, const SimpleTensor<TIn> *bias, SimpleTensor<TOut> *dst, int32_t result_offset, std::vector<int32_t> result_mult_int,
67 std::vector<int32_t> result_shift, int32_t min, int32_t max)
68 {
69 const int cols_in = in->shape().x();
70 const bool is_per_channel = result_mult_int.size() > 1;
71
72 #if defined(_OPENMP)
73 #pragma omp parallel for
74 #endif /* _OPENMP */
75 for(int i = 0; i < in->num_elements(); ++i)
76 {
77 int32_t result = ((*in)[i] + result_offset);
78
79 if(bias != nullptr)
80 {
81 result += (*bias)[i % cols_in];
82 }
83
84 result *= (is_per_channel) ? result_mult_int[i % cols_in] : result_mult_int[0];
85
86 result >>= (is_per_channel) ? result_shift[i % cols_in] : result_shift[0];
87
88 // Bounded ReLu
89 if(min != max)
90 {
91 result = std::max(min, std::min(max, result));
92 }
93
94 (*dst)[i] = static_cast<TOut>(std::max<TIn>(std::numeric_limits<TOut>::lowest(),
95 std::min<TIn>(std::numeric_limits<TOut>::max(), result)));
96 }
97 }
98
99 template <typename TIn, typename TOut>
quantize_down_scale_by_fixedpoint(const SimpleTensor<TIn> * in,const SimpleTensor<TIn> * bias,SimpleTensor<TOut> * dst,std::vector<int32_t> result_fixedpoint_multiplier,std::vector<int32_t> result_shift,int32_t result_offset_after_shift,int32_t min,int32_t max)100 void quantize_down_scale_by_fixedpoint(const SimpleTensor<TIn> *in, const SimpleTensor<TIn> *bias, SimpleTensor<TOut> *dst, std::vector<int32_t> result_fixedpoint_multiplier,
101 std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max)
102 {
103 const int cols_in = in->shape().x();
104 const bool is_per_channel = result_fixedpoint_multiplier.size() > 1;
105
106 #if defined(_OPENMP)
107 #pragma omp parallel for
108 #endif /* _OPENMP */
109 for(int i = 0; i < in->num_elements(); ++i)
110 {
111 TIn result = (*in)[i];
112
113 if(bias != nullptr)
114 {
115 result += (*bias)[i % cols_in];
116 }
117
118 // Fixed point multiplication
119 const int32_t multiplier = (is_per_channel) ? result_fixedpoint_multiplier[i % cols_in] : result_fixedpoint_multiplier[0];
120 const int32_t shift = (is_per_channel) ? result_shift[i % cols_in] : result_shift[0];
121
122 if(shift < 0)
123 {
124 result = asymm_int_mult(result * (1 << (-shift)), multiplier);
125 }
126 else
127 {
128 result = asymm_rounding_divide_by_pow2(asymm_int_mult(result, multiplier), shift);
129 }
130 result += result_offset_after_shift;
131
132 // Bounded ReLu
133 if(min != max)
134 {
135 result = std::max(min, std::min(max, result));
136 }
137
138 (*dst)[i] = static_cast<TOut>(std::max<TIn>(std::numeric_limits<TOut>::lowest(),
139 std::min<TIn>(std::numeric_limits<TOut>::max(), result)));
140 }
141 }
142
143 template <typename TIn, typename TOut>
quantize_down_scale_by_float(const SimpleTensor<TIn> * in,const SimpleTensor<TIn> * bias,SimpleTensor<TOut> * dst,std::vector<float_t> result_real_multiplier,int32_t result_offset,int32_t min,int32_t max)144 void quantize_down_scale_by_float(const SimpleTensor<TIn> *in, const SimpleTensor<TIn> *bias, SimpleTensor<TOut> *dst, std::vector<float_t> result_real_multiplier,
145 int32_t result_offset, int32_t min, int32_t max)
146 {
147 const int cols_in = in->shape().x();
148 const bool is_per_channel = result_real_multiplier.size() > 1;
149
150 #if defined(_OPENMP)
151 #pragma omp parallel for
152 #endif /* _OPENMP */
153 for(int i = 0; i < in->num_elements(); ++i)
154 {
155 TIn result = (*in)[i];
156
157 if(bias != nullptr)
158 {
159 result += (*bias)[i % cols_in];
160 }
161
162 // Float multiplication
163 const float_t multiplier = (is_per_channel) ? result_real_multiplier[i % cols_in] : result_real_multiplier[0];
164
165 float_t result_f = static_cast<float_t>(result) * multiplier + static_cast<float_t>(result_offset);
166 result = static_cast<TIn>(support::cpp11::round(result_f));
167
168 // Bounded ReLu
169 if(min != max)
170 {
171 result = std::max(min, std::min(max, result));
172 }
173
174 (*dst)[i] = static_cast<TOut>(std::max<TIn>(std::numeric_limits<TOut>::lowest(),
175 std::min<TIn>(std::numeric_limits<TOut>::max(), result)));
176 }
177 }
178 } // namespace
179
180 template <typename T_out, typename T_in, typename T_in_1>
gemmlowp_matrix_multiply_core(const SimpleTensor<T_in> & a,const SimpleTensor<T_in_1> & b,TensorShape shape_c,int32_t a_offset,int32_t b_offset)181 SimpleTensor<T_out> gemmlowp_matrix_multiply_core(const SimpleTensor<T_in> &a, const SimpleTensor<T_in_1> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset)
182 {
183 static_assert(std::is_same<typename std::decay<T_out>::type, int32_t>::value, "Only int32_t is allowed for the output");
184
185 DataType dt = std::is_same<T_out, int32_t>::value ? DataType::S32 : DataType::U32;
186 SimpleTensor<T_out> c(shape_c, dt);
187
188 const int K = a.shape().x();
189 const int M = a.shape().y();
190 const int N = b.shape().x();
191 const int D = a.shape().z(); // Number of matrices in a batch
192
193 const int a_stride_z = K * M;
194 // Do not slide the matrix B along the 3rd dimension in case matrix B has less than 3 dimensions
195 const int b_stride_z = b.shape().num_dimensions() > 2 ? N * K : 0;
196 const int c_stride_z = N * M;
197
198 std::vector<T_out> acc;
199 acc.resize(N);
200
201 for(int depth = 0; depth < D; ++depth)
202 {
203 const int base_addr_a = depth * a_stride_z;
204 const int base_addr_b = depth * b_stride_z;
205 const int base_addr_c = depth * c_stride_z;
206
207 for(int i = 0; i < M; ++i)
208 {
209 for(int j = 0; j < N; ++j)
210 {
211 acc[j] = 0;
212 }
213 for(int k = 0; k < K; ++k)
214 {
215 const T_out tmp_a = a_offset + static_cast<T_out>(a[base_addr_a + k + i * K]);
216 for(int j = 0; j < N; ++j)
217 {
218 const T_out tmp_b = b_offset + static_cast<T_out>(b[base_addr_b + j + k * N]);
219 const T_out mult_as_int = tmp_a * tmp_b;
220 acc[j] += mult_as_int;
221 }
222 }
223 for(int j = 0; j < N; ++j)
224 {
225 c[base_addr_c + j + i * N] = acc[j];
226 }
227 }
228 }
229
230 return c;
231 }
232
233 // used to validate assembly kernels which don't know anything about offsets
234 template <typename T1, typename T2, typename T3>
gemmlowp(const SimpleTensor<T2> & a,const SimpleTensor<T3> & b,TensorShape shape_c)235 SimpleTensor<T1> gemmlowp(const SimpleTensor<T2> &a, const SimpleTensor<T3> &b, TensorShape shape_c)
236 {
237 return gemmlowp_matrix_multiply_core<T1, T2, T3>(a, b, shape_c, 0, 0);
238 }
239
240 template <typename TIn, typename TOut>
gemmlowp_quantize_down_scale(const SimpleTensor<TIn> & in,int32_t result_offset,std::vector<int32_t> result_mult_int,std::vector<int32_t> result_shift,int32_t min,int32_t max)241 SimpleTensor<TOut> gemmlowp_quantize_down_scale(const SimpleTensor<TIn> &in, int32_t result_offset, std::vector<int32_t> result_mult_int, std::vector<int32_t> result_shift,
242 int32_t min, int32_t max)
243 {
244 SimpleTensor<TOut> dst(in.shape(), DataTypeExtractor<TOut>::data_type());
245
246 quantize_down_scale<TIn, TOut>(&in, nullptr, &dst, result_offset, result_mult_int, result_shift, min, max);
247
248 return dst;
249 }
250
251 template <typename TIn, typename TOut>
gemmlowp_quantize_down_scale(const SimpleTensor<TIn> & in,const SimpleTensor<TIn> & bias,int32_t result_offset,std::vector<int32_t> result_mult_int,std::vector<int32_t> result_shift,int32_t min,int32_t max)252 SimpleTensor<TOut> gemmlowp_quantize_down_scale(const SimpleTensor<TIn> &in, const SimpleTensor<TIn> &bias, int32_t result_offset, std::vector<int32_t> result_mult_int,
253 std::vector<int32_t> result_shift, int32_t min, int32_t max)
254 {
255 SimpleTensor<TOut> dst(in.shape(), DataTypeExtractor<TOut>::data_type());
256
257 quantize_down_scale<TIn, TOut>(&in, &bias, &dst, result_offset, result_mult_int, result_shift, min, max);
258
259 return dst;
260 }
261
262 template <typename TIn, typename TOut>
gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<TIn> & in,std::vector<int32_t> result_fixedpoint_multiplier,std::vector<int32_t> result_shift,int32_t result_offset_after_shift,int32_t min,int32_t max)263 SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<TIn> &in, std::vector<int32_t> result_fixedpoint_multiplier, std::vector<int32_t> result_shift,
264 int32_t result_offset_after_shift, int32_t min, int32_t max)
265 {
266 SimpleTensor<TOut> dst(in.shape(), DataTypeExtractor<TOut>::data_type());
267
268 quantize_down_scale_by_fixedpoint<TIn, TOut>(&in, nullptr, &dst, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
269
270 return dst;
271 }
272
273 template <typename TIn, typename TOut>
gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<TIn> & in,const SimpleTensor<TIn> & bias,std::vector<int32_t> result_fixedpoint_multiplier,std::vector<int32_t> result_shift,int32_t result_offset_after_shift,int32_t min,int32_t max)274 SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<TIn> &in, const SimpleTensor<TIn> &bias, std::vector<int32_t> result_fixedpoint_multiplier,
275 std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max)
276 {
277 SimpleTensor<TOut> dst(in.shape(), DataTypeExtractor<TOut>::data_type());
278
279 quantize_down_scale_by_fixedpoint<TIn, TOut>(&in, &bias, &dst, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
280
281 return dst;
282 }
283
284 template <typename TIn, typename TOut>
gemmlowp_quantize_down_scale_by_float(const SimpleTensor<TIn> & in,const SimpleTensor<TIn> & bias,std::vector<float_t> result_real_multiplier,int32_t result_offset,int32_t min,int32_t max)285 SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_float(const SimpleTensor<TIn> &in, const SimpleTensor<TIn> &bias,
286 std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max)
287 {
288 SimpleTensor<TOut> dst(in.shape(), DataTypeExtractor<TOut>::data_type());
289
290 quantize_down_scale_by_float<TIn, TOut>(&in, &bias, &dst, result_real_multiplier, result_offset, min, max);
291
292 return dst;
293 }
294
295 template <typename TIn, typename TOut>
gemmlowp_quantize_down_scale_by_float(const SimpleTensor<TIn> & in,std::vector<float_t> result_real_multiplier,int32_t result_offset,int32_t min,int32_t max)296 SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_float(const SimpleTensor<TIn> &in,
297 std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max)
298 {
299 SimpleTensor<TOut> dst(in.shape(), DataTypeExtractor<TOut>::data_type());
300
301 quantize_down_scale_by_float<TIn, TOut>(&in, nullptr, &dst, result_real_multiplier, result_offset, min, max);
302
303 return dst;
304 }
305
306 template SimpleTensor<uint8_t> gemmlowp_quantize_down_scale_by_float(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b,
307 std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max);
308 template SimpleTensor<uint8_t> gemmlowp_quantize_down_scale_by_float(const SimpleTensor<int32_t> &a,
309 std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max);
310 template SimpleTensor<int8_t> gemmlowp_quantize_down_scale_by_float(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b,
311 std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max);
312 template SimpleTensor<int8_t> gemmlowp_quantize_down_scale_by_float(const SimpleTensor<int32_t> &a,
313 std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max);
314 template SimpleTensor<uint8_t> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, std::vector<int32_t> result_fixedpoint_multiplier,
315 std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
316 template SimpleTensor<uint8_t> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b,
317 std::vector<int32_t> result_fixedpoint_multiplier,
318 std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
319 template SimpleTensor<int8_t> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, std::vector<int32_t> result_fixedpoint_multiplier,
320 std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
321 template SimpleTensor<int8_t> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b,
322 std::vector<int32_t> result_fixedpoint_multiplier,
323 std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
324 template SimpleTensor<int16_t> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, std::vector<int32_t> result_fixedpoint_multiplier,
325 std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
326 template SimpleTensor<int16_t> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b,
327 std::vector<int32_t> result_fixedpoint_multiplier,
328 std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
329 template SimpleTensor<uint8_t> gemmlowp_quantize_down_scale(const SimpleTensor<int32_t> &a, int32_t result_offset, std::vector<int32_t> result_mult_int,
330 std::vector<int32_t> result_shift, int32_t min, int32_t max);
331 template SimpleTensor<uint8_t> gemmlowp_quantize_down_scale(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b, int32_t result_offset, std::vector<int32_t> result_mult_int,
332 std::vector<int32_t> result_shift, int32_t min, int32_t max);
333 template SimpleTensor<int8_t> gemmlowp_quantize_down_scale(const SimpleTensor<int32_t> &a, int32_t result_offset, std::vector<int32_t> result_mult_int,
334 std::vector<int32_t> result_shift, int32_t min, int32_t max);
335 template SimpleTensor<int8_t> gemmlowp_quantize_down_scale(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b, int32_t result_offset, std::vector<int32_t> result_mult_int,
336 std::vector<int32_t> result_shift, int32_t min, int32_t max);
337 template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
338 template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
339 template SimpleTensor<int32_t> gemmlowp<int32_t, int8_t, int8_t>(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c);
340 template SimpleTensor<int32_t> gemmlowp<int32_t, uint8_t, uint8_t>(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, TensorShape shape_c);
341 template SimpleTensor<int32_t> gemmlowp<int32_t, uint8_t, int8_t>(const SimpleTensor<uint8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c);
342 } // namespace reference
343 } // namespace validation
344 } // namespace test
345 } // namespace arm_compute
346