xref: /aosp_15_r20/external/ComputeLibrary/tests/validation/reference/GEMMLowp.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2017-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "GEMMLowp.h"
25 
26 #include "arm_compute/core/Types.h"
27 #include "tests/validation/reference/UtilsQuantizedAsymm.h"
28 
29 #include "support/ToolchainSupport.h"
30 
31 #include <limits>
32 
33 namespace arm_compute
34 {
35 namespace test
36 {
37 namespace validation
38 {
39 namespace reference
40 {
41 namespace
42 {
43 template <typename T>
44 struct DataTypeExtractor
45 {
data_typearm_compute::test::validation::reference::__anon8a60d57b0111::DataTypeExtractor46     static DataType data_type()
47     {
48         DataType data_type = DataType::UNKNOWN;
49         if(std::is_same<T, int8_t>::value)
50         {
51             data_type = DataType::QASYMM8_SIGNED;
52         }
53         else if(std::is_same<T, uint8_t>::value)
54         {
55             data_type = DataType::QASYMM8;
56         }
57         else if(std::is_same<T, int16_t>::value)
58         {
59             data_type = DataType::QSYMM16;
60         }
61         return data_type;
62     }
63 };
64 
65 template <typename TIn, typename TOut>
quantize_down_scale(const SimpleTensor<TIn> * in,const SimpleTensor<TIn> * bias,SimpleTensor<TOut> * dst,int32_t result_offset,std::vector<int32_t> result_mult_int,std::vector<int32_t> result_shift,int32_t min,int32_t max)66 void quantize_down_scale(const SimpleTensor<TIn> *in, const SimpleTensor<TIn> *bias, SimpleTensor<TOut> *dst, int32_t result_offset, std::vector<int32_t> result_mult_int,
67                          std::vector<int32_t> result_shift, int32_t min, int32_t max)
68 {
69     const int  cols_in        = in->shape().x();
70     const bool is_per_channel = result_mult_int.size() > 1;
71 
72 #if defined(_OPENMP)
73     #pragma omp parallel for
74 #endif /* _OPENMP */
75     for(int i = 0; i < in->num_elements(); ++i)
76     {
77         int32_t result = ((*in)[i] + result_offset);
78 
79         if(bias != nullptr)
80         {
81             result += (*bias)[i % cols_in];
82         }
83 
84         result *= (is_per_channel) ? result_mult_int[i % cols_in] : result_mult_int[0];
85 
86         result >>= (is_per_channel) ? result_shift[i % cols_in] : result_shift[0];
87 
88         // Bounded ReLu
89         if(min != max)
90         {
91             result = std::max(min, std::min(max, result));
92         }
93 
94         (*dst)[i] = static_cast<TOut>(std::max<TIn>(std::numeric_limits<TOut>::lowest(),
95                                                     std::min<TIn>(std::numeric_limits<TOut>::max(), result)));
96     }
97 }
98 
99 template <typename TIn, typename TOut>
quantize_down_scale_by_fixedpoint(const SimpleTensor<TIn> * in,const SimpleTensor<TIn> * bias,SimpleTensor<TOut> * dst,std::vector<int32_t> result_fixedpoint_multiplier,std::vector<int32_t> result_shift,int32_t result_offset_after_shift,int32_t min,int32_t max)100 void quantize_down_scale_by_fixedpoint(const SimpleTensor<TIn> *in, const SimpleTensor<TIn> *bias, SimpleTensor<TOut> *dst, std::vector<int32_t> result_fixedpoint_multiplier,
101                                        std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max)
102 {
103     const int  cols_in        = in->shape().x();
104     const bool is_per_channel = result_fixedpoint_multiplier.size() > 1;
105 
106 #if defined(_OPENMP)
107     #pragma omp parallel for
108 #endif /* _OPENMP */
109     for(int i = 0; i < in->num_elements(); ++i)
110     {
111         TIn result = (*in)[i];
112 
113         if(bias != nullptr)
114         {
115             result += (*bias)[i % cols_in];
116         }
117 
118         // Fixed point multiplication
119         const int32_t multiplier = (is_per_channel) ? result_fixedpoint_multiplier[i % cols_in] : result_fixedpoint_multiplier[0];
120         const int32_t shift      = (is_per_channel) ? result_shift[i % cols_in] : result_shift[0];
121 
122         if(shift < 0)
123         {
124             result = asymm_int_mult(result * (1 << (-shift)), multiplier);
125         }
126         else
127         {
128             result = asymm_rounding_divide_by_pow2(asymm_int_mult(result, multiplier), shift);
129         }
130         result += result_offset_after_shift;
131 
132         // Bounded ReLu
133         if(min != max)
134         {
135             result = std::max(min, std::min(max, result));
136         }
137 
138         (*dst)[i] = static_cast<TOut>(std::max<TIn>(std::numeric_limits<TOut>::lowest(),
139                                                     std::min<TIn>(std::numeric_limits<TOut>::max(), result)));
140     }
141 }
142 
143 template <typename TIn, typename TOut>
quantize_down_scale_by_float(const SimpleTensor<TIn> * in,const SimpleTensor<TIn> * bias,SimpleTensor<TOut> * dst,std::vector<float_t> result_real_multiplier,int32_t result_offset,int32_t min,int32_t max)144 void quantize_down_scale_by_float(const SimpleTensor<TIn> *in, const SimpleTensor<TIn> *bias, SimpleTensor<TOut> *dst, std::vector<float_t> result_real_multiplier,
145                                   int32_t result_offset, int32_t min, int32_t max)
146 {
147     const int  cols_in        = in->shape().x();
148     const bool is_per_channel = result_real_multiplier.size() > 1;
149 
150 #if defined(_OPENMP)
151     #pragma omp parallel for
152 #endif /* _OPENMP */
153     for(int i = 0; i < in->num_elements(); ++i)
154     {
155         TIn result = (*in)[i];
156 
157         if(bias != nullptr)
158         {
159             result += (*bias)[i % cols_in];
160         }
161 
162         // Float multiplication
163         const float_t multiplier = (is_per_channel) ? result_real_multiplier[i % cols_in] : result_real_multiplier[0];
164 
165         float_t result_f = static_cast<float_t>(result) * multiplier + static_cast<float_t>(result_offset);
166         result           = static_cast<TIn>(support::cpp11::round(result_f));
167 
168         // Bounded ReLu
169         if(min != max)
170         {
171             result = std::max(min, std::min(max, result));
172         }
173 
174         (*dst)[i] = static_cast<TOut>(std::max<TIn>(std::numeric_limits<TOut>::lowest(),
175                                                     std::min<TIn>(std::numeric_limits<TOut>::max(), result)));
176     }
177 }
178 } // namespace
179 
180 template <typename T_out, typename T_in, typename T_in_1>
gemmlowp_matrix_multiply_core(const SimpleTensor<T_in> & a,const SimpleTensor<T_in_1> & b,TensorShape shape_c,int32_t a_offset,int32_t b_offset)181 SimpleTensor<T_out> gemmlowp_matrix_multiply_core(const SimpleTensor<T_in> &a, const SimpleTensor<T_in_1> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset)
182 {
183     static_assert(std::is_same<typename std::decay<T_out>::type, int32_t>::value, "Only int32_t is allowed for the output");
184 
185     DataType            dt = std::is_same<T_out, int32_t>::value ? DataType::S32 : DataType::U32;
186     SimpleTensor<T_out> c(shape_c, dt);
187 
188     const int K = a.shape().x();
189     const int M = a.shape().y();
190     const int N = b.shape().x();
191     const int D = a.shape().z(); // Number of matrices in a batch
192 
193     const int a_stride_z = K * M;
194     // Do not slide the matrix B along the 3rd dimension in case matrix B has less than 3 dimensions
195     const int b_stride_z = b.shape().num_dimensions() > 2 ? N * K : 0;
196     const int c_stride_z = N * M;
197 
198     std::vector<T_out> acc;
199     acc.resize(N);
200 
201     for(int depth = 0; depth < D; ++depth)
202     {
203         const int base_addr_a = depth * a_stride_z;
204         const int base_addr_b = depth * b_stride_z;
205         const int base_addr_c = depth * c_stride_z;
206 
207         for(int i = 0; i < M; ++i)
208         {
209             for(int j = 0; j < N; ++j)
210             {
211                 acc[j] = 0;
212             }
213             for(int k = 0; k < K; ++k)
214             {
215                 const T_out tmp_a = a_offset + static_cast<T_out>(a[base_addr_a + k + i * K]);
216                 for(int j = 0; j < N; ++j)
217                 {
218                     const T_out tmp_b       = b_offset + static_cast<T_out>(b[base_addr_b + j + k * N]);
219                     const T_out mult_as_int = tmp_a * tmp_b;
220                     acc[j] += mult_as_int;
221                 }
222             }
223             for(int j = 0; j < N; ++j)
224             {
225                 c[base_addr_c + j + i * N] = acc[j];
226             }
227         }
228     }
229 
230     return c;
231 }
232 
233 // used to validate assembly kernels which don't know anything about offsets
234 template <typename T1, typename T2, typename T3>
gemmlowp(const SimpleTensor<T2> & a,const SimpleTensor<T3> & b,TensorShape shape_c)235 SimpleTensor<T1> gemmlowp(const SimpleTensor<T2> &a, const SimpleTensor<T3> &b, TensorShape shape_c)
236 {
237     return gemmlowp_matrix_multiply_core<T1, T2, T3>(a, b, shape_c, 0, 0);
238 }
239 
240 template <typename TIn, typename TOut>
gemmlowp_quantize_down_scale(const SimpleTensor<TIn> & in,int32_t result_offset,std::vector<int32_t> result_mult_int,std::vector<int32_t> result_shift,int32_t min,int32_t max)241 SimpleTensor<TOut> gemmlowp_quantize_down_scale(const SimpleTensor<TIn> &in, int32_t result_offset, std::vector<int32_t> result_mult_int, std::vector<int32_t> result_shift,
242                                                 int32_t min, int32_t max)
243 {
244     SimpleTensor<TOut> dst(in.shape(), DataTypeExtractor<TOut>::data_type());
245 
246     quantize_down_scale<TIn, TOut>(&in, nullptr, &dst, result_offset, result_mult_int, result_shift, min, max);
247 
248     return dst;
249 }
250 
251 template <typename TIn, typename TOut>
gemmlowp_quantize_down_scale(const SimpleTensor<TIn> & in,const SimpleTensor<TIn> & bias,int32_t result_offset,std::vector<int32_t> result_mult_int,std::vector<int32_t> result_shift,int32_t min,int32_t max)252 SimpleTensor<TOut> gemmlowp_quantize_down_scale(const SimpleTensor<TIn> &in, const SimpleTensor<TIn> &bias, int32_t result_offset, std::vector<int32_t> result_mult_int,
253                                                 std::vector<int32_t> result_shift, int32_t min, int32_t max)
254 {
255     SimpleTensor<TOut> dst(in.shape(), DataTypeExtractor<TOut>::data_type());
256 
257     quantize_down_scale<TIn, TOut>(&in, &bias, &dst, result_offset, result_mult_int, result_shift, min, max);
258 
259     return dst;
260 }
261 
262 template <typename TIn, typename TOut>
gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<TIn> & in,std::vector<int32_t> result_fixedpoint_multiplier,std::vector<int32_t> result_shift,int32_t result_offset_after_shift,int32_t min,int32_t max)263 SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<TIn> &in, std::vector<int32_t> result_fixedpoint_multiplier, std::vector<int32_t> result_shift,
264                                                               int32_t result_offset_after_shift, int32_t min, int32_t max)
265 {
266     SimpleTensor<TOut> dst(in.shape(), DataTypeExtractor<TOut>::data_type());
267 
268     quantize_down_scale_by_fixedpoint<TIn, TOut>(&in, nullptr, &dst, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
269 
270     return dst;
271 }
272 
273 template <typename TIn, typename TOut>
gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<TIn> & in,const SimpleTensor<TIn> & bias,std::vector<int32_t> result_fixedpoint_multiplier,std::vector<int32_t> result_shift,int32_t result_offset_after_shift,int32_t min,int32_t max)274 SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<TIn> &in, const SimpleTensor<TIn> &bias, std::vector<int32_t> result_fixedpoint_multiplier,
275                                                               std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max)
276 {
277     SimpleTensor<TOut> dst(in.shape(), DataTypeExtractor<TOut>::data_type());
278 
279     quantize_down_scale_by_fixedpoint<TIn, TOut>(&in, &bias, &dst, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
280 
281     return dst;
282 }
283 
284 template <typename TIn, typename TOut>
gemmlowp_quantize_down_scale_by_float(const SimpleTensor<TIn> & in,const SimpleTensor<TIn> & bias,std::vector<float_t> result_real_multiplier,int32_t result_offset,int32_t min,int32_t max)285 SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_float(const SimpleTensor<TIn> &in, const SimpleTensor<TIn> &bias,
286                                                          std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max)
287 {
288     SimpleTensor<TOut> dst(in.shape(), DataTypeExtractor<TOut>::data_type());
289 
290     quantize_down_scale_by_float<TIn, TOut>(&in, &bias, &dst, result_real_multiplier, result_offset, min, max);
291 
292     return dst;
293 }
294 
295 template <typename TIn, typename TOut>
gemmlowp_quantize_down_scale_by_float(const SimpleTensor<TIn> & in,std::vector<float_t> result_real_multiplier,int32_t result_offset,int32_t min,int32_t max)296 SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_float(const SimpleTensor<TIn> &in,
297                                                          std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max)
298 {
299     SimpleTensor<TOut> dst(in.shape(), DataTypeExtractor<TOut>::data_type());
300 
301     quantize_down_scale_by_float<TIn, TOut>(&in, nullptr, &dst, result_real_multiplier, result_offset, min, max);
302 
303     return dst;
304 }
305 
306 template SimpleTensor<uint8_t> gemmlowp_quantize_down_scale_by_float(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b,
307                                                                      std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max);
308 template SimpleTensor<uint8_t> gemmlowp_quantize_down_scale_by_float(const SimpleTensor<int32_t> &a,
309                                                                      std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max);
310 template SimpleTensor<int8_t> gemmlowp_quantize_down_scale_by_float(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b,
311                                                                     std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max);
312 template SimpleTensor<int8_t> gemmlowp_quantize_down_scale_by_float(const SimpleTensor<int32_t> &a,
313                                                                     std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max);
314 template SimpleTensor<uint8_t> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, std::vector<int32_t> result_fixedpoint_multiplier,
315                                                                           std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
316 template SimpleTensor<uint8_t> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b,
317                                                                           std::vector<int32_t> result_fixedpoint_multiplier,
318                                                                           std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
319 template SimpleTensor<int8_t> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, std::vector<int32_t> result_fixedpoint_multiplier,
320                                                                          std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
321 template SimpleTensor<int8_t> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b,
322                                                                          std::vector<int32_t> result_fixedpoint_multiplier,
323                                                                          std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
324 template SimpleTensor<int16_t> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, std::vector<int32_t> result_fixedpoint_multiplier,
325                                                                           std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
326 template SimpleTensor<int16_t> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b,
327                                                                           std::vector<int32_t> result_fixedpoint_multiplier,
328                                                                           std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
329 template SimpleTensor<uint8_t> gemmlowp_quantize_down_scale(const SimpleTensor<int32_t> &a, int32_t result_offset, std::vector<int32_t> result_mult_int,
330                                                             std::vector<int32_t> result_shift, int32_t min, int32_t max);
331 template SimpleTensor<uint8_t> gemmlowp_quantize_down_scale(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b, int32_t result_offset, std::vector<int32_t> result_mult_int,
332                                                             std::vector<int32_t> result_shift, int32_t min, int32_t max);
333 template SimpleTensor<int8_t> gemmlowp_quantize_down_scale(const SimpleTensor<int32_t> &a, int32_t result_offset, std::vector<int32_t> result_mult_int,
334                                                            std::vector<int32_t> result_shift, int32_t min, int32_t max);
335 template SimpleTensor<int8_t> gemmlowp_quantize_down_scale(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b, int32_t result_offset, std::vector<int32_t> result_mult_int,
336                                                            std::vector<int32_t> result_shift, int32_t min, int32_t max);
337 template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
338 template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
339 template SimpleTensor<int32_t> gemmlowp<int32_t, int8_t, int8_t>(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c);
340 template SimpleTensor<int32_t> gemmlowp<int32_t, uint8_t, uint8_t>(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, TensorShape shape_c);
341 template SimpleTensor<int32_t> gemmlowp<int32_t, uint8_t, int8_t>(const SimpleTensor<uint8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c);
342 } // namespace reference
343 } // namespace validation
344 } // namespace test
345 } // namespace arm_compute
346