xref: /aosp_15_r20/external/ComputeLibrary/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2017-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"
25 
26 #include "arm_compute/core/ITensor.h"
27 #include "arm_compute/core/KernelDescriptors.h"
28 #include "arm_compute/core/TensorInfo.h"
29 #include "src/core/NEON/wrapper/wrapper.h"
30 #include "src/core/helpers/AutoConfiguration.h"
31 #include "src/core/helpers/WindowHelpers.h"
32 
33 namespace arm_compute
34 {
35 namespace cpu
36 {
37 namespace kernels
38 {
39 namespace
40 {
validate_arguments_matrix_a_reduction(const ITensorInfo * src,const ITensorInfo * dst,const GEMMLowpReductionKernelInfo & info)41 Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
42 {
43     ARM_COMPUTE_UNUSED(info);
44     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
45     ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
46     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
47 
48     if(dst->total_size() > 0)
49     {
50         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
51         ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
52     }
53     return Status{};
54 }
validate_arguments_matrix_b_reduction(const ITensorInfo * src,const ITensorInfo * dst,const GEMMLowpReductionKernelInfo & info)55 Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
56 {
57     ARM_COMPUTE_UNUSED(info);
58     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
59     ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
60     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
61 
62     if(dst->total_size() > 0)
63     {
64         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
65         ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
66     }
67     return Status{};
68 }
69 } // namespace
70 
configure(const ITensorInfo * src,ITensorInfo * dst,const GEMMLowpReductionKernelInfo & info)71 void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
72 {
73     // Perform validate step
74     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
75     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(src, dst, info));
76     _k             = info.k;
77     _scalar        = info.scalar;
78     _mul_by_scalar = info.mul_by_scalar;
79 
80     switch(src->data_type())
81     {
82         case DataType::QASYMM8:
83             _func = &CpuGemmLowpMatrixAReductionKernel::run_internal<uint8_t>;
84             break;
85         case DataType::QASYMM8_SIGNED:
86         case DataType::QSYMM8:
87         case DataType::QSYMM8_PER_CHANNEL:
88             _func = &CpuGemmLowpMatrixAReductionKernel::run_internal<int8_t>;
89             break;
90         default:
91             ARM_COMPUTE_ERROR("Unsupported data type");
92     }
93 
94     // Output auto initialization if not yet initialized
95     auto_init_if_empty(*dst, TensorShape(src->dimension(1)), 1, DataType::S32);
96 
97     Window win = calculate_max_window(*dst, Steps(1));
98     ICpuKernel::configure(win);
99 }
100 
validate(const ITensorInfo * src,const ITensorInfo * dst,const GEMMLowpReductionKernelInfo & info)101 Status CpuGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
102 {
103     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(src, dst, info));
104     return Status{};
105 }
106 
107 template <typename T>
run_internal(const ITensor * src,ITensor * dst,const arm_compute::Window & window)108 void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor *src, ITensor *dst, const arm_compute::Window &window)
109 {
110     // Intermediate and final accumulator types
111     using TIAcc = wrapper::traits::promote_t<T>;
112     using TAcc  = wrapper::traits::promote_t<TIAcc>;
113 
114     Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
115 
116     Window win_input(collapsed_window);
117     win_input.set(Window::DimX, Window::Dimension(0, 0, 0));
118     win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
119     win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
120 
121     Iterator in(src, win_input);
122     Iterator out(dst, collapsed_window);
123 
124     execute_window_loop(collapsed_window, [&](const Coordinates & id)
125     {
126         auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
127         TAcc sum_row  = 0;
128 
129         const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * src->info()->strides_in_bytes()[1] + id.y() * src->info()->strides_in_bytes()[2]));
130 
131 #if __arm__
132         asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
133 #endif /* __arm__ */
134 
135         int i = 0;
136         // This for loop performs 16 accumulations
137         for(; i <= (_k - 16); i += 16)
138         {
139             const auto a0_d8 = wrapper::vloadq(matrix_a + i);
140 
141             // Partial accumulations in U16
142             const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
143 
144             // Accumulate to U32
145             vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
146         }
147 
148         // This for loop performs the leftover accumulations
149         for(; i < _k; ++i)
150         {
151             sum_row += static_cast<TAcc>(matrix_a[i]);
152         }
153 
154 #if defined(__aarch64__)
155         // Reduction operation available on 64 bit architectures only
156         sum_row += wrapper::vaddv(vsum_row);
157 #else  // __aarch64__
158         auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
159         tmp      = wrapper::vpadd(tmp, tmp);
160 
161         sum_row += wrapper::vgetlane(tmp, 0);
162 #endif // __aarch64__
163 
164         // Multiply by scalar if necessary
165         if(_mul_by_scalar)
166         {
167             sum_row *= _scalar;
168         }
169 
170         *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
171     },
172     in, out);
173 }
174 
run_op(ITensorPack & tensors,const Window & window,const ThreadInfo & info)175 void CpuGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
176 {
177     ARM_COMPUTE_UNUSED(info);
178     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
179     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
180 
181     auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
182     auto dst = tensors.get_tensor(TensorType::ACL_DST);
183 
184     (this->*_func)(src, dst, window);
185 }
186 
name() const187 const char *CpuGemmLowpMatrixAReductionKernel::name() const
188 {
189     return "CpuGemmLowpMatrixAReductionKernel";
190 }
191 
configure(const ITensorInfo * src,ITensorInfo * dst,const GEMMLowpReductionKernelInfo & info)192 void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
193 {
194     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
195     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(src, dst, info));
196 
197     _k             = info.k;
198     _scalar        = info.scalar;
199     _mul_by_scalar = info.mul_by_scalar;
200 
201     // Configure kernel window
202     constexpr unsigned int num_elems_processed_per_iteration = 16;
203 
204     switch(src->data_type())
205     {
206         case DataType::QASYMM8:
207             _func = &CpuGemmLowpMatrixBReductionKernel::run_internal<uint8_t>;
208             break;
209         case DataType::QASYMM8_SIGNED:
210         case DataType::QSYMM8:
211         case DataType::QSYMM8_PER_CHANNEL:
212             _func = &CpuGemmLowpMatrixBReductionKernel::run_internal<int8_t>;
213             break;
214         default:
215             ARM_COMPUTE_ERROR("Unsupported data type");
216     }
217 
218     // Output auto initialization if not yet initialized
219     auto_init_if_empty(*dst, TensorShape(src->dimension(0)), 1, DataType::S32);
220 
221     // Configure kernel window
222     Window win = calculate_max_window_horizontal(*dst, Steps(num_elems_processed_per_iteration));
223     ICpuKernel::configure(win);
224 }
225 
validate(const ITensorInfo * src,const ITensorInfo * dst,const GEMMLowpReductionKernelInfo & info)226 Status CpuGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
227 {
228     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(src, dst, info));
229     return Status{};
230 }
231 
232 template <typename T>
run_internal(const ITensor * src,ITensor * dst,const Window & window,const ThreadInfo & info)233 void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info)
234 {
235     // Intermediate and final accumulator types
236     using TIAcc = wrapper::traits::promote_t<T>;
237     using TAcc  = wrapper::traits::promote_t<TIAcc>;
238 
239     Window     collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
240     const auto vec_scalar       = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{});
241 
242     const auto width_matrix_b = static_cast<int>(src->info()->dimension(0));
243     const auto in_b_stride    = static_cast<int>(src->info()->strides_in_bytes()[1]);
244 
245     // The implementation computes 16 elements per iteration
246     const int window_start_x = 16 * info.thread_id;
247     const int window_step_x  = 16 * info.num_threads;
248     // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
249     const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
250 
251     Window win_out(collapsed_window);
252     win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
253 
254     Window win_in(win_out);
255     win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
256     win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
257 
258     Iterator inb(src, win_in);
259     Iterator out(dst, win_out);
260 
261     execute_window_loop(win_out, [&](const Coordinates & id)
262     {
263         if(id.x() > width_matrix_b)
264         {
265             return;
266         }
267 
268         // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
269         typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
270         {
271             wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
272             wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
273             wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
274             wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
275         };
276 
277         const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * src->info()->strides_in_bytes()[2]);
278 
279 #if __arm__
280         asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
281         asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
282 #endif /* __arm__ */
283 
284         int i = 0;
285         // This for loop performs 4 accumulations
286         for(; i <= (_k - 4); i += 4)
287         {
288             const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
289             const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
290             const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
291             const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
292 
293 #if __arm__
294             asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
295             asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
296             asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
297             asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
298 #endif /* __arm__ */
299 
300             // Partial accumulation in 16bit
301             typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] =
302             {
303                 wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
304                 wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})
305             };
306 
307             tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
308             tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
309             tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
310             tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
311             tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
312             tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
313             tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
314             tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
315 
316             // Accumulate to 32bit
317             sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
318             sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
319             sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
320             sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
321 
322             matrix_b += 4 * in_b_stride;
323         }
324 
325         // This for loop perfoms the leftover accumulations
326         for(; i < _k; ++i)
327         {
328             const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
329 
330             // Convert S8 to S16
331             const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]
332             {
333                 wrapper::vmovl(wrapper::vgetlow(b0_b8)),
334                 wrapper::vmovl(wrapper::vgethigh(b0_b8))
335             };
336 
337             // Accumulate to 32bit
338             sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
339             sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
340             sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
341             sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
342 
343             matrix_b += in_b_stride;
344         }
345 
346         // Multiply by scalar if necessary
347         if(_mul_by_scalar)
348         {
349             sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
350             sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
351             sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
352             sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
353         }
354 
355         auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
356         if(id.x() + 16 < width_matrix_b)
357         {
358             wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
359             wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
360             wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
361             wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
362         }
363         else
364         {
365             auto left_over = width_matrix_b - id.x();
366             for(auto k = 0; k < 4 && left_over; ++k)
367             {
368                 for(auto j = 0; j < 4 && left_over; ++j, --left_over)
369                 {
370                     *(vector_sum_col + k * 4 + j) = sum_col[k][j];
371                 }
372             }
373         }
374     },
375     inb, out);
376 }
377 
run_op(ITensorPack & tensors,const Window & window,const ThreadInfo & info)378 void CpuGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
379 {
380     ARM_COMPUTE_UNUSED(info);
381     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
382     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
383 
384     auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
385     auto dst = tensors.get_tensor(TensorType::ACL_DST);
386 
387     (this->*_func)(src, dst, window, info);
388 }
389 
name() const390 const char *CpuGemmLowpMatrixBReductionKernel::name() const
391 {
392     return "CpuGemmLowpMatrixBReductionKernel";
393 }
394 } // namespace kernels
395 } // namespace cpu
396 } // namespace arm_compute