xref: /aosp_15_r20/external/libvpx/vpx_dsp/arm/highbd_convolve8_neon.h (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_VPX_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
12 #define VPX_VPX_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
13 
14 #include <arm_neon.h>
15 
highbd_convolve4_4_neon(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t filters,const uint16x4_t max)16 static INLINE uint16x4_t highbd_convolve4_4_neon(
17     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
18     const int16x4_t s3, const int16x4_t filters, const uint16x4_t max) {
19   int32x4_t sum = vmull_lane_s16(s0, filters, 0);
20   sum = vmlal_lane_s16(sum, s1, filters, 1);
21   sum = vmlal_lane_s16(sum, s2, filters, 2);
22   sum = vmlal_lane_s16(sum, s3, filters, 3);
23 
24   uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
25   return vmin_u16(res, max);
26 }
27 
highbd_convolve4_8_neon(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x4_t filters,const uint16x8_t max)28 static INLINE uint16x8_t highbd_convolve4_8_neon(
29     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
30     const int16x8_t s3, const int16x4_t filters, const uint16x8_t max) {
31   int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filters, 0);
32   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters, 1);
33   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters, 2);
34   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters, 3);
35 
36   int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filters, 0);
37   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters, 1);
38   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters, 2);
39   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters, 3);
40 
41   uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
42                                 vqrshrun_n_s32(sum1, FILTER_BITS));
43   return vminq_u16(res, max);
44 }
45 
46 #endif  // VPX_VPX_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
47