1 /*
2 * Copyright (c) 2024 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef VPX_VPX_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
12 #define VPX_VPX_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
13
14 #include <arm_neon.h>
15
highbd_convolve4_4_neon(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t filters,const uint16x4_t max)16 static INLINE uint16x4_t highbd_convolve4_4_neon(
17 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
18 const int16x4_t s3, const int16x4_t filters, const uint16x4_t max) {
19 int32x4_t sum = vmull_lane_s16(s0, filters, 0);
20 sum = vmlal_lane_s16(sum, s1, filters, 1);
21 sum = vmlal_lane_s16(sum, s2, filters, 2);
22 sum = vmlal_lane_s16(sum, s3, filters, 3);
23
24 uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
25 return vmin_u16(res, max);
26 }
27
highbd_convolve4_8_neon(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x4_t filters,const uint16x8_t max)28 static INLINE uint16x8_t highbd_convolve4_8_neon(
29 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
30 const int16x8_t s3, const int16x4_t filters, const uint16x8_t max) {
31 int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filters, 0);
32 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters, 1);
33 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters, 2);
34 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters, 3);
35
36 int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filters, 0);
37 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters, 1);
38 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters, 2);
39 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters, 3);
40
41 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
42 vqrshrun_n_s32(sum1, FILTER_BITS));
43 return vminq_u16(res, max);
44 }
45
46 #endif // VPX_VPX_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
47