xref: /aosp_15_r20/external/libvpx/vpx_dsp/arm/sum_neon.h (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_VPX_DSP_ARM_SUM_NEON_H_
12 #define VPX_VPX_DSP_ARM_SUM_NEON_H_
13 
14 #include <arm_neon.h>
15 
16 #include "./vpx_config.h"
17 #include "vpx/vpx_integer.h"
18 
horizontal_add_uint8x4(const uint8x8_t a)19 static INLINE uint16_t horizontal_add_uint8x4(const uint8x8_t a) {
20 #if VPX_ARCH_AARCH64
21   return vaddlv_u8(a);
22 #else
23   const uint16x4_t b = vpaddl_u8(a);
24   const uint16x4_t c = vpadd_u16(b, b);
25   return vget_lane_u16(c, 0);
26 #endif
27 }
28 
horizontal_add_uint8x8(const uint8x8_t a)29 static INLINE uint16_t horizontal_add_uint8x8(const uint8x8_t a) {
30 #if VPX_ARCH_AARCH64
31   return vaddlv_u8(a);
32 #else
33   const uint16x4_t b = vpaddl_u8(a);
34   const uint16x4_t c = vpadd_u16(b, b);
35   const uint16x4_t d = vpadd_u16(c, c);
36   return vget_lane_u16(d, 0);
37 #endif
38 }
39 
horizontal_add_uint8x16(const uint8x16_t a)40 static INLINE uint16_t horizontal_add_uint8x16(const uint8x16_t a) {
41 #if VPX_ARCH_AARCH64
42   return vaddlvq_u8(a);
43 #else
44   const uint16x8_t b = vpaddlq_u8(a);
45   const uint16x4_t c = vadd_u16(vget_low_u16(b), vget_high_u16(b));
46   const uint16x4_t d = vpadd_u16(c, c);
47   const uint16x4_t e = vpadd_u16(d, d);
48   return vget_lane_u16(e, 0);
49 #endif
50 }
51 
horizontal_add_uint16x4(const uint16x4_t a)52 static INLINE uint16_t horizontal_add_uint16x4(const uint16x4_t a) {
53 #if VPX_ARCH_AARCH64
54   return vaddv_u16(a);
55 #else
56   const uint16x4_t b = vpadd_u16(a, a);
57   const uint16x4_t c = vpadd_u16(b, b);
58   return vget_lane_u16(c, 0);
59 #endif
60 }
61 
horizontal_add_int16x8(const int16x8_t a)62 static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) {
63 #if VPX_ARCH_AARCH64
64   return vaddlvq_s16(a);
65 #else
66   const int32x4_t b = vpaddlq_s16(a);
67   const int64x2_t c = vpaddlq_s32(b);
68   const int32x2_t d = vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)),
69                                vreinterpret_s32_s64(vget_high_s64(c)));
70   return vget_lane_s32(d, 0);
71 #endif
72 }
73 
horizontal_add_uint16x8(const uint16x8_t a)74 static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) {
75 #if VPX_ARCH_AARCH64
76   return vaddlvq_u16(a);
77 #else
78   const uint32x4_t b = vpaddlq_u16(a);
79   const uint64x2_t c = vpaddlq_u32(b);
80   const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
81                                 vreinterpret_u32_u64(vget_high_u64(c)));
82   return vget_lane_u32(d, 0);
83 #endif
84 }
85 
horizontal_add_4d_uint16x8(const uint16x8_t sum[4])86 static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) {
87 #if VPX_ARCH_AARCH64
88   const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
89   const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
90   const uint16x8_t b0 = vpaddq_u16(a0, a1);
91   return vpaddlq_u16(b0);
92 #else
93   const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
94   const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
95   const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
96   const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
97   const uint16x4_t b0 = vpadd_u16(a0, a1);
98   const uint16x4_t b1 = vpadd_u16(a2, a3);
99   return vpaddlq_u16(vcombine_u16(b0, b1));
100 #endif
101 }
102 
horizontal_long_add_uint16x8(const uint16x8_t vec_lo,const uint16x8_t vec_hi)103 static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo,
104                                                     const uint16x8_t vec_hi) {
105 #if VPX_ARCH_AARCH64
106   return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi);
107 #else
108   const uint32x4_t vec_l_lo =
109       vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
110   const uint32x4_t vec_l_hi =
111       vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
112   const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
113   const uint64x2_t b = vpaddlq_u32(a);
114   const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
115                                 vreinterpret_u32_u64(vget_high_u64(b)));
116   return vget_lane_u32(c, 0);
117 #endif
118 }
119 
horizontal_long_add_4d_uint16x8(const uint16x8_t sum_lo[4],const uint16x8_t sum_hi[4])120 static INLINE uint32x4_t horizontal_long_add_4d_uint16x8(
121     const uint16x8_t sum_lo[4], const uint16x8_t sum_hi[4]) {
122   const uint32x4_t a0 = vpaddlq_u16(sum_lo[0]);
123   const uint32x4_t a1 = vpaddlq_u16(sum_lo[1]);
124   const uint32x4_t a2 = vpaddlq_u16(sum_lo[2]);
125   const uint32x4_t a3 = vpaddlq_u16(sum_lo[3]);
126   const uint32x4_t b0 = vpadalq_u16(a0, sum_hi[0]);
127   const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]);
128   const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]);
129   const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]);
130 #if VPX_ARCH_AARCH64
131   const uint32x4_t c0 = vpaddq_u32(b0, b1);
132   const uint32x4_t c1 = vpaddq_u32(b2, b3);
133   return vpaddq_u32(c0, c1);
134 #else
135   const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
136   const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
137   const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
138   const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
139   const uint32x2_t d0 = vpadd_u32(c0, c1);
140   const uint32x2_t d1 = vpadd_u32(c2, c3);
141   return vcombine_u32(d0, d1);
142 #endif
143 }
144 
horizontal_add_int32x2(const int32x2_t a)145 static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) {
146 #if VPX_ARCH_AARCH64
147   return vaddv_s32(a);
148 #else
149   return vget_lane_s32(a, 0) + vget_lane_s32(a, 1);
150 #endif
151 }
152 
horizontal_add_uint32x2(const uint32x2_t a)153 static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) {
154 #if VPX_ARCH_AARCH64
155   return vaddv_u32(a);
156 #else
157   const uint64x1_t b = vpaddl_u32(a);
158   return vget_lane_u32(vreinterpret_u32_u64(b), 0);
159 #endif
160 }
161 
horizontal_add_int32x4(const int32x4_t a)162 static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) {
163 #if VPX_ARCH_AARCH64
164   return vaddvq_s32(a);
165 #else
166   const int64x2_t b = vpaddlq_s32(a);
167   const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
168                                vreinterpret_s32_s64(vget_high_s64(b)));
169   return vget_lane_s32(c, 0);
170 #endif
171 }
172 
horizontal_add_uint32x4(const uint32x4_t a)173 static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
174 #if VPX_ARCH_AARCH64
175   return vaddvq_u32(a);
176 #else
177   const uint64x2_t b = vpaddlq_u32(a);
178   const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
179                                 vreinterpret_u32_u64(vget_high_u64(b)));
180   return vget_lane_u32(c, 0);
181 #endif
182 }
183 
horizontal_add_4d_uint32x4(const uint32x4_t sum[4])184 static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) {
185 #if VPX_ARCH_AARCH64
186   uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]);
187   uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]);
188   return vpaddq_u32(res01, res23);
189 #else
190   uint32x4_t res = vdupq_n_u32(0);
191   res = vsetq_lane_u32(horizontal_add_uint32x4(sum[0]), res, 0);
192   res = vsetq_lane_u32(horizontal_add_uint32x4(sum[1]), res, 1);
193   res = vsetq_lane_u32(horizontal_add_uint32x4(sum[2]), res, 2);
194   res = vsetq_lane_u32(horizontal_add_uint32x4(sum[3]), res, 3);
195   return res;
196 #endif
197 }
198 
horizontal_long_add_uint32x4(const uint32x4_t a)199 static INLINE uint64_t horizontal_long_add_uint32x4(const uint32x4_t a) {
200 #if VPX_ARCH_AARCH64
201   return vaddlvq_u32(a);
202 #else
203   const uint64x2_t b = vpaddlq_u32(a);
204   return vgetq_lane_u64(b, 0) + vgetq_lane_u64(b, 1);
205 #endif
206 }
207 
horizontal_add_int64x2(const int64x2_t a)208 static INLINE int64_t horizontal_add_int64x2(const int64x2_t a) {
209 #if VPX_ARCH_AARCH64
210   return vaddvq_s64(a);
211 #else
212   return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
213 #endif
214 }
215 
horizontal_add_uint64x2(const uint64x2_t a)216 static INLINE uint64_t horizontal_add_uint64x2(const uint64x2_t a) {
217 #if VPX_ARCH_AARCH64
218   return vaddvq_u64(a);
219 #else
220   return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
221 #endif
222 }
223 
horizontal_long_add_uint32x4_x2(const uint32x4_t a[2])224 static INLINE uint64_t horizontal_long_add_uint32x4_x2(const uint32x4_t a[2]) {
225   return horizontal_long_add_uint32x4(a[0]) +
226          horizontal_long_add_uint32x4(a[1]);
227 }
228 
horizontal_long_add_uint32x4_x4(const uint32x4_t a[4])229 static INLINE uint64_t horizontal_long_add_uint32x4_x4(const uint32x4_t a[4]) {
230   uint64x2_t sum = vpaddlq_u32(a[0]);
231   sum = vpadalq_u32(sum, a[1]);
232   sum = vpadalq_u32(sum, a[2]);
233   sum = vpadalq_u32(sum, a[3]);
234 
235   return horizontal_add_uint64x2(sum);
236 }
237 
horizontal_long_add_uint32x4_x8(const uint32x4_t a[8])238 static INLINE uint64_t horizontal_long_add_uint32x4_x8(const uint32x4_t a[8]) {
239   uint64x2_t sum[2];
240   sum[0] = vpaddlq_u32(a[0]);
241   sum[1] = vpaddlq_u32(a[1]);
242   sum[0] = vpadalq_u32(sum[0], a[2]);
243   sum[1] = vpadalq_u32(sum[1], a[3]);
244   sum[0] = vpadalq_u32(sum[0], a[4]);
245   sum[1] = vpadalq_u32(sum[1], a[5]);
246   sum[0] = vpadalq_u32(sum[0], a[6]);
247   sum[1] = vpadalq_u32(sum[1], a[7]);
248 
249   return horizontal_add_uint64x2(vaddq_u64(sum[0], sum[1]));
250 }
251 
252 static INLINE uint64_t
horizontal_long_add_uint32x4_x16(const uint32x4_t a[16])253 horizontal_long_add_uint32x4_x16(const uint32x4_t a[16]) {
254   uint64x2_t sum[2];
255   sum[0] = vpaddlq_u32(a[0]);
256   sum[1] = vpaddlq_u32(a[1]);
257   sum[0] = vpadalq_u32(sum[0], a[2]);
258   sum[1] = vpadalq_u32(sum[1], a[3]);
259   sum[0] = vpadalq_u32(sum[0], a[4]);
260   sum[1] = vpadalq_u32(sum[1], a[5]);
261   sum[0] = vpadalq_u32(sum[0], a[6]);
262   sum[1] = vpadalq_u32(sum[1], a[7]);
263   sum[0] = vpadalq_u32(sum[0], a[8]);
264   sum[1] = vpadalq_u32(sum[1], a[9]);
265   sum[0] = vpadalq_u32(sum[0], a[10]);
266   sum[1] = vpadalq_u32(sum[1], a[11]);
267   sum[0] = vpadalq_u32(sum[0], a[12]);
268   sum[1] = vpadalq_u32(sum[1], a[13]);
269   sum[0] = vpadalq_u32(sum[0], a[14]);
270   sum[1] = vpadalq_u32(sum[1], a[15]);
271 
272   return horizontal_add_uint64x2(vaddq_u64(sum[0], sum[1]));
273 }
274 
275 #endif  // VPX_VPX_DSP_ARM_SUM_NEON_H_
276