1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef VPX_VPX_DSP_ARM_SUM_NEON_H_
12 #define VPX_VPX_DSP_ARM_SUM_NEON_H_
13
14 #include <arm_neon.h>
15
16 #include "./vpx_config.h"
17 #include "vpx/vpx_integer.h"
18
horizontal_add_uint8x4(const uint8x8_t a)19 static INLINE uint16_t horizontal_add_uint8x4(const uint8x8_t a) {
20 #if VPX_ARCH_AARCH64
21 return vaddlv_u8(a);
22 #else
23 const uint16x4_t b = vpaddl_u8(a);
24 const uint16x4_t c = vpadd_u16(b, b);
25 return vget_lane_u16(c, 0);
26 #endif
27 }
28
horizontal_add_uint8x8(const uint8x8_t a)29 static INLINE uint16_t horizontal_add_uint8x8(const uint8x8_t a) {
30 #if VPX_ARCH_AARCH64
31 return vaddlv_u8(a);
32 #else
33 const uint16x4_t b = vpaddl_u8(a);
34 const uint16x4_t c = vpadd_u16(b, b);
35 const uint16x4_t d = vpadd_u16(c, c);
36 return vget_lane_u16(d, 0);
37 #endif
38 }
39
horizontal_add_uint8x16(const uint8x16_t a)40 static INLINE uint16_t horizontal_add_uint8x16(const uint8x16_t a) {
41 #if VPX_ARCH_AARCH64
42 return vaddlvq_u8(a);
43 #else
44 const uint16x8_t b = vpaddlq_u8(a);
45 const uint16x4_t c = vadd_u16(vget_low_u16(b), vget_high_u16(b));
46 const uint16x4_t d = vpadd_u16(c, c);
47 const uint16x4_t e = vpadd_u16(d, d);
48 return vget_lane_u16(e, 0);
49 #endif
50 }
51
horizontal_add_uint16x4(const uint16x4_t a)52 static INLINE uint16_t horizontal_add_uint16x4(const uint16x4_t a) {
53 #if VPX_ARCH_AARCH64
54 return vaddv_u16(a);
55 #else
56 const uint16x4_t b = vpadd_u16(a, a);
57 const uint16x4_t c = vpadd_u16(b, b);
58 return vget_lane_u16(c, 0);
59 #endif
60 }
61
horizontal_add_int16x8(const int16x8_t a)62 static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) {
63 #if VPX_ARCH_AARCH64
64 return vaddlvq_s16(a);
65 #else
66 const int32x4_t b = vpaddlq_s16(a);
67 const int64x2_t c = vpaddlq_s32(b);
68 const int32x2_t d = vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)),
69 vreinterpret_s32_s64(vget_high_s64(c)));
70 return vget_lane_s32(d, 0);
71 #endif
72 }
73
horizontal_add_uint16x8(const uint16x8_t a)74 static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) {
75 #if VPX_ARCH_AARCH64
76 return vaddlvq_u16(a);
77 #else
78 const uint32x4_t b = vpaddlq_u16(a);
79 const uint64x2_t c = vpaddlq_u32(b);
80 const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
81 vreinterpret_u32_u64(vget_high_u64(c)));
82 return vget_lane_u32(d, 0);
83 #endif
84 }
85
horizontal_add_4d_uint16x8(const uint16x8_t sum[4])86 static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) {
87 #if VPX_ARCH_AARCH64
88 const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
89 const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
90 const uint16x8_t b0 = vpaddq_u16(a0, a1);
91 return vpaddlq_u16(b0);
92 #else
93 const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
94 const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
95 const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
96 const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
97 const uint16x4_t b0 = vpadd_u16(a0, a1);
98 const uint16x4_t b1 = vpadd_u16(a2, a3);
99 return vpaddlq_u16(vcombine_u16(b0, b1));
100 #endif
101 }
102
horizontal_long_add_uint16x8(const uint16x8_t vec_lo,const uint16x8_t vec_hi)103 static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo,
104 const uint16x8_t vec_hi) {
105 #if VPX_ARCH_AARCH64
106 return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi);
107 #else
108 const uint32x4_t vec_l_lo =
109 vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
110 const uint32x4_t vec_l_hi =
111 vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
112 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
113 const uint64x2_t b = vpaddlq_u32(a);
114 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
115 vreinterpret_u32_u64(vget_high_u64(b)));
116 return vget_lane_u32(c, 0);
117 #endif
118 }
119
horizontal_long_add_4d_uint16x8(const uint16x8_t sum_lo[4],const uint16x8_t sum_hi[4])120 static INLINE uint32x4_t horizontal_long_add_4d_uint16x8(
121 const uint16x8_t sum_lo[4], const uint16x8_t sum_hi[4]) {
122 const uint32x4_t a0 = vpaddlq_u16(sum_lo[0]);
123 const uint32x4_t a1 = vpaddlq_u16(sum_lo[1]);
124 const uint32x4_t a2 = vpaddlq_u16(sum_lo[2]);
125 const uint32x4_t a3 = vpaddlq_u16(sum_lo[3]);
126 const uint32x4_t b0 = vpadalq_u16(a0, sum_hi[0]);
127 const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]);
128 const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]);
129 const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]);
130 #if VPX_ARCH_AARCH64
131 const uint32x4_t c0 = vpaddq_u32(b0, b1);
132 const uint32x4_t c1 = vpaddq_u32(b2, b3);
133 return vpaddq_u32(c0, c1);
134 #else
135 const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
136 const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
137 const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
138 const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
139 const uint32x2_t d0 = vpadd_u32(c0, c1);
140 const uint32x2_t d1 = vpadd_u32(c2, c3);
141 return vcombine_u32(d0, d1);
142 #endif
143 }
144
horizontal_add_int32x2(const int32x2_t a)145 static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) {
146 #if VPX_ARCH_AARCH64
147 return vaddv_s32(a);
148 #else
149 return vget_lane_s32(a, 0) + vget_lane_s32(a, 1);
150 #endif
151 }
152
horizontal_add_uint32x2(const uint32x2_t a)153 static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) {
154 #if VPX_ARCH_AARCH64
155 return vaddv_u32(a);
156 #else
157 const uint64x1_t b = vpaddl_u32(a);
158 return vget_lane_u32(vreinterpret_u32_u64(b), 0);
159 #endif
160 }
161
horizontal_add_int32x4(const int32x4_t a)162 static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) {
163 #if VPX_ARCH_AARCH64
164 return vaddvq_s32(a);
165 #else
166 const int64x2_t b = vpaddlq_s32(a);
167 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
168 vreinterpret_s32_s64(vget_high_s64(b)));
169 return vget_lane_s32(c, 0);
170 #endif
171 }
172
horizontal_add_uint32x4(const uint32x4_t a)173 static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
174 #if VPX_ARCH_AARCH64
175 return vaddvq_u32(a);
176 #else
177 const uint64x2_t b = vpaddlq_u32(a);
178 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
179 vreinterpret_u32_u64(vget_high_u64(b)));
180 return vget_lane_u32(c, 0);
181 #endif
182 }
183
horizontal_add_4d_uint32x4(const uint32x4_t sum[4])184 static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) {
185 #if VPX_ARCH_AARCH64
186 uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]);
187 uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]);
188 return vpaddq_u32(res01, res23);
189 #else
190 uint32x4_t res = vdupq_n_u32(0);
191 res = vsetq_lane_u32(horizontal_add_uint32x4(sum[0]), res, 0);
192 res = vsetq_lane_u32(horizontal_add_uint32x4(sum[1]), res, 1);
193 res = vsetq_lane_u32(horizontal_add_uint32x4(sum[2]), res, 2);
194 res = vsetq_lane_u32(horizontal_add_uint32x4(sum[3]), res, 3);
195 return res;
196 #endif
197 }
198
horizontal_long_add_uint32x4(const uint32x4_t a)199 static INLINE uint64_t horizontal_long_add_uint32x4(const uint32x4_t a) {
200 #if VPX_ARCH_AARCH64
201 return vaddlvq_u32(a);
202 #else
203 const uint64x2_t b = vpaddlq_u32(a);
204 return vgetq_lane_u64(b, 0) + vgetq_lane_u64(b, 1);
205 #endif
206 }
207
horizontal_add_int64x2(const int64x2_t a)208 static INLINE int64_t horizontal_add_int64x2(const int64x2_t a) {
209 #if VPX_ARCH_AARCH64
210 return vaddvq_s64(a);
211 #else
212 return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
213 #endif
214 }
215
horizontal_add_uint64x2(const uint64x2_t a)216 static INLINE uint64_t horizontal_add_uint64x2(const uint64x2_t a) {
217 #if VPX_ARCH_AARCH64
218 return vaddvq_u64(a);
219 #else
220 return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
221 #endif
222 }
223
horizontal_long_add_uint32x4_x2(const uint32x4_t a[2])224 static INLINE uint64_t horizontal_long_add_uint32x4_x2(const uint32x4_t a[2]) {
225 return horizontal_long_add_uint32x4(a[0]) +
226 horizontal_long_add_uint32x4(a[1]);
227 }
228
horizontal_long_add_uint32x4_x4(const uint32x4_t a[4])229 static INLINE uint64_t horizontal_long_add_uint32x4_x4(const uint32x4_t a[4]) {
230 uint64x2_t sum = vpaddlq_u32(a[0]);
231 sum = vpadalq_u32(sum, a[1]);
232 sum = vpadalq_u32(sum, a[2]);
233 sum = vpadalq_u32(sum, a[3]);
234
235 return horizontal_add_uint64x2(sum);
236 }
237
horizontal_long_add_uint32x4_x8(const uint32x4_t a[8])238 static INLINE uint64_t horizontal_long_add_uint32x4_x8(const uint32x4_t a[8]) {
239 uint64x2_t sum[2];
240 sum[0] = vpaddlq_u32(a[0]);
241 sum[1] = vpaddlq_u32(a[1]);
242 sum[0] = vpadalq_u32(sum[0], a[2]);
243 sum[1] = vpadalq_u32(sum[1], a[3]);
244 sum[0] = vpadalq_u32(sum[0], a[4]);
245 sum[1] = vpadalq_u32(sum[1], a[5]);
246 sum[0] = vpadalq_u32(sum[0], a[6]);
247 sum[1] = vpadalq_u32(sum[1], a[7]);
248
249 return horizontal_add_uint64x2(vaddq_u64(sum[0], sum[1]));
250 }
251
252 static INLINE uint64_t
horizontal_long_add_uint32x4_x16(const uint32x4_t a[16])253 horizontal_long_add_uint32x4_x16(const uint32x4_t a[16]) {
254 uint64x2_t sum[2];
255 sum[0] = vpaddlq_u32(a[0]);
256 sum[1] = vpaddlq_u32(a[1]);
257 sum[0] = vpadalq_u32(sum[0], a[2]);
258 sum[1] = vpadalq_u32(sum[1], a[3]);
259 sum[0] = vpadalq_u32(sum[0], a[4]);
260 sum[1] = vpadalq_u32(sum[1], a[5]);
261 sum[0] = vpadalq_u32(sum[0], a[6]);
262 sum[1] = vpadalq_u32(sum[1], a[7]);
263 sum[0] = vpadalq_u32(sum[0], a[8]);
264 sum[1] = vpadalq_u32(sum[1], a[9]);
265 sum[0] = vpadalq_u32(sum[0], a[10]);
266 sum[1] = vpadalq_u32(sum[1], a[11]);
267 sum[0] = vpadalq_u32(sum[0], a[12]);
268 sum[1] = vpadalq_u32(sum[1], a[13]);
269 sum[0] = vpadalq_u32(sum[0], a[14]);
270 sum[1] = vpadalq_u32(sum[1], a[15]);
271
272 return horizontal_add_uint64x2(vaddq_u64(sum[0], sum[1]));
273 }
274
275 #endif // VPX_VPX_DSP_ARM_SUM_NEON_H_
276