1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/arm/mem_neon.h"
18 #include "vpx_dsp/arm/sum_neon.h"
19
sad64xh_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)20 static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
21 const uint8_t *ref_ptr, int ref_stride,
22 int h) {
23 uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
24 vdupq_n_u16(0) };
25 uint32x4_t sum_u32;
26
27 int i = h;
28 do {
29 uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3;
30 uint8x16_t diff0, diff1, diff2, diff3;
31
32 s0 = vld1q_u8(src_ptr);
33 r0 = vld1q_u8(ref_ptr);
34 diff0 = vabdq_u8(s0, r0);
35 sum[0] = vpadalq_u8(sum[0], diff0);
36
37 s1 = vld1q_u8(src_ptr + 16);
38 r1 = vld1q_u8(ref_ptr + 16);
39 diff1 = vabdq_u8(s1, r1);
40 sum[1] = vpadalq_u8(sum[1], diff1);
41
42 s2 = vld1q_u8(src_ptr + 32);
43 r2 = vld1q_u8(ref_ptr + 32);
44 diff2 = vabdq_u8(s2, r2);
45 sum[2] = vpadalq_u8(sum[2], diff2);
46
47 s3 = vld1q_u8(src_ptr + 48);
48 r3 = vld1q_u8(ref_ptr + 48);
49 diff3 = vabdq_u8(s3, r3);
50 sum[3] = vpadalq_u8(sum[3], diff3);
51
52 src_ptr += src_stride;
53 ref_ptr += ref_stride;
54 } while (--i != 0);
55
56 sum_u32 = vpaddlq_u16(sum[0]);
57 sum_u32 = vpadalq_u16(sum_u32, sum[1]);
58 sum_u32 = vpadalq_u16(sum_u32, sum[2]);
59 sum_u32 = vpadalq_u16(sum_u32, sum[3]);
60
61 return horizontal_add_uint32x4(sum_u32);
62 }
63
sad32xh_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)64 static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
65 const uint8_t *ref_ptr, int ref_stride,
66 int h) {
67 uint32x4_t sum = vdupq_n_u32(0);
68
69 int i = h;
70 do {
71 uint8x16_t s0 = vld1q_u8(src_ptr);
72 uint8x16_t r0 = vld1q_u8(ref_ptr);
73 uint8x16_t diff0 = vabdq_u8(s0, r0);
74 uint16x8_t sum0 = vpaddlq_u8(diff0);
75
76 uint8x16_t s1 = vld1q_u8(src_ptr + 16);
77 uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
78 uint8x16_t diff1 = vabdq_u8(s1, r1);
79 uint16x8_t sum1 = vpaddlq_u8(diff1);
80
81 sum = vpadalq_u16(sum, sum0);
82 sum = vpadalq_u16(sum, sum1);
83
84 src_ptr += src_stride;
85 ref_ptr += ref_stride;
86 } while (--i != 0);
87
88 return horizontal_add_uint32x4(sum);
89 }
90
sad16xh_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)91 static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
92 const uint8_t *ref_ptr, int ref_stride,
93 int h) {
94 uint16x8_t sum = vdupq_n_u16(0);
95
96 int i = h;
97 do {
98 uint8x16_t s = vld1q_u8(src_ptr);
99 uint8x16_t r = vld1q_u8(ref_ptr);
100
101 uint8x16_t diff = vabdq_u8(s, r);
102 sum = vpadalq_u8(sum, diff);
103
104 src_ptr += src_stride;
105 ref_ptr += ref_stride;
106 } while (--i != 0);
107
108 return horizontal_add_uint16x8(sum);
109 }
110
sad8xh_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)111 static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
112 const uint8_t *ref_ptr, int ref_stride,
113 int h) {
114 uint16x8_t sum = vdupq_n_u16(0);
115
116 int i = h;
117 do {
118 uint8x8_t s = vld1_u8(src_ptr);
119 uint8x8_t r = vld1_u8(ref_ptr);
120
121 sum = vabal_u8(sum, s, r);
122
123 src_ptr += src_stride;
124 ref_ptr += ref_stride;
125 } while (--i != 0);
126
127 return horizontal_add_uint16x8(sum);
128 }
129
sad4xh_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)130 static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
131 const uint8_t *ref_ptr, int ref_stride,
132 int h) {
133 uint16x8_t sum = vdupq_n_u16(0);
134
135 int i = h / 2;
136 do {
137 uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
138 uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
139
140 sum = vabal_u8(sum, s, r);
141
142 src_ptr += 2 * src_stride;
143 ref_ptr += 2 * ref_stride;
144 } while (--i != 0);
145
146 return horizontal_add_uint16x8(sum);
147 }
148
149 #define SAD_WXH_NEON(w, h) \
150 unsigned int vpx_sad##w##x##h##_neon(const uint8_t *src, int src_stride, \
151 const uint8_t *ref, int ref_stride) { \
152 return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \
153 }
154
155 SAD_WXH_NEON(4, 4)
156 SAD_WXH_NEON(4, 8)
157
158 SAD_WXH_NEON(8, 4)
159 SAD_WXH_NEON(8, 8)
160 SAD_WXH_NEON(8, 16)
161
162 SAD_WXH_NEON(16, 8)
163 SAD_WXH_NEON(16, 16)
164 SAD_WXH_NEON(16, 32)
165
166 SAD_WXH_NEON(32, 16)
167 SAD_WXH_NEON(32, 32)
168 SAD_WXH_NEON(32, 64)
169
170 SAD_WXH_NEON(64, 32)
171 SAD_WXH_NEON(64, 64)
172
173 #undef SAD_WXH_NEON
174
175 #define SAD_SKIP_WXH_NEON(w, h) \
176 unsigned int vpx_sad_skip_##w##x##h##_neon( \
177 const uint8_t *src, int src_stride, const uint8_t *ref, \
178 int ref_stride) { \
179 return 2 * \
180 sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \
181 }
182
183 SAD_SKIP_WXH_NEON(4, 4)
184 SAD_SKIP_WXH_NEON(4, 8)
185
186 SAD_SKIP_WXH_NEON(8, 4)
187 SAD_SKIP_WXH_NEON(8, 8)
188 SAD_SKIP_WXH_NEON(8, 16)
189
190 SAD_SKIP_WXH_NEON(16, 8)
191 SAD_SKIP_WXH_NEON(16, 16)
192 SAD_SKIP_WXH_NEON(16, 32)
193
194 SAD_SKIP_WXH_NEON(32, 16)
195 SAD_SKIP_WXH_NEON(32, 32)
196 SAD_SKIP_WXH_NEON(32, 64)
197
198 SAD_SKIP_WXH_NEON(64, 32)
199 SAD_SKIP_WXH_NEON(64, 64)
200
201 #undef SAD_SKIP_WXH_NEON
202
sad64xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)203 static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
204 int src_stride,
205 const uint8_t *ref_ptr,
206 int ref_stride, int h,
207 const uint8_t *second_pred) {
208 uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
209 vdupq_n_u16(0) };
210 uint32x4_t sum_u32;
211
212 int i = h;
213 do {
214 uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
215 uint8x16_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
216
217 s0 = vld1q_u8(src_ptr);
218 r0 = vld1q_u8(ref_ptr);
219 p0 = vld1q_u8(second_pred);
220 avg0 = vrhaddq_u8(r0, p0);
221 diff0 = vabdq_u8(s0, avg0);
222 sum[0] = vpadalq_u8(sum[0], diff0);
223
224 s1 = vld1q_u8(src_ptr + 16);
225 r1 = vld1q_u8(ref_ptr + 16);
226 p1 = vld1q_u8(second_pred + 16);
227 avg1 = vrhaddq_u8(r1, p1);
228 diff1 = vabdq_u8(s1, avg1);
229 sum[1] = vpadalq_u8(sum[1], diff1);
230
231 s2 = vld1q_u8(src_ptr + 32);
232 r2 = vld1q_u8(ref_ptr + 32);
233 p2 = vld1q_u8(second_pred + 32);
234 avg2 = vrhaddq_u8(r2, p2);
235 diff2 = vabdq_u8(s2, avg2);
236 sum[2] = vpadalq_u8(sum[2], diff2);
237
238 s3 = vld1q_u8(src_ptr + 48);
239 r3 = vld1q_u8(ref_ptr + 48);
240 p3 = vld1q_u8(second_pred + 48);
241 avg3 = vrhaddq_u8(r3, p3);
242 diff3 = vabdq_u8(s3, avg3);
243 sum[3] = vpadalq_u8(sum[3], diff3);
244
245 src_ptr += src_stride;
246 ref_ptr += ref_stride;
247 second_pred += 64;
248 } while (--i != 0);
249
250 sum_u32 = vpaddlq_u16(sum[0]);
251 sum_u32 = vpadalq_u16(sum_u32, sum[1]);
252 sum_u32 = vpadalq_u16(sum_u32, sum[2]);
253 sum_u32 = vpadalq_u16(sum_u32, sum[3]);
254
255 return horizontal_add_uint32x4(sum_u32);
256 }
257
sad32xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)258 static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
259 int src_stride,
260 const uint8_t *ref_ptr,
261 int ref_stride, int h,
262 const uint8_t *second_pred) {
263 uint32x4_t sum = vdupq_n_u32(0);
264
265 int i = h;
266 do {
267 uint8x16_t s0 = vld1q_u8(src_ptr);
268 uint8x16_t r0 = vld1q_u8(ref_ptr);
269 uint8x16_t p0 = vld1q_u8(second_pred);
270 uint8x16_t avg0 = vrhaddq_u8(r0, p0);
271 uint8x16_t diff0 = vabdq_u8(s0, avg0);
272 uint16x8_t sum0 = vpaddlq_u8(diff0);
273
274 uint8x16_t s1 = vld1q_u8(src_ptr + 16);
275 uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
276 uint8x16_t p1 = vld1q_u8(second_pred + 16);
277 uint8x16_t avg1 = vrhaddq_u8(r1, p1);
278 uint8x16_t diff1 = vabdq_u8(s1, avg1);
279 uint16x8_t sum1 = vpaddlq_u8(diff1);
280
281 sum = vpadalq_u16(sum, sum0);
282 sum = vpadalq_u16(sum, sum1);
283
284 src_ptr += src_stride;
285 ref_ptr += ref_stride;
286 second_pred += 32;
287 } while (--i != 0);
288
289 return horizontal_add_uint32x4(sum);
290 }
291
sad16xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)292 static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
293 int src_stride,
294 const uint8_t *ref_ptr,
295 int ref_stride, int h,
296 const uint8_t *second_pred) {
297 uint16x8_t sum = vdupq_n_u16(0);
298
299 int i = h;
300 do {
301 uint8x16_t s = vld1q_u8(src_ptr);
302 uint8x16_t r = vld1q_u8(ref_ptr);
303 uint8x16_t p = vld1q_u8(second_pred);
304
305 uint8x16_t avg = vrhaddq_u8(r, p);
306 uint8x16_t diff = vabdq_u8(s, avg);
307 sum = vpadalq_u8(sum, diff);
308
309 src_ptr += src_stride;
310 ref_ptr += ref_stride;
311 second_pred += 16;
312 } while (--i != 0);
313
314 return horizontal_add_uint16x8(sum);
315 }
316
sad8xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)317 static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
318 int src_stride,
319 const uint8_t *ref_ptr,
320 int ref_stride, int h,
321 const uint8_t *second_pred) {
322 uint16x8_t sum = vdupq_n_u16(0);
323
324 int i = h;
325 do {
326 uint8x8_t s = vld1_u8(src_ptr);
327 uint8x8_t r = vld1_u8(ref_ptr);
328 uint8x8_t p = vld1_u8(second_pred);
329
330 uint8x8_t avg = vrhadd_u8(r, p);
331 sum = vabal_u8(sum, s, avg);
332
333 src_ptr += src_stride;
334 ref_ptr += ref_stride;
335 second_pred += 8;
336 } while (--i != 0);
337
338 return horizontal_add_uint16x8(sum);
339 }
340
sad4xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)341 static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr,
342 int src_stride,
343 const uint8_t *ref_ptr,
344 int ref_stride, int h,
345 const uint8_t *second_pred) {
346 uint16x8_t sum = vdupq_n_u16(0);
347
348 int i = h / 2;
349 do {
350 uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
351 uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
352 uint8x8_t p = vld1_u8(second_pred);
353
354 uint8x8_t avg = vrhadd_u8(r, p);
355 sum = vabal_u8(sum, s, avg);
356
357 src_ptr += 2 * src_stride;
358 ref_ptr += 2 * ref_stride;
359 second_pred += 8;
360 } while (--i != 0);
361
362 return horizontal_add_uint16x8(sum);
363 }
364
365 #define SAD_WXH_AVG_NEON(w, h) \
366 uint32_t vpx_sad##w##x##h##_avg_neon(const uint8_t *src, int src_stride, \
367 const uint8_t *ref, int ref_stride, \
368 const uint8_t *second_pred) { \
369 return sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \
370 second_pred); \
371 }
372
373 SAD_WXH_AVG_NEON(4, 4)
374 SAD_WXH_AVG_NEON(4, 8)
375
376 SAD_WXH_AVG_NEON(8, 4)
377 SAD_WXH_AVG_NEON(8, 8)
378 SAD_WXH_AVG_NEON(8, 16)
379
380 SAD_WXH_AVG_NEON(16, 8)
381 SAD_WXH_AVG_NEON(16, 16)
382 SAD_WXH_AVG_NEON(16, 32)
383
384 SAD_WXH_AVG_NEON(32, 16)
385 SAD_WXH_AVG_NEON(32, 32)
386 SAD_WXH_AVG_NEON(32, 64)
387
388 SAD_WXH_AVG_NEON(64, 32)
389 SAD_WXH_AVG_NEON(64, 64)
390
391 #undef SAD_WXH_AVG_NEON
392