xref: /aosp_15_r20/external/libvpx/vpx_dsp/arm/sad_neon.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/arm/mem_neon.h"
18 #include "vpx_dsp/arm/sum_neon.h"
19 
sad64xh_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)20 static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
21                                         const uint8_t *ref_ptr, int ref_stride,
22                                         int h) {
23   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
24                         vdupq_n_u16(0) };
25   uint32x4_t sum_u32;
26 
27   int i = h;
28   do {
29     uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3;
30     uint8x16_t diff0, diff1, diff2, diff3;
31 
32     s0 = vld1q_u8(src_ptr);
33     r0 = vld1q_u8(ref_ptr);
34     diff0 = vabdq_u8(s0, r0);
35     sum[0] = vpadalq_u8(sum[0], diff0);
36 
37     s1 = vld1q_u8(src_ptr + 16);
38     r1 = vld1q_u8(ref_ptr + 16);
39     diff1 = vabdq_u8(s1, r1);
40     sum[1] = vpadalq_u8(sum[1], diff1);
41 
42     s2 = vld1q_u8(src_ptr + 32);
43     r2 = vld1q_u8(ref_ptr + 32);
44     diff2 = vabdq_u8(s2, r2);
45     sum[2] = vpadalq_u8(sum[2], diff2);
46 
47     s3 = vld1q_u8(src_ptr + 48);
48     r3 = vld1q_u8(ref_ptr + 48);
49     diff3 = vabdq_u8(s3, r3);
50     sum[3] = vpadalq_u8(sum[3], diff3);
51 
52     src_ptr += src_stride;
53     ref_ptr += ref_stride;
54   } while (--i != 0);
55 
56   sum_u32 = vpaddlq_u16(sum[0]);
57   sum_u32 = vpadalq_u16(sum_u32, sum[1]);
58   sum_u32 = vpadalq_u16(sum_u32, sum[2]);
59   sum_u32 = vpadalq_u16(sum_u32, sum[3]);
60 
61   return horizontal_add_uint32x4(sum_u32);
62 }
63 
sad32xh_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)64 static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
65                                         const uint8_t *ref_ptr, int ref_stride,
66                                         int h) {
67   uint32x4_t sum = vdupq_n_u32(0);
68 
69   int i = h;
70   do {
71     uint8x16_t s0 = vld1q_u8(src_ptr);
72     uint8x16_t r0 = vld1q_u8(ref_ptr);
73     uint8x16_t diff0 = vabdq_u8(s0, r0);
74     uint16x8_t sum0 = vpaddlq_u8(diff0);
75 
76     uint8x16_t s1 = vld1q_u8(src_ptr + 16);
77     uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
78     uint8x16_t diff1 = vabdq_u8(s1, r1);
79     uint16x8_t sum1 = vpaddlq_u8(diff1);
80 
81     sum = vpadalq_u16(sum, sum0);
82     sum = vpadalq_u16(sum, sum1);
83 
84     src_ptr += src_stride;
85     ref_ptr += ref_stride;
86   } while (--i != 0);
87 
88   return horizontal_add_uint32x4(sum);
89 }
90 
sad16xh_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)91 static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
92                                         const uint8_t *ref_ptr, int ref_stride,
93                                         int h) {
94   uint16x8_t sum = vdupq_n_u16(0);
95 
96   int i = h;
97   do {
98     uint8x16_t s = vld1q_u8(src_ptr);
99     uint8x16_t r = vld1q_u8(ref_ptr);
100 
101     uint8x16_t diff = vabdq_u8(s, r);
102     sum = vpadalq_u8(sum, diff);
103 
104     src_ptr += src_stride;
105     ref_ptr += ref_stride;
106   } while (--i != 0);
107 
108   return horizontal_add_uint16x8(sum);
109 }
110 
sad8xh_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)111 static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
112                                        const uint8_t *ref_ptr, int ref_stride,
113                                        int h) {
114   uint16x8_t sum = vdupq_n_u16(0);
115 
116   int i = h;
117   do {
118     uint8x8_t s = vld1_u8(src_ptr);
119     uint8x8_t r = vld1_u8(ref_ptr);
120 
121     sum = vabal_u8(sum, s, r);
122 
123     src_ptr += src_stride;
124     ref_ptr += ref_stride;
125   } while (--i != 0);
126 
127   return horizontal_add_uint16x8(sum);
128 }
129 
sad4xh_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)130 static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
131                                        const uint8_t *ref_ptr, int ref_stride,
132                                        int h) {
133   uint16x8_t sum = vdupq_n_u16(0);
134 
135   int i = h / 2;
136   do {
137     uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
138     uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
139 
140     sum = vabal_u8(sum, s, r);
141 
142     src_ptr += 2 * src_stride;
143     ref_ptr += 2 * ref_stride;
144   } while (--i != 0);
145 
146   return horizontal_add_uint16x8(sum);
147 }
148 
149 #define SAD_WXH_NEON(w, h)                                                   \
150   unsigned int vpx_sad##w##x##h##_neon(const uint8_t *src, int src_stride,   \
151                                        const uint8_t *ref, int ref_stride) { \
152     return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h));           \
153   }
154 
155 SAD_WXH_NEON(4, 4)
156 SAD_WXH_NEON(4, 8)
157 
158 SAD_WXH_NEON(8, 4)
159 SAD_WXH_NEON(8, 8)
160 SAD_WXH_NEON(8, 16)
161 
162 SAD_WXH_NEON(16, 8)
163 SAD_WXH_NEON(16, 16)
164 SAD_WXH_NEON(16, 32)
165 
166 SAD_WXH_NEON(32, 16)
167 SAD_WXH_NEON(32, 32)
168 SAD_WXH_NEON(32, 64)
169 
170 SAD_WXH_NEON(64, 32)
171 SAD_WXH_NEON(64, 64)
172 
173 #undef SAD_WXH_NEON
174 
175 #define SAD_SKIP_WXH_NEON(w, h)                                                \
176   unsigned int vpx_sad_skip_##w##x##h##_neon(                                  \
177       const uint8_t *src, int src_stride, const uint8_t *ref,                  \
178       int ref_stride) {                                                        \
179     return 2 *                                                                 \
180            sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \
181   }
182 
183 SAD_SKIP_WXH_NEON(4, 4)
184 SAD_SKIP_WXH_NEON(4, 8)
185 
186 SAD_SKIP_WXH_NEON(8, 4)
187 SAD_SKIP_WXH_NEON(8, 8)
188 SAD_SKIP_WXH_NEON(8, 16)
189 
190 SAD_SKIP_WXH_NEON(16, 8)
191 SAD_SKIP_WXH_NEON(16, 16)
192 SAD_SKIP_WXH_NEON(16, 32)
193 
194 SAD_SKIP_WXH_NEON(32, 16)
195 SAD_SKIP_WXH_NEON(32, 32)
196 SAD_SKIP_WXH_NEON(32, 64)
197 
198 SAD_SKIP_WXH_NEON(64, 32)
199 SAD_SKIP_WXH_NEON(64, 64)
200 
201 #undef SAD_SKIP_WXH_NEON
202 
sad64xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)203 static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
204                                             int src_stride,
205                                             const uint8_t *ref_ptr,
206                                             int ref_stride, int h,
207                                             const uint8_t *second_pred) {
208   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
209                         vdupq_n_u16(0) };
210   uint32x4_t sum_u32;
211 
212   int i = h;
213   do {
214     uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
215     uint8x16_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
216 
217     s0 = vld1q_u8(src_ptr);
218     r0 = vld1q_u8(ref_ptr);
219     p0 = vld1q_u8(second_pred);
220     avg0 = vrhaddq_u8(r0, p0);
221     diff0 = vabdq_u8(s0, avg0);
222     sum[0] = vpadalq_u8(sum[0], diff0);
223 
224     s1 = vld1q_u8(src_ptr + 16);
225     r1 = vld1q_u8(ref_ptr + 16);
226     p1 = vld1q_u8(second_pred + 16);
227     avg1 = vrhaddq_u8(r1, p1);
228     diff1 = vabdq_u8(s1, avg1);
229     sum[1] = vpadalq_u8(sum[1], diff1);
230 
231     s2 = vld1q_u8(src_ptr + 32);
232     r2 = vld1q_u8(ref_ptr + 32);
233     p2 = vld1q_u8(second_pred + 32);
234     avg2 = vrhaddq_u8(r2, p2);
235     diff2 = vabdq_u8(s2, avg2);
236     sum[2] = vpadalq_u8(sum[2], diff2);
237 
238     s3 = vld1q_u8(src_ptr + 48);
239     r3 = vld1q_u8(ref_ptr + 48);
240     p3 = vld1q_u8(second_pred + 48);
241     avg3 = vrhaddq_u8(r3, p3);
242     diff3 = vabdq_u8(s3, avg3);
243     sum[3] = vpadalq_u8(sum[3], diff3);
244 
245     src_ptr += src_stride;
246     ref_ptr += ref_stride;
247     second_pred += 64;
248   } while (--i != 0);
249 
250   sum_u32 = vpaddlq_u16(sum[0]);
251   sum_u32 = vpadalq_u16(sum_u32, sum[1]);
252   sum_u32 = vpadalq_u16(sum_u32, sum[2]);
253   sum_u32 = vpadalq_u16(sum_u32, sum[3]);
254 
255   return horizontal_add_uint32x4(sum_u32);
256 }
257 
sad32xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)258 static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
259                                             int src_stride,
260                                             const uint8_t *ref_ptr,
261                                             int ref_stride, int h,
262                                             const uint8_t *second_pred) {
263   uint32x4_t sum = vdupq_n_u32(0);
264 
265   int i = h;
266   do {
267     uint8x16_t s0 = vld1q_u8(src_ptr);
268     uint8x16_t r0 = vld1q_u8(ref_ptr);
269     uint8x16_t p0 = vld1q_u8(second_pred);
270     uint8x16_t avg0 = vrhaddq_u8(r0, p0);
271     uint8x16_t diff0 = vabdq_u8(s0, avg0);
272     uint16x8_t sum0 = vpaddlq_u8(diff0);
273 
274     uint8x16_t s1 = vld1q_u8(src_ptr + 16);
275     uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
276     uint8x16_t p1 = vld1q_u8(second_pred + 16);
277     uint8x16_t avg1 = vrhaddq_u8(r1, p1);
278     uint8x16_t diff1 = vabdq_u8(s1, avg1);
279     uint16x8_t sum1 = vpaddlq_u8(diff1);
280 
281     sum = vpadalq_u16(sum, sum0);
282     sum = vpadalq_u16(sum, sum1);
283 
284     src_ptr += src_stride;
285     ref_ptr += ref_stride;
286     second_pred += 32;
287   } while (--i != 0);
288 
289   return horizontal_add_uint32x4(sum);
290 }
291 
sad16xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)292 static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
293                                             int src_stride,
294                                             const uint8_t *ref_ptr,
295                                             int ref_stride, int h,
296                                             const uint8_t *second_pred) {
297   uint16x8_t sum = vdupq_n_u16(0);
298 
299   int i = h;
300   do {
301     uint8x16_t s = vld1q_u8(src_ptr);
302     uint8x16_t r = vld1q_u8(ref_ptr);
303     uint8x16_t p = vld1q_u8(second_pred);
304 
305     uint8x16_t avg = vrhaddq_u8(r, p);
306     uint8x16_t diff = vabdq_u8(s, avg);
307     sum = vpadalq_u8(sum, diff);
308 
309     src_ptr += src_stride;
310     ref_ptr += ref_stride;
311     second_pred += 16;
312   } while (--i != 0);
313 
314   return horizontal_add_uint16x8(sum);
315 }
316 
sad8xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)317 static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
318                                            int src_stride,
319                                            const uint8_t *ref_ptr,
320                                            int ref_stride, int h,
321                                            const uint8_t *second_pred) {
322   uint16x8_t sum = vdupq_n_u16(0);
323 
324   int i = h;
325   do {
326     uint8x8_t s = vld1_u8(src_ptr);
327     uint8x8_t r = vld1_u8(ref_ptr);
328     uint8x8_t p = vld1_u8(second_pred);
329 
330     uint8x8_t avg = vrhadd_u8(r, p);
331     sum = vabal_u8(sum, s, avg);
332 
333     src_ptr += src_stride;
334     ref_ptr += ref_stride;
335     second_pred += 8;
336   } while (--i != 0);
337 
338   return horizontal_add_uint16x8(sum);
339 }
340 
sad4xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)341 static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr,
342                                            int src_stride,
343                                            const uint8_t *ref_ptr,
344                                            int ref_stride, int h,
345                                            const uint8_t *second_pred) {
346   uint16x8_t sum = vdupq_n_u16(0);
347 
348   int i = h / 2;
349   do {
350     uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
351     uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
352     uint8x8_t p = vld1_u8(second_pred);
353 
354     uint8x8_t avg = vrhadd_u8(r, p);
355     sum = vabal_u8(sum, s, avg);
356 
357     src_ptr += 2 * src_stride;
358     ref_ptr += 2 * ref_stride;
359     second_pred += 8;
360   } while (--i != 0);
361 
362   return horizontal_add_uint16x8(sum);
363 }
364 
365 #define SAD_WXH_AVG_NEON(w, h)                                             \
366   uint32_t vpx_sad##w##x##h##_avg_neon(const uint8_t *src, int src_stride, \
367                                        const uint8_t *ref, int ref_stride, \
368                                        const uint8_t *second_pred) {       \
369     return sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h),      \
370                                second_pred);                               \
371   }
372 
373 SAD_WXH_AVG_NEON(4, 4)
374 SAD_WXH_AVG_NEON(4, 8)
375 
376 SAD_WXH_AVG_NEON(8, 4)
377 SAD_WXH_AVG_NEON(8, 8)
378 SAD_WXH_AVG_NEON(8, 16)
379 
380 SAD_WXH_AVG_NEON(16, 8)
381 SAD_WXH_AVG_NEON(16, 16)
382 SAD_WXH_AVG_NEON(16, 32)
383 
384 SAD_WXH_AVG_NEON(32, 16)
385 SAD_WXH_AVG_NEON(32, 32)
386 SAD_WXH_AVG_NEON(32, 64)
387 
388 SAD_WXH_AVG_NEON(64, 32)
389 SAD_WXH_AVG_NEON(64, 64)
390 
391 #undef SAD_WXH_AVG_NEON
392