1 /*
2 * Copyright (c) 2023 The WebM project authors. All rights reserved.
3 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
4 *
5 * This source code is subject to the terms of the BSD 2 Clause License and
6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7 * was not distributed with this source code in the LICENSE file, you can
8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
9 * Media Patent License 1.0 was not distributed with this source code in the
10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
11 */
12
13 #include <arm_neon.h>
14
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17
18 #include "aom/aom_integer.h"
19 #include "aom_dsp/arm/mem_neon.h"
20 #include "aom_dsp/arm/sum_neon.h"
21
highbd_sad4xh_small_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)22 static inline uint32_t highbd_sad4xh_small_neon(const uint8_t *src_ptr,
23 int src_stride,
24 const uint8_t *ref_ptr,
25 int ref_stride, int h) {
26 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
27 const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
28 uint32x4_t sum = vdupq_n_u32(0);
29
30 int i = h;
31 do {
32 uint16x4_t s = vld1_u16(src16_ptr);
33 uint16x4_t r = vld1_u16(ref16_ptr);
34 sum = vabal_u16(sum, s, r);
35
36 src16_ptr += src_stride;
37 ref16_ptr += ref_stride;
38 } while (--i != 0);
39
40 return horizontal_add_u32x4(sum);
41 }
42
highbd_sad8xh_small_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)43 static inline uint32_t highbd_sad8xh_small_neon(const uint8_t *src_ptr,
44 int src_stride,
45 const uint8_t *ref_ptr,
46 int ref_stride, int h) {
47 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
48 const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
49 uint16x8_t sum = vdupq_n_u16(0);
50
51 int i = h;
52 do {
53 uint16x8_t s = vld1q_u16(src16_ptr);
54 uint16x8_t r = vld1q_u16(ref16_ptr);
55 sum = vabaq_u16(sum, s, r);
56
57 src16_ptr += src_stride;
58 ref16_ptr += ref_stride;
59 } while (--i != 0);
60
61 return horizontal_add_u16x8(sum);
62 }
63
64 #if !CONFIG_REALTIME_ONLY
highbd_sad8xh_large_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)65 static inline uint32_t highbd_sad8xh_large_neon(const uint8_t *src_ptr,
66 int src_stride,
67 const uint8_t *ref_ptr,
68 int ref_stride, int h) {
69 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
70 const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
71 uint32x4_t sum_u32 = vdupq_n_u32(0);
72
73 int i = h;
74 do {
75 uint16x8_t s = vld1q_u16(src16_ptr);
76 uint16x8_t r = vld1q_u16(ref16_ptr);
77 uint16x8_t sum_u16 = vabdq_u16(s, r);
78 sum_u32 = vpadalq_u16(sum_u32, sum_u16);
79
80 src16_ptr += src_stride;
81 ref16_ptr += ref_stride;
82 } while (--i != 0);
83
84 return horizontal_add_u32x4(sum_u32);
85 }
86 #endif // !CONFIG_REALTIME_ONLY
87
highbd_sad16xh_large_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)88 static inline uint32_t highbd_sad16xh_large_neon(const uint8_t *src_ptr,
89 int src_stride,
90 const uint8_t *ref_ptr,
91 int ref_stride, int h) {
92 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
93 const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
94 uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
95
96 int i = h;
97 do {
98 uint16x8_t s0 = vld1q_u16(src16_ptr);
99 uint16x8_t r0 = vld1q_u16(ref16_ptr);
100 uint16x8_t diff0 = vabdq_u16(s0, r0);
101 sum[0] = vpadalq_u16(sum[0], diff0);
102
103 uint16x8_t s1 = vld1q_u16(src16_ptr + 8);
104 uint16x8_t r1 = vld1q_u16(ref16_ptr + 8);
105 uint16x8_t diff1 = vabdq_u16(s1, r1);
106 sum[1] = vpadalq_u16(sum[1], diff1);
107
108 src16_ptr += src_stride;
109 ref16_ptr += ref_stride;
110 } while (--i != 0);
111
112 sum[0] = vaddq_u32(sum[0], sum[1]);
113 return horizontal_add_u32x4(sum[0]);
114 }
115
highbd_sadwxh_large_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int w,int h)116 static inline uint32_t highbd_sadwxh_large_neon(const uint8_t *src_ptr,
117 int src_stride,
118 const uint8_t *ref_ptr,
119 int ref_stride, int w, int h) {
120 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
121 const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
122 uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
123 vdupq_n_u32(0) };
124
125 int i = h;
126 do {
127 int j = 0;
128 do {
129 uint16x8_t s0 = vld1q_u16(src16_ptr + j);
130 uint16x8_t r0 = vld1q_u16(ref16_ptr + j);
131 uint16x8_t diff0 = vabdq_u16(s0, r0);
132 sum[0] = vpadalq_u16(sum[0], diff0);
133
134 uint16x8_t s1 = vld1q_u16(src16_ptr + j + 8);
135 uint16x8_t r1 = vld1q_u16(ref16_ptr + j + 8);
136 uint16x8_t diff1 = vabdq_u16(s1, r1);
137 sum[1] = vpadalq_u16(sum[1], diff1);
138
139 uint16x8_t s2 = vld1q_u16(src16_ptr + j + 16);
140 uint16x8_t r2 = vld1q_u16(ref16_ptr + j + 16);
141 uint16x8_t diff2 = vabdq_u16(s2, r2);
142 sum[2] = vpadalq_u16(sum[2], diff2);
143
144 uint16x8_t s3 = vld1q_u16(src16_ptr + j + 24);
145 uint16x8_t r3 = vld1q_u16(ref16_ptr + j + 24);
146 uint16x8_t diff3 = vabdq_u16(s3, r3);
147 sum[3] = vpadalq_u16(sum[3], diff3);
148
149 j += 32;
150 } while (j < w);
151
152 src16_ptr += src_stride;
153 ref16_ptr += ref_stride;
154 } while (--i != 0);
155
156 sum[0] = vaddq_u32(sum[0], sum[1]);
157 sum[2] = vaddq_u32(sum[2], sum[3]);
158 sum[0] = vaddq_u32(sum[0], sum[2]);
159
160 return horizontal_add_u32x4(sum[0]);
161 }
162
highbd_sad128xh_large_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)163 static inline unsigned int highbd_sad128xh_large_neon(const uint8_t *src_ptr,
164 int src_stride,
165 const uint8_t *ref_ptr,
166 int ref_stride, int h) {
167 return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128,
168 h);
169 }
170
highbd_sad64xh_large_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)171 static inline unsigned int highbd_sad64xh_large_neon(const uint8_t *src_ptr,
172 int src_stride,
173 const uint8_t *ref_ptr,
174 int ref_stride, int h) {
175 return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64,
176 h);
177 }
178
highbd_sad32xh_large_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)179 static inline unsigned int highbd_sad32xh_large_neon(const uint8_t *src_ptr,
180 int src_stride,
181 const uint8_t *ref_ptr,
182 int ref_stride, int h) {
183 return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32,
184 h);
185 }
186
187 #define HBD_SAD_WXH_SMALL_NEON(w, h) \
188 unsigned int aom_highbd_sad##w##x##h##_neon( \
189 const uint8_t *src, int src_stride, const uint8_t *ref, \
190 int ref_stride) { \
191 return highbd_sad##w##xh_small_neon(src, src_stride, ref, ref_stride, \
192 (h)); \
193 }
194
195 #define HBD_SAD_WXH_LARGE_NEON(w, h) \
196 unsigned int aom_highbd_sad##w##x##h##_neon( \
197 const uint8_t *src, int src_stride, const uint8_t *ref, \
198 int ref_stride) { \
199 return highbd_sad##w##xh_large_neon(src, src_stride, ref, ref_stride, \
200 (h)); \
201 }
202
203 HBD_SAD_WXH_SMALL_NEON(4, 4)
204 HBD_SAD_WXH_SMALL_NEON(4, 8)
205
206 HBD_SAD_WXH_SMALL_NEON(8, 4)
207 HBD_SAD_WXH_SMALL_NEON(8, 8)
208 HBD_SAD_WXH_SMALL_NEON(8, 16)
209
210 HBD_SAD_WXH_LARGE_NEON(16, 8)
211 HBD_SAD_WXH_LARGE_NEON(16, 16)
212 HBD_SAD_WXH_LARGE_NEON(16, 32)
213
214 HBD_SAD_WXH_LARGE_NEON(32, 16)
215 HBD_SAD_WXH_LARGE_NEON(32, 32)
216 HBD_SAD_WXH_LARGE_NEON(32, 64)
217
218 HBD_SAD_WXH_LARGE_NEON(64, 32)
219 HBD_SAD_WXH_LARGE_NEON(64, 64)
220 HBD_SAD_WXH_LARGE_NEON(64, 128)
221
222 HBD_SAD_WXH_LARGE_NEON(128, 64)
223 HBD_SAD_WXH_LARGE_NEON(128, 128)
224
225 #if !CONFIG_REALTIME_ONLY
226 HBD_SAD_WXH_SMALL_NEON(4, 16)
227
228 HBD_SAD_WXH_LARGE_NEON(8, 32)
229
230 HBD_SAD_WXH_LARGE_NEON(16, 4)
231 HBD_SAD_WXH_LARGE_NEON(16, 64)
232
233 HBD_SAD_WXH_LARGE_NEON(32, 8)
234
235 HBD_SAD_WXH_LARGE_NEON(64, 16)
236 #endif // !CONFIG_REALTIME_ONLY
237
238 #define HBD_SAD_SKIP_WXH_SMALL_NEON(w, h) \
239 unsigned int aom_highbd_sad_skip_##w##x##h##_neon( \
240 const uint8_t *src, int src_stride, const uint8_t *ref, \
241 int ref_stride) { \
242 return 2 * highbd_sad##w##xh_small_neon(src, 2 * src_stride, ref, \
243 2 * ref_stride, (h) / 2); \
244 }
245
246 #define HBD_SAD_SKIP_WXH_LARGE_NEON(w, h) \
247 unsigned int aom_highbd_sad_skip_##w##x##h##_neon( \
248 const uint8_t *src, int src_stride, const uint8_t *ref, \
249 int ref_stride) { \
250 return 2 * highbd_sad##w##xh_large_neon(src, 2 * src_stride, ref, \
251 2 * ref_stride, (h) / 2); \
252 }
253
254 HBD_SAD_SKIP_WXH_SMALL_NEON(4, 4)
255 HBD_SAD_SKIP_WXH_SMALL_NEON(4, 8)
256
257 HBD_SAD_SKIP_WXH_SMALL_NEON(8, 4)
258 HBD_SAD_SKIP_WXH_SMALL_NEON(8, 8)
259 HBD_SAD_SKIP_WXH_SMALL_NEON(8, 16)
260
261 HBD_SAD_SKIP_WXH_LARGE_NEON(16, 8)
262 HBD_SAD_SKIP_WXH_LARGE_NEON(16, 16)
263 HBD_SAD_SKIP_WXH_LARGE_NEON(16, 32)
264
265 HBD_SAD_SKIP_WXH_LARGE_NEON(32, 16)
266 HBD_SAD_SKIP_WXH_LARGE_NEON(32, 32)
267 HBD_SAD_SKIP_WXH_LARGE_NEON(32, 64)
268
269 HBD_SAD_SKIP_WXH_LARGE_NEON(64, 32)
270 HBD_SAD_SKIP_WXH_LARGE_NEON(64, 64)
271 HBD_SAD_SKIP_WXH_LARGE_NEON(64, 128)
272
273 HBD_SAD_SKIP_WXH_LARGE_NEON(128, 64)
274 HBD_SAD_SKIP_WXH_LARGE_NEON(128, 128)
275
276 #if !CONFIG_REALTIME_ONLY
277 HBD_SAD_SKIP_WXH_SMALL_NEON(4, 16)
278
279 HBD_SAD_SKIP_WXH_SMALL_NEON(8, 32)
280
281 HBD_SAD_SKIP_WXH_LARGE_NEON(16, 4)
282 HBD_SAD_SKIP_WXH_LARGE_NEON(16, 64)
283
284 HBD_SAD_SKIP_WXH_LARGE_NEON(32, 8)
285
286 HBD_SAD_SKIP_WXH_LARGE_NEON(64, 16)
287 #endif // !CONFIG_REALTIME_ONLY
288
highbd_sad4xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)289 static inline uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr,
290 int src_stride,
291 const uint8_t *ref_ptr,
292 int ref_stride, int h,
293 const uint8_t *second_pred) {
294 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
295 const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
296 const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
297 uint32x4_t sum = vdupq_n_u32(0);
298
299 int i = h;
300 do {
301 uint16x4_t s = vld1_u16(src16_ptr);
302 uint16x4_t r = vld1_u16(ref16_ptr);
303 uint16x4_t p = vld1_u16(pred16_ptr);
304
305 uint16x4_t avg = vrhadd_u16(r, p);
306 sum = vabal_u16(sum, s, avg);
307
308 src16_ptr += src_stride;
309 ref16_ptr += ref_stride;
310 pred16_ptr += 4;
311 } while (--i != 0);
312
313 return horizontal_add_u32x4(sum);
314 }
315
highbd_sad8xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)316 static inline uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr,
317 int src_stride,
318 const uint8_t *ref_ptr,
319 int ref_stride, int h,
320 const uint8_t *second_pred) {
321 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
322 const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
323 const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
324 uint32x4_t sum = vdupq_n_u32(0);
325
326 int i = h;
327 do {
328 uint16x8_t s = vld1q_u16(src16_ptr);
329 uint16x8_t r = vld1q_u16(ref16_ptr);
330 uint16x8_t p = vld1q_u16(pred16_ptr);
331
332 uint16x8_t avg = vrhaddq_u16(r, p);
333 uint16x8_t diff = vabdq_u16(s, avg);
334 sum = vpadalq_u16(sum, diff);
335
336 src16_ptr += src_stride;
337 ref16_ptr += ref_stride;
338 pred16_ptr += 8;
339 } while (--i != 0);
340
341 return horizontal_add_u32x4(sum);
342 }
343
highbd_sad16xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)344 static inline uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr,
345 int src_stride,
346 const uint8_t *ref_ptr,
347 int ref_stride, int h,
348 const uint8_t *second_pred) {
349 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
350 const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
351 const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
352 uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
353
354 int i = h;
355 do {
356 uint16x8_t s0, s1, r0, r1, p0, p1;
357 uint16x8_t avg0, avg1, diff0, diff1;
358
359 s0 = vld1q_u16(src16_ptr);
360 r0 = vld1q_u16(ref16_ptr);
361 p0 = vld1q_u16(pred16_ptr);
362 avg0 = vrhaddq_u16(r0, p0);
363 diff0 = vabdq_u16(s0, avg0);
364 sum[0] = vpadalq_u16(sum[0], diff0);
365
366 s1 = vld1q_u16(src16_ptr + 8);
367 r1 = vld1q_u16(ref16_ptr + 8);
368 p1 = vld1q_u16(pred16_ptr + 8);
369 avg1 = vrhaddq_u16(r1, p1);
370 diff1 = vabdq_u16(s1, avg1);
371 sum[1] = vpadalq_u16(sum[1], diff1);
372
373 src16_ptr += src_stride;
374 ref16_ptr += ref_stride;
375 pred16_ptr += 16;
376 } while (--i != 0);
377
378 sum[0] = vaddq_u32(sum[0], sum[1]);
379 return horizontal_add_u32x4(sum[0]);
380 }
381
highbd_sadwxh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int w,int h,const uint8_t * second_pred)382 static inline uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr,
383 int src_stride,
384 const uint8_t *ref_ptr,
385 int ref_stride, int w, int h,
386 const uint8_t *second_pred) {
387 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
388 const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
389 const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
390 uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
391 vdupq_n_u32(0) };
392
393 int i = h;
394 do {
395 int j = 0;
396 do {
397 uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
398 uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
399
400 s0 = vld1q_u16(src16_ptr + j);
401 r0 = vld1q_u16(ref16_ptr + j);
402 p0 = vld1q_u16(pred16_ptr + j);
403 avg0 = vrhaddq_u16(r0, p0);
404 diff0 = vabdq_u16(s0, avg0);
405 sum[0] = vpadalq_u16(sum[0], diff0);
406
407 s1 = vld1q_u16(src16_ptr + j + 8);
408 r1 = vld1q_u16(ref16_ptr + j + 8);
409 p1 = vld1q_u16(pred16_ptr + j + 8);
410 avg1 = vrhaddq_u16(r1, p1);
411 diff1 = vabdq_u16(s1, avg1);
412 sum[1] = vpadalq_u16(sum[1], diff1);
413
414 s2 = vld1q_u16(src16_ptr + j + 16);
415 r2 = vld1q_u16(ref16_ptr + j + 16);
416 p2 = vld1q_u16(pred16_ptr + j + 16);
417 avg2 = vrhaddq_u16(r2, p2);
418 diff2 = vabdq_u16(s2, avg2);
419 sum[2] = vpadalq_u16(sum[2], diff2);
420
421 s3 = vld1q_u16(src16_ptr + j + 24);
422 r3 = vld1q_u16(ref16_ptr + j + 24);
423 p3 = vld1q_u16(pred16_ptr + j + 24);
424 avg3 = vrhaddq_u16(r3, p3);
425 diff3 = vabdq_u16(s3, avg3);
426 sum[3] = vpadalq_u16(sum[3], diff3);
427
428 j += 32;
429 } while (j < w);
430
431 src16_ptr += src_stride;
432 ref16_ptr += ref_stride;
433 pred16_ptr += w;
434 } while (--i != 0);
435
436 sum[0] = vaddq_u32(sum[0], sum[1]);
437 sum[2] = vaddq_u32(sum[2], sum[3]);
438 sum[0] = vaddq_u32(sum[0], sum[2]);
439
440 return horizontal_add_u32x4(sum[0]);
441 }
442
highbd_sad128xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)443 static inline unsigned int highbd_sad128xh_avg_neon(
444 const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
445 int ref_stride, int h, const uint8_t *second_pred) {
446 return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128,
447 h, second_pred);
448 }
449
highbd_sad64xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)450 static inline unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr,
451 int src_stride,
452 const uint8_t *ref_ptr,
453 int ref_stride, int h,
454 const uint8_t *second_pred) {
455 return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
456 second_pred);
457 }
458
highbd_sad32xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)459 static inline unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr,
460 int src_stride,
461 const uint8_t *ref_ptr,
462 int ref_stride, int h,
463 const uint8_t *second_pred) {
464 return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
465 second_pred);
466 }
467
468 #define HBD_SAD_WXH_AVG_NEON(w, h) \
469 uint32_t aom_highbd_sad##w##x##h##_avg_neon( \
470 const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
471 const uint8_t *second_pred) { \
472 return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \
473 second_pred); \
474 }
475
476 HBD_SAD_WXH_AVG_NEON(4, 4)
477 HBD_SAD_WXH_AVG_NEON(4, 8)
478
479 HBD_SAD_WXH_AVG_NEON(8, 4)
480 HBD_SAD_WXH_AVG_NEON(8, 8)
481 HBD_SAD_WXH_AVG_NEON(8, 16)
482
483 HBD_SAD_WXH_AVG_NEON(16, 8)
484 HBD_SAD_WXH_AVG_NEON(16, 16)
485 HBD_SAD_WXH_AVG_NEON(16, 32)
486
487 HBD_SAD_WXH_AVG_NEON(32, 16)
488 HBD_SAD_WXH_AVG_NEON(32, 32)
489 HBD_SAD_WXH_AVG_NEON(32, 64)
490
491 HBD_SAD_WXH_AVG_NEON(64, 32)
492 HBD_SAD_WXH_AVG_NEON(64, 64)
493 HBD_SAD_WXH_AVG_NEON(64, 128)
494
495 HBD_SAD_WXH_AVG_NEON(128, 64)
496 HBD_SAD_WXH_AVG_NEON(128, 128)
497
498 #if !CONFIG_REALTIME_ONLY
499 HBD_SAD_WXH_AVG_NEON(4, 16)
500
501 HBD_SAD_WXH_AVG_NEON(8, 32)
502
503 HBD_SAD_WXH_AVG_NEON(16, 4)
504 HBD_SAD_WXH_AVG_NEON(16, 64)
505
506 HBD_SAD_WXH_AVG_NEON(32, 8)
507
508 HBD_SAD_WXH_AVG_NEON(64, 16)
509 #endif // !CONFIG_REALTIME_ONLY
510