xref: /aosp_15_r20/external/libaom/aom_dsp/arm/highbd_sad_neon.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2023 The WebM project authors. All rights reserved.
3  * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
4  *
5  * This source code is subject to the terms of the BSD 2 Clause License and
6  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7  * was not distributed with this source code in the LICENSE file, you can
8  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
9  * Media Patent License 1.0 was not distributed with this source code in the
10  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
11  */
12 
13 #include <arm_neon.h>
14 
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17 
18 #include "aom/aom_integer.h"
19 #include "aom_dsp/arm/mem_neon.h"
20 #include "aom_dsp/arm/sum_neon.h"
21 
highbd_sad4xh_small_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)22 static inline uint32_t highbd_sad4xh_small_neon(const uint8_t *src_ptr,
23                                                 int src_stride,
24                                                 const uint8_t *ref_ptr,
25                                                 int ref_stride, int h) {
26   const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
27   const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
28   uint32x4_t sum = vdupq_n_u32(0);
29 
30   int i = h;
31   do {
32     uint16x4_t s = vld1_u16(src16_ptr);
33     uint16x4_t r = vld1_u16(ref16_ptr);
34     sum = vabal_u16(sum, s, r);
35 
36     src16_ptr += src_stride;
37     ref16_ptr += ref_stride;
38   } while (--i != 0);
39 
40   return horizontal_add_u32x4(sum);
41 }
42 
highbd_sad8xh_small_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)43 static inline uint32_t highbd_sad8xh_small_neon(const uint8_t *src_ptr,
44                                                 int src_stride,
45                                                 const uint8_t *ref_ptr,
46                                                 int ref_stride, int h) {
47   const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
48   const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
49   uint16x8_t sum = vdupq_n_u16(0);
50 
51   int i = h;
52   do {
53     uint16x8_t s = vld1q_u16(src16_ptr);
54     uint16x8_t r = vld1q_u16(ref16_ptr);
55     sum = vabaq_u16(sum, s, r);
56 
57     src16_ptr += src_stride;
58     ref16_ptr += ref_stride;
59   } while (--i != 0);
60 
61   return horizontal_add_u16x8(sum);
62 }
63 
64 #if !CONFIG_REALTIME_ONLY
highbd_sad8xh_large_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)65 static inline uint32_t highbd_sad8xh_large_neon(const uint8_t *src_ptr,
66                                                 int src_stride,
67                                                 const uint8_t *ref_ptr,
68                                                 int ref_stride, int h) {
69   const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
70   const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
71   uint32x4_t sum_u32 = vdupq_n_u32(0);
72 
73   int i = h;
74   do {
75     uint16x8_t s = vld1q_u16(src16_ptr);
76     uint16x8_t r = vld1q_u16(ref16_ptr);
77     uint16x8_t sum_u16 = vabdq_u16(s, r);
78     sum_u32 = vpadalq_u16(sum_u32, sum_u16);
79 
80     src16_ptr += src_stride;
81     ref16_ptr += ref_stride;
82   } while (--i != 0);
83 
84   return horizontal_add_u32x4(sum_u32);
85 }
86 #endif  // !CONFIG_REALTIME_ONLY
87 
highbd_sad16xh_large_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)88 static inline uint32_t highbd_sad16xh_large_neon(const uint8_t *src_ptr,
89                                                  int src_stride,
90                                                  const uint8_t *ref_ptr,
91                                                  int ref_stride, int h) {
92   const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
93   const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
94   uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
95 
96   int i = h;
97   do {
98     uint16x8_t s0 = vld1q_u16(src16_ptr);
99     uint16x8_t r0 = vld1q_u16(ref16_ptr);
100     uint16x8_t diff0 = vabdq_u16(s0, r0);
101     sum[0] = vpadalq_u16(sum[0], diff0);
102 
103     uint16x8_t s1 = vld1q_u16(src16_ptr + 8);
104     uint16x8_t r1 = vld1q_u16(ref16_ptr + 8);
105     uint16x8_t diff1 = vabdq_u16(s1, r1);
106     sum[1] = vpadalq_u16(sum[1], diff1);
107 
108     src16_ptr += src_stride;
109     ref16_ptr += ref_stride;
110   } while (--i != 0);
111 
112   sum[0] = vaddq_u32(sum[0], sum[1]);
113   return horizontal_add_u32x4(sum[0]);
114 }
115 
highbd_sadwxh_large_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int w,int h)116 static inline uint32_t highbd_sadwxh_large_neon(const uint8_t *src_ptr,
117                                                 int src_stride,
118                                                 const uint8_t *ref_ptr,
119                                                 int ref_stride, int w, int h) {
120   const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
121   const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
122   uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
123                         vdupq_n_u32(0) };
124 
125   int i = h;
126   do {
127     int j = 0;
128     do {
129       uint16x8_t s0 = vld1q_u16(src16_ptr + j);
130       uint16x8_t r0 = vld1q_u16(ref16_ptr + j);
131       uint16x8_t diff0 = vabdq_u16(s0, r0);
132       sum[0] = vpadalq_u16(sum[0], diff0);
133 
134       uint16x8_t s1 = vld1q_u16(src16_ptr + j + 8);
135       uint16x8_t r1 = vld1q_u16(ref16_ptr + j + 8);
136       uint16x8_t diff1 = vabdq_u16(s1, r1);
137       sum[1] = vpadalq_u16(sum[1], diff1);
138 
139       uint16x8_t s2 = vld1q_u16(src16_ptr + j + 16);
140       uint16x8_t r2 = vld1q_u16(ref16_ptr + j + 16);
141       uint16x8_t diff2 = vabdq_u16(s2, r2);
142       sum[2] = vpadalq_u16(sum[2], diff2);
143 
144       uint16x8_t s3 = vld1q_u16(src16_ptr + j + 24);
145       uint16x8_t r3 = vld1q_u16(ref16_ptr + j + 24);
146       uint16x8_t diff3 = vabdq_u16(s3, r3);
147       sum[3] = vpadalq_u16(sum[3], diff3);
148 
149       j += 32;
150     } while (j < w);
151 
152     src16_ptr += src_stride;
153     ref16_ptr += ref_stride;
154   } while (--i != 0);
155 
156   sum[0] = vaddq_u32(sum[0], sum[1]);
157   sum[2] = vaddq_u32(sum[2], sum[3]);
158   sum[0] = vaddq_u32(sum[0], sum[2]);
159 
160   return horizontal_add_u32x4(sum[0]);
161 }
162 
highbd_sad128xh_large_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)163 static inline unsigned int highbd_sad128xh_large_neon(const uint8_t *src_ptr,
164                                                       int src_stride,
165                                                       const uint8_t *ref_ptr,
166                                                       int ref_stride, int h) {
167   return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128,
168                                   h);
169 }
170 
highbd_sad64xh_large_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)171 static inline unsigned int highbd_sad64xh_large_neon(const uint8_t *src_ptr,
172                                                      int src_stride,
173                                                      const uint8_t *ref_ptr,
174                                                      int ref_stride, int h) {
175   return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64,
176                                   h);
177 }
178 
highbd_sad32xh_large_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)179 static inline unsigned int highbd_sad32xh_large_neon(const uint8_t *src_ptr,
180                                                      int src_stride,
181                                                      const uint8_t *ref_ptr,
182                                                      int ref_stride, int h) {
183   return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32,
184                                   h);
185 }
186 
187 #define HBD_SAD_WXH_SMALL_NEON(w, h)                                      \
188   unsigned int aom_highbd_sad##w##x##h##_neon(                            \
189       const uint8_t *src, int src_stride, const uint8_t *ref,             \
190       int ref_stride) {                                                   \
191     return highbd_sad##w##xh_small_neon(src, src_stride, ref, ref_stride, \
192                                         (h));                             \
193   }
194 
195 #define HBD_SAD_WXH_LARGE_NEON(w, h)                                      \
196   unsigned int aom_highbd_sad##w##x##h##_neon(                            \
197       const uint8_t *src, int src_stride, const uint8_t *ref,             \
198       int ref_stride) {                                                   \
199     return highbd_sad##w##xh_large_neon(src, src_stride, ref, ref_stride, \
200                                         (h));                             \
201   }
202 
203 HBD_SAD_WXH_SMALL_NEON(4, 4)
204 HBD_SAD_WXH_SMALL_NEON(4, 8)
205 
206 HBD_SAD_WXH_SMALL_NEON(8, 4)
207 HBD_SAD_WXH_SMALL_NEON(8, 8)
208 HBD_SAD_WXH_SMALL_NEON(8, 16)
209 
210 HBD_SAD_WXH_LARGE_NEON(16, 8)
211 HBD_SAD_WXH_LARGE_NEON(16, 16)
212 HBD_SAD_WXH_LARGE_NEON(16, 32)
213 
214 HBD_SAD_WXH_LARGE_NEON(32, 16)
215 HBD_SAD_WXH_LARGE_NEON(32, 32)
216 HBD_SAD_WXH_LARGE_NEON(32, 64)
217 
218 HBD_SAD_WXH_LARGE_NEON(64, 32)
219 HBD_SAD_WXH_LARGE_NEON(64, 64)
220 HBD_SAD_WXH_LARGE_NEON(64, 128)
221 
222 HBD_SAD_WXH_LARGE_NEON(128, 64)
223 HBD_SAD_WXH_LARGE_NEON(128, 128)
224 
225 #if !CONFIG_REALTIME_ONLY
226 HBD_SAD_WXH_SMALL_NEON(4, 16)
227 
228 HBD_SAD_WXH_LARGE_NEON(8, 32)
229 
230 HBD_SAD_WXH_LARGE_NEON(16, 4)
231 HBD_SAD_WXH_LARGE_NEON(16, 64)
232 
233 HBD_SAD_WXH_LARGE_NEON(32, 8)
234 
235 HBD_SAD_WXH_LARGE_NEON(64, 16)
236 #endif  // !CONFIG_REALTIME_ONLY
237 
238 #define HBD_SAD_SKIP_WXH_SMALL_NEON(w, h)                             \
239   unsigned int aom_highbd_sad_skip_##w##x##h##_neon(                  \
240       const uint8_t *src, int src_stride, const uint8_t *ref,         \
241       int ref_stride) {                                               \
242     return 2 * highbd_sad##w##xh_small_neon(src, 2 * src_stride, ref, \
243                                             2 * ref_stride, (h) / 2); \
244   }
245 
246 #define HBD_SAD_SKIP_WXH_LARGE_NEON(w, h)                             \
247   unsigned int aom_highbd_sad_skip_##w##x##h##_neon(                  \
248       const uint8_t *src, int src_stride, const uint8_t *ref,         \
249       int ref_stride) {                                               \
250     return 2 * highbd_sad##w##xh_large_neon(src, 2 * src_stride, ref, \
251                                             2 * ref_stride, (h) / 2); \
252   }
253 
254 HBD_SAD_SKIP_WXH_SMALL_NEON(4, 4)
255 HBD_SAD_SKIP_WXH_SMALL_NEON(4, 8)
256 
257 HBD_SAD_SKIP_WXH_SMALL_NEON(8, 4)
258 HBD_SAD_SKIP_WXH_SMALL_NEON(8, 8)
259 HBD_SAD_SKIP_WXH_SMALL_NEON(8, 16)
260 
261 HBD_SAD_SKIP_WXH_LARGE_NEON(16, 8)
262 HBD_SAD_SKIP_WXH_LARGE_NEON(16, 16)
263 HBD_SAD_SKIP_WXH_LARGE_NEON(16, 32)
264 
265 HBD_SAD_SKIP_WXH_LARGE_NEON(32, 16)
266 HBD_SAD_SKIP_WXH_LARGE_NEON(32, 32)
267 HBD_SAD_SKIP_WXH_LARGE_NEON(32, 64)
268 
269 HBD_SAD_SKIP_WXH_LARGE_NEON(64, 32)
270 HBD_SAD_SKIP_WXH_LARGE_NEON(64, 64)
271 HBD_SAD_SKIP_WXH_LARGE_NEON(64, 128)
272 
273 HBD_SAD_SKIP_WXH_LARGE_NEON(128, 64)
274 HBD_SAD_SKIP_WXH_LARGE_NEON(128, 128)
275 
276 #if !CONFIG_REALTIME_ONLY
277 HBD_SAD_SKIP_WXH_SMALL_NEON(4, 16)
278 
279 HBD_SAD_SKIP_WXH_SMALL_NEON(8, 32)
280 
281 HBD_SAD_SKIP_WXH_LARGE_NEON(16, 4)
282 HBD_SAD_SKIP_WXH_LARGE_NEON(16, 64)
283 
284 HBD_SAD_SKIP_WXH_LARGE_NEON(32, 8)
285 
286 HBD_SAD_SKIP_WXH_LARGE_NEON(64, 16)
287 #endif  // !CONFIG_REALTIME_ONLY
288 
highbd_sad4xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)289 static inline uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr,
290                                               int src_stride,
291                                               const uint8_t *ref_ptr,
292                                               int ref_stride, int h,
293                                               const uint8_t *second_pred) {
294   const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
295   const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
296   const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
297   uint32x4_t sum = vdupq_n_u32(0);
298 
299   int i = h;
300   do {
301     uint16x4_t s = vld1_u16(src16_ptr);
302     uint16x4_t r = vld1_u16(ref16_ptr);
303     uint16x4_t p = vld1_u16(pred16_ptr);
304 
305     uint16x4_t avg = vrhadd_u16(r, p);
306     sum = vabal_u16(sum, s, avg);
307 
308     src16_ptr += src_stride;
309     ref16_ptr += ref_stride;
310     pred16_ptr += 4;
311   } while (--i != 0);
312 
313   return horizontal_add_u32x4(sum);
314 }
315 
highbd_sad8xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)316 static inline uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr,
317                                               int src_stride,
318                                               const uint8_t *ref_ptr,
319                                               int ref_stride, int h,
320                                               const uint8_t *second_pred) {
321   const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
322   const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
323   const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
324   uint32x4_t sum = vdupq_n_u32(0);
325 
326   int i = h;
327   do {
328     uint16x8_t s = vld1q_u16(src16_ptr);
329     uint16x8_t r = vld1q_u16(ref16_ptr);
330     uint16x8_t p = vld1q_u16(pred16_ptr);
331 
332     uint16x8_t avg = vrhaddq_u16(r, p);
333     uint16x8_t diff = vabdq_u16(s, avg);
334     sum = vpadalq_u16(sum, diff);
335 
336     src16_ptr += src_stride;
337     ref16_ptr += ref_stride;
338     pred16_ptr += 8;
339   } while (--i != 0);
340 
341   return horizontal_add_u32x4(sum);
342 }
343 
highbd_sad16xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)344 static inline uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr,
345                                                int src_stride,
346                                                const uint8_t *ref_ptr,
347                                                int ref_stride, int h,
348                                                const uint8_t *second_pred) {
349   const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
350   const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
351   const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
352   uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
353 
354   int i = h;
355   do {
356     uint16x8_t s0, s1, r0, r1, p0, p1;
357     uint16x8_t avg0, avg1, diff0, diff1;
358 
359     s0 = vld1q_u16(src16_ptr);
360     r0 = vld1q_u16(ref16_ptr);
361     p0 = vld1q_u16(pred16_ptr);
362     avg0 = vrhaddq_u16(r0, p0);
363     diff0 = vabdq_u16(s0, avg0);
364     sum[0] = vpadalq_u16(sum[0], diff0);
365 
366     s1 = vld1q_u16(src16_ptr + 8);
367     r1 = vld1q_u16(ref16_ptr + 8);
368     p1 = vld1q_u16(pred16_ptr + 8);
369     avg1 = vrhaddq_u16(r1, p1);
370     diff1 = vabdq_u16(s1, avg1);
371     sum[1] = vpadalq_u16(sum[1], diff1);
372 
373     src16_ptr += src_stride;
374     ref16_ptr += ref_stride;
375     pred16_ptr += 16;
376   } while (--i != 0);
377 
378   sum[0] = vaddq_u32(sum[0], sum[1]);
379   return horizontal_add_u32x4(sum[0]);
380 }
381 
highbd_sadwxh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int w,int h,const uint8_t * second_pred)382 static inline uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr,
383                                               int src_stride,
384                                               const uint8_t *ref_ptr,
385                                               int ref_stride, int w, int h,
386                                               const uint8_t *second_pred) {
387   const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
388   const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
389   const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
390   uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
391                         vdupq_n_u32(0) };
392 
393   int i = h;
394   do {
395     int j = 0;
396     do {
397       uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
398       uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
399 
400       s0 = vld1q_u16(src16_ptr + j);
401       r0 = vld1q_u16(ref16_ptr + j);
402       p0 = vld1q_u16(pred16_ptr + j);
403       avg0 = vrhaddq_u16(r0, p0);
404       diff0 = vabdq_u16(s0, avg0);
405       sum[0] = vpadalq_u16(sum[0], diff0);
406 
407       s1 = vld1q_u16(src16_ptr + j + 8);
408       r1 = vld1q_u16(ref16_ptr + j + 8);
409       p1 = vld1q_u16(pred16_ptr + j + 8);
410       avg1 = vrhaddq_u16(r1, p1);
411       diff1 = vabdq_u16(s1, avg1);
412       sum[1] = vpadalq_u16(sum[1], diff1);
413 
414       s2 = vld1q_u16(src16_ptr + j + 16);
415       r2 = vld1q_u16(ref16_ptr + j + 16);
416       p2 = vld1q_u16(pred16_ptr + j + 16);
417       avg2 = vrhaddq_u16(r2, p2);
418       diff2 = vabdq_u16(s2, avg2);
419       sum[2] = vpadalq_u16(sum[2], diff2);
420 
421       s3 = vld1q_u16(src16_ptr + j + 24);
422       r3 = vld1q_u16(ref16_ptr + j + 24);
423       p3 = vld1q_u16(pred16_ptr + j + 24);
424       avg3 = vrhaddq_u16(r3, p3);
425       diff3 = vabdq_u16(s3, avg3);
426       sum[3] = vpadalq_u16(sum[3], diff3);
427 
428       j += 32;
429     } while (j < w);
430 
431     src16_ptr += src_stride;
432     ref16_ptr += ref_stride;
433     pred16_ptr += w;
434   } while (--i != 0);
435 
436   sum[0] = vaddq_u32(sum[0], sum[1]);
437   sum[2] = vaddq_u32(sum[2], sum[3]);
438   sum[0] = vaddq_u32(sum[0], sum[2]);
439 
440   return horizontal_add_u32x4(sum[0]);
441 }
442 
highbd_sad128xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)443 static inline unsigned int highbd_sad128xh_avg_neon(
444     const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
445     int ref_stride, int h, const uint8_t *second_pred) {
446   return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128,
447                                 h, second_pred);
448 }
449 
highbd_sad64xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)450 static inline unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr,
451                                                    int src_stride,
452                                                    const uint8_t *ref_ptr,
453                                                    int ref_stride, int h,
454                                                    const uint8_t *second_pred) {
455   return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
456                                 second_pred);
457 }
458 
highbd_sad32xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)459 static inline unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr,
460                                                    int src_stride,
461                                                    const uint8_t *ref_ptr,
462                                                    int ref_stride, int h,
463                                                    const uint8_t *second_pred) {
464   return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
465                                 second_pred);
466 }
467 
468 #define HBD_SAD_WXH_AVG_NEON(w, h)                                            \
469   uint32_t aom_highbd_sad##w##x##h##_avg_neon(                                \
470       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
471       const uint8_t *second_pred) {                                           \
472     return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h),  \
473                                       second_pred);                           \
474   }
475 
476 HBD_SAD_WXH_AVG_NEON(4, 4)
477 HBD_SAD_WXH_AVG_NEON(4, 8)
478 
479 HBD_SAD_WXH_AVG_NEON(8, 4)
480 HBD_SAD_WXH_AVG_NEON(8, 8)
481 HBD_SAD_WXH_AVG_NEON(8, 16)
482 
483 HBD_SAD_WXH_AVG_NEON(16, 8)
484 HBD_SAD_WXH_AVG_NEON(16, 16)
485 HBD_SAD_WXH_AVG_NEON(16, 32)
486 
487 HBD_SAD_WXH_AVG_NEON(32, 16)
488 HBD_SAD_WXH_AVG_NEON(32, 32)
489 HBD_SAD_WXH_AVG_NEON(32, 64)
490 
491 HBD_SAD_WXH_AVG_NEON(64, 32)
492 HBD_SAD_WXH_AVG_NEON(64, 64)
493 HBD_SAD_WXH_AVG_NEON(64, 128)
494 
495 HBD_SAD_WXH_AVG_NEON(128, 64)
496 HBD_SAD_WXH_AVG_NEON(128, 128)
497 
498 #if !CONFIG_REALTIME_ONLY
499 HBD_SAD_WXH_AVG_NEON(4, 16)
500 
501 HBD_SAD_WXH_AVG_NEON(8, 32)
502 
503 HBD_SAD_WXH_AVG_NEON(16, 4)
504 HBD_SAD_WXH_AVG_NEON(16, 64)
505 
506 HBD_SAD_WXH_AVG_NEON(32, 8)
507 
508 HBD_SAD_WXH_AVG_NEON(64, 16)
509 #endif  // !CONFIG_REALTIME_ONLY
510