xref: /aosp_15_r20/external/libaom/aom_dsp/arm/highbd_subpel_variance_neon.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2023 The WebM project authors. All rights reserved.
3  * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
4  *
5  * This source code is subject to the terms of the BSD 2 Clause License and
6  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7  * was not distributed with this source code in the LICENSE file, you can
8  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
9  * Media Patent License 1.0 was not distributed with this source code in the
10  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
11  */
12 
13 #include <arm_neon.h>
14 
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17 
18 #include "aom_dsp/aom_filter.h"
19 #include "aom_dsp/arm/dist_wtd_avg_neon.h"
20 #include "aom_dsp/arm/mem_neon.h"
21 #include "aom_dsp/arm/sum_neon.h"
22 #include "aom_dsp/variance.h"
23 
24 // The bilinear filters look like this:
25 //
26 // {{ 128,  0 }, { 112, 16 }, { 96, 32 }, { 80,  48 },
27 //  {  64, 64 }, {  48, 80 }, { 32, 96 }, { 16, 112 }}
28 //
29 // We can factor out the highest common multiple, such that the sum of both
30 // weights will be 8 instead of 128. The benefits of this are two-fold:
31 //
32 // 1) We can infer the filter values from the filter_offset parameter in the
33 // bilinear filter functions below - we don't have to actually load the values
34 // from memory:
35 // f0 = 8 - filter_offset
36 // f1 = filter_offset
37 //
38 // 2) Scaling the pixel values by 8, instead of 128 enables us to operate on
39 // 16-bit data types at all times, rather than widening out to 32-bit and
40 // requiring double the number of data processing instructions. (12-bit * 8 =
41 // 15-bit.)
42 
43 // Process a block exactly 4 wide and any height.
highbd_var_filter_block2d_bil_w4(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)44 static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
45                                              uint16_t *dst_ptr, int src_stride,
46                                              int pixel_step, int dst_height,
47                                              int filter_offset) {
48   const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
49   const uint16x4_t f1 = vdup_n_u16(filter_offset);
50 
51   int i = dst_height;
52   do {
53     uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
54     uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
55 
56     uint16x4_t blend = vmul_u16(s0, f0);
57     blend = vmla_u16(blend, s1, f1);
58     blend = vrshr_n_u16(blend, 3);
59 
60     vst1_u16(dst_ptr, blend);
61 
62     src_ptr += src_stride;
63     dst_ptr += 4;
64   } while (--i != 0);
65 }
66 
67 // Process a block which is a multiple of 8 and any height.
highbd_var_filter_block2d_bil_large(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,int filter_offset)68 static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr,
69                                                 uint16_t *dst_ptr,
70                                                 int src_stride, int pixel_step,
71                                                 int dst_width, int dst_height,
72                                                 int filter_offset) {
73   const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
74   const uint16x8_t f1 = vdupq_n_u16(filter_offset);
75 
76   int i = dst_height;
77   do {
78     int j = 0;
79     do {
80       uint16x8_t s0 = vld1q_u16(src_ptr + j);
81       uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
82 
83       uint16x8_t blend = vmulq_u16(s0, f0);
84       blend = vmlaq_u16(blend, s1, f1);
85       blend = vrshrq_n_u16(blend, 3);
86 
87       vst1q_u16(dst_ptr + j, blend);
88 
89       j += 8;
90     } while (j < dst_width);
91 
92     src_ptr += src_stride;
93     dst_ptr += dst_width;
94   } while (--i != 0);
95 }
96 
highbd_var_filter_block2d_bil_w8(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)97 static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr,
98                                              uint16_t *dst_ptr, int src_stride,
99                                              int pixel_step, int dst_height,
100                                              int filter_offset) {
101   highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
102                                       8, dst_height, filter_offset);
103 }
104 
highbd_var_filter_block2d_bil_w16(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)105 static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr,
106                                               uint16_t *dst_ptr, int src_stride,
107                                               int pixel_step, int dst_height,
108                                               int filter_offset) {
109   highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
110                                       16, dst_height, filter_offset);
111 }
112 
highbd_var_filter_block2d_bil_w32(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)113 static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr,
114                                               uint16_t *dst_ptr, int src_stride,
115                                               int pixel_step, int dst_height,
116                                               int filter_offset) {
117   highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
118                                       32, dst_height, filter_offset);
119 }
120 
highbd_var_filter_block2d_bil_w64(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)121 static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr,
122                                               uint16_t *dst_ptr, int src_stride,
123                                               int pixel_step, int dst_height,
124                                               int filter_offset) {
125   highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
126                                       64, dst_height, filter_offset);
127 }
128 
highbd_var_filter_block2d_bil_w128(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)129 static void highbd_var_filter_block2d_bil_w128(const uint16_t *src_ptr,
130                                                uint16_t *dst_ptr,
131                                                int src_stride, int pixel_step,
132                                                int dst_height,
133                                                int filter_offset) {
134   highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
135                                       128, dst_height, filter_offset);
136 }
137 
highbd_var_filter_block2d_avg(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height)138 static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
139                                           uint16_t *dst_ptr, int src_stride,
140                                           int pixel_step, int dst_width,
141                                           int dst_height) {
142   int i = dst_height;
143 
144   // We only specialize on the filter values for large block sizes (>= 16x16.)
145   assert(dst_width >= 16 && dst_width % 16 == 0);
146 
147   do {
148     int j = 0;
149     do {
150       uint16x8_t s0 = vld1q_u16(src_ptr + j);
151       uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
152       uint16x8_t avg = vrhaddq_u16(s0, s1);
153       vst1q_u16(dst_ptr + j, avg);
154 
155       j += 8;
156     } while (j < dst_width);
157 
158     src_ptr += src_stride;
159     dst_ptr += dst_width;
160   } while (--i != 0);
161 }
162 
163 #define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)                           \
164   unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
165       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
166       const uint8_t *ref, int ref_stride, uint32_t *sse) {                     \
167     uint16_t tmp0[w * (h + 1)];                                                \
168     uint16_t tmp1[w * h];                                                      \
169     uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
170                                                                                \
171     highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1),  \
172                                        xoffset);                               \
173     highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \
174                                                                                \
175     return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
176                                                      w, ref, ref_stride, sse); \
177   }
178 
179 #define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)               \
180   unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
181       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
182       const uint8_t *ref, int ref_stride, unsigned int *sse) {                 \
183     uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
184                                                                                \
185     if (xoffset == 0) {                                                        \
186       if (yoffset == 0) {                                                      \
187         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
188             CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse);    \
189       } else if (yoffset == 4) {                                               \
190         uint16_t tmp[w * h];                                                   \
191         highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
192                                       h);                                      \
193         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
194             CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
195       } else {                                                                 \
196         uint16_t tmp[w * h];                                                   \
197         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride,           \
198                                            src_stride, h, yoffset);            \
199         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
200             CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
201       }                                                                        \
202     } else if (xoffset == 4) {                                                 \
203       uint16_t tmp0[w * (h + 1)];                                              \
204       if (yoffset == 0) {                                                      \
205         highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h);     \
206         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
207             CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
208       } else if (yoffset == 4) {                                               \
209         uint16_t tmp1[w * (h + 1)];                                            \
210         highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
211                                       (h + 1));                                \
212         highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
213         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
214             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
215       } else {                                                                 \
216         uint16_t tmp1[w * (h + 1)];                                            \
217         highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
218                                       (h + 1));                                \
219         highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
220         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
221             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
222       }                                                                        \
223     } else {                                                                   \
224       uint16_t tmp0[w * (h + 1)];                                              \
225       if (yoffset == 0) {                                                      \
226         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h,    \
227                                            xoffset);                           \
228         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
229             CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
230       } else if (yoffset == 4) {                                               \
231         uint16_t tmp1[w * h];                                                  \
232         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
233                                            (h + 1), xoffset);                  \
234         highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
235         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
236             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
237       } else {                                                                 \
238         uint16_t tmp1[w * h];                                                  \
239         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
240                                            (h + 1), xoffset);                  \
241         highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
242         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
243             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
244       }                                                                        \
245     }                                                                          \
246   }
247 
248 // 8-bit
249 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
250 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
251 
252 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
253 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
254 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
255 
256 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
257 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
258 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
259 
260 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
261 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
262 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
263 
264 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
265 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
266 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
267 
268 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
269 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
270 
271 #if !CONFIG_REALTIME_ONLY
272 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
273 
274 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
275 
276 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
277 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
278 
279 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
280 
281 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
282 #endif  // !CONFIG_REALTIME_ONLY
283 
284 // 10-bit
285 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
286 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
287 
288 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
289 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
290 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
291 
292 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
293 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
294 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
295 
296 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
297 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
298 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
299 
300 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
301 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
302 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
303 
304 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
305 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
306 
307 #if !CONFIG_REALTIME_ONLY
308 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
309 
310 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
311 
312 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
313 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
314 
315 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
316 
317 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
318 #endif  // !CONFIG_REALTIME_ONLY
319 
320 // 12-bit
321 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
322 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
323 
324 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
325 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
326 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
327 
328 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
329 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
330 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
331 
332 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
333 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
334 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
335 
336 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
337 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
338 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
339 
340 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
341 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
342 
343 #if !CONFIG_REALTIME_ONLY
344 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
345 
346 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
347 
348 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
349 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
350 
351 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
352 
353 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
354 #endif  // !CONFIG_REALTIME_ONLY
355 
356 // Combine bilinear filter with aom_highbd_comp_avg_pred for blocks having
357 // width 4.
highbd_avg_pred_var_filter_block2d_bil_w4(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred)358 static void highbd_avg_pred_var_filter_block2d_bil_w4(
359     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
360     int dst_height, int filter_offset, const uint16_t *second_pred) {
361   const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
362   const uint16x4_t f1 = vdup_n_u16(filter_offset);
363 
364   int i = dst_height;
365   do {
366     uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
367     uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
368     uint16x4_t p = vld1_u16(second_pred);
369 
370     uint16x4_t blend = vmul_u16(s0, f0);
371     blend = vmla_u16(blend, s1, f1);
372     blend = vrshr_n_u16(blend, 3);
373 
374     vst1_u16(dst_ptr, vrhadd_u16(blend, p));
375 
376     src_ptr += src_stride;
377     dst_ptr += 4;
378     second_pred += 4;
379   } while (--i != 0);
380 }
381 
382 // Combine bilinear filter with aom_highbd_comp_avg_pred for large blocks.
highbd_avg_pred_var_filter_block2d_bil_large(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,int filter_offset,const uint16_t * second_pred)383 static void highbd_avg_pred_var_filter_block2d_bil_large(
384     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
385     int dst_width, int dst_height, int filter_offset,
386     const uint16_t *second_pred) {
387   const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
388   const uint16x8_t f1 = vdupq_n_u16(filter_offset);
389 
390   int i = dst_height;
391   do {
392     int j = 0;
393     do {
394       uint16x8_t s0 = vld1q_u16(src_ptr + j);
395       uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
396       uint16x8_t p = vld1q_u16(second_pred);
397 
398       uint16x8_t blend = vmulq_u16(s0, f0);
399       blend = vmlaq_u16(blend, s1, f1);
400       blend = vrshrq_n_u16(blend, 3);
401 
402       vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p));
403 
404       j += 8;
405       second_pred += 8;
406     } while (j < dst_width);
407 
408     src_ptr += src_stride;
409     dst_ptr += dst_width;
410   } while (--i != 0);
411 }
412 
highbd_avg_pred_var_filter_block2d_bil_w8(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred)413 static void highbd_avg_pred_var_filter_block2d_bil_w8(
414     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
415     int dst_height, int filter_offset, const uint16_t *second_pred) {
416   highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
417                                                pixel_step, 8, dst_height,
418                                                filter_offset, second_pred);
419 }
420 
highbd_avg_pred_var_filter_block2d_bil_w16(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred)421 static void highbd_avg_pred_var_filter_block2d_bil_w16(
422     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
423     int dst_height, int filter_offset, const uint16_t *second_pred) {
424   highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
425                                                pixel_step, 16, dst_height,
426                                                filter_offset, second_pred);
427 }
428 
highbd_avg_pred_var_filter_block2d_bil_w32(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred)429 static void highbd_avg_pred_var_filter_block2d_bil_w32(
430     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
431     int dst_height, int filter_offset, const uint16_t *second_pred) {
432   highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
433                                                pixel_step, 32, dst_height,
434                                                filter_offset, second_pred);
435 }
436 
highbd_avg_pred_var_filter_block2d_bil_w64(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred)437 static void highbd_avg_pred_var_filter_block2d_bil_w64(
438     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
439     int dst_height, int filter_offset, const uint16_t *second_pred) {
440   highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
441                                                pixel_step, 64, dst_height,
442                                                filter_offset, second_pred);
443 }
444 
highbd_avg_pred_var_filter_block2d_bil_w128(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred)445 static void highbd_avg_pred_var_filter_block2d_bil_w128(
446     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
447     int dst_height, int filter_offset, const uint16_t *second_pred) {
448   highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
449                                                pixel_step, 128, dst_height,
450                                                filter_offset, second_pred);
451 }
452 
453 // Combine averaging subpel filter with aom_highbd_comp_avg_pred.
highbd_avg_pred_var_filter_block2d_avg(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,const uint16_t * second_pred)454 static void highbd_avg_pred_var_filter_block2d_avg(
455     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
456     int dst_width, int dst_height, const uint16_t *second_pred) {
457   int i = dst_height;
458 
459   // We only specialize on the filter values for large block sizes (>= 16x16.)
460   assert(dst_width >= 16 && dst_width % 16 == 0);
461 
462   do {
463     int j = 0;
464     do {
465       uint16x8_t s0 = vld1q_u16(src_ptr + j);
466       uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
467       uint16x8_t avg = vrhaddq_u16(s0, s1);
468 
469       uint16x8_t p = vld1q_u16(second_pred);
470       avg = vrhaddq_u16(avg, p);
471 
472       vst1q_u16(dst_ptr + j, avg);
473 
474       j += 8;
475       second_pred += 8;
476     } while (j < dst_width);
477 
478     src_ptr += src_stride;
479     dst_ptr += dst_width;
480   } while (--i != 0);
481 }
482 
483 // Implementation of aom_highbd_comp_avg_pred for blocks having width >= 16.
highbd_avg_pred(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int dst_width,int dst_height,const uint16_t * second_pred)484 static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
485                             int src_stride, int dst_width, int dst_height,
486                             const uint16_t *second_pred) {
487   int i = dst_height;
488 
489   // We only specialize on the filter values for large block sizes (>= 16x16.)
490   assert(dst_width >= 16 && dst_width % 16 == 0);
491 
492   do {
493     int j = 0;
494     do {
495       uint16x8_t s = vld1q_u16(src_ptr + j);
496       uint16x8_t p = vld1q_u16(second_pred);
497 
498       uint16x8_t avg = vrhaddq_u16(s, p);
499 
500       vst1q_u16(dst_ptr + j, avg);
501 
502       j += 8;
503       second_pred += 8;
504     } while (j < dst_width);
505 
506     src_ptr += src_stride;
507     dst_ptr += dst_width;
508   } while (--i != 0);
509 }
510 
511 #define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)                       \
512   uint32_t aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon(     \
513       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
514       const uint8_t *ref, int ref_stride, uint32_t *sse,                       \
515       const uint8_t *second_pred) {                                            \
516     uint16_t tmp0[w * (h + 1)];                                                \
517     uint16_t tmp1[w * h];                                                      \
518     uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
519                                                                                \
520     highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1),  \
521                                        xoffset);                               \
522     highbd_avg_pred_var_filter_block2d_bil_w##w(                               \
523         tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));       \
524                                                                                \
525     return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
526                                                      w, ref, ref_stride, sse); \
527   }
528 
529 #define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)           \
530   unsigned int aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
531       const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
532       const uint8_t *ref, int ref_stride, uint32_t *sse,                       \
533       const uint8_t *second_pred) {                                            \
534     uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
535                                                                                \
536     if (xoffset == 0) {                                                        \
537       uint16_t tmp[w * h];                                                     \
538       if (yoffset == 0) {                                                      \
539         highbd_avg_pred(src_ptr, tmp, source_stride, w, h,                     \
540                         CONVERT_TO_SHORTPTR(second_pred));                     \
541         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
542             CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
543       } else if (yoffset == 4) {                                               \
544         highbd_avg_pred_var_filter_block2d_avg(                                \
545             src_ptr, tmp, source_stride, source_stride, w, h,                  \
546             CONVERT_TO_SHORTPTR(second_pred));                                 \
547         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
548             CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
549       } else {                                                                 \
550         highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
551             src_ptr, tmp, source_stride, source_stride, h, yoffset,            \
552             CONVERT_TO_SHORTPTR(second_pred));                                 \
553         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
554             CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
555       }                                                                        \
556     } else if (xoffset == 4) {                                                 \
557       uint16_t tmp0[w * (h + 1)];                                              \
558       if (yoffset == 0) {                                                      \
559         highbd_avg_pred_var_filter_block2d_avg(                                \
560             src_ptr, tmp0, source_stride, 1, w, h,                             \
561             CONVERT_TO_SHORTPTR(second_pred));                                 \
562         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
563             CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
564       } else if (yoffset == 4) {                                               \
565         uint16_t tmp1[w * (h + 1)];                                            \
566         highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
567                                       (h + 1));                                \
568         highbd_avg_pred_var_filter_block2d_avg(                                \
569             tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
570         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
571             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
572       } else {                                                                 \
573         uint16_t tmp1[w * (h + 1)];                                            \
574         highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
575                                       (h + 1));                                \
576         highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
577             tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
578         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
579             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
580       }                                                                        \
581     } else {                                                                   \
582       uint16_t tmp0[w * (h + 1)];                                              \
583       if (yoffset == 0) {                                                      \
584         highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
585             src_ptr, tmp0, source_stride, 1, h, xoffset,                       \
586             CONVERT_TO_SHORTPTR(second_pred));                                 \
587         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
588             CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
589       } else if (yoffset == 4) {                                               \
590         uint16_t tmp1[w * h];                                                  \
591         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
592                                            (h + 1), xoffset);                  \
593         highbd_avg_pred_var_filter_block2d_avg(                                \
594             tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
595         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
596             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
597       } else {                                                                 \
598         uint16_t tmp1[w * h];                                                  \
599         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
600                                            (h + 1), xoffset);                  \
601         highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
602             tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
603         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
604             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
605       }                                                                        \
606     }                                                                          \
607   }
608 
609 // 8-bit
610 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
611 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
612 
613 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
614 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
615 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
616 
617 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
618 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
619 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
620 
621 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
622 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
623 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
624 
625 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
626 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
627 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128)
628 
629 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64)
630 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128)
631 
632 #if !CONFIG_REALTIME_ONLY
633 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16)
634 
635 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32)
636 
637 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4)
638 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64)
639 
640 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8)
641 
642 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16)
643 #endif  // !CONFIG_REALTIME_ONLY
644 
645 // 10-bit
646 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
647 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
648 
649 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
650 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
651 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
652 
653 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
654 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
655 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
656 
657 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
658 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
659 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
660 
661 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
662 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
663 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128)
664 
665 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64)
666 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128)
667 
668 #if !CONFIG_REALTIME_ONLY
669 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16)
670 
671 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32)
672 
673 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4)
674 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64)
675 
676 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8)
677 
678 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16)
679 #endif  // !CONFIG_REALTIME_ONLY
680 
681 // 12-bit
682 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
683 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
684 
685 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
686 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
687 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
688 
689 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
690 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
691 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
692 
693 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
694 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
695 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
696 
697 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
698 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
699 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128)
700 
701 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64)
702 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128)
703 
704 #if !CONFIG_REALTIME_ONLY
705 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16)
706 
707 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32)
708 
709 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4)
710 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64)
711 
712 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8)
713 
714 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16)
715 #endif  // !CONFIG_REALTIME_ONLY
716 
717 #define HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)                    \
718   unsigned int                                                                 \
719       aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon(       \
720           const uint8_t *src, int src_stride, int xoffset, int yoffset,        \
721           const uint8_t *ref, int ref_stride, const uint8_t *second_pred,      \
722           const uint8_t *msk, int msk_stride, int invert_mask,                 \
723           unsigned int *sse) {                                                 \
724     uint16_t tmp0[w * (h + 1)];                                                \
725     uint16_t tmp1[w * (h + 1)];                                                \
726     uint16_t tmp2[w * h];                                                      \
727     uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
728     highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1),  \
729                                        xoffset);                               \
730     highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \
731     aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, w,   \
732                                    h, CONVERT_TO_BYTEPTR(tmp1), w, msk,        \
733                                    msk_stride, invert_mask);                   \
734     return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp2), \
735                                                      w, ref, ref_stride, sse); \
736   }
737 
738 #define HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)        \
739   unsigned int                                                                 \
740       aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon(       \
741           const uint8_t *src, int src_stride, int xoffset, int yoffset,        \
742           const uint8_t *ref, int ref_stride, const uint8_t *second_pred,      \
743           const uint8_t *msk, int msk_stride, int invert_mask,                 \
744           unsigned int *sse) {                                                 \
745     uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
746     if (xoffset == 0) {                                                        \
747       uint16_t tmp0[w * h];                                                    \
748       if (yoffset == 0) {                                                      \
749         aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp0), second_pred,  \
750                                        w, h, src, src_stride, msk, msk_stride, \
751                                        invert_mask);                           \
752         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
753             CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
754       } else if (yoffset == 4) {                                               \
755         uint16_t tmp1[w * h];                                                  \
756         highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, src_stride,   \
757                                       w, h);                                   \
758         aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \
759                                        w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
760                                        msk_stride, invert_mask);               \
761         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
762             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
763       } else {                                                                 \
764         uint16_t tmp1[w * h];                                                  \
765         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride,          \
766                                            src_stride, h, yoffset);            \
767         aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \
768                                        w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
769                                        msk_stride, invert_mask);               \
770         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
771             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
772       }                                                                        \
773     } else if (xoffset == 4) {                                                 \
774       uint16_t tmp0[w * (h + 1)];                                              \
775       if (yoffset == 0) {                                                      \
776         uint16_t tmp1[w * h];                                                  \
777         highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h);     \
778         aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \
779                                        w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
780                                        msk_stride, invert_mask);               \
781         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
782             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
783       } else if (yoffset == 4) {                                               \
784         uint16_t tmp1[w * h];                                                  \
785         uint16_t tmp2[w * h];                                                  \
786         highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
787                                       (h + 1));                                \
788         highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
789         aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \
790                                        w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
791                                        msk_stride, invert_mask);               \
792         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
793             CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \
794       } else {                                                                 \
795         uint16_t tmp1[w * h];                                                  \
796         uint16_t tmp2[w * h];                                                  \
797         highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
798                                       (h + 1));                                \
799         highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
800         aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \
801                                        w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
802                                        msk_stride, invert_mask);               \
803         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
804             CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \
805       }                                                                        \
806     } else {                                                                   \
807       if (yoffset == 0) {                                                      \
808         uint16_t tmp0[w * h];                                                  \
809         uint16_t tmp1[w * h];                                                  \
810         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h,    \
811                                            xoffset);                           \
812         aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \
813                                        w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
814                                        msk_stride, invert_mask);               \
815         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
816             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
817       } else if (yoffset == 4) {                                               \
818         uint16_t tmp0[w * (h + 1)];                                            \
819         uint16_t tmp1[w * h];                                                  \
820         uint16_t tmp2[w * h];                                                  \
821         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
822                                            (h + 1), xoffset);                  \
823         highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
824         aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \
825                                        w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
826                                        msk_stride, invert_mask);               \
827         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
828             CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \
829       } else {                                                                 \
830         uint16_t tmp0[w * (h + 1)];                                            \
831         uint16_t tmp1[w * (h + 1)];                                            \
832         uint16_t tmp2[w * h];                                                  \
833         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
834                                            (h + 1), xoffset);                  \
835         highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
836         aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \
837                                        w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
838                                        msk_stride, invert_mask);               \
839         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
840             CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \
841       }                                                                        \
842     }                                                                          \
843   }
844 
845 // 8-bit
846 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
847 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
848 
849 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
850 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
851 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
852 
853 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
854 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
855 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
856 
857 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
858 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
859 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
860 
861 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
862 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
863 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
864 
865 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
866 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
867 
868 #if !CONFIG_REALTIME_ONLY
869 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
870 
871 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
872 
873 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
874 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
875 
876 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
877 
878 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
879 #endif  // !CONFIG_REALTIME_ONLY
880 
881 // 10-bit
882 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
883 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
884 
885 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
886 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
887 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
888 
889 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
890 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
891 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
892 
893 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
894 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
895 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
896 
897 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
898 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
899 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
900 
901 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
902 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
903 
904 #if !CONFIG_REALTIME_ONLY
905 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
906 
907 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
908 
909 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
910 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
911 
912 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
913 
914 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
915 #endif  // !CONFIG_REALTIME_ONLY
916 
917 // 12-bit
918 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
919 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
920 
921 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
922 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
923 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
924 
925 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
926 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
927 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
928 
929 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
930 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
931 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
932 
933 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
934 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
935 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
936 
937 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
938 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
939 
940 #if !CONFIG_REALTIME_ONLY
941 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
942 
943 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
944 
945 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
946 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
947 
948 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
949 
950 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
951 #endif  // !CONFIG_REALTIME_ONLY
952 
953 #if !CONFIG_REALTIME_ONLY
954 #define HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)                \
955   unsigned int                                                              \
956       aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon(      \
957           const uint8_t *pre, int pre_stride, int xoffset, int yoffset,     \
958           const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {    \
959     uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);                           \
960     uint16_t tmp0[w * (h + 1)];                                             \
961     uint16_t tmp1[w * h];                                                   \
962     highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h + 1, \
963                                        xoffset);                            \
964     highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);       \
965     return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(           \
966         CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                      \
967   }
968 
969 #define SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)       \
970   unsigned int                                                                 \
971       aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon(         \
972           const uint8_t *pre, int pre_stride, int xoffset, int yoffset,        \
973           const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {       \
974     uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);                              \
975     if (xoffset == 0) {                                                        \
976       if (yoffset == 0) {                                                      \
977         return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
978             pre, pre_stride, wsrc, mask, sse);                                 \
979       } else if (yoffset == 4) {                                               \
980         uint16_t tmp[w * h];                                                   \
981         highbd_var_filter_block2d_avg(pre_ptr, tmp, pre_stride, pre_stride, w, \
982                                       h);                                      \
983         return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
984             CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse);                      \
985       } else {                                                                 \
986         uint16_t tmp[w * h];                                                   \
987         highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp, pre_stride,           \
988                                            pre_stride, h, yoffset);            \
989         return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
990             CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse);                      \
991       }                                                                        \
992     } else if (xoffset == 4) {                                                 \
993       uint16_t tmp0[w * (h + 1)];                                              \
994       if (yoffset == 0) {                                                      \
995         highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h);     \
996         return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
997             CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse);                     \
998       } else if (yoffset == 4) {                                               \
999         uint16_t tmp1[w * (h + 1)];                                            \
1000         highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \
1001         highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
1002         return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
1003             CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \
1004       } else {                                                                 \
1005         uint16_t tmp1[w * (h + 1)];                                            \
1006         highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \
1007         highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
1008         return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
1009             CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \
1010       }                                                                        \
1011     } else {                                                                   \
1012       uint16_t tmp0[w * (h + 1)];                                              \
1013       if (yoffset == 0) {                                                      \
1014         highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h,    \
1015                                            xoffset);                           \
1016         return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
1017             CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse);                     \
1018       } else if (yoffset == 4) {                                               \
1019         uint16_t tmp1[w * h];                                                  \
1020         highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1,       \
1021                                            h + 1, xoffset);                    \
1022         highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
1023         return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
1024             CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \
1025       } else {                                                                 \
1026         uint16_t tmp1[w * h];                                                  \
1027         highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1,       \
1028                                            h + 1, xoffset);                    \
1029         highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
1030         return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
1031             CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \
1032       }                                                                        \
1033     }                                                                          \
1034   }
1035 
1036 // 8-bit
1037 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
1038 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
1039 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
1040 
1041 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
1042 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
1043 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
1044 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
1045 
1046 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
1047 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
1048 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
1049 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
1050 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
1051 
1052 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
1053 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
1054 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
1055 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
1056 
1057 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
1058 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
1059 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
1060 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
1061 
1062 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
1063 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
1064 
1065 // 10-bit
1066 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
1067 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
1068 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
1069 
1070 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
1071 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
1072 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
1073 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
1074 
1075 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
1076 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
1077 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
1078 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
1079 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
1080 
1081 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
1082 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
1083 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
1084 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
1085 
1086 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
1087 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
1088 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
1089 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
1090 
1091 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
1092 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
1093 
1094 // 12-bit
1095 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
1096 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
1097 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
1098 
1099 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
1100 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
1101 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
1102 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
1103 
1104 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
1105 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
1106 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
1107 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
1108 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
1109 
1110 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
1111 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
1112 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
1113 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
1114 
1115 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
1116 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
1117 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
1118 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
1119 
1120 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
1121 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
1122 #endif  // !CONFIG_REALTIME_ONLY
1123 
highbd_dist_wtd_avg_pred(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int dst_width,int dst_height,const uint16_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)1124 static void highbd_dist_wtd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
1125                                      int src_stride, int dst_width,
1126                                      int dst_height,
1127                                      const uint16_t *second_pred,
1128                                      const DIST_WTD_COMP_PARAMS *jcp_param) {
1129   // We only specialise on the filter values for large block sizes (>= 16x16.)
1130   assert(dst_width >= 16 && dst_width % 16 == 0);
1131   const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
1132   const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
1133 
1134   int i = dst_height;
1135   do {
1136     int j = 0;
1137     do {
1138       uint16x8_t s = vld1q_u16(src_ptr + j);
1139       uint16x8_t p = vld1q_u16(second_pred);
1140 
1141       uint16x8_t avg = dist_wtd_avg_u16x8(s, p, fwd_offset, bck_offset);
1142 
1143       vst1q_u16(dst_ptr + j, avg);
1144 
1145       second_pred += 8;
1146       j += 8;
1147     } while (j < dst_width);
1148 
1149     src_ptr += src_stride;
1150     dst_ptr += dst_width;
1151   } while (--i != 0);
1152 }
1153 
highbd_dist_wtd_avg_pred_var_filter_block2d_avg(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,const uint16_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)1154 static void highbd_dist_wtd_avg_pred_var_filter_block2d_avg(
1155     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
1156     int dst_width, int dst_height, const uint16_t *second_pred,
1157     const DIST_WTD_COMP_PARAMS *jcp_param) {
1158   // We only specialise on the filter values for large block sizes (>= 16x16.)
1159   assert(dst_width >= 16 && dst_width % 16 == 0);
1160   const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
1161   const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
1162 
1163   int i = dst_height;
1164   do {
1165     int j = 0;
1166     do {
1167       uint16x8_t s0 = vld1q_u16(src_ptr + j);
1168       uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
1169       uint16x8_t p = vld1q_u16(second_pred);
1170       uint16x8_t avg = vrhaddq_u16(s0, s1);
1171       avg = dist_wtd_avg_u16x8(avg, p, fwd_offset, bck_offset);
1172 
1173       vst1q_u16(dst_ptr + j, avg);
1174 
1175       second_pred += 8;
1176       j += 8;
1177     } while (j < dst_width);
1178 
1179     src_ptr += src_stride;
1180     dst_ptr += dst_width;
1181   } while (--i != 0);
1182 }
1183 
highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w4(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)1184 static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w4(
1185     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
1186     int dst_height, int filter_offset, const uint16_t *second_pred,
1187     const DIST_WTD_COMP_PARAMS *jcp_param) {
1188   const uint16x4_t fwd_offset = vdup_n_u16(jcp_param->fwd_offset);
1189   const uint16x4_t bck_offset = vdup_n_u16(jcp_param->bck_offset);
1190   const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
1191   const uint16x4_t f1 = vdup_n_u16(filter_offset);
1192 
1193   int i = dst_height;
1194   do {
1195     uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
1196     uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
1197     uint16x4_t p = vld1_u16(second_pred);
1198 
1199     uint16x4_t blend = vmul_u16(s0, f0);
1200     blend = vmla_u16(blend, s1, f1);
1201     blend = vrshr_n_u16(blend, 3);
1202 
1203     uint16x4_t avg = dist_wtd_avg_u16x4(blend, p, fwd_offset, bck_offset);
1204 
1205     vst1_u16(dst_ptr, avg);
1206 
1207     src_ptr += src_stride;
1208     dst_ptr += 4;
1209     second_pred += 4;
1210   } while (--i != 0);
1211 }
1212 
1213 // Combine bilinear filter with aom_dist_wtd_comp_avg_pred for large blocks.
highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,int filter_offset,const uint16_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)1214 static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
1215     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
1216     int dst_width, int dst_height, int filter_offset,
1217     const uint16_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {
1218   const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
1219   const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
1220   const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
1221   const uint16x8_t f1 = vdupq_n_u16(filter_offset);
1222 
1223   int i = dst_height;
1224   do {
1225     int j = 0;
1226     do {
1227       uint16x8_t s0 = vld1q_u16(src_ptr + j);
1228       uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
1229       uint16x8_t p = vld1q_u16(second_pred);
1230 
1231       uint16x8_t blend = vmulq_u16(s0, f0);
1232       blend = vmlaq_u16(blend, s1, f1);
1233       blend = vrshrq_n_u16(blend, 3);
1234 
1235       uint16x8_t avg = dist_wtd_avg_u16x8(blend, p, fwd_offset, bck_offset);
1236 
1237       vst1q_u16(dst_ptr + j, avg);
1238 
1239       second_pred += 8;
1240       j += 8;
1241     } while (j < dst_width);
1242 
1243     src_ptr += src_stride;
1244     dst_ptr += dst_width;
1245   } while (--i != 0);
1246 }
1247 
highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w8(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)1248 static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w8(
1249     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
1250     int dst_height, int filter_offset, const uint16_t *second_pred,
1251     const DIST_WTD_COMP_PARAMS *jcp_param) {
1252   highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
1253       src_ptr, dst_ptr, src_stride, pixel_step, 8, dst_height, filter_offset,
1254       second_pred, jcp_param);
1255 }
1256 
1257 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w16(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)1258 static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w16(
1259     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
1260     int dst_height, int filter_offset, const uint16_t *second_pred,
1261     const DIST_WTD_COMP_PARAMS *jcp_param) {
1262   highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
1263       src_ptr, dst_ptr, src_stride, pixel_step, 16, dst_height, filter_offset,
1264       second_pred, jcp_param);
1265 }
1266 
1267 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w32(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)1268 static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w32(
1269     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
1270     int dst_height, int filter_offset, const uint16_t *second_pred,
1271     const DIST_WTD_COMP_PARAMS *jcp_param) {
1272   highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
1273       src_ptr, dst_ptr, src_stride, pixel_step, 32, dst_height, filter_offset,
1274       second_pred, jcp_param);
1275 }
1276 
1277 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w64(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)1278 static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w64(
1279     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
1280     int dst_height, int filter_offset, const uint16_t *second_pred,
1281     const DIST_WTD_COMP_PARAMS *jcp_param) {
1282   highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
1283       src_ptr, dst_ptr, src_stride, pixel_step, 64, dst_height, filter_offset,
1284       second_pred, jcp_param);
1285 }
1286 
1287 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w128(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)1288 static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w128(
1289     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
1290     int dst_height, int filter_offset, const uint16_t *second_pred,
1291     const DIST_WTD_COMP_PARAMS *jcp_param) {
1292   highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
1293       src_ptr, dst_ptr, src_stride, pixel_step, 128, dst_height, filter_offset,
1294       second_pred, jcp_param);
1295 }
1296 
1297 #define HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)              \
1298   unsigned int                                                                 \
1299       aom_highbd_##bitdepth##_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
1300           const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, \
1301           const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,               \
1302           const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
1303     uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                              \
1304     uint16_t *second = CONVERT_TO_SHORTPTR(second_pred);                       \
1305     uint16_t tmp0[w * (h + 1)];                                                \
1306     uint16_t tmp1[w * h];                                                      \
1307     highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1,     \
1308                                        xoffset);                               \
1309     highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                      \
1310         tmp0, tmp1, w, w, h, yoffset, second, jcp_param);                      \
1311     return aom_highbd_##bitdepth##_variance##w##x##h(                          \
1312         CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse);                \
1313   }
1314 
1315 #define SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)  \
1316   unsigned int                                                                 \
1317       aom_highbd_##bitdepth##_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
1318           const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, \
1319           const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,               \
1320           const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
1321     uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                              \
1322     uint16_t *second = CONVERT_TO_SHORTPTR(second_pred);                       \
1323     if (xoffset == 0) {                                                        \
1324       uint16_t tmp[w * h];                                                     \
1325       if (yoffset == 0) {                                                      \
1326         highbd_dist_wtd_avg_pred(src, tmp, source_stride, w, h, second,        \
1327                                  jcp_param);                                   \
1328         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
1329             CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse);             \
1330       } else if (yoffset == 4) {                                               \
1331         highbd_dist_wtd_avg_pred_var_filter_block2d_avg(                       \
1332             src, tmp, source_stride, source_stride, w, h, second, jcp_param);  \
1333         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
1334             CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse);             \
1335       } else {                                                                 \
1336         highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                  \
1337             src, tmp, source_stride, source_stride, h, yoffset, second,        \
1338             jcp_param);                                                        \
1339         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
1340             CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse);             \
1341       }                                                                        \
1342     } else if (xoffset == 4) {                                                 \
1343       uint16_t tmp0[w * (h + 1)];                                              \
1344       if (yoffset == 0) {                                                      \
1345         highbd_dist_wtd_avg_pred_var_filter_block2d_avg(                       \
1346             src, tmp0, source_stride, 1, w, h, second, jcp_param);             \
1347         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
1348             CONVERT_TO_BYTEPTR(tmp0), w, ref_ptr, ref_stride, sse);            \
1349       } else if (yoffset == 4) {                                               \
1350         uint16_t tmp1[w * (h + 1)];                                            \
1351         highbd_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h + 1);  \
1352         highbd_dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w,   \
1353                                                         h, second, jcp_param); \
1354         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
1355             CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse);            \
1356       } else {                                                                 \
1357         uint16_t tmp1[w * (h + 1)];                                            \
1358         highbd_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h + 1);  \
1359         highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                  \
1360             tmp0, tmp1, w, w, h, yoffset, second, jcp_param);                  \
1361         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
1362             CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse);            \
1363       }                                                                        \
1364     } else {                                                                   \
1365       uint16_t tmp0[w * (h + 1)];                                              \
1366       if (yoffset == 0) {                                                      \
1367         highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                  \
1368             src, tmp0, source_stride, 1, h, xoffset, second, jcp_param);       \
1369         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
1370             CONVERT_TO_BYTEPTR(tmp0), w, ref_ptr, ref_stride, sse);            \
1371       } else if (yoffset == 4) {                                               \
1372         uint16_t tmp1[w * h];                                                  \
1373         highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \
1374                                            xoffset);                           \
1375         highbd_dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w,   \
1376                                                         h, second, jcp_param); \
1377         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
1378             CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse);            \
1379       } else {                                                                 \
1380         uint16_t tmp1[w * h];                                                  \
1381         highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \
1382                                            xoffset);                           \
1383         highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                  \
1384             tmp0, tmp1, w, w, h, yoffset, second, jcp_param);                  \
1385         return aom_highbd_##bitdepth##_variance##w##x##h(                      \
1386             CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse);            \
1387       }                                                                        \
1388     }                                                                          \
1389   }
1390 
1391 // 8-bit
1392 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
1393 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
1394 
1395 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
1396 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
1397 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
1398 
1399 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
1400 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
1401 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
1402 
1403 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
1404 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
1405 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
1406 
1407 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
1408 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
1409 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128)
1410 
1411 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64)
1412 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128)
1413 
1414 #if !CONFIG_REALTIME_ONLY
1415 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16)
1416 
1417 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32)
1418 
1419 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4)
1420 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64)
1421 
1422 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8)
1423 
1424 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16)
1425 #endif  // !CONFIG_REALTIME_ONLY
1426 
1427 // 10-bit
1428 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
1429 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
1430 
1431 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
1432 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
1433 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
1434 
1435 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
1436 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
1437 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
1438 
1439 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
1440 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
1441 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
1442 
1443 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
1444 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
1445 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128)
1446 
1447 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64)
1448 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128)
1449 
1450 #if !CONFIG_REALTIME_ONLY
1451 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16)
1452 
1453 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32)
1454 
1455 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4)
1456 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64)
1457 
1458 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8)
1459 
1460 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16)
1461 #endif  // !CONFIG_REALTIME_ONLY
1462 
1463 // 12-bit
1464 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
1465 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
1466 
1467 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
1468 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
1469 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
1470 
1471 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
1472 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
1473 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
1474 
1475 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
1476 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
1477 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
1478 
1479 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
1480 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
1481 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128)
1482 
1483 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64)
1484 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128)
1485 
1486 #if !CONFIG_REALTIME_ONLY
1487 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16)
1488 
1489 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32)
1490 
1491 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4)
1492 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64)
1493 
1494 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8)
1495 
1496 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16)
1497 #endif  // !CONFIG_REALTIME_ONLY
1498