1 /*
2 * Copyright (c) 2023 The WebM project authors. All rights reserved.
3 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
4 *
5 * This source code is subject to the terms of the BSD 2 Clause License and
6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7 * was not distributed with this source code in the LICENSE file, you can
8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
9 * Media Patent License 1.0 was not distributed with this source code in the
10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
11 */
12
13 #include <arm_neon.h>
14
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17
18 #include "aom_dsp/aom_filter.h"
19 #include "aom_dsp/arm/dist_wtd_avg_neon.h"
20 #include "aom_dsp/arm/mem_neon.h"
21 #include "aom_dsp/arm/sum_neon.h"
22 #include "aom_dsp/variance.h"
23
24 // The bilinear filters look like this:
25 //
26 // {{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
27 // { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }}
28 //
29 // We can factor out the highest common multiple, such that the sum of both
30 // weights will be 8 instead of 128. The benefits of this are two-fold:
31 //
32 // 1) We can infer the filter values from the filter_offset parameter in the
33 // bilinear filter functions below - we don't have to actually load the values
34 // from memory:
35 // f0 = 8 - filter_offset
36 // f1 = filter_offset
37 //
38 // 2) Scaling the pixel values by 8, instead of 128 enables us to operate on
39 // 16-bit data types at all times, rather than widening out to 32-bit and
40 // requiring double the number of data processing instructions. (12-bit * 8 =
41 // 15-bit.)
42
43 // Process a block exactly 4 wide and any height.
highbd_var_filter_block2d_bil_w4(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)44 static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
45 uint16_t *dst_ptr, int src_stride,
46 int pixel_step, int dst_height,
47 int filter_offset) {
48 const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
49 const uint16x4_t f1 = vdup_n_u16(filter_offset);
50
51 int i = dst_height;
52 do {
53 uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
54 uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
55
56 uint16x4_t blend = vmul_u16(s0, f0);
57 blend = vmla_u16(blend, s1, f1);
58 blend = vrshr_n_u16(blend, 3);
59
60 vst1_u16(dst_ptr, blend);
61
62 src_ptr += src_stride;
63 dst_ptr += 4;
64 } while (--i != 0);
65 }
66
67 // Process a block which is a multiple of 8 and any height.
highbd_var_filter_block2d_bil_large(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,int filter_offset)68 static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr,
69 uint16_t *dst_ptr,
70 int src_stride, int pixel_step,
71 int dst_width, int dst_height,
72 int filter_offset) {
73 const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
74 const uint16x8_t f1 = vdupq_n_u16(filter_offset);
75
76 int i = dst_height;
77 do {
78 int j = 0;
79 do {
80 uint16x8_t s0 = vld1q_u16(src_ptr + j);
81 uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
82
83 uint16x8_t blend = vmulq_u16(s0, f0);
84 blend = vmlaq_u16(blend, s1, f1);
85 blend = vrshrq_n_u16(blend, 3);
86
87 vst1q_u16(dst_ptr + j, blend);
88
89 j += 8;
90 } while (j < dst_width);
91
92 src_ptr += src_stride;
93 dst_ptr += dst_width;
94 } while (--i != 0);
95 }
96
highbd_var_filter_block2d_bil_w8(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)97 static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr,
98 uint16_t *dst_ptr, int src_stride,
99 int pixel_step, int dst_height,
100 int filter_offset) {
101 highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
102 8, dst_height, filter_offset);
103 }
104
highbd_var_filter_block2d_bil_w16(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)105 static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr,
106 uint16_t *dst_ptr, int src_stride,
107 int pixel_step, int dst_height,
108 int filter_offset) {
109 highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
110 16, dst_height, filter_offset);
111 }
112
highbd_var_filter_block2d_bil_w32(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)113 static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr,
114 uint16_t *dst_ptr, int src_stride,
115 int pixel_step, int dst_height,
116 int filter_offset) {
117 highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
118 32, dst_height, filter_offset);
119 }
120
highbd_var_filter_block2d_bil_w64(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)121 static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr,
122 uint16_t *dst_ptr, int src_stride,
123 int pixel_step, int dst_height,
124 int filter_offset) {
125 highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
126 64, dst_height, filter_offset);
127 }
128
highbd_var_filter_block2d_bil_w128(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)129 static void highbd_var_filter_block2d_bil_w128(const uint16_t *src_ptr,
130 uint16_t *dst_ptr,
131 int src_stride, int pixel_step,
132 int dst_height,
133 int filter_offset) {
134 highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
135 128, dst_height, filter_offset);
136 }
137
highbd_var_filter_block2d_avg(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height)138 static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
139 uint16_t *dst_ptr, int src_stride,
140 int pixel_step, int dst_width,
141 int dst_height) {
142 int i = dst_height;
143
144 // We only specialize on the filter values for large block sizes (>= 16x16.)
145 assert(dst_width >= 16 && dst_width % 16 == 0);
146
147 do {
148 int j = 0;
149 do {
150 uint16x8_t s0 = vld1q_u16(src_ptr + j);
151 uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
152 uint16x8_t avg = vrhaddq_u16(s0, s1);
153 vst1q_u16(dst_ptr + j, avg);
154
155 j += 8;
156 } while (j < dst_width);
157
158 src_ptr += src_stride;
159 dst_ptr += dst_width;
160 } while (--i != 0);
161 }
162
163 #define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
164 unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \
165 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
166 const uint8_t *ref, int ref_stride, uint32_t *sse) { \
167 uint16_t tmp0[w * (h + 1)]; \
168 uint16_t tmp1[w * h]; \
169 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
170 \
171 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
172 xoffset); \
173 highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
174 \
175 return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
176 w, ref, ref_stride, sse); \
177 }
178
179 #define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
180 unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \
181 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
182 const uint8_t *ref, int ref_stride, unsigned int *sse) { \
183 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
184 \
185 if (xoffset == 0) { \
186 if (yoffset == 0) { \
187 return aom_highbd_##bitdepth##_variance##w##x##h( \
188 CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \
189 } else if (yoffset == 4) { \
190 uint16_t tmp[w * h]; \
191 highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
192 h); \
193 return aom_highbd_##bitdepth##_variance##w##x##h( \
194 CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
195 } else { \
196 uint16_t tmp[w * h]; \
197 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \
198 src_stride, h, yoffset); \
199 return aom_highbd_##bitdepth##_variance##w##x##h( \
200 CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
201 } \
202 } else if (xoffset == 4) { \
203 uint16_t tmp0[w * (h + 1)]; \
204 if (yoffset == 0) { \
205 highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \
206 return aom_highbd_##bitdepth##_variance##w##x##h( \
207 CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
208 } else if (yoffset == 4) { \
209 uint16_t tmp1[w * (h + 1)]; \
210 highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
211 (h + 1)); \
212 highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
213 return aom_highbd_##bitdepth##_variance##w##x##h( \
214 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
215 } else { \
216 uint16_t tmp1[w * (h + 1)]; \
217 highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
218 (h + 1)); \
219 highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
220 return aom_highbd_##bitdepth##_variance##w##x##h( \
221 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
222 } \
223 } else { \
224 uint16_t tmp0[w * (h + 1)]; \
225 if (yoffset == 0) { \
226 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \
227 xoffset); \
228 return aom_highbd_##bitdepth##_variance##w##x##h( \
229 CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
230 } else if (yoffset == 4) { \
231 uint16_t tmp1[w * h]; \
232 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
233 (h + 1), xoffset); \
234 highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
235 return aom_highbd_##bitdepth##_variance##w##x##h( \
236 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
237 } else { \
238 uint16_t tmp1[w * h]; \
239 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
240 (h + 1), xoffset); \
241 highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
242 return aom_highbd_##bitdepth##_variance##w##x##h( \
243 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
244 } \
245 } \
246 }
247
248 // 8-bit
249 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
250 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
251
252 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
253 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
254 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
255
256 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
257 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
258 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
259
260 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
261 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
262 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
263
264 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
265 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
266 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
267
268 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
269 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
270
271 #if !CONFIG_REALTIME_ONLY
272 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
273
274 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
275
276 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
277 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
278
279 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
280
281 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
282 #endif // !CONFIG_REALTIME_ONLY
283
284 // 10-bit
285 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
286 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
287
288 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
289 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
290 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
291
292 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
293 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
294 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
295
296 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
297 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
298 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
299
300 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
301 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
302 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
303
304 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
305 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
306
307 #if !CONFIG_REALTIME_ONLY
308 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
309
310 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
311
312 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
313 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
314
315 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
316
317 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
318 #endif // !CONFIG_REALTIME_ONLY
319
320 // 12-bit
321 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
322 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
323
324 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
325 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
326 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
327
328 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
329 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
330 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
331
332 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
333 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
334 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
335
336 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
337 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
338 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
339
340 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
341 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
342
343 #if !CONFIG_REALTIME_ONLY
344 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
345
346 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
347
348 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
349 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
350
351 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
352
353 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
354 #endif // !CONFIG_REALTIME_ONLY
355
356 // Combine bilinear filter with aom_highbd_comp_avg_pred for blocks having
357 // width 4.
highbd_avg_pred_var_filter_block2d_bil_w4(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred)358 static void highbd_avg_pred_var_filter_block2d_bil_w4(
359 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
360 int dst_height, int filter_offset, const uint16_t *second_pred) {
361 const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
362 const uint16x4_t f1 = vdup_n_u16(filter_offset);
363
364 int i = dst_height;
365 do {
366 uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
367 uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
368 uint16x4_t p = vld1_u16(second_pred);
369
370 uint16x4_t blend = vmul_u16(s0, f0);
371 blend = vmla_u16(blend, s1, f1);
372 blend = vrshr_n_u16(blend, 3);
373
374 vst1_u16(dst_ptr, vrhadd_u16(blend, p));
375
376 src_ptr += src_stride;
377 dst_ptr += 4;
378 second_pred += 4;
379 } while (--i != 0);
380 }
381
382 // Combine bilinear filter with aom_highbd_comp_avg_pred for large blocks.
highbd_avg_pred_var_filter_block2d_bil_large(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,int filter_offset,const uint16_t * second_pred)383 static void highbd_avg_pred_var_filter_block2d_bil_large(
384 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
385 int dst_width, int dst_height, int filter_offset,
386 const uint16_t *second_pred) {
387 const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
388 const uint16x8_t f1 = vdupq_n_u16(filter_offset);
389
390 int i = dst_height;
391 do {
392 int j = 0;
393 do {
394 uint16x8_t s0 = vld1q_u16(src_ptr + j);
395 uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
396 uint16x8_t p = vld1q_u16(second_pred);
397
398 uint16x8_t blend = vmulq_u16(s0, f0);
399 blend = vmlaq_u16(blend, s1, f1);
400 blend = vrshrq_n_u16(blend, 3);
401
402 vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p));
403
404 j += 8;
405 second_pred += 8;
406 } while (j < dst_width);
407
408 src_ptr += src_stride;
409 dst_ptr += dst_width;
410 } while (--i != 0);
411 }
412
highbd_avg_pred_var_filter_block2d_bil_w8(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred)413 static void highbd_avg_pred_var_filter_block2d_bil_w8(
414 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
415 int dst_height, int filter_offset, const uint16_t *second_pred) {
416 highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
417 pixel_step, 8, dst_height,
418 filter_offset, second_pred);
419 }
420
highbd_avg_pred_var_filter_block2d_bil_w16(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred)421 static void highbd_avg_pred_var_filter_block2d_bil_w16(
422 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
423 int dst_height, int filter_offset, const uint16_t *second_pred) {
424 highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
425 pixel_step, 16, dst_height,
426 filter_offset, second_pred);
427 }
428
highbd_avg_pred_var_filter_block2d_bil_w32(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred)429 static void highbd_avg_pred_var_filter_block2d_bil_w32(
430 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
431 int dst_height, int filter_offset, const uint16_t *second_pred) {
432 highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
433 pixel_step, 32, dst_height,
434 filter_offset, second_pred);
435 }
436
highbd_avg_pred_var_filter_block2d_bil_w64(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred)437 static void highbd_avg_pred_var_filter_block2d_bil_w64(
438 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
439 int dst_height, int filter_offset, const uint16_t *second_pred) {
440 highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
441 pixel_step, 64, dst_height,
442 filter_offset, second_pred);
443 }
444
highbd_avg_pred_var_filter_block2d_bil_w128(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred)445 static void highbd_avg_pred_var_filter_block2d_bil_w128(
446 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
447 int dst_height, int filter_offset, const uint16_t *second_pred) {
448 highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
449 pixel_step, 128, dst_height,
450 filter_offset, second_pred);
451 }
452
453 // Combine averaging subpel filter with aom_highbd_comp_avg_pred.
highbd_avg_pred_var_filter_block2d_avg(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,const uint16_t * second_pred)454 static void highbd_avg_pred_var_filter_block2d_avg(
455 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
456 int dst_width, int dst_height, const uint16_t *second_pred) {
457 int i = dst_height;
458
459 // We only specialize on the filter values for large block sizes (>= 16x16.)
460 assert(dst_width >= 16 && dst_width % 16 == 0);
461
462 do {
463 int j = 0;
464 do {
465 uint16x8_t s0 = vld1q_u16(src_ptr + j);
466 uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
467 uint16x8_t avg = vrhaddq_u16(s0, s1);
468
469 uint16x8_t p = vld1q_u16(second_pred);
470 avg = vrhaddq_u16(avg, p);
471
472 vst1q_u16(dst_ptr + j, avg);
473
474 j += 8;
475 second_pred += 8;
476 } while (j < dst_width);
477
478 src_ptr += src_stride;
479 dst_ptr += dst_width;
480 } while (--i != 0);
481 }
482
483 // Implementation of aom_highbd_comp_avg_pred for blocks having width >= 16.
highbd_avg_pred(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int dst_width,int dst_height,const uint16_t * second_pred)484 static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
485 int src_stride, int dst_width, int dst_height,
486 const uint16_t *second_pred) {
487 int i = dst_height;
488
489 // We only specialize on the filter values for large block sizes (>= 16x16.)
490 assert(dst_width >= 16 && dst_width % 16 == 0);
491
492 do {
493 int j = 0;
494 do {
495 uint16x8_t s = vld1q_u16(src_ptr + j);
496 uint16x8_t p = vld1q_u16(second_pred);
497
498 uint16x8_t avg = vrhaddq_u16(s, p);
499
500 vst1q_u16(dst_ptr + j, avg);
501
502 j += 8;
503 second_pred += 8;
504 } while (j < dst_width);
505
506 src_ptr += src_stride;
507 dst_ptr += dst_width;
508 } while (--i != 0);
509 }
510
511 #define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
512 uint32_t aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
513 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
514 const uint8_t *ref, int ref_stride, uint32_t *sse, \
515 const uint8_t *second_pred) { \
516 uint16_t tmp0[w * (h + 1)]; \
517 uint16_t tmp1[w * h]; \
518 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
519 \
520 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
521 xoffset); \
522 highbd_avg_pred_var_filter_block2d_bil_w##w( \
523 tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
524 \
525 return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
526 w, ref, ref_stride, sse); \
527 }
528
529 #define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
530 unsigned int aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
531 const uint8_t *src, int source_stride, int xoffset, int yoffset, \
532 const uint8_t *ref, int ref_stride, uint32_t *sse, \
533 const uint8_t *second_pred) { \
534 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
535 \
536 if (xoffset == 0) { \
537 uint16_t tmp[w * h]; \
538 if (yoffset == 0) { \
539 highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \
540 CONVERT_TO_SHORTPTR(second_pred)); \
541 return aom_highbd_##bitdepth##_variance##w##x##h( \
542 CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
543 } else if (yoffset == 4) { \
544 highbd_avg_pred_var_filter_block2d_avg( \
545 src_ptr, tmp, source_stride, source_stride, w, h, \
546 CONVERT_TO_SHORTPTR(second_pred)); \
547 return aom_highbd_##bitdepth##_variance##w##x##h( \
548 CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
549 } else { \
550 highbd_avg_pred_var_filter_block2d_bil_w##w( \
551 src_ptr, tmp, source_stride, source_stride, h, yoffset, \
552 CONVERT_TO_SHORTPTR(second_pred)); \
553 return aom_highbd_##bitdepth##_variance##w##x##h( \
554 CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
555 } \
556 } else if (xoffset == 4) { \
557 uint16_t tmp0[w * (h + 1)]; \
558 if (yoffset == 0) { \
559 highbd_avg_pred_var_filter_block2d_avg( \
560 src_ptr, tmp0, source_stride, 1, w, h, \
561 CONVERT_TO_SHORTPTR(second_pred)); \
562 return aom_highbd_##bitdepth##_variance##w##x##h( \
563 CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
564 } else if (yoffset == 4) { \
565 uint16_t tmp1[w * (h + 1)]; \
566 highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \
567 (h + 1)); \
568 highbd_avg_pred_var_filter_block2d_avg( \
569 tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
570 return aom_highbd_##bitdepth##_variance##w##x##h( \
571 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
572 } else { \
573 uint16_t tmp1[w * (h + 1)]; \
574 highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \
575 (h + 1)); \
576 highbd_avg_pred_var_filter_block2d_bil_w##w( \
577 tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
578 return aom_highbd_##bitdepth##_variance##w##x##h( \
579 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
580 } \
581 } else { \
582 uint16_t tmp0[w * (h + 1)]; \
583 if (yoffset == 0) { \
584 highbd_avg_pred_var_filter_block2d_bil_w##w( \
585 src_ptr, tmp0, source_stride, 1, h, xoffset, \
586 CONVERT_TO_SHORTPTR(second_pred)); \
587 return aom_highbd_##bitdepth##_variance##w##x##h( \
588 CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
589 } else if (yoffset == 4) { \
590 uint16_t tmp1[w * h]; \
591 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \
592 (h + 1), xoffset); \
593 highbd_avg_pred_var_filter_block2d_avg( \
594 tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
595 return aom_highbd_##bitdepth##_variance##w##x##h( \
596 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
597 } else { \
598 uint16_t tmp1[w * h]; \
599 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \
600 (h + 1), xoffset); \
601 highbd_avg_pred_var_filter_block2d_bil_w##w( \
602 tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
603 return aom_highbd_##bitdepth##_variance##w##x##h( \
604 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
605 } \
606 } \
607 }
608
609 // 8-bit
610 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
611 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
612
613 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
614 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
615 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
616
617 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
618 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
619 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
620
621 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
622 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
623 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
624
625 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
626 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
627 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128)
628
629 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64)
630 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128)
631
632 #if !CONFIG_REALTIME_ONLY
633 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16)
634
635 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32)
636
637 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4)
638 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64)
639
640 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8)
641
642 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16)
643 #endif // !CONFIG_REALTIME_ONLY
644
645 // 10-bit
646 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
647 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
648
649 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
650 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
651 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
652
653 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
654 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
655 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
656
657 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
658 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
659 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
660
661 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
662 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
663 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128)
664
665 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64)
666 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128)
667
668 #if !CONFIG_REALTIME_ONLY
669 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16)
670
671 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32)
672
673 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4)
674 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64)
675
676 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8)
677
678 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16)
679 #endif // !CONFIG_REALTIME_ONLY
680
681 // 12-bit
682 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
683 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
684
685 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
686 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
687 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
688
689 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
690 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
691 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
692
693 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
694 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
695 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
696
697 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
698 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
699 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128)
700
701 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64)
702 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128)
703
704 #if !CONFIG_REALTIME_ONLY
705 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16)
706
707 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32)
708
709 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4)
710 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64)
711
712 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8)
713
714 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16)
715 #endif // !CONFIG_REALTIME_ONLY
716
717 #define HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
718 unsigned int \
719 aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon( \
720 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
721 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
722 const uint8_t *msk, int msk_stride, int invert_mask, \
723 unsigned int *sse) { \
724 uint16_t tmp0[w * (h + 1)]; \
725 uint16_t tmp1[w * (h + 1)]; \
726 uint16_t tmp2[w * h]; \
727 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
728 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
729 xoffset); \
730 highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
731 aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, w, \
732 h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
733 msk_stride, invert_mask); \
734 return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp2), \
735 w, ref, ref_stride, sse); \
736 }
737
738 #define HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
739 unsigned int \
740 aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon( \
741 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
742 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
743 const uint8_t *msk, int msk_stride, int invert_mask, \
744 unsigned int *sse) { \
745 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
746 if (xoffset == 0) { \
747 uint16_t tmp0[w * h]; \
748 if (yoffset == 0) { \
749 aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp0), second_pred, \
750 w, h, src, src_stride, msk, msk_stride, \
751 invert_mask); \
752 return aom_highbd_##bitdepth##_variance##w##x##h( \
753 CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
754 } else if (yoffset == 4) { \
755 uint16_t tmp1[w * h]; \
756 highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, src_stride, \
757 w, h); \
758 aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \
759 w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
760 msk_stride, invert_mask); \
761 return aom_highbd_##bitdepth##_variance##w##x##h( \
762 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
763 } else { \
764 uint16_t tmp1[w * h]; \
765 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, \
766 src_stride, h, yoffset); \
767 aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \
768 w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
769 msk_stride, invert_mask); \
770 return aom_highbd_##bitdepth##_variance##w##x##h( \
771 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
772 } \
773 } else if (xoffset == 4) { \
774 uint16_t tmp0[w * (h + 1)]; \
775 if (yoffset == 0) { \
776 uint16_t tmp1[w * h]; \
777 highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \
778 aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \
779 w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
780 msk_stride, invert_mask); \
781 return aom_highbd_##bitdepth##_variance##w##x##h( \
782 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
783 } else if (yoffset == 4) { \
784 uint16_t tmp1[w * h]; \
785 uint16_t tmp2[w * h]; \
786 highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
787 (h + 1)); \
788 highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
789 aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \
790 w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
791 msk_stride, invert_mask); \
792 return aom_highbd_##bitdepth##_variance##w##x##h( \
793 CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \
794 } else { \
795 uint16_t tmp1[w * h]; \
796 uint16_t tmp2[w * h]; \
797 highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
798 (h + 1)); \
799 highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
800 aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \
801 w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
802 msk_stride, invert_mask); \
803 return aom_highbd_##bitdepth##_variance##w##x##h( \
804 CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \
805 } \
806 } else { \
807 if (yoffset == 0) { \
808 uint16_t tmp0[w * h]; \
809 uint16_t tmp1[w * h]; \
810 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \
811 xoffset); \
812 aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \
813 w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
814 msk_stride, invert_mask); \
815 return aom_highbd_##bitdepth##_variance##w##x##h( \
816 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
817 } else if (yoffset == 4) { \
818 uint16_t tmp0[w * (h + 1)]; \
819 uint16_t tmp1[w * h]; \
820 uint16_t tmp2[w * h]; \
821 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
822 (h + 1), xoffset); \
823 highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
824 aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \
825 w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
826 msk_stride, invert_mask); \
827 return aom_highbd_##bitdepth##_variance##w##x##h( \
828 CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \
829 } else { \
830 uint16_t tmp0[w * (h + 1)]; \
831 uint16_t tmp1[w * (h + 1)]; \
832 uint16_t tmp2[w * h]; \
833 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
834 (h + 1), xoffset); \
835 highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
836 aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \
837 w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
838 msk_stride, invert_mask); \
839 return aom_highbd_##bitdepth##_variance##w##x##h( \
840 CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \
841 } \
842 } \
843 }
844
845 // 8-bit
846 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
847 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
848
849 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
850 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
851 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
852
853 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
854 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
855 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
856
857 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
858 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
859 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
860
861 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
862 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
863 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
864
865 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
866 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
867
868 #if !CONFIG_REALTIME_ONLY
869 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
870
871 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
872
873 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
874 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
875
876 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
877
878 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
879 #endif // !CONFIG_REALTIME_ONLY
880
881 // 10-bit
882 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
883 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
884
885 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
886 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
887 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
888
889 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
890 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
891 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
892
893 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
894 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
895 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
896
897 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
898 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
899 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
900
901 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
902 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
903
904 #if !CONFIG_REALTIME_ONLY
905 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
906
907 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
908
909 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
910 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
911
912 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
913
914 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
915 #endif // !CONFIG_REALTIME_ONLY
916
917 // 12-bit
918 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
919 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
920
921 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
922 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
923 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
924
925 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
926 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
927 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
928
929 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
930 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
931 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
932
933 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
934 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
935 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
936
937 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
938 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
939
940 #if !CONFIG_REALTIME_ONLY
941 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
942
943 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
944
945 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
946 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
947
948 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
949
950 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
951 #endif // !CONFIG_REALTIME_ONLY
952
953 #if !CONFIG_REALTIME_ONLY
954 #define HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
955 unsigned int \
956 aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon( \
957 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
958 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
959 uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre); \
960 uint16_t tmp0[w * (h + 1)]; \
961 uint16_t tmp1[w * h]; \
962 highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h + 1, \
963 xoffset); \
964 highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
965 return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
966 CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
967 }
968
969 #define SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
970 unsigned int \
971 aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon( \
972 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
973 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
974 uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre); \
975 if (xoffset == 0) { \
976 if (yoffset == 0) { \
977 return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
978 pre, pre_stride, wsrc, mask, sse); \
979 } else if (yoffset == 4) { \
980 uint16_t tmp[w * h]; \
981 highbd_var_filter_block2d_avg(pre_ptr, tmp, pre_stride, pre_stride, w, \
982 h); \
983 return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
984 CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse); \
985 } else { \
986 uint16_t tmp[w * h]; \
987 highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp, pre_stride, \
988 pre_stride, h, yoffset); \
989 return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
990 CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse); \
991 } \
992 } else if (xoffset == 4) { \
993 uint16_t tmp0[w * (h + 1)]; \
994 if (yoffset == 0) { \
995 highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h); \
996 return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
997 CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse); \
998 } else if (yoffset == 4) { \
999 uint16_t tmp1[w * (h + 1)]; \
1000 highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \
1001 highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
1002 return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
1003 CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
1004 } else { \
1005 uint16_t tmp1[w * (h + 1)]; \
1006 highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \
1007 highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
1008 return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
1009 CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
1010 } \
1011 } else { \
1012 uint16_t tmp0[w * (h + 1)]; \
1013 if (yoffset == 0) { \
1014 highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h, \
1015 xoffset); \
1016 return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
1017 CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse); \
1018 } else if (yoffset == 4) { \
1019 uint16_t tmp1[w * h]; \
1020 highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, \
1021 h + 1, xoffset); \
1022 highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
1023 return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
1024 CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
1025 } else { \
1026 uint16_t tmp1[w * h]; \
1027 highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, \
1028 h + 1, xoffset); \
1029 highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
1030 return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
1031 CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
1032 } \
1033 } \
1034 }
1035
1036 // 8-bit
1037 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
1038 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
1039 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
1040
1041 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
1042 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
1043 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
1044 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
1045
1046 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
1047 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
1048 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
1049 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
1050 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
1051
1052 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
1053 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
1054 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
1055 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
1056
1057 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
1058 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
1059 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
1060 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
1061
1062 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
1063 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
1064
1065 // 10-bit
1066 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
1067 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
1068 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
1069
1070 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
1071 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
1072 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
1073 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
1074
1075 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
1076 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
1077 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
1078 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
1079 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
1080
1081 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
1082 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
1083 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
1084 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
1085
1086 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
1087 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
1088 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
1089 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
1090
1091 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
1092 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
1093
1094 // 12-bit
1095 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
1096 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
1097 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
1098
1099 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
1100 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
1101 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
1102 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
1103
1104 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
1105 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
1106 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
1107 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
1108 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
1109
1110 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
1111 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
1112 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
1113 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
1114
1115 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
1116 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
1117 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
1118 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
1119
1120 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
1121 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
1122 #endif // !CONFIG_REALTIME_ONLY
1123
highbd_dist_wtd_avg_pred(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int dst_width,int dst_height,const uint16_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)1124 static void highbd_dist_wtd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
1125 int src_stride, int dst_width,
1126 int dst_height,
1127 const uint16_t *second_pred,
1128 const DIST_WTD_COMP_PARAMS *jcp_param) {
1129 // We only specialise on the filter values for large block sizes (>= 16x16.)
1130 assert(dst_width >= 16 && dst_width % 16 == 0);
1131 const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
1132 const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
1133
1134 int i = dst_height;
1135 do {
1136 int j = 0;
1137 do {
1138 uint16x8_t s = vld1q_u16(src_ptr + j);
1139 uint16x8_t p = vld1q_u16(second_pred);
1140
1141 uint16x8_t avg = dist_wtd_avg_u16x8(s, p, fwd_offset, bck_offset);
1142
1143 vst1q_u16(dst_ptr + j, avg);
1144
1145 second_pred += 8;
1146 j += 8;
1147 } while (j < dst_width);
1148
1149 src_ptr += src_stride;
1150 dst_ptr += dst_width;
1151 } while (--i != 0);
1152 }
1153
highbd_dist_wtd_avg_pred_var_filter_block2d_avg(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,const uint16_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)1154 static void highbd_dist_wtd_avg_pred_var_filter_block2d_avg(
1155 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
1156 int dst_width, int dst_height, const uint16_t *second_pred,
1157 const DIST_WTD_COMP_PARAMS *jcp_param) {
1158 // We only specialise on the filter values for large block sizes (>= 16x16.)
1159 assert(dst_width >= 16 && dst_width % 16 == 0);
1160 const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
1161 const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
1162
1163 int i = dst_height;
1164 do {
1165 int j = 0;
1166 do {
1167 uint16x8_t s0 = vld1q_u16(src_ptr + j);
1168 uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
1169 uint16x8_t p = vld1q_u16(second_pred);
1170 uint16x8_t avg = vrhaddq_u16(s0, s1);
1171 avg = dist_wtd_avg_u16x8(avg, p, fwd_offset, bck_offset);
1172
1173 vst1q_u16(dst_ptr + j, avg);
1174
1175 second_pred += 8;
1176 j += 8;
1177 } while (j < dst_width);
1178
1179 src_ptr += src_stride;
1180 dst_ptr += dst_width;
1181 } while (--i != 0);
1182 }
1183
highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w4(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)1184 static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w4(
1185 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
1186 int dst_height, int filter_offset, const uint16_t *second_pred,
1187 const DIST_WTD_COMP_PARAMS *jcp_param) {
1188 const uint16x4_t fwd_offset = vdup_n_u16(jcp_param->fwd_offset);
1189 const uint16x4_t bck_offset = vdup_n_u16(jcp_param->bck_offset);
1190 const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
1191 const uint16x4_t f1 = vdup_n_u16(filter_offset);
1192
1193 int i = dst_height;
1194 do {
1195 uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
1196 uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
1197 uint16x4_t p = vld1_u16(second_pred);
1198
1199 uint16x4_t blend = vmul_u16(s0, f0);
1200 blend = vmla_u16(blend, s1, f1);
1201 blend = vrshr_n_u16(blend, 3);
1202
1203 uint16x4_t avg = dist_wtd_avg_u16x4(blend, p, fwd_offset, bck_offset);
1204
1205 vst1_u16(dst_ptr, avg);
1206
1207 src_ptr += src_stride;
1208 dst_ptr += 4;
1209 second_pred += 4;
1210 } while (--i != 0);
1211 }
1212
1213 // Combine bilinear filter with aom_dist_wtd_comp_avg_pred for large blocks.
highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,int filter_offset,const uint16_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)1214 static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
1215 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
1216 int dst_width, int dst_height, int filter_offset,
1217 const uint16_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {
1218 const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
1219 const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
1220 const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
1221 const uint16x8_t f1 = vdupq_n_u16(filter_offset);
1222
1223 int i = dst_height;
1224 do {
1225 int j = 0;
1226 do {
1227 uint16x8_t s0 = vld1q_u16(src_ptr + j);
1228 uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
1229 uint16x8_t p = vld1q_u16(second_pred);
1230
1231 uint16x8_t blend = vmulq_u16(s0, f0);
1232 blend = vmlaq_u16(blend, s1, f1);
1233 blend = vrshrq_n_u16(blend, 3);
1234
1235 uint16x8_t avg = dist_wtd_avg_u16x8(blend, p, fwd_offset, bck_offset);
1236
1237 vst1q_u16(dst_ptr + j, avg);
1238
1239 second_pred += 8;
1240 j += 8;
1241 } while (j < dst_width);
1242
1243 src_ptr += src_stride;
1244 dst_ptr += dst_width;
1245 } while (--i != 0);
1246 }
1247
highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w8(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)1248 static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w8(
1249 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
1250 int dst_height, int filter_offset, const uint16_t *second_pred,
1251 const DIST_WTD_COMP_PARAMS *jcp_param) {
1252 highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
1253 src_ptr, dst_ptr, src_stride, pixel_step, 8, dst_height, filter_offset,
1254 second_pred, jcp_param);
1255 }
1256
1257 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w16(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)1258 static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w16(
1259 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
1260 int dst_height, int filter_offset, const uint16_t *second_pred,
1261 const DIST_WTD_COMP_PARAMS *jcp_param) {
1262 highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
1263 src_ptr, dst_ptr, src_stride, pixel_step, 16, dst_height, filter_offset,
1264 second_pred, jcp_param);
1265 }
1266
1267 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w32(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)1268 static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w32(
1269 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
1270 int dst_height, int filter_offset, const uint16_t *second_pred,
1271 const DIST_WTD_COMP_PARAMS *jcp_param) {
1272 highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
1273 src_ptr, dst_ptr, src_stride, pixel_step, 32, dst_height, filter_offset,
1274 second_pred, jcp_param);
1275 }
1276
1277 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w64(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)1278 static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w64(
1279 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
1280 int dst_height, int filter_offset, const uint16_t *second_pred,
1281 const DIST_WTD_COMP_PARAMS *jcp_param) {
1282 highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
1283 src_ptr, dst_ptr, src_stride, pixel_step, 64, dst_height, filter_offset,
1284 second_pred, jcp_param);
1285 }
1286
1287 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w128(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)1288 static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w128(
1289 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
1290 int dst_height, int filter_offset, const uint16_t *second_pred,
1291 const DIST_WTD_COMP_PARAMS *jcp_param) {
1292 highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
1293 src_ptr, dst_ptr, src_stride, pixel_step, 128, dst_height, filter_offset,
1294 second_pred, jcp_param);
1295 }
1296
1297 #define HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
1298 unsigned int \
1299 aom_highbd_##bitdepth##_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
1300 const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, \
1301 const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
1302 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
1303 uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
1304 uint16_t *second = CONVERT_TO_SHORTPTR(second_pred); \
1305 uint16_t tmp0[w * (h + 1)]; \
1306 uint16_t tmp1[w * h]; \
1307 highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \
1308 xoffset); \
1309 highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
1310 tmp0, tmp1, w, w, h, yoffset, second, jcp_param); \
1311 return aom_highbd_##bitdepth##_variance##w##x##h( \
1312 CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
1313 }
1314
1315 #define SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
1316 unsigned int \
1317 aom_highbd_##bitdepth##_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
1318 const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, \
1319 const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
1320 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
1321 uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
1322 uint16_t *second = CONVERT_TO_SHORTPTR(second_pred); \
1323 if (xoffset == 0) { \
1324 uint16_t tmp[w * h]; \
1325 if (yoffset == 0) { \
1326 highbd_dist_wtd_avg_pred(src, tmp, source_stride, w, h, second, \
1327 jcp_param); \
1328 return aom_highbd_##bitdepth##_variance##w##x##h( \
1329 CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse); \
1330 } else if (yoffset == 4) { \
1331 highbd_dist_wtd_avg_pred_var_filter_block2d_avg( \
1332 src, tmp, source_stride, source_stride, w, h, second, jcp_param); \
1333 return aom_highbd_##bitdepth##_variance##w##x##h( \
1334 CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse); \
1335 } else { \
1336 highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
1337 src, tmp, source_stride, source_stride, h, yoffset, second, \
1338 jcp_param); \
1339 return aom_highbd_##bitdepth##_variance##w##x##h( \
1340 CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse); \
1341 } \
1342 } else if (xoffset == 4) { \
1343 uint16_t tmp0[w * (h + 1)]; \
1344 if (yoffset == 0) { \
1345 highbd_dist_wtd_avg_pred_var_filter_block2d_avg( \
1346 src, tmp0, source_stride, 1, w, h, second, jcp_param); \
1347 return aom_highbd_##bitdepth##_variance##w##x##h( \
1348 CONVERT_TO_BYTEPTR(tmp0), w, ref_ptr, ref_stride, sse); \
1349 } else if (yoffset == 4) { \
1350 uint16_t tmp1[w * (h + 1)]; \
1351 highbd_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h + 1); \
1352 highbd_dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, \
1353 h, second, jcp_param); \
1354 return aom_highbd_##bitdepth##_variance##w##x##h( \
1355 CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
1356 } else { \
1357 uint16_t tmp1[w * (h + 1)]; \
1358 highbd_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h + 1); \
1359 highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
1360 tmp0, tmp1, w, w, h, yoffset, second, jcp_param); \
1361 return aom_highbd_##bitdepth##_variance##w##x##h( \
1362 CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
1363 } \
1364 } else { \
1365 uint16_t tmp0[w * (h + 1)]; \
1366 if (yoffset == 0) { \
1367 highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
1368 src, tmp0, source_stride, 1, h, xoffset, second, jcp_param); \
1369 return aom_highbd_##bitdepth##_variance##w##x##h( \
1370 CONVERT_TO_BYTEPTR(tmp0), w, ref_ptr, ref_stride, sse); \
1371 } else if (yoffset == 4) { \
1372 uint16_t tmp1[w * h]; \
1373 highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \
1374 xoffset); \
1375 highbd_dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, \
1376 h, second, jcp_param); \
1377 return aom_highbd_##bitdepth##_variance##w##x##h( \
1378 CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
1379 } else { \
1380 uint16_t tmp1[w * h]; \
1381 highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \
1382 xoffset); \
1383 highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
1384 tmp0, tmp1, w, w, h, yoffset, second, jcp_param); \
1385 return aom_highbd_##bitdepth##_variance##w##x##h( \
1386 CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
1387 } \
1388 } \
1389 }
1390
1391 // 8-bit
1392 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
1393 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
1394
1395 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
1396 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
1397 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
1398
1399 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
1400 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
1401 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
1402
1403 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
1404 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
1405 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
1406
1407 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
1408 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
1409 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128)
1410
1411 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64)
1412 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128)
1413
1414 #if !CONFIG_REALTIME_ONLY
1415 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16)
1416
1417 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32)
1418
1419 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4)
1420 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64)
1421
1422 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8)
1423
1424 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16)
1425 #endif // !CONFIG_REALTIME_ONLY
1426
1427 // 10-bit
1428 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
1429 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
1430
1431 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
1432 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
1433 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
1434
1435 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
1436 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
1437 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
1438
1439 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
1440 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
1441 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
1442
1443 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
1444 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
1445 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128)
1446
1447 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64)
1448 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128)
1449
1450 #if !CONFIG_REALTIME_ONLY
1451 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16)
1452
1453 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32)
1454
1455 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4)
1456 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64)
1457
1458 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8)
1459
1460 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16)
1461 #endif // !CONFIG_REALTIME_ONLY
1462
1463 // 12-bit
1464 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
1465 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
1466
1467 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
1468 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
1469 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
1470
1471 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
1472 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
1473 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
1474
1475 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
1476 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
1477 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
1478
1479 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
1480 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
1481 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128)
1482
1483 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64)
1484 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128)
1485
1486 #if !CONFIG_REALTIME_ONLY
1487 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16)
1488
1489 HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32)
1490
1491 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4)
1492 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64)
1493
1494 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8)
1495
1496 SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16)
1497 #endif // !CONFIG_REALTIME_ONLY
1498