1 /*
2 * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include <assert.h>
13
14 #include "./vpx_dsp_rtcd.h"
15 #include "./vpx_config.h"
16
17 #include "vpx/vpx_integer.h"
18 #include "vpx_dsp/arm/mem_neon.h"
19
20 // The bilinear filters look like this:
21 //
22 // {{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
23 // { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }}
24 //
25 // We can factor out the highest common multiple, such that the sum of both
26 // weights will be 8 instead of 128. The benefits of this are two-fold:
27 //
28 // 1) We can infer the filter values from the filter_offset parameter in the
29 // bilinear filter functions below - we don't have to actually load the values
30 // from memory:
31 // f0 = 8 - filter_offset
32 // f1 = filter_offset
33 //
34 // 2) Scaling the pixel values by 8, instead of 128 enables us to operate on
35 // 16-bit data types at all times, rather than widening out to 32-bit and
36 // requiring double the number of data processing instructions. (12-bit * 8 =
37 // 15-bit.)
38
39 // Process a block exactly 4 wide and any height.
highbd_var_filter_block2d_bil_w4(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)40 static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
41 uint16_t *dst_ptr, int src_stride,
42 int pixel_step, int dst_height,
43 int filter_offset) {
44 const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
45 const uint16x4_t f1 = vdup_n_u16(filter_offset);
46
47 int i = dst_height;
48 do {
49 uint16x4_t s0 = load_unaligned_u16(src_ptr);
50 uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step);
51
52 uint16x4_t blend = vmul_u16(s0, f0);
53 blend = vmla_u16(blend, s1, f1);
54 blend = vrshr_n_u16(blend, 3);
55
56 vst1_u16(dst_ptr, blend);
57
58 src_ptr += src_stride;
59 dst_ptr += 4;
60 } while (--i != 0);
61 }
62
63 // Process a block which is a multiple of 8 and any height.
highbd_var_filter_block2d_bil_large(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,int filter_offset)64 static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr,
65 uint16_t *dst_ptr,
66 int src_stride, int pixel_step,
67 int dst_width, int dst_height,
68 int filter_offset) {
69 const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
70 const uint16x8_t f1 = vdupq_n_u16(filter_offset);
71
72 int i = dst_height;
73 do {
74 int j = 0;
75 do {
76 uint16x8_t s0 = vld1q_u16(src_ptr + j);
77 uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
78
79 uint16x8_t blend = vmulq_u16(s0, f0);
80 blend = vmlaq_u16(blend, s1, f1);
81 blend = vrshrq_n_u16(blend, 3);
82
83 vst1q_u16(dst_ptr + j, blend);
84
85 j += 8;
86 } while (j < dst_width);
87
88 src_ptr += src_stride;
89 dst_ptr += dst_width;
90 } while (--i != 0);
91 }
92
highbd_var_filter_block2d_bil_w8(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)93 static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr,
94 uint16_t *dst_ptr, int src_stride,
95 int pixel_step, int dst_height,
96 int filter_offset) {
97 highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
98 8, dst_height, filter_offset);
99 }
highbd_var_filter_block2d_bil_w16(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)100 static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr,
101 uint16_t *dst_ptr, int src_stride,
102 int pixel_step, int dst_height,
103 int filter_offset) {
104 highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
105 16, dst_height, filter_offset);
106 }
highbd_var_filter_block2d_bil_w32(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)107 static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr,
108 uint16_t *dst_ptr, int src_stride,
109 int pixel_step, int dst_height,
110 int filter_offset) {
111 highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
112 32, dst_height, filter_offset);
113 }
highbd_var_filter_block2d_bil_w64(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)114 static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr,
115 uint16_t *dst_ptr, int src_stride,
116 int pixel_step, int dst_height,
117 int filter_offset) {
118 highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
119 64, dst_height, filter_offset);
120 }
121
highbd_var_filter_block2d_avg(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height)122 static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
123 uint16_t *dst_ptr, int src_stride,
124 int pixel_step, int dst_width,
125 int dst_height) {
126 int i = dst_height;
127
128 // We only specialize on the filter values for large block sizes (>= 16x16.)
129 assert(dst_width >= 16 && dst_width % 16 == 0);
130
131 do {
132 int j = 0;
133 do {
134 uint16x8_t s0 = vld1q_u16(src_ptr + j);
135 uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
136 uint16x8_t avg = vrhaddq_u16(s0, s1);
137 vst1q_u16(dst_ptr + j, avg);
138
139 j += 8;
140 } while (j < dst_width);
141
142 src_ptr += src_stride;
143 dst_ptr += dst_width;
144 } while (--i != 0);
145 }
146
147 #define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
148 unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \
149 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
150 const uint8_t *ref, int ref_stride, uint32_t *sse) { \
151 uint16_t tmp0[w * (h + 1)]; \
152 uint16_t tmp1[w * h]; \
153 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
154 \
155 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
156 xoffset); \
157 highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
158 \
159 return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
160 w, ref, ref_stride, sse); \
161 }
162
163 #define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
164 unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \
165 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
166 const uint8_t *ref, int ref_stride, unsigned int *sse) { \
167 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
168 \
169 if (xoffset == 0) { \
170 if (yoffset == 0) { \
171 return vpx_highbd_##bitdepth##_variance##w##x##h( \
172 CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \
173 } else if (yoffset == 4) { \
174 uint16_t tmp[w * h]; \
175 highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
176 h); \
177 return vpx_highbd_##bitdepth##_variance##w##x##h( \
178 CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
179 } else { \
180 uint16_t tmp[w * h]; \
181 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \
182 src_stride, h, yoffset); \
183 return vpx_highbd_##bitdepth##_variance##w##x##h( \
184 CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
185 } \
186 } else if (xoffset == 4) { \
187 uint16_t tmp0[w * (h + 1)]; \
188 if (yoffset == 0) { \
189 highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \
190 return vpx_highbd_##bitdepth##_variance##w##x##h( \
191 CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
192 } else if (yoffset == 4) { \
193 uint16_t tmp1[w * (h + 1)]; \
194 highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
195 (h + 1)); \
196 highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
197 return vpx_highbd_##bitdepth##_variance##w##x##h( \
198 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
199 } else { \
200 uint16_t tmp1[w * (h + 1)]; \
201 highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
202 (h + 1)); \
203 highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
204 return vpx_highbd_##bitdepth##_variance##w##x##h( \
205 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
206 } \
207 } else { \
208 uint16_t tmp0[w * (h + 1)]; \
209 if (yoffset == 0) { \
210 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \
211 xoffset); \
212 return vpx_highbd_##bitdepth##_variance##w##x##h( \
213 CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
214 } else if (yoffset == 4) { \
215 uint16_t tmp1[w * h]; \
216 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
217 (h + 1), xoffset); \
218 highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
219 return vpx_highbd_##bitdepth##_variance##w##x##h( \
220 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
221 } else { \
222 uint16_t tmp1[w * h]; \
223 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
224 (h + 1), xoffset); \
225 highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
226 return vpx_highbd_##bitdepth##_variance##w##x##h( \
227 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
228 } \
229 } \
230 }
231
232 // 8-bit
233 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
234 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
235
236 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
237 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
238 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
239
240 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
241 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
242 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
243
244 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
245 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
246 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
247
248 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
249 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
250
251 // 10-bit
252 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
253 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
254
255 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
256 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
257 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
258
259 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
260 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
261 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
262
263 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
264 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
265 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
266
267 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
268 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
269
270 // 12-bit
271 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
272 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
273
274 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
275 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
276 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
277
278 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
279 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
280 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
281
282 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
283 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
284 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
285
286 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
287 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
288
289 // Combine bilinear filter with vpx_highbd_comp_avg_pred for blocks having
290 // width 4.
highbd_avg_pred_var_filter_block2d_bil_w4(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred)291 static void highbd_avg_pred_var_filter_block2d_bil_w4(
292 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
293 int dst_height, int filter_offset, const uint16_t *second_pred) {
294 const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
295 const uint16x4_t f1 = vdup_n_u16(filter_offset);
296
297 int i = dst_height;
298 do {
299 uint16x4_t s0 = load_unaligned_u16(src_ptr);
300 uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step);
301 uint16x4_t p = vld1_u16(second_pred);
302
303 uint16x4_t blend = vmul_u16(s0, f0);
304 blend = vmla_u16(blend, s1, f1);
305 blend = vrshr_n_u16(blend, 3);
306
307 vst1_u16(dst_ptr, vrhadd_u16(blend, p));
308
309 src_ptr += src_stride;
310 dst_ptr += 4;
311 second_pred += 4;
312 } while (--i != 0);
313 }
314
315 // Combine bilinear filter with vpx_highbd_comp_avg_pred for large blocks.
highbd_avg_pred_var_filter_block2d_bil_large(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,int filter_offset,const uint16_t * second_pred)316 static void highbd_avg_pred_var_filter_block2d_bil_large(
317 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
318 int dst_width, int dst_height, int filter_offset,
319 const uint16_t *second_pred) {
320 const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
321 const uint16x8_t f1 = vdupq_n_u16(filter_offset);
322
323 int i = dst_height;
324 do {
325 int j = 0;
326 do {
327 uint16x8_t s0 = vld1q_u16(src_ptr + j);
328 uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
329 uint16x8_t p = vld1q_u16(second_pred);
330
331 uint16x8_t blend = vmulq_u16(s0, f0);
332 blend = vmlaq_u16(blend, s1, f1);
333 blend = vrshrq_n_u16(blend, 3);
334
335 vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p));
336
337 j += 8;
338 second_pred += 8;
339 } while (j < dst_width);
340
341 src_ptr += src_stride;
342 dst_ptr += dst_width;
343 } while (--i != 0);
344 }
345
highbd_avg_pred_var_filter_block2d_bil_w8(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred)346 static void highbd_avg_pred_var_filter_block2d_bil_w8(
347 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
348 int dst_height, int filter_offset, const uint16_t *second_pred) {
349 highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
350 pixel_step, 8, dst_height,
351 filter_offset, second_pred);
352 }
highbd_avg_pred_var_filter_block2d_bil_w16(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred)353 static void highbd_avg_pred_var_filter_block2d_bil_w16(
354 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
355 int dst_height, int filter_offset, const uint16_t *second_pred) {
356 highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
357 pixel_step, 16, dst_height,
358 filter_offset, second_pred);
359 }
highbd_avg_pred_var_filter_block2d_bil_w32(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred)360 static void highbd_avg_pred_var_filter_block2d_bil_w32(
361 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
362 int dst_height, int filter_offset, const uint16_t *second_pred) {
363 highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
364 pixel_step, 32, dst_height,
365 filter_offset, second_pred);
366 }
highbd_avg_pred_var_filter_block2d_bil_w64(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred)367 static void highbd_avg_pred_var_filter_block2d_bil_w64(
368 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
369 int dst_height, int filter_offset, const uint16_t *second_pred) {
370 highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
371 pixel_step, 64, dst_height,
372 filter_offset, second_pred);
373 }
374
375 // Combine averaging subpel filter with vpx_highbd_comp_avg_pred.
highbd_avg_pred_var_filter_block2d_avg(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,const uint16_t * second_pred)376 static void highbd_avg_pred_var_filter_block2d_avg(
377 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
378 int dst_width, int dst_height, const uint16_t *second_pred) {
379 int i = dst_height;
380
381 // We only specialize on the filter values for large block sizes (>= 16x16.)
382 assert(dst_width >= 16 && dst_width % 16 == 0);
383
384 do {
385 int j = 0;
386 do {
387 uint16x8_t s0 = vld1q_u16(src_ptr + j);
388 uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
389 uint16x8_t avg = vrhaddq_u16(s0, s1);
390
391 uint16x8_t p = vld1q_u16(second_pred);
392 avg = vrhaddq_u16(avg, p);
393
394 vst1q_u16(dst_ptr + j, avg);
395
396 j += 8;
397 second_pred += 8;
398 } while (j < dst_width);
399
400 src_ptr += src_stride;
401 dst_ptr += dst_width;
402 } while (--i != 0);
403 }
404
405 // Implementation of vpx_highbd_comp_avg_pred for blocks having width >= 16.
highbd_avg_pred(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int dst_width,int dst_height,const uint16_t * second_pred)406 static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
407 int src_stride, int dst_width, int dst_height,
408 const uint16_t *second_pred) {
409 int i = dst_height;
410
411 // We only specialize on the filter values for large block sizes (>= 16x16.)
412 assert(dst_width >= 16 && dst_width % 16 == 0);
413
414 do {
415 int j = 0;
416 do {
417 uint16x8_t s = vld1q_u16(src_ptr + j);
418 uint16x8_t p = vld1q_u16(second_pred);
419
420 uint16x8_t avg = vrhaddq_u16(s, p);
421
422 vst1q_u16(dst_ptr + j, avg);
423
424 j += 8;
425 second_pred += 8;
426 } while (j < dst_width);
427
428 src_ptr += src_stride;
429 dst_ptr += dst_width;
430 } while (--i != 0);
431 }
432
433 #define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
434 uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
435 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
436 const uint8_t *ref, int ref_stride, uint32_t *sse, \
437 const uint8_t *second_pred) { \
438 uint16_t tmp0[w * (h + 1)]; \
439 uint16_t tmp1[w * h]; \
440 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
441 \
442 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
443 xoffset); \
444 highbd_avg_pred_var_filter_block2d_bil_w##w( \
445 tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
446 \
447 return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
448 w, ref, ref_stride, sse); \
449 }
450
451 #define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
452 unsigned int vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
453 const uint8_t *src, int source_stride, int xoffset, int yoffset, \
454 const uint8_t *ref, int ref_stride, unsigned int *sse, \
455 const uint8_t *second_pred) { \
456 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
457 \
458 if (xoffset == 0) { \
459 uint16_t tmp[w * h]; \
460 if (yoffset == 0) { \
461 highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \
462 CONVERT_TO_SHORTPTR(second_pred)); \
463 return vpx_highbd_##bitdepth##_variance##w##x##h( \
464 CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
465 } else if (yoffset == 4) { \
466 highbd_avg_pred_var_filter_block2d_avg( \
467 src_ptr, tmp, source_stride, source_stride, w, h, \
468 CONVERT_TO_SHORTPTR(second_pred)); \
469 return vpx_highbd_##bitdepth##_variance##w##x##h( \
470 CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
471 } else { \
472 highbd_avg_pred_var_filter_block2d_bil_w##w( \
473 src_ptr, tmp, source_stride, source_stride, h, yoffset, \
474 CONVERT_TO_SHORTPTR(second_pred)); \
475 return vpx_highbd_##bitdepth##_variance##w##x##h( \
476 CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
477 } \
478 } else if (xoffset == 4) { \
479 uint16_t tmp0[w * (h + 1)]; \
480 if (yoffset == 0) { \
481 highbd_avg_pred_var_filter_block2d_avg( \
482 src_ptr, tmp0, source_stride, 1, w, h, \
483 CONVERT_TO_SHORTPTR(second_pred)); \
484 return vpx_highbd_##bitdepth##_variance##w##x##h( \
485 CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
486 } else if (yoffset == 4) { \
487 uint16_t tmp1[w * (h + 1)]; \
488 highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \
489 (h + 1)); \
490 highbd_avg_pred_var_filter_block2d_avg( \
491 tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
492 return vpx_highbd_##bitdepth##_variance##w##x##h( \
493 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
494 } else { \
495 uint16_t tmp1[w * (h + 1)]; \
496 highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \
497 (h + 1)); \
498 highbd_avg_pred_var_filter_block2d_bil_w##w( \
499 tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
500 return vpx_highbd_##bitdepth##_variance##w##x##h( \
501 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
502 } \
503 } else { \
504 uint16_t tmp0[w * (h + 1)]; \
505 if (yoffset == 0) { \
506 highbd_avg_pred_var_filter_block2d_bil_w##w( \
507 src_ptr, tmp0, source_stride, 1, h, xoffset, \
508 CONVERT_TO_SHORTPTR(second_pred)); \
509 return vpx_highbd_##bitdepth##_variance##w##x##h( \
510 CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
511 } else if (yoffset == 4) { \
512 uint16_t tmp1[w * h]; \
513 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \
514 (h + 1), xoffset); \
515 highbd_avg_pred_var_filter_block2d_avg( \
516 tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
517 return vpx_highbd_##bitdepth##_variance##w##x##h( \
518 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
519 } else { \
520 uint16_t tmp1[w * h]; \
521 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \
522 (h + 1), xoffset); \
523 highbd_avg_pred_var_filter_block2d_bil_w##w( \
524 tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
525 return vpx_highbd_##bitdepth##_variance##w##x##h( \
526 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
527 } \
528 } \
529 }
530
531 // 8-bit
532 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
533 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
534
535 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
536 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
537 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
538
539 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
540 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
541 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
542
543 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
544 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
545 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
546
547 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
548 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
549
550 // 10-bit
551 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
552 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
553
554 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
555 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
556 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
557
558 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
559 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
560 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
561
562 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
563 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
564 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
565
566 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
567 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
568
569 // 12-bit
570 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
571 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
572
573 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
574 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
575 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
576
577 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
578 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
579 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
580
581 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
582 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
583 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
584
585 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
586 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
587