xref: /aosp_15_r20/external/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include <assert.h>
13 
14 #include "./vpx_dsp_rtcd.h"
15 #include "./vpx_config.h"
16 
17 #include "vpx/vpx_integer.h"
18 #include "vpx_dsp/arm/mem_neon.h"
19 
20 // The bilinear filters look like this:
21 //
22 // {{ 128,  0 }, { 112, 16 }, { 96, 32 }, { 80,  48 },
23 //  {  64, 64 }, {  48, 80 }, { 32, 96 }, { 16, 112 }}
24 //
25 // We can factor out the highest common multiple, such that the sum of both
26 // weights will be 8 instead of 128. The benefits of this are two-fold:
27 //
28 // 1) We can infer the filter values from the filter_offset parameter in the
29 // bilinear filter functions below - we don't have to actually load the values
30 // from memory:
31 // f0 = 8 - filter_offset
32 // f1 = filter_offset
33 //
34 // 2) Scaling the pixel values by 8, instead of 128 enables us to operate on
35 // 16-bit data types at all times, rather than widening out to 32-bit and
36 // requiring double the number of data processing instructions. (12-bit * 8 =
37 // 15-bit.)
38 
39 // Process a block exactly 4 wide and any height.
highbd_var_filter_block2d_bil_w4(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)40 static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
41                                              uint16_t *dst_ptr, int src_stride,
42                                              int pixel_step, int dst_height,
43                                              int filter_offset) {
44   const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
45   const uint16x4_t f1 = vdup_n_u16(filter_offset);
46 
47   int i = dst_height;
48   do {
49     uint16x4_t s0 = load_unaligned_u16(src_ptr);
50     uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step);
51 
52     uint16x4_t blend = vmul_u16(s0, f0);
53     blend = vmla_u16(blend, s1, f1);
54     blend = vrshr_n_u16(blend, 3);
55 
56     vst1_u16(dst_ptr, blend);
57 
58     src_ptr += src_stride;
59     dst_ptr += 4;
60   } while (--i != 0);
61 }
62 
63 // Process a block which is a multiple of 8 and any height.
highbd_var_filter_block2d_bil_large(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,int filter_offset)64 static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr,
65                                                 uint16_t *dst_ptr,
66                                                 int src_stride, int pixel_step,
67                                                 int dst_width, int dst_height,
68                                                 int filter_offset) {
69   const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
70   const uint16x8_t f1 = vdupq_n_u16(filter_offset);
71 
72   int i = dst_height;
73   do {
74     int j = 0;
75     do {
76       uint16x8_t s0 = vld1q_u16(src_ptr + j);
77       uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
78 
79       uint16x8_t blend = vmulq_u16(s0, f0);
80       blend = vmlaq_u16(blend, s1, f1);
81       blend = vrshrq_n_u16(blend, 3);
82 
83       vst1q_u16(dst_ptr + j, blend);
84 
85       j += 8;
86     } while (j < dst_width);
87 
88     src_ptr += src_stride;
89     dst_ptr += dst_width;
90   } while (--i != 0);
91 }
92 
highbd_var_filter_block2d_bil_w8(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)93 static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr,
94                                              uint16_t *dst_ptr, int src_stride,
95                                              int pixel_step, int dst_height,
96                                              int filter_offset) {
97   highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
98                                       8, dst_height, filter_offset);
99 }
highbd_var_filter_block2d_bil_w16(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)100 static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr,
101                                               uint16_t *dst_ptr, int src_stride,
102                                               int pixel_step, int dst_height,
103                                               int filter_offset) {
104   highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
105                                       16, dst_height, filter_offset);
106 }
highbd_var_filter_block2d_bil_w32(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)107 static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr,
108                                               uint16_t *dst_ptr, int src_stride,
109                                               int pixel_step, int dst_height,
110                                               int filter_offset) {
111   highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
112                                       32, dst_height, filter_offset);
113 }
highbd_var_filter_block2d_bil_w64(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)114 static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr,
115                                               uint16_t *dst_ptr, int src_stride,
116                                               int pixel_step, int dst_height,
117                                               int filter_offset) {
118   highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
119                                       64, dst_height, filter_offset);
120 }
121 
highbd_var_filter_block2d_avg(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height)122 static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
123                                           uint16_t *dst_ptr, int src_stride,
124                                           int pixel_step, int dst_width,
125                                           int dst_height) {
126   int i = dst_height;
127 
128   // We only specialize on the filter values for large block sizes (>= 16x16.)
129   assert(dst_width >= 16 && dst_width % 16 == 0);
130 
131   do {
132     int j = 0;
133     do {
134       uint16x8_t s0 = vld1q_u16(src_ptr + j);
135       uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
136       uint16x8_t avg = vrhaddq_u16(s0, s1);
137       vst1q_u16(dst_ptr + j, avg);
138 
139       j += 8;
140     } while (j < dst_width);
141 
142     src_ptr += src_stride;
143     dst_ptr += dst_width;
144   } while (--i != 0);
145 }
146 
147 #define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)                           \
148   unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
149       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
150       const uint8_t *ref, int ref_stride, uint32_t *sse) {                     \
151     uint16_t tmp0[w * (h + 1)];                                                \
152     uint16_t tmp1[w * h];                                                      \
153     uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
154                                                                                \
155     highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1),  \
156                                        xoffset);                               \
157     highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \
158                                                                                \
159     return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
160                                                      w, ref, ref_stride, sse); \
161   }
162 
163 #define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)               \
164   unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
165       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
166       const uint8_t *ref, int ref_stride, unsigned int *sse) {                 \
167     uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
168                                                                                \
169     if (xoffset == 0) {                                                        \
170       if (yoffset == 0) {                                                      \
171         return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
172             CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse);    \
173       } else if (yoffset == 4) {                                               \
174         uint16_t tmp[w * h];                                                   \
175         highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
176                                       h);                                      \
177         return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
178             CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
179       } else {                                                                 \
180         uint16_t tmp[w * h];                                                   \
181         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride,           \
182                                            src_stride, h, yoffset);            \
183         return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
184             CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
185       }                                                                        \
186     } else if (xoffset == 4) {                                                 \
187       uint16_t tmp0[w * (h + 1)];                                              \
188       if (yoffset == 0) {                                                      \
189         highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h);     \
190         return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
191             CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
192       } else if (yoffset == 4) {                                               \
193         uint16_t tmp1[w * (h + 1)];                                            \
194         highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
195                                       (h + 1));                                \
196         highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
197         return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
198             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
199       } else {                                                                 \
200         uint16_t tmp1[w * (h + 1)];                                            \
201         highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
202                                       (h + 1));                                \
203         highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
204         return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
205             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
206       }                                                                        \
207     } else {                                                                   \
208       uint16_t tmp0[w * (h + 1)];                                              \
209       if (yoffset == 0) {                                                      \
210         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h,    \
211                                            xoffset);                           \
212         return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
213             CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
214       } else if (yoffset == 4) {                                               \
215         uint16_t tmp1[w * h];                                                  \
216         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
217                                            (h + 1), xoffset);                  \
218         highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
219         return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
220             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
221       } else {                                                                 \
222         uint16_t tmp1[w * h];                                                  \
223         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
224                                            (h + 1), xoffset);                  \
225         highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
226         return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
227             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
228       }                                                                        \
229     }                                                                          \
230   }
231 
232 // 8-bit
233 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
234 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
235 
236 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
237 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
238 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
239 
240 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
241 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
242 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
243 
244 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
245 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
246 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
247 
248 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
249 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
250 
251 // 10-bit
252 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
253 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
254 
255 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
256 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
257 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
258 
259 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
260 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
261 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
262 
263 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
264 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
265 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
266 
267 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
268 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
269 
270 // 12-bit
271 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
272 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
273 
274 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
275 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
276 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
277 
278 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
279 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
280 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
281 
282 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
283 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
284 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
285 
286 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
287 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
288 
289 // Combine bilinear filter with vpx_highbd_comp_avg_pred for blocks having
290 // width 4.
highbd_avg_pred_var_filter_block2d_bil_w4(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred)291 static void highbd_avg_pred_var_filter_block2d_bil_w4(
292     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
293     int dst_height, int filter_offset, const uint16_t *second_pred) {
294   const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
295   const uint16x4_t f1 = vdup_n_u16(filter_offset);
296 
297   int i = dst_height;
298   do {
299     uint16x4_t s0 = load_unaligned_u16(src_ptr);
300     uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step);
301     uint16x4_t p = vld1_u16(second_pred);
302 
303     uint16x4_t blend = vmul_u16(s0, f0);
304     blend = vmla_u16(blend, s1, f1);
305     blend = vrshr_n_u16(blend, 3);
306 
307     vst1_u16(dst_ptr, vrhadd_u16(blend, p));
308 
309     src_ptr += src_stride;
310     dst_ptr += 4;
311     second_pred += 4;
312   } while (--i != 0);
313 }
314 
315 // Combine bilinear filter with vpx_highbd_comp_avg_pred for large blocks.
highbd_avg_pred_var_filter_block2d_bil_large(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,int filter_offset,const uint16_t * second_pred)316 static void highbd_avg_pred_var_filter_block2d_bil_large(
317     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
318     int dst_width, int dst_height, int filter_offset,
319     const uint16_t *second_pred) {
320   const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
321   const uint16x8_t f1 = vdupq_n_u16(filter_offset);
322 
323   int i = dst_height;
324   do {
325     int j = 0;
326     do {
327       uint16x8_t s0 = vld1q_u16(src_ptr + j);
328       uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
329       uint16x8_t p = vld1q_u16(second_pred);
330 
331       uint16x8_t blend = vmulq_u16(s0, f0);
332       blend = vmlaq_u16(blend, s1, f1);
333       blend = vrshrq_n_u16(blend, 3);
334 
335       vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p));
336 
337       j += 8;
338       second_pred += 8;
339     } while (j < dst_width);
340 
341     src_ptr += src_stride;
342     dst_ptr += dst_width;
343   } while (--i != 0);
344 }
345 
highbd_avg_pred_var_filter_block2d_bil_w8(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred)346 static void highbd_avg_pred_var_filter_block2d_bil_w8(
347     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
348     int dst_height, int filter_offset, const uint16_t *second_pred) {
349   highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
350                                                pixel_step, 8, dst_height,
351                                                filter_offset, second_pred);
352 }
highbd_avg_pred_var_filter_block2d_bil_w16(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred)353 static void highbd_avg_pred_var_filter_block2d_bil_w16(
354     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
355     int dst_height, int filter_offset, const uint16_t *second_pred) {
356   highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
357                                                pixel_step, 16, dst_height,
358                                                filter_offset, second_pred);
359 }
highbd_avg_pred_var_filter_block2d_bil_w32(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred)360 static void highbd_avg_pred_var_filter_block2d_bil_w32(
361     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
362     int dst_height, int filter_offset, const uint16_t *second_pred) {
363   highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
364                                                pixel_step, 32, dst_height,
365                                                filter_offset, second_pred);
366 }
highbd_avg_pred_var_filter_block2d_bil_w64(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint16_t * second_pred)367 static void highbd_avg_pred_var_filter_block2d_bil_w64(
368     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
369     int dst_height, int filter_offset, const uint16_t *second_pred) {
370   highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
371                                                pixel_step, 64, dst_height,
372                                                filter_offset, second_pred);
373 }
374 
375 // Combine averaging subpel filter with vpx_highbd_comp_avg_pred.
highbd_avg_pred_var_filter_block2d_avg(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,const uint16_t * second_pred)376 static void highbd_avg_pred_var_filter_block2d_avg(
377     const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
378     int dst_width, int dst_height, const uint16_t *second_pred) {
379   int i = dst_height;
380 
381   // We only specialize on the filter values for large block sizes (>= 16x16.)
382   assert(dst_width >= 16 && dst_width % 16 == 0);
383 
384   do {
385     int j = 0;
386     do {
387       uint16x8_t s0 = vld1q_u16(src_ptr + j);
388       uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
389       uint16x8_t avg = vrhaddq_u16(s0, s1);
390 
391       uint16x8_t p = vld1q_u16(second_pred);
392       avg = vrhaddq_u16(avg, p);
393 
394       vst1q_u16(dst_ptr + j, avg);
395 
396       j += 8;
397       second_pred += 8;
398     } while (j < dst_width);
399 
400     src_ptr += src_stride;
401     dst_ptr += dst_width;
402   } while (--i != 0);
403 }
404 
405 // Implementation of vpx_highbd_comp_avg_pred for blocks having width >= 16.
highbd_avg_pred(const uint16_t * src_ptr,uint16_t * dst_ptr,int src_stride,int dst_width,int dst_height,const uint16_t * second_pred)406 static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
407                             int src_stride, int dst_width, int dst_height,
408                             const uint16_t *second_pred) {
409   int i = dst_height;
410 
411   // We only specialize on the filter values for large block sizes (>= 16x16.)
412   assert(dst_width >= 16 && dst_width % 16 == 0);
413 
414   do {
415     int j = 0;
416     do {
417       uint16x8_t s = vld1q_u16(src_ptr + j);
418       uint16x8_t p = vld1q_u16(second_pred);
419 
420       uint16x8_t avg = vrhaddq_u16(s, p);
421 
422       vst1q_u16(dst_ptr + j, avg);
423 
424       j += 8;
425       second_pred += 8;
426     } while (j < dst_width);
427 
428     src_ptr += src_stride;
429     dst_ptr += dst_width;
430   } while (--i != 0);
431 }
432 
433 #define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)                       \
434   uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon(     \
435       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
436       const uint8_t *ref, int ref_stride, uint32_t *sse,                       \
437       const uint8_t *second_pred) {                                            \
438     uint16_t tmp0[w * (h + 1)];                                                \
439     uint16_t tmp1[w * h];                                                      \
440     uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
441                                                                                \
442     highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1),  \
443                                        xoffset);                               \
444     highbd_avg_pred_var_filter_block2d_bil_w##w(                               \
445         tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));       \
446                                                                                \
447     return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
448                                                      w, ref, ref_stride, sse); \
449   }
450 
451 #define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)           \
452   unsigned int vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
453       const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
454       const uint8_t *ref, int ref_stride, unsigned int *sse,                   \
455       const uint8_t *second_pred) {                                            \
456     uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
457                                                                                \
458     if (xoffset == 0) {                                                        \
459       uint16_t tmp[w * h];                                                     \
460       if (yoffset == 0) {                                                      \
461         highbd_avg_pred(src_ptr, tmp, source_stride, w, h,                     \
462                         CONVERT_TO_SHORTPTR(second_pred));                     \
463         return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
464             CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
465       } else if (yoffset == 4) {                                               \
466         highbd_avg_pred_var_filter_block2d_avg(                                \
467             src_ptr, tmp, source_stride, source_stride, w, h,                  \
468             CONVERT_TO_SHORTPTR(second_pred));                                 \
469         return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
470             CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
471       } else {                                                                 \
472         highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
473             src_ptr, tmp, source_stride, source_stride, h, yoffset,            \
474             CONVERT_TO_SHORTPTR(second_pred));                                 \
475         return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
476             CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
477       }                                                                        \
478     } else if (xoffset == 4) {                                                 \
479       uint16_t tmp0[w * (h + 1)];                                              \
480       if (yoffset == 0) {                                                      \
481         highbd_avg_pred_var_filter_block2d_avg(                                \
482             src_ptr, tmp0, source_stride, 1, w, h,                             \
483             CONVERT_TO_SHORTPTR(second_pred));                                 \
484         return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
485             CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
486       } else if (yoffset == 4) {                                               \
487         uint16_t tmp1[w * (h + 1)];                                            \
488         highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
489                                       (h + 1));                                \
490         highbd_avg_pred_var_filter_block2d_avg(                                \
491             tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
492         return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
493             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
494       } else {                                                                 \
495         uint16_t tmp1[w * (h + 1)];                                            \
496         highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
497                                       (h + 1));                                \
498         highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
499             tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
500         return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
501             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
502       }                                                                        \
503     } else {                                                                   \
504       uint16_t tmp0[w * (h + 1)];                                              \
505       if (yoffset == 0) {                                                      \
506         highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
507             src_ptr, tmp0, source_stride, 1, h, xoffset,                       \
508             CONVERT_TO_SHORTPTR(second_pred));                                 \
509         return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
510             CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
511       } else if (yoffset == 4) {                                               \
512         uint16_t tmp1[w * h];                                                  \
513         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
514                                            (h + 1), xoffset);                  \
515         highbd_avg_pred_var_filter_block2d_avg(                                \
516             tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
517         return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
518             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
519       } else {                                                                 \
520         uint16_t tmp1[w * h];                                                  \
521         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
522                                            (h + 1), xoffset);                  \
523         highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
524             tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
525         return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
526             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
527       }                                                                        \
528     }                                                                          \
529   }
530 
531 // 8-bit
532 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
533 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
534 
535 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
536 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
537 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
538 
539 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
540 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
541 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
542 
543 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
544 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
545 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
546 
547 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
548 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
549 
550 // 10-bit
551 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
552 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
553 
554 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
555 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
556 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
557 
558 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
559 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
560 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
561 
562 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
563 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
564 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
565 
566 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
567 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
568 
569 // 12-bit
570 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
571 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
572 
573 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
574 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
575 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
576 
577 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
578 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
579 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
580 
581 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
582 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
583 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
584 
585 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
586 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
587