xref: /aosp_15_r20/external/libaom/aom_dsp/arm/subpel_variance_neon.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 
14 #include "config/aom_dsp_rtcd.h"
15 #include "config/aom_config.h"
16 
17 #include "aom_ports/mem.h"
18 #include "aom/aom_integer.h"
19 
20 #include "aom_dsp/variance.h"
21 #include "aom_dsp/arm/dist_wtd_avg_neon.h"
22 #include "aom_dsp/arm/mem_neon.h"
23 
var_filter_block2d_bil_w4(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)24 static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr,
25                                       int src_stride, int pixel_step,
26                                       int dst_height, int filter_offset) {
27   const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
28   const uint8x8_t f1 = vdup_n_u8(filter_offset);
29 
30   int i = dst_height;
31   do {
32     uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
33     uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
34     uint16x8_t blend = vmull_u8(s0, f0);
35     blend = vmlal_u8(blend, s1, f1);
36     uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
37     vst1_u8(dst_ptr, blend_u8);
38 
39     src_ptr += 2 * src_stride;
40     dst_ptr += 2 * 4;
41     i -= 2;
42   } while (i != 0);
43 }
44 
var_filter_block2d_bil_w8(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)45 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr,
46                                       int src_stride, int pixel_step,
47                                       int dst_height, int filter_offset) {
48   const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
49   const uint8x8_t f1 = vdup_n_u8(filter_offset);
50 
51   int i = dst_height;
52   do {
53     uint8x8_t s0 = vld1_u8(src_ptr);
54     uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
55     uint16x8_t blend = vmull_u8(s0, f0);
56     blend = vmlal_u8(blend, s1, f1);
57     uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
58     vst1_u8(dst_ptr, blend_u8);
59 
60     src_ptr += src_stride;
61     dst_ptr += 8;
62   } while (--i != 0);
63 }
64 
var_filter_block2d_bil_large(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,int filter_offset)65 static void var_filter_block2d_bil_large(const uint8_t *src_ptr,
66                                          uint8_t *dst_ptr, int src_stride,
67                                          int pixel_step, int dst_width,
68                                          int dst_height, int filter_offset) {
69   const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
70   const uint8x8_t f1 = vdup_n_u8(filter_offset);
71 
72   int i = dst_height;
73   do {
74     int j = 0;
75     do {
76       uint8x16_t s0 = vld1q_u8(src_ptr + j);
77       uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
78       uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
79       blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
80       uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
81       blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
82       uint8x16_t blend_u8 =
83           vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
84       vst1q_u8(dst_ptr + j, blend_u8);
85 
86       j += 16;
87     } while (j < dst_width);
88 
89     src_ptr += src_stride;
90     dst_ptr += dst_width;
91   } while (--i != 0);
92 }
93 
var_filter_block2d_bil_w16(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)94 static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr,
95                                        int src_stride, int pixel_step,
96                                        int dst_height, int filter_offset) {
97   var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16,
98                                dst_height, filter_offset);
99 }
100 
var_filter_block2d_bil_w32(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)101 static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr,
102                                        int src_stride, int pixel_step,
103                                        int dst_height, int filter_offset) {
104   var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32,
105                                dst_height, filter_offset);
106 }
107 
var_filter_block2d_bil_w64(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)108 static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
109                                        int src_stride, int pixel_step,
110                                        int dst_height, int filter_offset) {
111   var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64,
112                                dst_height, filter_offset);
113 }
114 
var_filter_block2d_bil_w128(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)115 static void var_filter_block2d_bil_w128(const uint8_t *src_ptr,
116                                         uint8_t *dst_ptr, int src_stride,
117                                         int pixel_step, int dst_height,
118                                         int filter_offset) {
119   var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 128,
120                                dst_height, filter_offset);
121 }
122 
var_filter_block2d_avg(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height)123 static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
124                                    int src_stride, int pixel_step,
125                                    int dst_width, int dst_height) {
126   // We only specialise on the filter values for large block sizes (>= 16x16.)
127   assert(dst_width >= 16 && dst_width % 16 == 0);
128 
129   int i = dst_height;
130   do {
131     int j = 0;
132     do {
133       uint8x16_t s0 = vld1q_u8(src_ptr + j);
134       uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
135       uint8x16_t avg = vrhaddq_u8(s0, s1);
136       vst1q_u8(dst_ptr + j, avg);
137 
138       j += 16;
139     } while (j < dst_width);
140 
141     src_ptr += src_stride;
142     dst_ptr += dst_width;
143   } while (--i != 0);
144 }
145 
146 #define SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                          \
147   unsigned int aom_sub_pixel_variance##w##x##h##_neon(                   \
148       const uint8_t *src, int src_stride, int xoffset, int yoffset,      \
149       const uint8_t *ref, int ref_stride, uint32_t *sse) {               \
150     uint8_t tmp0[w * (h + padding)];                                     \
151     uint8_t tmp1[w * h];                                                 \
152     var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
153                                 xoffset);                                \
154     var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
155     return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
156   }
157 
158 #define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                  \
159   unsigned int aom_sub_pixel_variance##w##x##h##_neon(                       \
160       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
161       const uint8_t *ref, int ref_stride, unsigned int *sse) {               \
162     if (xoffset == 0) {                                                      \
163       if (yoffset == 0) {                                                    \
164         return aom_variance##w##x##h(src, src_stride, ref, ref_stride, sse); \
165       } else if (yoffset == 4) {                                             \
166         uint8_t tmp[w * h];                                                  \
167         var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h);      \
168         return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);          \
169       } else {                                                               \
170         uint8_t tmp[w * h];                                                  \
171         var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h,     \
172                                     yoffset);                                \
173         return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);          \
174       }                                                                      \
175     } else if (xoffset == 4) {                                               \
176       uint8_t tmp0[w * (h + padding)];                                       \
177       if (yoffset == 0) {                                                    \
178         var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h);              \
179         return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);         \
180       } else if (yoffset == 4) {                                             \
181         uint8_t tmp1[w * (h + padding)];                                     \
182         var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));  \
183         var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                      \
184         return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
185       } else {                                                               \
186         uint8_t tmp1[w * (h + padding)];                                     \
187         var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));  \
188         var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
189         return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
190       }                                                                      \
191     } else {                                                                 \
192       uint8_t tmp0[w * (h + padding)];                                       \
193       if (yoffset == 0) {                                                    \
194         var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset);   \
195         return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);         \
196       } else if (yoffset == 4) {                                             \
197         uint8_t tmp1[w * h];                                                 \
198         var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
199                                     xoffset);                                \
200         var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                      \
201         return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
202       } else {                                                               \
203         uint8_t tmp1[w * h];                                                 \
204         var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
205                                     xoffset);                                \
206         var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
207         return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
208       }                                                                      \
209     }                                                                        \
210   }
211 
212 SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
213 SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
214 
215 SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
216 SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
217 SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
218 
219 SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
220 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
221 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
222 
223 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
224 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
225 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
226 
227 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
228 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
229 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
230 
231 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
232 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
233 
234 // Realtime mode doesn't use 4x rectangular blocks.
235 #if !CONFIG_REALTIME_ONLY
236 
237 SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
238 
239 SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
240 
241 SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
242 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
243 
244 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
245 
246 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
247 
248 #endif  // !CONFIG_REALTIME_ONLY
249 
250 #undef SUBPEL_VARIANCE_WXH_NEON
251 #undef SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON
252 
253 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 4.
avg_pred_var_filter_block2d_bil_w4(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred)254 static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr,
255                                                uint8_t *dst_ptr, int src_stride,
256                                                int pixel_step, int dst_height,
257                                                int filter_offset,
258                                                const uint8_t *second_pred) {
259   const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
260   const uint8x8_t f1 = vdup_n_u8(filter_offset);
261 
262   int i = dst_height;
263   do {
264     uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
265     uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
266     uint16x8_t blend = vmull_u8(s0, f0);
267     blend = vmlal_u8(blend, s1, f1);
268     uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
269 
270     uint8x8_t p = vld1_u8(second_pred);
271     uint8x8_t avg = vrhadd_u8(blend_u8, p);
272 
273     vst1_u8(dst_ptr, avg);
274 
275     src_ptr += 2 * src_stride;
276     dst_ptr += 2 * 4;
277     second_pred += 2 * 4;
278     i -= 2;
279   } while (i != 0);
280 }
281 
282 // Combine bilinear filter with aom_dist_wtd_comp_avg_pred for blocks having
283 // width 4.
dist_wtd_avg_pred_var_filter_block2d_bil_w4(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)284 static void dist_wtd_avg_pred_var_filter_block2d_bil_w4(
285     const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
286     int dst_height, int filter_offset, const uint8_t *second_pred,
287     const DIST_WTD_COMP_PARAMS *jcp_param) {
288   const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
289   const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
290   const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
291   const uint8x8_t f1 = vdup_n_u8(filter_offset);
292 
293   int i = dst_height;
294   do {
295     uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
296     uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
297     uint8x8_t p = vld1_u8(second_pred);
298     uint16x8_t blend = vmull_u8(s0, f0);
299     blend = vmlal_u8(blend, s1, f1);
300     uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
301     uint8x8_t avg = dist_wtd_avg_u8x8(blend_u8, p, fwd_offset, bck_offset);
302 
303     vst1_u8(dst_ptr, avg);
304 
305     src_ptr += 2 * src_stride;
306     dst_ptr += 2 * 4;
307     second_pred += 2 * 4;
308     i -= 2;
309   } while (i != 0);
310 }
311 
312 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 8.
avg_pred_var_filter_block2d_bil_w8(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred)313 static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr,
314                                                uint8_t *dst_ptr, int src_stride,
315                                                int pixel_step, int dst_height,
316                                                int filter_offset,
317                                                const uint8_t *second_pred) {
318   const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
319   const uint8x8_t f1 = vdup_n_u8(filter_offset);
320 
321   int i = dst_height;
322   do {
323     uint8x8_t s0 = vld1_u8(src_ptr);
324     uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
325     uint16x8_t blend = vmull_u8(s0, f0);
326     blend = vmlal_u8(blend, s1, f1);
327     uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
328 
329     uint8x8_t p = vld1_u8(second_pred);
330     uint8x8_t avg = vrhadd_u8(blend_u8, p);
331 
332     vst1_u8(dst_ptr, avg);
333 
334     src_ptr += src_stride;
335     dst_ptr += 8;
336     second_pred += 8;
337   } while (--i > 0);
338 }
339 
340 // Combine bilinear filter with aom_dist_wtd_comp_avg_pred for blocks having
341 // width 8.
dist_wtd_avg_pred_var_filter_block2d_bil_w8(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)342 static void dist_wtd_avg_pred_var_filter_block2d_bil_w8(
343     const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
344     int dst_height, int filter_offset, const uint8_t *second_pred,
345     const DIST_WTD_COMP_PARAMS *jcp_param) {
346   const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
347   const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
348   const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
349   const uint8x8_t f1 = vdup_n_u8(filter_offset);
350 
351   int i = dst_height;
352   do {
353     uint8x8_t s0 = vld1_u8(src_ptr);
354     uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
355     uint8x8_t p = vld1_u8(second_pred);
356     uint16x8_t blend = vmull_u8(s0, f0);
357     blend = vmlal_u8(blend, s1, f1);
358     uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
359     uint8x8_t avg = dist_wtd_avg_u8x8(blend_u8, p, fwd_offset, bck_offset);
360 
361     vst1_u8(dst_ptr, avg);
362 
363     src_ptr += src_stride;
364     dst_ptr += 8;
365     second_pred += 8;
366   } while (--i > 0);
367 }
368 
369 // Combine bilinear filter with aom_comp_avg_pred for large blocks.
avg_pred_var_filter_block2d_bil_large(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,int filter_offset,const uint8_t * second_pred)370 static void avg_pred_var_filter_block2d_bil_large(
371     const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
372     int dst_width, int dst_height, int filter_offset,
373     const uint8_t *second_pred) {
374   const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
375   const uint8x8_t f1 = vdup_n_u8(filter_offset);
376 
377   int i = dst_height;
378   do {
379     int j = 0;
380     do {
381       uint8x16_t s0 = vld1q_u8(src_ptr + j);
382       uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
383       uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
384       blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
385       uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
386       blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
387       uint8x16_t blend_u8 =
388           vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
389 
390       uint8x16_t p = vld1q_u8(second_pred);
391       uint8x16_t avg = vrhaddq_u8(blend_u8, p);
392 
393       vst1q_u8(dst_ptr + j, avg);
394 
395       j += 16;
396       second_pred += 16;
397     } while (j < dst_width);
398 
399     src_ptr += src_stride;
400     dst_ptr += dst_width;
401   } while (--i != 0);
402 }
403 
404 // Combine bilinear filter with aom_dist_wtd_comp_avg_pred for large blocks.
dist_wtd_avg_pred_var_filter_block2d_bil_large(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,int filter_offset,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)405 static void dist_wtd_avg_pred_var_filter_block2d_bil_large(
406     const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
407     int dst_width, int dst_height, int filter_offset,
408     const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {
409   const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
410   const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
411   const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
412   const uint8x8_t f1 = vdup_n_u8(filter_offset);
413 
414   int i = dst_height;
415   do {
416     int j = 0;
417     do {
418       uint8x16_t s0 = vld1q_u8(src_ptr + j);
419       uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
420       uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
421       blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
422       uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
423       blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
424       uint8x16_t blend_u8 =
425           vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
426 
427       uint8x16_t p = vld1q_u8(second_pred);
428       uint8x16_t avg = dist_wtd_avg_u8x16(blend_u8, p, fwd_offset, bck_offset);
429 
430       vst1q_u8(dst_ptr + j, avg);
431 
432       j += 16;
433       second_pred += 16;
434     } while (j < dst_width);
435 
436     src_ptr += src_stride;
437     dst_ptr += dst_width;
438   } while (--i != 0);
439 }
440 
441 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
avg_pred_var_filter_block2d_bil_w16(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred)442 static void avg_pred_var_filter_block2d_bil_w16(
443     const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
444     int dst_height, int filter_offset, const uint8_t *second_pred) {
445   avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
446                                         pixel_step, 16, dst_height,
447                                         filter_offset, second_pred);
448 }
449 
450 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
avg_pred_var_filter_block2d_bil_w32(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred)451 static void avg_pred_var_filter_block2d_bil_w32(
452     const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
453     int dst_height, int filter_offset, const uint8_t *second_pred) {
454   avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
455                                         pixel_step, 32, dst_height,
456                                         filter_offset, second_pred);
457 }
458 
459 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
avg_pred_var_filter_block2d_bil_w64(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred)460 static void avg_pred_var_filter_block2d_bil_w64(
461     const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
462     int dst_height, int filter_offset, const uint8_t *second_pred) {
463   avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
464                                         pixel_step, 64, dst_height,
465                                         filter_offset, second_pred);
466 }
467 
468 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
avg_pred_var_filter_block2d_bil_w128(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred)469 static void avg_pred_var_filter_block2d_bil_w128(
470     const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
471     int dst_height, int filter_offset, const uint8_t *second_pred) {
472   avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
473                                         pixel_step, 128, dst_height,
474                                         filter_offset, second_pred);
475 }
476 
477 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
dist_wtd_avg_pred_var_filter_block2d_bil_w16(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)478 static void dist_wtd_avg_pred_var_filter_block2d_bil_w16(
479     const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
480     int dst_height, int filter_offset, const uint8_t *second_pred,
481     const DIST_WTD_COMP_PARAMS *jcp_param) {
482   dist_wtd_avg_pred_var_filter_block2d_bil_large(
483       src_ptr, dst_ptr, src_stride, pixel_step, 16, dst_height, filter_offset,
484       second_pred, jcp_param);
485 }
486 
487 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
dist_wtd_avg_pred_var_filter_block2d_bil_w32(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)488 static void dist_wtd_avg_pred_var_filter_block2d_bil_w32(
489     const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
490     int dst_height, int filter_offset, const uint8_t *second_pred,
491     const DIST_WTD_COMP_PARAMS *jcp_param) {
492   dist_wtd_avg_pred_var_filter_block2d_bil_large(
493       src_ptr, dst_ptr, src_stride, pixel_step, 32, dst_height, filter_offset,
494       second_pred, jcp_param);
495 }
496 
497 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
dist_wtd_avg_pred_var_filter_block2d_bil_w64(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)498 static void dist_wtd_avg_pred_var_filter_block2d_bil_w64(
499     const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
500     int dst_height, int filter_offset, const uint8_t *second_pred,
501     const DIST_WTD_COMP_PARAMS *jcp_param) {
502   dist_wtd_avg_pred_var_filter_block2d_bil_large(
503       src_ptr, dst_ptr, src_stride, pixel_step, 64, dst_height, filter_offset,
504       second_pred, jcp_param);
505 }
506 
507 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
dist_wtd_avg_pred_var_filter_block2d_bil_w128(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)508 static void dist_wtd_avg_pred_var_filter_block2d_bil_w128(
509     const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
510     int dst_height, int filter_offset, const uint8_t *second_pred,
511     const DIST_WTD_COMP_PARAMS *jcp_param) {
512   dist_wtd_avg_pred_var_filter_block2d_bil_large(
513       src_ptr, dst_ptr, src_stride, pixel_step, 128, dst_height, filter_offset,
514       second_pred, jcp_param);
515 }
516 
517 // Combine averaging subpel filter with aom_comp_avg_pred.
avg_pred_var_filter_block2d_avg(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,const uint8_t * second_pred)518 static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr,
519                                             uint8_t *dst_ptr, int src_stride,
520                                             int pixel_step, int dst_width,
521                                             int dst_height,
522                                             const uint8_t *second_pred) {
523   // We only specialise on the filter values for large block sizes (>= 16x16.)
524   assert(dst_width >= 16 && dst_width % 16 == 0);
525 
526   int i = dst_height;
527   do {
528     int j = 0;
529     do {
530       uint8x16_t s0 = vld1q_u8(src_ptr + j);
531       uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
532       uint8x16_t avg = vrhaddq_u8(s0, s1);
533 
534       uint8x16_t p = vld1q_u8(second_pred);
535       avg = vrhaddq_u8(avg, p);
536 
537       vst1q_u8(dst_ptr + j, avg);
538 
539       j += 16;
540       second_pred += 16;
541     } while (j < dst_width);
542 
543     src_ptr += src_stride;
544     dst_ptr += dst_width;
545   } while (--i != 0);
546 }
547 
548 // Combine averaging subpel filter with aom_dist_wtd_comp_avg_pred.
dist_wtd_avg_pred_var_filter_block2d_avg(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)549 static void dist_wtd_avg_pred_var_filter_block2d_avg(
550     const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
551     int dst_width, int dst_height, const uint8_t *second_pred,
552     const DIST_WTD_COMP_PARAMS *jcp_param) {
553   // We only specialise on the filter values for large block sizes (>= 16x16.)
554   assert(dst_width >= 16 && dst_width % 16 == 0);
555   const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
556   const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
557 
558   int i = dst_height;
559   do {
560     int j = 0;
561     do {
562       uint8x16_t s0 = vld1q_u8(src_ptr + j);
563       uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
564       uint8x16_t p = vld1q_u8(second_pred);
565       uint8x16_t avg = vrhaddq_u8(s0, s1);
566       avg = dist_wtd_avg_u8x16(avg, p, fwd_offset, bck_offset);
567 
568       vst1q_u8(dst_ptr + j, avg);
569 
570       j += 16;
571       second_pred += 16;
572     } while (j < dst_width);
573 
574     src_ptr += src_stride;
575     dst_ptr += dst_width;
576   } while (--i != 0);
577 }
578 
579 // Implementation of aom_comp_avg_pred for blocks having width >= 16.
avg_pred(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int dst_width,int dst_height,const uint8_t * second_pred)580 static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
581                      int dst_width, int dst_height,
582                      const uint8_t *second_pred) {
583   // We only specialise on the filter values for large block sizes (>= 16x16.)
584   assert(dst_width >= 16 && dst_width % 16 == 0);
585 
586   int i = dst_height;
587   do {
588     int j = 0;
589     do {
590       uint8x16_t s = vld1q_u8(src_ptr + j);
591       uint8x16_t p = vld1q_u8(second_pred);
592 
593       uint8x16_t avg = vrhaddq_u8(s, p);
594 
595       vst1q_u8(dst_ptr + j, avg);
596 
597       j += 16;
598       second_pred += 16;
599     } while (j < dst_width);
600 
601     src_ptr += src_stride;
602     dst_ptr += dst_width;
603   } while (--i != 0);
604 }
605 
606 // Implementation of aom_dist_wtd_comp_avg_pred for blocks having width >= 16.
dist_wtd_avg_pred(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int dst_width,int dst_height,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)607 static void dist_wtd_avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr,
608                               int src_stride, int dst_width, int dst_height,
609                               const uint8_t *second_pred,
610                               const DIST_WTD_COMP_PARAMS *jcp_param) {
611   // We only specialise on the filter values for large block sizes (>= 16x16.)
612   assert(dst_width >= 16 && dst_width % 16 == 0);
613   const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
614   const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
615 
616   int i = dst_height;
617   do {
618     int j = 0;
619     do {
620       uint8x16_t s = vld1q_u8(src_ptr + j);
621       uint8x16_t p = vld1q_u8(second_pred);
622 
623       uint8x16_t avg = dist_wtd_avg_u8x16(s, p, fwd_offset, bck_offset);
624 
625       vst1q_u8(dst_ptr + j, avg);
626 
627       j += 16;
628       second_pred += 16;
629     } while (j < dst_width);
630 
631     src_ptr += src_stride;
632     dst_ptr += dst_width;
633   } while (--i != 0);
634 }
635 
636 #define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                         \
637   unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon(                  \
638       const uint8_t *src, int source_stride, int xoffset, int yoffset,      \
639       const uint8_t *ref, int ref_stride, uint32_t *sse,                    \
640       const uint8_t *second_pred) {                                         \
641     uint8_t tmp0[w * (h + padding)];                                        \
642     uint8_t tmp1[w * h];                                                    \
643     var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
644                                 xoffset);                                   \
645     avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,      \
646                                          second_pred);                      \
647     return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);            \
648   }
649 
650 #define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                \
651   unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon(                     \
652       const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
653       const uint8_t *ref, int ref_stride, unsigned int *sse,                   \
654       const uint8_t *second_pred) {                                            \
655     if (xoffset == 0) {                                                        \
656       uint8_t tmp[w * h];                                                      \
657       if (yoffset == 0) {                                                      \
658         avg_pred(src, tmp, source_stride, w, h, second_pred);                  \
659         return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
660       } else if (yoffset == 4) {                                               \
661         avg_pred_var_filter_block2d_avg(src, tmp, source_stride,               \
662                                         source_stride, w, h, second_pred);     \
663         return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
664       } else {                                                                 \
665         avg_pred_var_filter_block2d_bil_w##w(                                  \
666             src, tmp, source_stride, source_stride, h, yoffset, second_pred);  \
667         return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
668       }                                                                        \
669     } else if (xoffset == 4) {                                                 \
670       uint8_t tmp0[w * (h + padding)];                                         \
671       if (yoffset == 0) {                                                      \
672         avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h,     \
673                                         second_pred);                          \
674         return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
675       } else if (yoffset == 4) {                                               \
676         uint8_t tmp1[w * (h + padding)];                                       \
677         var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
678         avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
679         return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
680       } else {                                                                 \
681         uint8_t tmp1[w * (h + padding)];                                       \
682         var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
683         avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
684                                              second_pred);                     \
685         return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
686       }                                                                        \
687     } else {                                                                   \
688       uint8_t tmp0[w * (h + padding)];                                         \
689       if (yoffset == 0) {                                                      \
690         avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h,   \
691                                              xoffset, second_pred);            \
692         return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
693       } else if (yoffset == 4) {                                               \
694         uint8_t tmp1[w * h];                                                   \
695         var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
696                                     (h + padding), xoffset);                   \
697         avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
698         return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
699       } else {                                                                 \
700         uint8_t tmp1[w * h];                                                   \
701         var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
702                                     (h + padding), xoffset);                   \
703         avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
704                                              second_pred);                     \
705         return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
706       }                                                                        \
707     }                                                                          \
708   }
709 
710 SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
711 SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
712 
713 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
714 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
715 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
716 
717 SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
718 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
719 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
720 
721 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
722 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
723 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
724 
725 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
726 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
727 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 128, 1)
728 
729 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 64, 1)
730 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 128, 1)
731 
732 #if !CONFIG_REALTIME_ONLY
733 
734 SUBPEL_AVG_VARIANCE_WXH_NEON(4, 16, 2)
735 
736 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 1)
737 
738 SUBPEL_AVG_VARIANCE_WXH_NEON(16, 4, 1)
739 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 64, 1)
740 
741 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 8, 1)
742 
743 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 16, 1)
744 
745 #endif  // !CONFIG_REALTIME_ONLY
746 
747 #undef SUBPEL_AVG_VARIANCE_WXH_NEON
748 #undef SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON
749 
750 #define DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                \
751   unsigned int aom_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon(         \
752       const uint8_t *src, int source_stride, int xoffset, int yoffset,      \
753       const uint8_t *ref, int ref_stride, uint32_t *sse,                    \
754       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {  \
755     uint8_t tmp0[w * (h + padding)];                                        \
756     uint8_t tmp1[w * h];                                                    \
757     var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
758                                 xoffset);                                   \
759     dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                          \
760         tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param);              \
761     return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);            \
762   }
763 
764 #define SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)       \
765   unsigned int aom_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon(            \
766       const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
767       const uint8_t *ref, int ref_stride, unsigned int *sse,                   \
768       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {     \
769     if (xoffset == 0) {                                                        \
770       uint8_t tmp[w * h];                                                      \
771       if (yoffset == 0) {                                                      \
772         dist_wtd_avg_pred(src, tmp, source_stride, w, h, second_pred,          \
773                           jcp_param);                                          \
774         return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
775       } else if (yoffset == 4) {                                               \
776         dist_wtd_avg_pred_var_filter_block2d_avg(src, tmp, source_stride,      \
777                                                  source_stride, w, h,          \
778                                                  second_pred, jcp_param);      \
779         return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
780       } else {                                                                 \
781         dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                         \
782             src, tmp, source_stride, source_stride, h, yoffset, second_pred,   \
783             jcp_param);                                                        \
784         return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
785       }                                                                        \
786     } else if (xoffset == 4) {                                                 \
787       uint8_t tmp0[w * (h + padding)];                                         \
788       if (yoffset == 0) {                                                      \
789         dist_wtd_avg_pred_var_filter_block2d_avg(                              \
790             src, tmp0, source_stride, 1, w, h, second_pred, jcp_param);        \
791         return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
792       } else if (yoffset == 4) {                                               \
793         uint8_t tmp1[w * (h + padding)];                                       \
794         var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
795         dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h,       \
796                                                  second_pred, jcp_param);      \
797         return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
798       } else {                                                                 \
799         uint8_t tmp1[w * (h + padding)];                                       \
800         var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
801         dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                         \
802             tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param);             \
803         return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
804       }                                                                        \
805     } else {                                                                   \
806       uint8_t tmp0[w * (h + padding)];                                         \
807       if (yoffset == 0) {                                                      \
808         dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                         \
809             src, tmp0, source_stride, 1, h, xoffset, second_pred, jcp_param);  \
810         return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
811       } else if (yoffset == 4) {                                               \
812         uint8_t tmp1[w * h];                                                   \
813         var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
814                                     (h + padding), xoffset);                   \
815         dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h,       \
816                                                  second_pred, jcp_param);      \
817         return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
818       } else {                                                                 \
819         uint8_t tmp1[w * h];                                                   \
820         var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
821                                     (h + padding), xoffset);                   \
822         dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                         \
823             tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param);             \
824         return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
825       }                                                                        \
826     }                                                                          \
827   }
828 
829 DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
830 DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
831 
832 DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
833 DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
834 DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
835 
836 DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
837 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
838 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
839 
840 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
841 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
842 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
843 
844 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
845 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
846 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 128, 1)
847 
848 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 64, 1)
849 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 128, 1)
850 
851 #if !CONFIG_REALTIME_ONLY
852 
853 DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 16, 2)
854 
855 DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 1)
856 
857 DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 4, 1)
858 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 64, 1)
859 
860 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 8, 1)
861 
862 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 16, 1)
863 
864 #endif  // !CONFIG_REALTIME_ONLY
865 
866 #undef DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON
867 #undef SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON
868 
869 #if !CONFIG_REALTIME_ONLY
870 
871 #define OBMC_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                   \
872   unsigned int aom_obmc_sub_pixel_variance##w##x##h##_neon(            \
873       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,    \
874       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {   \
875     uint8_t tmp0[w * (h + padding)];                                   \
876     uint8_t tmp1[w * h];                                               \
877     var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \
878                                 xoffset);                              \
879     var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);         \
880     return aom_obmc_variance##w##x##h(tmp1, w, wsrc, mask, sse);       \
881   }
882 
883 #define SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)              \
884   unsigned int aom_obmc_sub_pixel_variance##w##x##h##_neon(                   \
885       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,           \
886       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {          \
887     if (xoffset == 0) {                                                       \
888       if (yoffset == 0) {                                                     \
889         return aom_obmc_variance##w##x##h##_neon(pre, pre_stride, wsrc, mask, \
890                                                  sse);                        \
891       } else if (yoffset == 4) {                                              \
892         uint8_t tmp[w * h];                                                   \
893         var_filter_block2d_avg(pre, tmp, pre_stride, pre_stride, w, h);       \
894         return aom_obmc_variance##w##x##h##_neon(tmp, w, wsrc, mask, sse);    \
895       } else {                                                                \
896         uint8_t tmp[w * h];                                                   \
897         var_filter_block2d_bil_w##w(pre, tmp, pre_stride, pre_stride, h,      \
898                                     yoffset);                                 \
899         return aom_obmc_variance##w##x##h##_neon(tmp, w, wsrc, mask, sse);    \
900       }                                                                       \
901     } else if (xoffset == 4) {                                                \
902       uint8_t tmp0[w * (h + padding)];                                        \
903       if (yoffset == 0) {                                                     \
904         var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h);               \
905         return aom_obmc_variance##w##x##h##_neon(tmp0, w, wsrc, mask, sse);   \
906       } else if (yoffset == 4) {                                              \
907         uint8_t tmp1[w * (h + padding)];                                      \
908         var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h + padding);     \
909         var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                       \
910         return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse);   \
911       } else {                                                                \
912         uint8_t tmp1[w * (h + padding)];                                      \
913         var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h + padding);     \
914         var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);            \
915         return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse);   \
916       }                                                                       \
917     } else {                                                                  \
918       uint8_t tmp0[w * (h + padding)];                                        \
919       if (yoffset == 0) {                                                     \
920         var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h, xoffset);    \
921         return aom_obmc_variance##w##x##h##_neon(tmp0, w, wsrc, mask, sse);   \
922       } else if (yoffset == 4) {                                              \
923         uint8_t tmp1[w * h];                                                  \
924         var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding,    \
925                                     xoffset);                                 \
926         var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                       \
927         return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse);   \
928       } else {                                                                \
929         uint8_t tmp1[w * h];                                                  \
930         var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding,    \
931                                     xoffset);                                 \
932         var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);            \
933         return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse);   \
934       }                                                                       \
935     }                                                                         \
936   }
937 
938 OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
939 OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
940 OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
941 
942 OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
943 OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
944 OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
945 OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
946 
947 OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
948 OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
949 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
950 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
951 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
952 
953 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
954 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
955 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
956 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
957 
958 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
959 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
960 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
961 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
962 
963 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
964 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
965 
966 #undef OBMC_SUBPEL_VARIANCE_WXH_NEON
967 #undef SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON
968 #endif  // !CONFIG_REALTIME_ONLY
969 
970 #define MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                         \
971   unsigned int aom_masked_sub_pixel_variance##w##x##h##_neon(                  \
972       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
973       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
974       const uint8_t *msk, int msk_stride, int invert_mask,                     \
975       unsigned int *sse) {                                                     \
976     uint8_t tmp0[w * (h + padding)];                                           \
977     uint8_t tmp1[w * h];                                                       \
978     uint8_t tmp2[w * h];                                                       \
979     var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),       \
980                                 xoffset);                                      \
981     var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);                 \
982     aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, msk_stride, \
983                             invert_mask);                                      \
984     return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse);               \
985   }
986 
987 #define SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)             \
988   unsigned int aom_masked_sub_pixel_variance##w##x##h##_neon(                  \
989       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
990       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
991       const uint8_t *msk, int msk_stride, int invert_mask,                     \
992       unsigned int *sse) {                                                     \
993     if (xoffset == 0) {                                                        \
994       uint8_t tmp0[w * h];                                                     \
995       if (yoffset == 0) {                                                      \
996         aom_comp_mask_pred_neon(tmp0, second_pred, w, h, src, src_stride, msk, \
997                                 msk_stride, invert_mask);                      \
998         return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
999       } else if (yoffset == 4) {                                               \
1000         uint8_t tmp1[w * h];                                                   \
1001         var_filter_block2d_avg(src, tmp0, src_stride, src_stride, w, h);       \
1002         aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk,         \
1003                                 msk_stride, invert_mask);                      \
1004         return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
1005       } else {                                                                 \
1006         uint8_t tmp1[w * h];                                                   \
1007         var_filter_block2d_bil_w##w(src, tmp0, src_stride, src_stride, h,      \
1008                                     yoffset);                                  \
1009         aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk,         \
1010                                 msk_stride, invert_mask);                      \
1011         return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
1012       }                                                                        \
1013     } else if (xoffset == 4) {                                                 \
1014       uint8_t tmp0[w * (h + padding)];                                         \
1015       if (yoffset == 0) {                                                      \
1016         uint8_t tmp1[w * h];                                                   \
1017         var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h);                \
1018         aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk,         \
1019                                 msk_stride, invert_mask);                      \
1020         return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
1021       } else if (yoffset == 4) {                                               \
1022         uint8_t tmp1[w * h];                                                   \
1023         uint8_t tmp2[w * h];                                                   \
1024         var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));    \
1025         var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                        \
1026         aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk,         \
1027                                 msk_stride, invert_mask);                      \
1028         return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse);           \
1029       } else {                                                                 \
1030         uint8_t tmp1[w * h];                                                   \
1031         uint8_t tmp2[w * h];                                                   \
1032         var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));    \
1033         var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);             \
1034         aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk,         \
1035                                 msk_stride, invert_mask);                      \
1036         return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse);           \
1037       }                                                                        \
1038     } else {                                                                   \
1039       if (yoffset == 0) {                                                      \
1040         uint8_t tmp0[w * h];                                                   \
1041         uint8_t tmp1[w * h];                                                   \
1042         var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset);     \
1043         aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk,         \
1044                                 msk_stride, invert_mask);                      \
1045         return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
1046       } else if (yoffset == 4) {                                               \
1047         uint8_t tmp0[w * (h + padding)];                                       \
1048         uint8_t tmp1[w * h];                                                   \
1049         uint8_t tmp2[w * h];                                                   \
1050         var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),   \
1051                                     xoffset);                                  \
1052         var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                        \
1053         aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk,         \
1054                                 msk_stride, invert_mask);                      \
1055         return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse);           \
1056       } else {                                                                 \
1057         uint8_t tmp0[w * (h + padding)];                                       \
1058         uint8_t tmp1[w * (h + padding)];                                       \
1059         uint8_t tmp2[w * h];                                                   \
1060         var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),   \
1061                                     xoffset);                                  \
1062         var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);             \
1063         aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk,         \
1064                                 msk_stride, invert_mask);                      \
1065         return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse);           \
1066       }                                                                        \
1067     }                                                                          \
1068   }
1069 
1070 MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
1071 MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
1072 
1073 MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
1074 MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
1075 MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
1076 
1077 MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
1078 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
1079 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
1080 
1081 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
1082 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
1083 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
1084 
1085 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
1086 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
1087 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
1088 
1089 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
1090 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
1091 
1092 // Realtime mode doesn't use 4x rectangular blocks.
1093 #if !CONFIG_REALTIME_ONLY
1094 MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
1095 MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
1096 MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
1097 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
1098 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
1099 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
1100 #endif  // !CONFIG_REALTIME_ONLY
1101 
1102 #undef MASKED_SUBPEL_VARIANCE_WXH_NEON
1103 #undef SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON
1104