1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13
14 #include "config/aom_dsp_rtcd.h"
15 #include "config/aom_config.h"
16
17 #include "aom_ports/mem.h"
18 #include "aom/aom_integer.h"
19
20 #include "aom_dsp/variance.h"
21 #include "aom_dsp/arm/dist_wtd_avg_neon.h"
22 #include "aom_dsp/arm/mem_neon.h"
23
var_filter_block2d_bil_w4(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)24 static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr,
25 int src_stride, int pixel_step,
26 int dst_height, int filter_offset) {
27 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
28 const uint8x8_t f1 = vdup_n_u8(filter_offset);
29
30 int i = dst_height;
31 do {
32 uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
33 uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
34 uint16x8_t blend = vmull_u8(s0, f0);
35 blend = vmlal_u8(blend, s1, f1);
36 uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
37 vst1_u8(dst_ptr, blend_u8);
38
39 src_ptr += 2 * src_stride;
40 dst_ptr += 2 * 4;
41 i -= 2;
42 } while (i != 0);
43 }
44
var_filter_block2d_bil_w8(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)45 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr,
46 int src_stride, int pixel_step,
47 int dst_height, int filter_offset) {
48 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
49 const uint8x8_t f1 = vdup_n_u8(filter_offset);
50
51 int i = dst_height;
52 do {
53 uint8x8_t s0 = vld1_u8(src_ptr);
54 uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
55 uint16x8_t blend = vmull_u8(s0, f0);
56 blend = vmlal_u8(blend, s1, f1);
57 uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
58 vst1_u8(dst_ptr, blend_u8);
59
60 src_ptr += src_stride;
61 dst_ptr += 8;
62 } while (--i != 0);
63 }
64
var_filter_block2d_bil_large(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,int filter_offset)65 static void var_filter_block2d_bil_large(const uint8_t *src_ptr,
66 uint8_t *dst_ptr, int src_stride,
67 int pixel_step, int dst_width,
68 int dst_height, int filter_offset) {
69 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
70 const uint8x8_t f1 = vdup_n_u8(filter_offset);
71
72 int i = dst_height;
73 do {
74 int j = 0;
75 do {
76 uint8x16_t s0 = vld1q_u8(src_ptr + j);
77 uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
78 uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
79 blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
80 uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
81 blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
82 uint8x16_t blend_u8 =
83 vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
84 vst1q_u8(dst_ptr + j, blend_u8);
85
86 j += 16;
87 } while (j < dst_width);
88
89 src_ptr += src_stride;
90 dst_ptr += dst_width;
91 } while (--i != 0);
92 }
93
var_filter_block2d_bil_w16(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)94 static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr,
95 int src_stride, int pixel_step,
96 int dst_height, int filter_offset) {
97 var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16,
98 dst_height, filter_offset);
99 }
100
var_filter_block2d_bil_w32(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)101 static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr,
102 int src_stride, int pixel_step,
103 int dst_height, int filter_offset) {
104 var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32,
105 dst_height, filter_offset);
106 }
107
var_filter_block2d_bil_w64(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)108 static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
109 int src_stride, int pixel_step,
110 int dst_height, int filter_offset) {
111 var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64,
112 dst_height, filter_offset);
113 }
114
var_filter_block2d_bil_w128(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)115 static void var_filter_block2d_bil_w128(const uint8_t *src_ptr,
116 uint8_t *dst_ptr, int src_stride,
117 int pixel_step, int dst_height,
118 int filter_offset) {
119 var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 128,
120 dst_height, filter_offset);
121 }
122
var_filter_block2d_avg(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height)123 static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
124 int src_stride, int pixel_step,
125 int dst_width, int dst_height) {
126 // We only specialise on the filter values for large block sizes (>= 16x16.)
127 assert(dst_width >= 16 && dst_width % 16 == 0);
128
129 int i = dst_height;
130 do {
131 int j = 0;
132 do {
133 uint8x16_t s0 = vld1q_u8(src_ptr + j);
134 uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
135 uint8x16_t avg = vrhaddq_u8(s0, s1);
136 vst1q_u8(dst_ptr + j, avg);
137
138 j += 16;
139 } while (j < dst_width);
140
141 src_ptr += src_stride;
142 dst_ptr += dst_width;
143 } while (--i != 0);
144 }
145
146 #define SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
147 unsigned int aom_sub_pixel_variance##w##x##h##_neon( \
148 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
149 const uint8_t *ref, int ref_stride, uint32_t *sse) { \
150 uint8_t tmp0[w * (h + padding)]; \
151 uint8_t tmp1[w * h]; \
152 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
153 xoffset); \
154 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
155 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
156 }
157
158 #define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
159 unsigned int aom_sub_pixel_variance##w##x##h##_neon( \
160 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
161 const uint8_t *ref, int ref_stride, unsigned int *sse) { \
162 if (xoffset == 0) { \
163 if (yoffset == 0) { \
164 return aom_variance##w##x##h(src, src_stride, ref, ref_stride, sse); \
165 } else if (yoffset == 4) { \
166 uint8_t tmp[w * h]; \
167 var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \
168 return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
169 } else { \
170 uint8_t tmp[w * h]; \
171 var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \
172 yoffset); \
173 return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
174 } \
175 } else if (xoffset == 4) { \
176 uint8_t tmp0[w * (h + padding)]; \
177 if (yoffset == 0) { \
178 var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \
179 return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
180 } else if (yoffset == 4) { \
181 uint8_t tmp1[w * (h + padding)]; \
182 var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
183 var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
184 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
185 } else { \
186 uint8_t tmp1[w * (h + padding)]; \
187 var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
188 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
189 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
190 } \
191 } else { \
192 uint8_t tmp0[w * (h + padding)]; \
193 if (yoffset == 0) { \
194 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \
195 return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
196 } else if (yoffset == 4) { \
197 uint8_t tmp1[w * h]; \
198 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
199 xoffset); \
200 var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
201 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
202 } else { \
203 uint8_t tmp1[w * h]; \
204 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
205 xoffset); \
206 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
207 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
208 } \
209 } \
210 }
211
212 SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
213 SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
214
215 SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
216 SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
217 SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
218
219 SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
220 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
221 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
222
223 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
224 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
225 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
226
227 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
228 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
229 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
230
231 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
232 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
233
234 // Realtime mode doesn't use 4x rectangular blocks.
235 #if !CONFIG_REALTIME_ONLY
236
237 SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
238
239 SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
240
241 SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
242 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
243
244 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
245
246 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
247
248 #endif // !CONFIG_REALTIME_ONLY
249
250 #undef SUBPEL_VARIANCE_WXH_NEON
251 #undef SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON
252
253 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 4.
avg_pred_var_filter_block2d_bil_w4(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred)254 static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr,
255 uint8_t *dst_ptr, int src_stride,
256 int pixel_step, int dst_height,
257 int filter_offset,
258 const uint8_t *second_pred) {
259 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
260 const uint8x8_t f1 = vdup_n_u8(filter_offset);
261
262 int i = dst_height;
263 do {
264 uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
265 uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
266 uint16x8_t blend = vmull_u8(s0, f0);
267 blend = vmlal_u8(blend, s1, f1);
268 uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
269
270 uint8x8_t p = vld1_u8(second_pred);
271 uint8x8_t avg = vrhadd_u8(blend_u8, p);
272
273 vst1_u8(dst_ptr, avg);
274
275 src_ptr += 2 * src_stride;
276 dst_ptr += 2 * 4;
277 second_pred += 2 * 4;
278 i -= 2;
279 } while (i != 0);
280 }
281
282 // Combine bilinear filter with aom_dist_wtd_comp_avg_pred for blocks having
283 // width 4.
dist_wtd_avg_pred_var_filter_block2d_bil_w4(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)284 static void dist_wtd_avg_pred_var_filter_block2d_bil_w4(
285 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
286 int dst_height, int filter_offset, const uint8_t *second_pred,
287 const DIST_WTD_COMP_PARAMS *jcp_param) {
288 const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
289 const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
290 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
291 const uint8x8_t f1 = vdup_n_u8(filter_offset);
292
293 int i = dst_height;
294 do {
295 uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
296 uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
297 uint8x8_t p = vld1_u8(second_pred);
298 uint16x8_t blend = vmull_u8(s0, f0);
299 blend = vmlal_u8(blend, s1, f1);
300 uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
301 uint8x8_t avg = dist_wtd_avg_u8x8(blend_u8, p, fwd_offset, bck_offset);
302
303 vst1_u8(dst_ptr, avg);
304
305 src_ptr += 2 * src_stride;
306 dst_ptr += 2 * 4;
307 second_pred += 2 * 4;
308 i -= 2;
309 } while (i != 0);
310 }
311
312 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 8.
avg_pred_var_filter_block2d_bil_w8(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred)313 static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr,
314 uint8_t *dst_ptr, int src_stride,
315 int pixel_step, int dst_height,
316 int filter_offset,
317 const uint8_t *second_pred) {
318 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
319 const uint8x8_t f1 = vdup_n_u8(filter_offset);
320
321 int i = dst_height;
322 do {
323 uint8x8_t s0 = vld1_u8(src_ptr);
324 uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
325 uint16x8_t blend = vmull_u8(s0, f0);
326 blend = vmlal_u8(blend, s1, f1);
327 uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
328
329 uint8x8_t p = vld1_u8(second_pred);
330 uint8x8_t avg = vrhadd_u8(blend_u8, p);
331
332 vst1_u8(dst_ptr, avg);
333
334 src_ptr += src_stride;
335 dst_ptr += 8;
336 second_pred += 8;
337 } while (--i > 0);
338 }
339
340 // Combine bilinear filter with aom_dist_wtd_comp_avg_pred for blocks having
341 // width 8.
dist_wtd_avg_pred_var_filter_block2d_bil_w8(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)342 static void dist_wtd_avg_pred_var_filter_block2d_bil_w8(
343 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
344 int dst_height, int filter_offset, const uint8_t *second_pred,
345 const DIST_WTD_COMP_PARAMS *jcp_param) {
346 const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
347 const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
348 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
349 const uint8x8_t f1 = vdup_n_u8(filter_offset);
350
351 int i = dst_height;
352 do {
353 uint8x8_t s0 = vld1_u8(src_ptr);
354 uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
355 uint8x8_t p = vld1_u8(second_pred);
356 uint16x8_t blend = vmull_u8(s0, f0);
357 blend = vmlal_u8(blend, s1, f1);
358 uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
359 uint8x8_t avg = dist_wtd_avg_u8x8(blend_u8, p, fwd_offset, bck_offset);
360
361 vst1_u8(dst_ptr, avg);
362
363 src_ptr += src_stride;
364 dst_ptr += 8;
365 second_pred += 8;
366 } while (--i > 0);
367 }
368
369 // Combine bilinear filter with aom_comp_avg_pred for large blocks.
avg_pred_var_filter_block2d_bil_large(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,int filter_offset,const uint8_t * second_pred)370 static void avg_pred_var_filter_block2d_bil_large(
371 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
372 int dst_width, int dst_height, int filter_offset,
373 const uint8_t *second_pred) {
374 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
375 const uint8x8_t f1 = vdup_n_u8(filter_offset);
376
377 int i = dst_height;
378 do {
379 int j = 0;
380 do {
381 uint8x16_t s0 = vld1q_u8(src_ptr + j);
382 uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
383 uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
384 blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
385 uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
386 blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
387 uint8x16_t blend_u8 =
388 vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
389
390 uint8x16_t p = vld1q_u8(second_pred);
391 uint8x16_t avg = vrhaddq_u8(blend_u8, p);
392
393 vst1q_u8(dst_ptr + j, avg);
394
395 j += 16;
396 second_pred += 16;
397 } while (j < dst_width);
398
399 src_ptr += src_stride;
400 dst_ptr += dst_width;
401 } while (--i != 0);
402 }
403
404 // Combine bilinear filter with aom_dist_wtd_comp_avg_pred for large blocks.
dist_wtd_avg_pred_var_filter_block2d_bil_large(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,int filter_offset,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)405 static void dist_wtd_avg_pred_var_filter_block2d_bil_large(
406 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
407 int dst_width, int dst_height, int filter_offset,
408 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {
409 const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
410 const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
411 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
412 const uint8x8_t f1 = vdup_n_u8(filter_offset);
413
414 int i = dst_height;
415 do {
416 int j = 0;
417 do {
418 uint8x16_t s0 = vld1q_u8(src_ptr + j);
419 uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
420 uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
421 blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
422 uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
423 blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
424 uint8x16_t blend_u8 =
425 vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
426
427 uint8x16_t p = vld1q_u8(second_pred);
428 uint8x16_t avg = dist_wtd_avg_u8x16(blend_u8, p, fwd_offset, bck_offset);
429
430 vst1q_u8(dst_ptr + j, avg);
431
432 j += 16;
433 second_pred += 16;
434 } while (j < dst_width);
435
436 src_ptr += src_stride;
437 dst_ptr += dst_width;
438 } while (--i != 0);
439 }
440
441 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
avg_pred_var_filter_block2d_bil_w16(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred)442 static void avg_pred_var_filter_block2d_bil_w16(
443 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
444 int dst_height, int filter_offset, const uint8_t *second_pred) {
445 avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
446 pixel_step, 16, dst_height,
447 filter_offset, second_pred);
448 }
449
450 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
avg_pred_var_filter_block2d_bil_w32(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred)451 static void avg_pred_var_filter_block2d_bil_w32(
452 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
453 int dst_height, int filter_offset, const uint8_t *second_pred) {
454 avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
455 pixel_step, 32, dst_height,
456 filter_offset, second_pred);
457 }
458
459 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
avg_pred_var_filter_block2d_bil_w64(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred)460 static void avg_pred_var_filter_block2d_bil_w64(
461 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
462 int dst_height, int filter_offset, const uint8_t *second_pred) {
463 avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
464 pixel_step, 64, dst_height,
465 filter_offset, second_pred);
466 }
467
468 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
avg_pred_var_filter_block2d_bil_w128(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred)469 static void avg_pred_var_filter_block2d_bil_w128(
470 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
471 int dst_height, int filter_offset, const uint8_t *second_pred) {
472 avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
473 pixel_step, 128, dst_height,
474 filter_offset, second_pred);
475 }
476
477 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
dist_wtd_avg_pred_var_filter_block2d_bil_w16(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)478 static void dist_wtd_avg_pred_var_filter_block2d_bil_w16(
479 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
480 int dst_height, int filter_offset, const uint8_t *second_pred,
481 const DIST_WTD_COMP_PARAMS *jcp_param) {
482 dist_wtd_avg_pred_var_filter_block2d_bil_large(
483 src_ptr, dst_ptr, src_stride, pixel_step, 16, dst_height, filter_offset,
484 second_pred, jcp_param);
485 }
486
487 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
dist_wtd_avg_pred_var_filter_block2d_bil_w32(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)488 static void dist_wtd_avg_pred_var_filter_block2d_bil_w32(
489 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
490 int dst_height, int filter_offset, const uint8_t *second_pred,
491 const DIST_WTD_COMP_PARAMS *jcp_param) {
492 dist_wtd_avg_pred_var_filter_block2d_bil_large(
493 src_ptr, dst_ptr, src_stride, pixel_step, 32, dst_height, filter_offset,
494 second_pred, jcp_param);
495 }
496
497 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
dist_wtd_avg_pred_var_filter_block2d_bil_w64(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)498 static void dist_wtd_avg_pred_var_filter_block2d_bil_w64(
499 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
500 int dst_height, int filter_offset, const uint8_t *second_pred,
501 const DIST_WTD_COMP_PARAMS *jcp_param) {
502 dist_wtd_avg_pred_var_filter_block2d_bil_large(
503 src_ptr, dst_ptr, src_stride, pixel_step, 64, dst_height, filter_offset,
504 second_pred, jcp_param);
505 }
506
507 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
dist_wtd_avg_pred_var_filter_block2d_bil_w128(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)508 static void dist_wtd_avg_pred_var_filter_block2d_bil_w128(
509 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
510 int dst_height, int filter_offset, const uint8_t *second_pred,
511 const DIST_WTD_COMP_PARAMS *jcp_param) {
512 dist_wtd_avg_pred_var_filter_block2d_bil_large(
513 src_ptr, dst_ptr, src_stride, pixel_step, 128, dst_height, filter_offset,
514 second_pred, jcp_param);
515 }
516
517 // Combine averaging subpel filter with aom_comp_avg_pred.
avg_pred_var_filter_block2d_avg(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,const uint8_t * second_pred)518 static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr,
519 uint8_t *dst_ptr, int src_stride,
520 int pixel_step, int dst_width,
521 int dst_height,
522 const uint8_t *second_pred) {
523 // We only specialise on the filter values for large block sizes (>= 16x16.)
524 assert(dst_width >= 16 && dst_width % 16 == 0);
525
526 int i = dst_height;
527 do {
528 int j = 0;
529 do {
530 uint8x16_t s0 = vld1q_u8(src_ptr + j);
531 uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
532 uint8x16_t avg = vrhaddq_u8(s0, s1);
533
534 uint8x16_t p = vld1q_u8(second_pred);
535 avg = vrhaddq_u8(avg, p);
536
537 vst1q_u8(dst_ptr + j, avg);
538
539 j += 16;
540 second_pred += 16;
541 } while (j < dst_width);
542
543 src_ptr += src_stride;
544 dst_ptr += dst_width;
545 } while (--i != 0);
546 }
547
548 // Combine averaging subpel filter with aom_dist_wtd_comp_avg_pred.
dist_wtd_avg_pred_var_filter_block2d_avg(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)549 static void dist_wtd_avg_pred_var_filter_block2d_avg(
550 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
551 int dst_width, int dst_height, const uint8_t *second_pred,
552 const DIST_WTD_COMP_PARAMS *jcp_param) {
553 // We only specialise on the filter values for large block sizes (>= 16x16.)
554 assert(dst_width >= 16 && dst_width % 16 == 0);
555 const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
556 const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
557
558 int i = dst_height;
559 do {
560 int j = 0;
561 do {
562 uint8x16_t s0 = vld1q_u8(src_ptr + j);
563 uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
564 uint8x16_t p = vld1q_u8(second_pred);
565 uint8x16_t avg = vrhaddq_u8(s0, s1);
566 avg = dist_wtd_avg_u8x16(avg, p, fwd_offset, bck_offset);
567
568 vst1q_u8(dst_ptr + j, avg);
569
570 j += 16;
571 second_pred += 16;
572 } while (j < dst_width);
573
574 src_ptr += src_stride;
575 dst_ptr += dst_width;
576 } while (--i != 0);
577 }
578
579 // Implementation of aom_comp_avg_pred for blocks having width >= 16.
avg_pred(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int dst_width,int dst_height,const uint8_t * second_pred)580 static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
581 int dst_width, int dst_height,
582 const uint8_t *second_pred) {
583 // We only specialise on the filter values for large block sizes (>= 16x16.)
584 assert(dst_width >= 16 && dst_width % 16 == 0);
585
586 int i = dst_height;
587 do {
588 int j = 0;
589 do {
590 uint8x16_t s = vld1q_u8(src_ptr + j);
591 uint8x16_t p = vld1q_u8(second_pred);
592
593 uint8x16_t avg = vrhaddq_u8(s, p);
594
595 vst1q_u8(dst_ptr + j, avg);
596
597 j += 16;
598 second_pred += 16;
599 } while (j < dst_width);
600
601 src_ptr += src_stride;
602 dst_ptr += dst_width;
603 } while (--i != 0);
604 }
605
606 // Implementation of aom_dist_wtd_comp_avg_pred for blocks having width >= 16.
dist_wtd_avg_pred(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int dst_width,int dst_height,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)607 static void dist_wtd_avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr,
608 int src_stride, int dst_width, int dst_height,
609 const uint8_t *second_pred,
610 const DIST_WTD_COMP_PARAMS *jcp_param) {
611 // We only specialise on the filter values for large block sizes (>= 16x16.)
612 assert(dst_width >= 16 && dst_width % 16 == 0);
613 const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
614 const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
615
616 int i = dst_height;
617 do {
618 int j = 0;
619 do {
620 uint8x16_t s = vld1q_u8(src_ptr + j);
621 uint8x16_t p = vld1q_u8(second_pred);
622
623 uint8x16_t avg = dist_wtd_avg_u8x16(s, p, fwd_offset, bck_offset);
624
625 vst1q_u8(dst_ptr + j, avg);
626
627 j += 16;
628 second_pred += 16;
629 } while (j < dst_width);
630
631 src_ptr += src_stride;
632 dst_ptr += dst_width;
633 } while (--i != 0);
634 }
635
636 #define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
637 unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon( \
638 const uint8_t *src, int source_stride, int xoffset, int yoffset, \
639 const uint8_t *ref, int ref_stride, uint32_t *sse, \
640 const uint8_t *second_pred) { \
641 uint8_t tmp0[w * (h + padding)]; \
642 uint8_t tmp1[w * h]; \
643 var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
644 xoffset); \
645 avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
646 second_pred); \
647 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
648 }
649
650 #define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
651 unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon( \
652 const uint8_t *src, int source_stride, int xoffset, int yoffset, \
653 const uint8_t *ref, int ref_stride, unsigned int *sse, \
654 const uint8_t *second_pred) { \
655 if (xoffset == 0) { \
656 uint8_t tmp[w * h]; \
657 if (yoffset == 0) { \
658 avg_pred(src, tmp, source_stride, w, h, second_pred); \
659 return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
660 } else if (yoffset == 4) { \
661 avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \
662 source_stride, w, h, second_pred); \
663 return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
664 } else { \
665 avg_pred_var_filter_block2d_bil_w##w( \
666 src, tmp, source_stride, source_stride, h, yoffset, second_pred); \
667 return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
668 } \
669 } else if (xoffset == 4) { \
670 uint8_t tmp0[w * (h + padding)]; \
671 if (yoffset == 0) { \
672 avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h, \
673 second_pred); \
674 return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
675 } else if (yoffset == 4) { \
676 uint8_t tmp1[w * (h + padding)]; \
677 var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
678 avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \
679 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
680 } else { \
681 uint8_t tmp1[w * (h + padding)]; \
682 var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
683 avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
684 second_pred); \
685 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
686 } \
687 } else { \
688 uint8_t tmp0[w * (h + padding)]; \
689 if (yoffset == 0) { \
690 avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h, \
691 xoffset, second_pred); \
692 return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
693 } else if (yoffset == 4) { \
694 uint8_t tmp1[w * h]; \
695 var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
696 (h + padding), xoffset); \
697 avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \
698 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
699 } else { \
700 uint8_t tmp1[w * h]; \
701 var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
702 (h + padding), xoffset); \
703 avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
704 second_pred); \
705 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
706 } \
707 } \
708 }
709
710 SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
711 SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
712
713 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
714 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
715 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
716
717 SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
718 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
719 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
720
721 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
722 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
723 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
724
725 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
726 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
727 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 128, 1)
728
729 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 64, 1)
730 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 128, 1)
731
732 #if !CONFIG_REALTIME_ONLY
733
734 SUBPEL_AVG_VARIANCE_WXH_NEON(4, 16, 2)
735
736 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 1)
737
738 SUBPEL_AVG_VARIANCE_WXH_NEON(16, 4, 1)
739 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 64, 1)
740
741 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 8, 1)
742
743 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 16, 1)
744
745 #endif // !CONFIG_REALTIME_ONLY
746
747 #undef SUBPEL_AVG_VARIANCE_WXH_NEON
748 #undef SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON
749
750 #define DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
751 unsigned int aom_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
752 const uint8_t *src, int source_stride, int xoffset, int yoffset, \
753 const uint8_t *ref, int ref_stride, uint32_t *sse, \
754 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
755 uint8_t tmp0[w * (h + padding)]; \
756 uint8_t tmp1[w * h]; \
757 var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
758 xoffset); \
759 dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
760 tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param); \
761 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
762 }
763
764 #define SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
765 unsigned int aom_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
766 const uint8_t *src, int source_stride, int xoffset, int yoffset, \
767 const uint8_t *ref, int ref_stride, unsigned int *sse, \
768 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
769 if (xoffset == 0) { \
770 uint8_t tmp[w * h]; \
771 if (yoffset == 0) { \
772 dist_wtd_avg_pred(src, tmp, source_stride, w, h, second_pred, \
773 jcp_param); \
774 return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
775 } else if (yoffset == 4) { \
776 dist_wtd_avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \
777 source_stride, w, h, \
778 second_pred, jcp_param); \
779 return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
780 } else { \
781 dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
782 src, tmp, source_stride, source_stride, h, yoffset, second_pred, \
783 jcp_param); \
784 return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
785 } \
786 } else if (xoffset == 4) { \
787 uint8_t tmp0[w * (h + padding)]; \
788 if (yoffset == 0) { \
789 dist_wtd_avg_pred_var_filter_block2d_avg( \
790 src, tmp0, source_stride, 1, w, h, second_pred, jcp_param); \
791 return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
792 } else if (yoffset == 4) { \
793 uint8_t tmp1[w * (h + padding)]; \
794 var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
795 dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, \
796 second_pred, jcp_param); \
797 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
798 } else { \
799 uint8_t tmp1[w * (h + padding)]; \
800 var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
801 dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
802 tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param); \
803 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
804 } \
805 } else { \
806 uint8_t tmp0[w * (h + padding)]; \
807 if (yoffset == 0) { \
808 dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
809 src, tmp0, source_stride, 1, h, xoffset, second_pred, jcp_param); \
810 return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
811 } else if (yoffset == 4) { \
812 uint8_t tmp1[w * h]; \
813 var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
814 (h + padding), xoffset); \
815 dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, \
816 second_pred, jcp_param); \
817 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
818 } else { \
819 uint8_t tmp1[w * h]; \
820 var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
821 (h + padding), xoffset); \
822 dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
823 tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param); \
824 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
825 } \
826 } \
827 }
828
829 DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
830 DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
831
832 DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
833 DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
834 DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
835
836 DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
837 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
838 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
839
840 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
841 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
842 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
843
844 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
845 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
846 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 128, 1)
847
848 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 64, 1)
849 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 128, 1)
850
851 #if !CONFIG_REALTIME_ONLY
852
853 DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 16, 2)
854
855 DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 1)
856
857 DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 4, 1)
858 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 64, 1)
859
860 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 8, 1)
861
862 SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 16, 1)
863
864 #endif // !CONFIG_REALTIME_ONLY
865
866 #undef DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON
867 #undef SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON
868
869 #if !CONFIG_REALTIME_ONLY
870
871 #define OBMC_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
872 unsigned int aom_obmc_sub_pixel_variance##w##x##h##_neon( \
873 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
874 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
875 uint8_t tmp0[w * (h + padding)]; \
876 uint8_t tmp1[w * h]; \
877 var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \
878 xoffset); \
879 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
880 return aom_obmc_variance##w##x##h(tmp1, w, wsrc, mask, sse); \
881 }
882
883 #define SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
884 unsigned int aom_obmc_sub_pixel_variance##w##x##h##_neon( \
885 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
886 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
887 if (xoffset == 0) { \
888 if (yoffset == 0) { \
889 return aom_obmc_variance##w##x##h##_neon(pre, pre_stride, wsrc, mask, \
890 sse); \
891 } else if (yoffset == 4) { \
892 uint8_t tmp[w * h]; \
893 var_filter_block2d_avg(pre, tmp, pre_stride, pre_stride, w, h); \
894 return aom_obmc_variance##w##x##h##_neon(tmp, w, wsrc, mask, sse); \
895 } else { \
896 uint8_t tmp[w * h]; \
897 var_filter_block2d_bil_w##w(pre, tmp, pre_stride, pre_stride, h, \
898 yoffset); \
899 return aom_obmc_variance##w##x##h##_neon(tmp, w, wsrc, mask, sse); \
900 } \
901 } else if (xoffset == 4) { \
902 uint8_t tmp0[w * (h + padding)]; \
903 if (yoffset == 0) { \
904 var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h); \
905 return aom_obmc_variance##w##x##h##_neon(tmp0, w, wsrc, mask, sse); \
906 } else if (yoffset == 4) { \
907 uint8_t tmp1[w * (h + padding)]; \
908 var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h + padding); \
909 var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
910 return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \
911 } else { \
912 uint8_t tmp1[w * (h + padding)]; \
913 var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h + padding); \
914 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
915 return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \
916 } \
917 } else { \
918 uint8_t tmp0[w * (h + padding)]; \
919 if (yoffset == 0) { \
920 var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h, xoffset); \
921 return aom_obmc_variance##w##x##h##_neon(tmp0, w, wsrc, mask, sse); \
922 } else if (yoffset == 4) { \
923 uint8_t tmp1[w * h]; \
924 var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \
925 xoffset); \
926 var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
927 return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \
928 } else { \
929 uint8_t tmp1[w * h]; \
930 var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \
931 xoffset); \
932 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
933 return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \
934 } \
935 } \
936 }
937
938 OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
939 OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
940 OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
941
942 OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
943 OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
944 OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
945 OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
946
947 OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
948 OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
949 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
950 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
951 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
952
953 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
954 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
955 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
956 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
957
958 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
959 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
960 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
961 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
962
963 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
964 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
965
966 #undef OBMC_SUBPEL_VARIANCE_WXH_NEON
967 #undef SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON
968 #endif // !CONFIG_REALTIME_ONLY
969
970 #define MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
971 unsigned int aom_masked_sub_pixel_variance##w##x##h##_neon( \
972 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
973 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
974 const uint8_t *msk, int msk_stride, int invert_mask, \
975 unsigned int *sse) { \
976 uint8_t tmp0[w * (h + padding)]; \
977 uint8_t tmp1[w * h]; \
978 uint8_t tmp2[w * h]; \
979 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
980 xoffset); \
981 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
982 aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, msk_stride, \
983 invert_mask); \
984 return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \
985 }
986
987 #define SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
988 unsigned int aom_masked_sub_pixel_variance##w##x##h##_neon( \
989 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
990 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
991 const uint8_t *msk, int msk_stride, int invert_mask, \
992 unsigned int *sse) { \
993 if (xoffset == 0) { \
994 uint8_t tmp0[w * h]; \
995 if (yoffset == 0) { \
996 aom_comp_mask_pred_neon(tmp0, second_pred, w, h, src, src_stride, msk, \
997 msk_stride, invert_mask); \
998 return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
999 } else if (yoffset == 4) { \
1000 uint8_t tmp1[w * h]; \
1001 var_filter_block2d_avg(src, tmp0, src_stride, src_stride, w, h); \
1002 aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \
1003 msk_stride, invert_mask); \
1004 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
1005 } else { \
1006 uint8_t tmp1[w * h]; \
1007 var_filter_block2d_bil_w##w(src, tmp0, src_stride, src_stride, h, \
1008 yoffset); \
1009 aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \
1010 msk_stride, invert_mask); \
1011 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
1012 } \
1013 } else if (xoffset == 4) { \
1014 uint8_t tmp0[w * (h + padding)]; \
1015 if (yoffset == 0) { \
1016 uint8_t tmp1[w * h]; \
1017 var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \
1018 aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \
1019 msk_stride, invert_mask); \
1020 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
1021 } else if (yoffset == 4) { \
1022 uint8_t tmp1[w * h]; \
1023 uint8_t tmp2[w * h]; \
1024 var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
1025 var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
1026 aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \
1027 msk_stride, invert_mask); \
1028 return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \
1029 } else { \
1030 uint8_t tmp1[w * h]; \
1031 uint8_t tmp2[w * h]; \
1032 var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
1033 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
1034 aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \
1035 msk_stride, invert_mask); \
1036 return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \
1037 } \
1038 } else { \
1039 if (yoffset == 0) { \
1040 uint8_t tmp0[w * h]; \
1041 uint8_t tmp1[w * h]; \
1042 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \
1043 aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \
1044 msk_stride, invert_mask); \
1045 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
1046 } else if (yoffset == 4) { \
1047 uint8_t tmp0[w * (h + padding)]; \
1048 uint8_t tmp1[w * h]; \
1049 uint8_t tmp2[w * h]; \
1050 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
1051 xoffset); \
1052 var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
1053 aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \
1054 msk_stride, invert_mask); \
1055 return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \
1056 } else { \
1057 uint8_t tmp0[w * (h + padding)]; \
1058 uint8_t tmp1[w * (h + padding)]; \
1059 uint8_t tmp2[w * h]; \
1060 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
1061 xoffset); \
1062 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
1063 aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \
1064 msk_stride, invert_mask); \
1065 return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \
1066 } \
1067 } \
1068 }
1069
1070 MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
1071 MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
1072
1073 MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
1074 MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
1075 MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
1076
1077 MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
1078 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
1079 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
1080
1081 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
1082 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
1083 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
1084
1085 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
1086 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
1087 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
1088
1089 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
1090 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
1091
1092 // Realtime mode doesn't use 4x rectangular blocks.
1093 #if !CONFIG_REALTIME_ONLY
1094 MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
1095 MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
1096 MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
1097 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
1098 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
1099 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
1100 #endif // !CONFIG_REALTIME_ONLY
1101
1102 #undef MASKED_SUBPEL_VARIANCE_WXH_NEON
1103 #undef SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON
1104