1 /*
2 * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include <assert.h>
13
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/arm/highbd_convolve8_neon.h"
18 #include "vpx_dsp/arm/mem_neon.h"
19 #include "vpx_dsp/arm/transpose_neon.h"
20 #include "vpx_dsp/vpx_dsp_common.h"
21 #include "vpx_dsp/vpx_filter.h"
22 #include "vpx_ports/mem.h"
23
24 static INLINE uint16x4_t
highbd_convolve8_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t filters,const uint16x4_t max)25 highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
26 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
27 const int16x4_t s6, const int16x4_t s7,
28 const int16x8_t filters, const uint16x4_t max) {
29 const int16x4_t filters_lo = vget_low_s16(filters);
30 const int16x4_t filters_hi = vget_high_s16(filters);
31
32 int32x4_t sum = vmull_lane_s16(s0, filters_lo, 0);
33 sum = vmlal_lane_s16(sum, s1, filters_lo, 1);
34 sum = vmlal_lane_s16(sum, s2, filters_lo, 2);
35 sum = vmlal_lane_s16(sum, s3, filters_lo, 3);
36 sum = vmlal_lane_s16(sum, s4, filters_hi, 0);
37 sum = vmlal_lane_s16(sum, s5, filters_hi, 1);
38 sum = vmlal_lane_s16(sum, s6, filters_hi, 2);
39 sum = vmlal_lane_s16(sum, s7, filters_hi, 3);
40
41 uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
42 return vmin_u16(res, max);
43 }
44
45 static INLINE uint16x8_t
highbd_convolve8_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filters,const uint16x8_t max)46 highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
47 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
48 const int16x8_t s6, const int16x8_t s7,
49 const int16x8_t filters, const uint16x8_t max) {
50 const int16x4_t filters_lo = vget_low_s16(filters);
51 const int16x4_t filters_hi = vget_high_s16(filters);
52
53 int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0);
54 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1);
55 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2);
56 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3);
57 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filters_hi, 0);
58 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1);
59 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2);
60 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3);
61
62 int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0);
63 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1);
64 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2);
65 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3);
66 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filters_hi, 0);
67 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filters_hi, 1);
68 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filters_hi, 2);
69 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filters_hi, 3);
70
71 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
72 vqrshrun_n_s32(sum1, FILTER_BITS));
73 return vminq_u16(res, max);
74 }
75
highbd_convolve_4tap_horiz_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int w,int h,const int16x4_t filter,int bd)76 static INLINE void highbd_convolve_4tap_horiz_neon(
77 const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
78 ptrdiff_t dst_stride, int w, int h, const int16x4_t filter, int bd) {
79 if (w == 4) {
80 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
81 const int16_t *s = (const int16_t *)src;
82 uint16_t *d = dst;
83
84 do {
85 int16x4_t s0[4], s1[4], s2[4], s3[4];
86 load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
87 load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
88 load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
89 load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
90
91 uint16x4_t d0 =
92 highbd_convolve4_4_neon(s0[0], s0[1], s0[2], s0[3], filter, max);
93 uint16x4_t d1 =
94 highbd_convolve4_4_neon(s1[0], s1[1], s1[2], s1[3], filter, max);
95 uint16x4_t d2 =
96 highbd_convolve4_4_neon(s2[0], s2[1], s2[2], s2[3], filter, max);
97 uint16x4_t d3 =
98 highbd_convolve4_4_neon(s3[0], s3[1], s3[2], s3[3], filter, max);
99
100 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
101
102 s += 4 * src_stride;
103 d += 4 * dst_stride;
104 h -= 4;
105 } while (h != 0);
106 } else {
107 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
108
109 do {
110 const int16_t *s = (const int16_t *)src;
111 uint16_t *d = dst;
112 int width = w;
113
114 do {
115 int16x8_t s0[4], s1[4], s2[4], s3[4];
116 load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
117 load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
118 load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
119 load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
120
121 uint16x8_t d0 =
122 highbd_convolve4_8_neon(s0[0], s0[1], s0[2], s0[3], filter, max);
123 uint16x8_t d1 =
124 highbd_convolve4_8_neon(s1[0], s1[1], s1[2], s1[3], filter, max);
125 uint16x8_t d2 =
126 highbd_convolve4_8_neon(s2[0], s2[1], s2[2], s2[3], filter, max);
127 uint16x8_t d3 =
128 highbd_convolve4_8_neon(s3[0], s3[1], s3[2], s3[3], filter, max);
129
130 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
131
132 s += 8;
133 d += 8;
134 width -= 8;
135 } while (width != 0);
136 src += 4 * src_stride;
137 dst += 4 * dst_stride;
138 h -= 4;
139 } while (h != 0);
140 }
141 }
142
highbd_convolve_8tap_horiz_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int w,int h,const int16x8_t filter,int bd)143 static INLINE void highbd_convolve_8tap_horiz_neon(
144 const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
145 ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) {
146 if (w == 4) {
147 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
148 const int16_t *s = (const int16_t *)src;
149 uint16_t *d = dst;
150
151 do {
152 int16x4_t s0[8], s1[8], s2[8], s3[8];
153 load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
154 &s0[4], &s0[5], &s0[6], &s0[7]);
155 load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
156 &s1[4], &s1[5], &s1[6], &s1[7]);
157 load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
158 &s2[4], &s2[5], &s2[6], &s2[7]);
159 load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
160 &s3[4], &s3[5], &s3[6], &s3[7]);
161
162 uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4],
163 s0[5], s0[6], s0[7], filter, max);
164 uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4],
165 s1[5], s1[6], s1[7], filter, max);
166 uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4],
167 s2[5], s2[6], s2[7], filter, max);
168 uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4],
169 s3[5], s3[6], s3[7], filter, max);
170
171 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
172
173 s += 4 * src_stride;
174 d += 4 * dst_stride;
175 h -= 4;
176 } while (h != 0);
177 } else {
178 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
179
180 do {
181 const int16_t *s = (const int16_t *)src;
182 uint16_t *d = dst;
183 int width = w;
184
185 do {
186 int16x8_t s0[8], s1[8], s2[8], s3[8];
187 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
188 &s0[4], &s0[5], &s0[6], &s0[7]);
189 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
190 &s1[4], &s1[5], &s1[6], &s1[7]);
191 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
192 &s2[4], &s2[5], &s2[6], &s2[7]);
193 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
194 &s3[4], &s3[5], &s3[6], &s3[7]);
195
196 uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4],
197 s0[5], s0[6], s0[7], filter, max);
198 uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4],
199 s1[5], s1[6], s1[7], filter, max);
200 uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4],
201 s2[5], s2[6], s2[7], filter, max);
202 uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4],
203 s3[5], s3[6], s3[7], filter, max);
204
205 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
206
207 s += 8;
208 d += 8;
209 width -= 8;
210 } while (width != 0);
211 src += 4 * src_stride;
212 dst += 4 * dst_stride;
213 h -= 4;
214 } while (h != 0);
215 }
216 }
217
vpx_highbd_convolve8_horiz_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)218 void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
219 uint16_t *dst, ptrdiff_t dst_stride,
220 const InterpKernel *filter, int x0_q4,
221 int x_step_q4, int y0_q4, int y_step_q4,
222 int w, int h, int bd) {
223 if (x_step_q4 != 16) {
224 vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter,
225 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
226 return;
227 }
228
229 assert((intptr_t)dst % 4 == 0);
230 assert(dst_stride % 4 == 0);
231 assert(x_step_q4 == 16);
232
233 (void)x_step_q4;
234 (void)y0_q4;
235 (void)y_step_q4;
236
237 if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
238 const int16x4_t x_filter_4tap = vld1_s16(filter[x0_q4] + 2);
239 highbd_convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h,
240 x_filter_4tap, bd);
241 } else {
242 const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]);
243 highbd_convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h,
244 x_filter_8tap, bd);
245 }
246 }
247
vpx_highbd_convolve8_avg_horiz_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)248 void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
249 ptrdiff_t src_stride, uint16_t *dst,
250 ptrdiff_t dst_stride,
251 const InterpKernel *filter, int x0_q4,
252 int x_step_q4, int y0_q4,
253 int y_step_q4, int w, int h, int bd) {
254 if (x_step_q4 != 16) {
255 vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
256 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
257 bd);
258 return;
259 }
260
261 assert((intptr_t)dst % 4 == 0);
262 assert(dst_stride % 4 == 0);
263
264 const int16x8_t filters = vld1q_s16(filter[x0_q4]);
265
266 src -= 3;
267
268 if (w == 4) {
269 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
270 const int16_t *s = (const int16_t *)src;
271 uint16_t *d = dst;
272
273 do {
274 int16x4_t s0[8], s1[8], s2[8], s3[8];
275 load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
276 &s0[4], &s0[5], &s0[6], &s0[7]);
277 load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
278 &s1[4], &s1[5], &s1[6], &s1[7]);
279 load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
280 &s2[4], &s2[5], &s2[6], &s2[7]);
281 load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
282 &s3[4], &s3[5], &s3[6], &s3[7]);
283
284 uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4],
285 s0[5], s0[6], s0[7], filters, max);
286 uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4],
287 s1[5], s1[6], s1[7], filters, max);
288 uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4],
289 s2[5], s2[6], s2[7], filters, max);
290 uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4],
291 s3[5], s3[6], s3[7], filters, max);
292
293 d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
294 d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
295 d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
296 d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
297
298 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
299
300 s += 4 * src_stride;
301 d += 4 * dst_stride;
302 h -= 4;
303 } while (h != 0);
304 } else {
305 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
306
307 do {
308 const int16_t *s = (const int16_t *)src;
309 uint16_t *d = dst;
310 int width = w;
311
312 do {
313 int16x8_t s0[8], s1[8], s2[8], s3[8];
314 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
315 &s0[4], &s0[5], &s0[6], &s0[7]);
316 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
317 &s1[4], &s1[5], &s1[6], &s1[7]);
318 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
319 &s2[4], &s2[5], &s2[6], &s2[7]);
320 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
321 &s3[4], &s3[5], &s3[6], &s3[7]);
322
323 uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4],
324 s0[5], s0[6], s0[7], filters, max);
325 uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4],
326 s1[5], s1[6], s1[7], filters, max);
327 uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4],
328 s2[5], s2[6], s2[7], filters, max);
329 uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4],
330 s3[5], s3[6], s3[7], filters, max);
331
332 d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
333 d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
334 d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
335 d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
336
337 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
338
339 s += 8;
340 d += 8;
341 width -= 8;
342 } while (width != 0);
343 src += 4 * src_stride;
344 dst += 4 * dst_stride;
345 h -= 4;
346 } while (h != 0);
347 }
348 }
349
highbd_convolve_4tap_vert_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int w,int h,const int16x4_t filter,int bd)350 static INLINE void highbd_convolve_4tap_vert_neon(
351 const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
352 ptrdiff_t dst_stride, int w, int h, const int16x4_t filter, int bd) {
353 if (w == 4) {
354 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
355 const int16_t *s = (const int16_t *)src;
356 uint16_t *d = dst;
357
358 int16x4_t s0, s1, s2;
359 load_s16_4x3(s, src_stride, &s0, &s1, &s2);
360
361 s += 3 * src_stride;
362
363 do {
364 int16x4_t s3, s4, s5, s6;
365 load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
366
367 uint16x4_t d0 = highbd_convolve4_4_neon(s0, s1, s2, s3, filter, max);
368 uint16x4_t d1 = highbd_convolve4_4_neon(s1, s2, s3, s4, filter, max);
369 uint16x4_t d2 = highbd_convolve4_4_neon(s2, s3, s4, s5, filter, max);
370 uint16x4_t d3 = highbd_convolve4_4_neon(s3, s4, s5, s6, filter, max);
371
372 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
373
374 s0 = s4;
375 s1 = s5;
376 s2 = s6;
377 s += 4 * src_stride;
378 d += 4 * dst_stride;
379 h -= 4;
380 } while (h != 0);
381 } else {
382 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
383
384 do {
385 const int16_t *s = (const int16_t *)src;
386 uint16_t *d = dst;
387 int height = h;
388
389 int16x8_t s0, s1, s2;
390 load_s16_8x3(s, src_stride, &s0, &s1, &s2);
391
392 s += 3 * src_stride;
393
394 do {
395 int16x8_t s3, s4, s5, s6;
396 load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
397
398 uint16x8_t d0 = highbd_convolve4_8_neon(s0, s1, s2, s3, filter, max);
399 uint16x8_t d1 = highbd_convolve4_8_neon(s1, s2, s3, s4, filter, max);
400 uint16x8_t d2 = highbd_convolve4_8_neon(s2, s3, s4, s5, filter, max);
401 uint16x8_t d3 = highbd_convolve4_8_neon(s3, s4, s5, s6, filter, max);
402
403 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
404
405 s0 = s4;
406 s1 = s5;
407 s2 = s6;
408 s += 4 * src_stride;
409 d += 4 * dst_stride;
410 height -= 4;
411 } while (height != 0);
412 src += 8;
413 dst += 8;
414 w -= 8;
415 } while (w != 0);
416 }
417 }
418
highbd_convolve_8tap_vert_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int w,int h,const int16x8_t filter,int bd)419 static INLINE void highbd_convolve_8tap_vert_neon(
420 const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
421 ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) {
422 if (w == 4) {
423 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
424 const int16_t *s = (const int16_t *)src;
425 uint16_t *d = dst;
426
427 int16x4_t s0, s1, s2, s3, s4, s5, s6;
428 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
429
430 s += 7 * src_stride;
431
432 do {
433 int16x4_t s7, s8, s9, s10;
434 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
435
436 uint16x4_t d0 =
437 highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter, max);
438 uint16x4_t d1 =
439 highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter, max);
440 uint16x4_t d2 =
441 highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter, max);
442 uint16x4_t d3 =
443 highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter, max);
444
445 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
446
447 s0 = s4;
448 s1 = s5;
449 s2 = s6;
450 s3 = s7;
451 s4 = s8;
452 s5 = s9;
453 s6 = s10;
454 s += 4 * src_stride;
455 d += 4 * dst_stride;
456 h -= 4;
457 } while (h != 0);
458 } else {
459 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
460
461 do {
462 const int16_t *s = (const int16_t *)src;
463 uint16_t *d = dst;
464 int height = h;
465
466 int16x8_t s0, s1, s2, s3, s4, s5, s6;
467 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
468
469 s += 7 * src_stride;
470
471 do {
472 int16x8_t s7, s8, s9, s10;
473 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
474
475 uint16x8_t d0 =
476 highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter, max);
477 uint16x8_t d1 =
478 highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter, max);
479 uint16x8_t d2 =
480 highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter, max);
481 uint16x8_t d3 =
482 highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter, max);
483
484 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
485
486 s0 = s4;
487 s1 = s5;
488 s2 = s6;
489 s3 = s7;
490 s4 = s8;
491 s5 = s9;
492 s6 = s10;
493 s += 4 * src_stride;
494 d += 4 * dst_stride;
495 height -= 4;
496 } while (height != 0);
497 src += 8;
498 dst += 8;
499 w -= 8;
500 } while (w != 0);
501 }
502 }
503
vpx_highbd_convolve8_vert_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)504 void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
505 uint16_t *dst, ptrdiff_t dst_stride,
506 const InterpKernel *filter, int x0_q4,
507 int x_step_q4, int y0_q4, int y_step_q4,
508 int w, int h, int bd) {
509 if (y_step_q4 != 16) {
510 vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
511 x_step_q4, y0_q4, y_step_q4, w, h, bd);
512 return;
513 }
514
515 assert((intptr_t)dst % 4 == 0);
516 assert(dst_stride % 4 == 0);
517 assert(y_step_q4 == 16);
518
519 (void)x_step_q4;
520 (void)y0_q4;
521 (void)y_step_q4;
522
523 if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
524 const int16x4_t y_filter_4tap = vld1_s16(filter[y0_q4] + 2);
525 highbd_convolve_4tap_vert_neon(src - src_stride, src_stride, dst,
526 dst_stride, w, h, y_filter_4tap, bd);
527 } else {
528 const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]);
529 highbd_convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst,
530 dst_stride, w, h, y_filter_8tap, bd);
531 }
532 }
533
vpx_highbd_convolve8_avg_vert_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)534 void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
535 ptrdiff_t src_stride, uint16_t *dst,
536 ptrdiff_t dst_stride,
537 const InterpKernel *filter, int x0_q4,
538 int x_step_q4, int y0_q4, int y_step_q4,
539 int w, int h, int bd) {
540 if (y_step_q4 != 16) {
541 vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
542 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
543 bd);
544 return;
545 }
546
547 assert((intptr_t)dst % 4 == 0);
548 assert(dst_stride % 4 == 0);
549
550 const int16x8_t filters = vld1q_s16(filter[y0_q4]);
551
552 src -= 3 * src_stride;
553
554 if (w == 4) {
555 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
556 const int16_t *s = (const int16_t *)src;
557 uint16_t *d = dst;
558
559 int16x4_t s0, s1, s2, s3, s4, s5, s6;
560 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
561
562 s += 7 * src_stride;
563
564 do {
565 int16x4_t s7, s8, s9, s10;
566 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
567
568 uint16x4_t d0 =
569 highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
570 uint16x4_t d1 =
571 highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
572 uint16x4_t d2 =
573 highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
574 uint16x4_t d3 =
575 highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
576
577 d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
578 d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
579 d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
580 d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
581
582 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
583
584 s0 = s4;
585 s1 = s5;
586 s2 = s6;
587 s3 = s7;
588 s4 = s8;
589 s5 = s9;
590 s6 = s10;
591 s += 4 * src_stride;
592 d += 4 * dst_stride;
593 h -= 4;
594 } while (h != 0);
595 } else {
596 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
597
598 do {
599 const int16_t *s = (const int16_t *)src;
600 uint16_t *d = dst;
601 int height = h;
602
603 int16x8_t s0, s1, s2, s3, s4, s5, s6;
604 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
605
606 s += 7 * src_stride;
607
608 do {
609 int16x8_t s7, s8, s9, s10;
610 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
611
612 uint16x8_t d0 =
613 highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
614 uint16x8_t d1 =
615 highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
616 uint16x8_t d2 =
617 highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
618 uint16x8_t d3 =
619 highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
620
621 d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
622 d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
623 d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
624 d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
625
626 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
627
628 s0 = s4;
629 s1 = s5;
630 s2 = s6;
631 s3 = s7;
632 s4 = s8;
633 s5 = s9;
634 s6 = s10;
635 s += 4 * src_stride;
636 d += 4 * dst_stride;
637 height -= 4;
638 } while (height != 0);
639 src += 8;
640 dst += 8;
641 w -= 8;
642 } while (w != 0);
643 }
644 }
645
highbd_convolve_2d_4tap_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int w,int h,const int16x4_t x_filter,const int16x4_t y_filter,int bd)646 static INLINE void highbd_convolve_2d_4tap_neon(
647 const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
648 ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,
649 const int16x4_t y_filter, int bd) {
650 if (w == 4) {
651 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
652 const int16_t *s = (const int16_t *)src;
653 uint16_t *d = dst;
654
655 int16x4_t h_s0[4], h_s1[4], h_s2[4];
656 load_s16_4x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]);
657 load_s16_4x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]);
658 load_s16_4x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]);
659
660 int16x4_t v_s0 = vreinterpret_s16_u16(highbd_convolve4_4_neon(
661 h_s0[0], h_s0[1], h_s0[2], h_s0[3], x_filter, max));
662 int16x4_t v_s1 = vreinterpret_s16_u16(highbd_convolve4_4_neon(
663 h_s1[0], h_s1[1], h_s1[2], h_s1[3], x_filter, max));
664 int16x4_t v_s2 = vreinterpret_s16_u16(highbd_convolve4_4_neon(
665 h_s2[0], h_s2[1], h_s2[2], h_s2[3], x_filter, max));
666
667 s += 3 * src_stride;
668
669 do {
670 int16x4_t h_s3[4], h_s4[4], h_s5[4], h_s6[4];
671 load_s16_4x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2],
672 &h_s3[3]);
673 load_s16_4x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2],
674 &h_s4[3]);
675 load_s16_4x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2],
676 &h_s5[3]);
677 load_s16_4x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2],
678 &h_s6[3]);
679
680 int16x4_t v_s3 = vreinterpret_s16_u16(highbd_convolve4_4_neon(
681 h_s3[0], h_s3[1], h_s3[2], h_s3[3], x_filter, max));
682 int16x4_t v_s4 = vreinterpret_s16_u16(highbd_convolve4_4_neon(
683 h_s4[0], h_s4[1], h_s4[2], h_s4[3], x_filter, max));
684 int16x4_t v_s5 = vreinterpret_s16_u16(highbd_convolve4_4_neon(
685 h_s5[0], h_s5[1], h_s5[2], h_s5[3], x_filter, max));
686 int16x4_t v_s6 = vreinterpret_s16_u16(highbd_convolve4_4_neon(
687 h_s6[0], h_s6[1], h_s6[2], h_s6[3], x_filter, max));
688
689 uint16x4_t d0 =
690 highbd_convolve4_4_neon(v_s0, v_s1, v_s2, v_s3, y_filter, max);
691 uint16x4_t d1 =
692 highbd_convolve4_4_neon(v_s1, v_s2, v_s3, v_s4, y_filter, max);
693 uint16x4_t d2 =
694 highbd_convolve4_4_neon(v_s2, v_s3, v_s4, v_s5, y_filter, max);
695 uint16x4_t d3 =
696 highbd_convolve4_4_neon(v_s3, v_s4, v_s5, v_s6, y_filter, max);
697
698 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
699
700 v_s0 = v_s4;
701 v_s1 = v_s5;
702 v_s2 = v_s6;
703 s += 4 * src_stride;
704 d += 4 * dst_stride;
705 h -= 4;
706 } while (h != 0);
707
708 return;
709 }
710
711 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
712
713 do {
714 const int16_t *s = (const int16_t *)src;
715 uint16_t *d = dst;
716 int height = h;
717
718 int16x8_t h_s0[4], h_s1[4], h_s2[4];
719 load_s16_8x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]);
720 load_s16_8x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]);
721 load_s16_8x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]);
722
723 int16x8_t v_s0 = vreinterpretq_s16_u16(highbd_convolve4_8_neon(
724 h_s0[0], h_s0[1], h_s0[2], h_s0[3], x_filter, max));
725 int16x8_t v_s1 = vreinterpretq_s16_u16(highbd_convolve4_8_neon(
726 h_s1[0], h_s1[1], h_s1[2], h_s1[3], x_filter, max));
727 int16x8_t v_s2 = vreinterpretq_s16_u16(highbd_convolve4_8_neon(
728 h_s2[0], h_s2[1], h_s2[2], h_s2[3], x_filter, max));
729
730 s += 3 * src_stride;
731
732 do {
733 int16x8_t h_s3[4], h_s4[4], h_s5[4], h_s6[4];
734 load_s16_8x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2],
735 &h_s3[3]);
736 load_s16_8x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2],
737 &h_s4[3]);
738 load_s16_8x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2],
739 &h_s5[3]);
740 load_s16_8x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2],
741 &h_s6[3]);
742
743 int16x8_t v_s3 = vreinterpretq_s16_u16(highbd_convolve4_8_neon(
744 h_s3[0], h_s3[1], h_s3[2], h_s3[3], x_filter, max));
745 int16x8_t v_s4 = vreinterpretq_s16_u16(highbd_convolve4_8_neon(
746 h_s4[0], h_s4[1], h_s4[2], h_s4[3], x_filter, max));
747 int16x8_t v_s5 = vreinterpretq_s16_u16(highbd_convolve4_8_neon(
748 h_s5[0], h_s5[1], h_s5[2], h_s5[3], x_filter, max));
749 int16x8_t v_s6 = vreinterpretq_s16_u16(highbd_convolve4_8_neon(
750 h_s6[0], h_s6[1], h_s6[2], h_s6[3], x_filter, max));
751
752 uint16x8_t d0 =
753 highbd_convolve4_8_neon(v_s0, v_s1, v_s2, v_s3, y_filter, max);
754 uint16x8_t d1 =
755 highbd_convolve4_8_neon(v_s1, v_s2, v_s3, v_s4, y_filter, max);
756 uint16x8_t d2 =
757 highbd_convolve4_8_neon(v_s2, v_s3, v_s4, v_s5, y_filter, max);
758 uint16x8_t d3 =
759 highbd_convolve4_8_neon(v_s3, v_s4, v_s5, v_s6, y_filter, max);
760
761 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
762
763 v_s0 = v_s4;
764 v_s1 = v_s5;
765 v_s2 = v_s6;
766 s += 4 * src_stride;
767 d += 4 * dst_stride;
768 height -= 4;
769 } while (height != 0);
770 src += 8;
771 dst += 8;
772 w -= 8;
773 } while (w != 0);
774 }
775
highbd_convolve_2d_8tap_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int w,int h,const int16x8_t x_filter,const int16x8_t y_filter,int bd)776 static INLINE void highbd_convolve_2d_8tap_neon(
777 const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
778 ptrdiff_t dst_stride, int w, int h, const int16x8_t x_filter,
779 const int16x8_t y_filter, int bd) {
780 if (w == 4) {
781 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
782 const int16_t *s = (const int16_t *)src;
783 uint16_t *d = dst;
784
785 int16x4_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
786 load_s16_4x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
787 &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
788 load_s16_4x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
789 &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
790 load_s16_4x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
791 &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
792 load_s16_4x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
793 &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
794 load_s16_4x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
795 &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
796 load_s16_4x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
797 &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
798 load_s16_4x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
799 &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
800
801 int16x4_t v_s0 = vreinterpret_s16_u16(
802 highbd_convolve8_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
803 h_s0[6], h_s0[7], x_filter, max));
804 int16x4_t v_s1 = vreinterpret_s16_u16(
805 highbd_convolve8_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
806 h_s1[6], h_s1[7], x_filter, max));
807 int16x4_t v_s2 = vreinterpret_s16_u16(
808 highbd_convolve8_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
809 h_s2[6], h_s2[7], x_filter, max));
810 int16x4_t v_s3 = vreinterpret_s16_u16(
811 highbd_convolve8_4(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
812 h_s3[6], h_s3[7], x_filter, max));
813 int16x4_t v_s4 = vreinterpret_s16_u16(
814 highbd_convolve8_4(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
815 h_s4[6], h_s4[7], x_filter, max));
816 int16x4_t v_s5 = vreinterpret_s16_u16(
817 highbd_convolve8_4(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
818 h_s5[6], h_s5[7], x_filter, max));
819 int16x4_t v_s6 = vreinterpret_s16_u16(
820 highbd_convolve8_4(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
821 h_s6[6], h_s6[7], x_filter, max));
822
823 s += 7 * src_stride;
824
825 do {
826 int16x4_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
827 load_s16_4x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
828 &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
829 load_s16_4x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
830 &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
831 load_s16_4x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
832 &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
833 load_s16_4x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
834 &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
835
836 int16x4_t v_s7 = vreinterpret_s16_u16(
837 highbd_convolve8_4(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
838 h_s7[5], h_s7[6], h_s7[7], x_filter, max));
839 int16x4_t v_s8 = vreinterpret_s16_u16(
840 highbd_convolve8_4(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
841 h_s8[5], h_s8[6], h_s8[7], x_filter, max));
842 int16x4_t v_s9 = vreinterpret_s16_u16(
843 highbd_convolve8_4(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
844 h_s9[5], h_s9[6], h_s9[7], x_filter, max));
845 int16x4_t v_s10 = vreinterpret_s16_u16(
846 highbd_convolve8_4(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
847 h_s10[5], h_s10[6], h_s10[7], x_filter, max));
848
849 uint16x4_t d0 = highbd_convolve8_4(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
850 v_s6, v_s7, y_filter, max);
851 uint16x4_t d1 = highbd_convolve8_4(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
852 v_s7, v_s8, y_filter, max);
853 uint16x4_t d2 = highbd_convolve8_4(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
854 v_s8, v_s9, y_filter, max);
855 uint16x4_t d3 = highbd_convolve8_4(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
856 v_s9, v_s10, y_filter, max);
857
858 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
859
860 v_s0 = v_s4;
861 v_s1 = v_s5;
862 v_s2 = v_s6;
863 v_s3 = v_s7;
864 v_s4 = v_s8;
865 v_s5 = v_s9;
866 v_s6 = v_s10;
867 s += 4 * src_stride;
868 d += 4 * dst_stride;
869 h -= 4;
870 } while (h != 0);
871
872 return;
873 }
874
875 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
876
877 do {
878 const int16_t *s = (const int16_t *)src;
879 uint16_t *d = dst;
880 int height = h;
881
882 int16x8_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
883 load_s16_8x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
884 &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
885 load_s16_8x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
886 &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
887 load_s16_8x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
888 &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
889 load_s16_8x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
890 &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
891 load_s16_8x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
892 &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
893 load_s16_8x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
894 &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
895 load_s16_8x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
896 &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
897
898 int16x8_t v_s0 = vreinterpretq_s16_u16(
899 highbd_convolve8_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
900 h_s0[6], h_s0[7], x_filter, max));
901 int16x8_t v_s1 = vreinterpretq_s16_u16(
902 highbd_convolve8_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
903 h_s1[6], h_s1[7], x_filter, max));
904 int16x8_t v_s2 = vreinterpretq_s16_u16(
905 highbd_convolve8_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
906 h_s2[6], h_s2[7], x_filter, max));
907 int16x8_t v_s3 = vreinterpretq_s16_u16(
908 highbd_convolve8_8(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
909 h_s3[6], h_s3[7], x_filter, max));
910 int16x8_t v_s4 = vreinterpretq_s16_u16(
911 highbd_convolve8_8(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
912 h_s4[6], h_s4[7], x_filter, max));
913 int16x8_t v_s5 = vreinterpretq_s16_u16(
914 highbd_convolve8_8(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
915 h_s5[6], h_s5[7], x_filter, max));
916 int16x8_t v_s6 = vreinterpretq_s16_u16(
917 highbd_convolve8_8(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
918 h_s6[6], h_s6[7], x_filter, max));
919
920 s += 7 * src_stride;
921
922 do {
923 int16x8_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
924 load_s16_8x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
925 &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
926 load_s16_8x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
927 &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
928 load_s16_8x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
929 &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
930 load_s16_8x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
931 &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
932
933 int16x8_t v_s7 = vreinterpretq_s16_u16(
934 highbd_convolve8_8(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
935 h_s7[5], h_s7[6], h_s7[7], x_filter, max));
936 int16x8_t v_s8 = vreinterpretq_s16_u16(
937 highbd_convolve8_8(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
938 h_s8[5], h_s8[6], h_s8[7], x_filter, max));
939 int16x8_t v_s9 = vreinterpretq_s16_u16(
940 highbd_convolve8_8(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
941 h_s9[5], h_s9[6], h_s9[7], x_filter, max));
942 int16x8_t v_s10 = vreinterpretq_s16_u16(
943 highbd_convolve8_8(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
944 h_s10[5], h_s10[6], h_s10[7], x_filter, max));
945
946 uint16x8_t d0 = highbd_convolve8_8(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
947 v_s6, v_s7, y_filter, max);
948 uint16x8_t d1 = highbd_convolve8_8(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
949 v_s7, v_s8, y_filter, max);
950 uint16x8_t d2 = highbd_convolve8_8(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
951 v_s8, v_s9, y_filter, max);
952 uint16x8_t d3 = highbd_convolve8_8(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
953 v_s9, v_s10, y_filter, max);
954
955 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
956
957 v_s0 = v_s4;
958 v_s1 = v_s5;
959 v_s2 = v_s6;
960 v_s3 = v_s7;
961 v_s4 = v_s8;
962 v_s5 = v_s9;
963 v_s6 = v_s10;
964 s += 4 * src_stride;
965 d += 4 * dst_stride;
966 height -= 4;
967 } while (height != 0);
968 src += 8;
969 dst += 8;
970 w -= 8;
971 } while (w != 0);
972 }
973
vpx_highbd_convolve8_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)974 void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
975 uint16_t *dst, ptrdiff_t dst_stride,
976 const InterpKernel *filter, int x0_q4,
977 int x_step_q4, int y0_q4, int y_step_q4, int w,
978 int h, int bd) {
979 if (x_step_q4 != 16 || y_step_q4 != 16) {
980 vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
981 x_step_q4, y0_q4, y_step_q4, w, h, bd);
982 return;
983 }
984
985 const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8;
986 const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
987 // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
988 // lines post both horizontally and vertically.
989 const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1;
990 const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride;
991
992 if (x_filter_taps == 4 && y_filter_taps == 4) {
993 const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
994 const int16x4_t y_filter = vld1_s16(filter[y0_q4] + 2);
995
996 highbd_convolve_2d_4tap_neon(src - horiz_offset - vert_offset, src_stride,
997 dst, dst_stride, w, h, x_filter, y_filter, bd);
998 return;
999 }
1000
1001 const int16x8_t x_filter = vld1q_s16(filter[x0_q4]);
1002 const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
1003
1004 highbd_convolve_2d_8tap_neon(src - horiz_offset - vert_offset, src_stride,
1005 dst, dst_stride, w, h, x_filter, y_filter, bd);
1006 }
1007
vpx_highbd_convolve8_avg_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)1008 void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
1009 uint16_t *dst, ptrdiff_t dst_stride,
1010 const InterpKernel *filter, int x0_q4,
1011 int x_step_q4, int y0_q4, int y_step_q4,
1012 int w, int h, int bd) {
1013 if (x_step_q4 != 16 || y_step_q4 != 16) {
1014 vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
1015 x_step_q4, y0_q4, y_step_q4, w, h, bd);
1016 return;
1017 }
1018
1019 // Averaging convolution always uses an 8-tap filter.
1020 const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1;
1021 const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride;
1022 // Account for needing SUBPEL_TAPS / 2 - 1 lines prior and SUBPEL_TAPS / 2
1023 // lines post both horizontally and vertically.
1024 src = src - horiz_offset - vert_offset;
1025
1026 const int16x8_t x_filter = vld1q_s16(filter[x0_q4]);
1027 const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
1028
1029 if (w == 4) {
1030 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
1031 const int16_t *s = (const int16_t *)src;
1032 uint16_t *d = dst;
1033
1034 int16x4_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
1035 load_s16_4x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
1036 &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
1037 load_s16_4x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
1038 &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
1039 load_s16_4x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
1040 &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
1041 load_s16_4x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
1042 &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
1043 load_s16_4x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
1044 &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
1045 load_s16_4x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
1046 &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
1047 load_s16_4x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
1048 &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
1049
1050 int16x4_t v_s0 = vreinterpret_s16_u16(
1051 highbd_convolve8_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
1052 h_s0[6], h_s0[7], x_filter, max));
1053 int16x4_t v_s1 = vreinterpret_s16_u16(
1054 highbd_convolve8_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
1055 h_s1[6], h_s1[7], x_filter, max));
1056 int16x4_t v_s2 = vreinterpret_s16_u16(
1057 highbd_convolve8_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
1058 h_s2[6], h_s2[7], x_filter, max));
1059 int16x4_t v_s3 = vreinterpret_s16_u16(
1060 highbd_convolve8_4(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
1061 h_s3[6], h_s3[7], x_filter, max));
1062 int16x4_t v_s4 = vreinterpret_s16_u16(
1063 highbd_convolve8_4(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
1064 h_s4[6], h_s4[7], x_filter, max));
1065 int16x4_t v_s5 = vreinterpret_s16_u16(
1066 highbd_convolve8_4(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
1067 h_s5[6], h_s5[7], x_filter, max));
1068 int16x4_t v_s6 = vreinterpret_s16_u16(
1069 highbd_convolve8_4(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
1070 h_s6[6], h_s6[7], x_filter, max));
1071
1072 s += 7 * src_stride;
1073
1074 do {
1075 int16x4_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
1076 load_s16_4x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
1077 &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
1078 load_s16_4x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
1079 &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
1080 load_s16_4x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
1081 &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
1082 load_s16_4x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
1083 &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
1084
1085 int16x4_t v_s7 = vreinterpret_s16_u16(
1086 highbd_convolve8_4(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
1087 h_s7[5], h_s7[6], h_s7[7], x_filter, max));
1088 int16x4_t v_s8 = vreinterpret_s16_u16(
1089 highbd_convolve8_4(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
1090 h_s8[5], h_s8[6], h_s8[7], x_filter, max));
1091 int16x4_t v_s9 = vreinterpret_s16_u16(
1092 highbd_convolve8_4(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
1093 h_s9[5], h_s9[6], h_s9[7], x_filter, max));
1094 int16x4_t v_s10 = vreinterpret_s16_u16(
1095 highbd_convolve8_4(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
1096 h_s10[5], h_s10[6], h_s10[7], x_filter, max));
1097
1098 uint16x4_t d0 = highbd_convolve8_4(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
1099 v_s6, v_s7, y_filter, max);
1100 uint16x4_t d1 = highbd_convolve8_4(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
1101 v_s7, v_s8, y_filter, max);
1102 uint16x4_t d2 = highbd_convolve8_4(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
1103 v_s8, v_s9, y_filter, max);
1104 uint16x4_t d3 = highbd_convolve8_4(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
1105 v_s9, v_s10, y_filter, max);
1106
1107 d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
1108 d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
1109 d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
1110 d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
1111
1112 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1113
1114 v_s0 = v_s4;
1115 v_s1 = v_s5;
1116 v_s2 = v_s6;
1117 v_s3 = v_s7;
1118 v_s4 = v_s8;
1119 v_s5 = v_s9;
1120 v_s6 = v_s10;
1121 s += 4 * src_stride;
1122 d += 4 * dst_stride;
1123 h -= 4;
1124 } while (h != 0);
1125
1126 return;
1127 }
1128
1129 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
1130
1131 do {
1132 const int16_t *s = (const int16_t *)src;
1133 uint16_t *d = dst;
1134 int height = h;
1135
1136 int16x8_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
1137 load_s16_8x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
1138 &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
1139 load_s16_8x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
1140 &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
1141 load_s16_8x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
1142 &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
1143 load_s16_8x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
1144 &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
1145 load_s16_8x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
1146 &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
1147 load_s16_8x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
1148 &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
1149 load_s16_8x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
1150 &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
1151
1152 int16x8_t v_s0 = vreinterpretq_s16_u16(
1153 highbd_convolve8_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
1154 h_s0[6], h_s0[7], x_filter, max));
1155 int16x8_t v_s1 = vreinterpretq_s16_u16(
1156 highbd_convolve8_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
1157 h_s1[6], h_s1[7], x_filter, max));
1158 int16x8_t v_s2 = vreinterpretq_s16_u16(
1159 highbd_convolve8_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
1160 h_s2[6], h_s2[7], x_filter, max));
1161 int16x8_t v_s3 = vreinterpretq_s16_u16(
1162 highbd_convolve8_8(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
1163 h_s3[6], h_s3[7], x_filter, max));
1164 int16x8_t v_s4 = vreinterpretq_s16_u16(
1165 highbd_convolve8_8(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
1166 h_s4[6], h_s4[7], x_filter, max));
1167 int16x8_t v_s5 = vreinterpretq_s16_u16(
1168 highbd_convolve8_8(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
1169 h_s5[6], h_s5[7], x_filter, max));
1170 int16x8_t v_s6 = vreinterpretq_s16_u16(
1171 highbd_convolve8_8(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
1172 h_s6[6], h_s6[7], x_filter, max));
1173
1174 s += 7 * src_stride;
1175
1176 do {
1177 int16x8_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
1178 load_s16_8x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
1179 &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
1180 load_s16_8x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
1181 &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
1182 load_s16_8x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
1183 &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
1184 load_s16_8x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
1185 &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
1186
1187 int16x8_t v_s7 = vreinterpretq_s16_u16(
1188 highbd_convolve8_8(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
1189 h_s7[5], h_s7[6], h_s7[7], x_filter, max));
1190 int16x8_t v_s8 = vreinterpretq_s16_u16(
1191 highbd_convolve8_8(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
1192 h_s8[5], h_s8[6], h_s8[7], x_filter, max));
1193 int16x8_t v_s9 = vreinterpretq_s16_u16(
1194 highbd_convolve8_8(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
1195 h_s9[5], h_s9[6], h_s9[7], x_filter, max));
1196 int16x8_t v_s10 = vreinterpretq_s16_u16(
1197 highbd_convolve8_8(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
1198 h_s10[5], h_s10[6], h_s10[7], x_filter, max));
1199
1200 uint16x8_t d0 = highbd_convolve8_8(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
1201 v_s6, v_s7, y_filter, max);
1202 uint16x8_t d1 = highbd_convolve8_8(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
1203 v_s7, v_s8, y_filter, max);
1204 uint16x8_t d2 = highbd_convolve8_8(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
1205 v_s8, v_s9, y_filter, max);
1206 uint16x8_t d3 = highbd_convolve8_8(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
1207 v_s9, v_s10, y_filter, max);
1208
1209 d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
1210 d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
1211 d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
1212 d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
1213
1214 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1215
1216 v_s0 = v_s4;
1217 v_s1 = v_s5;
1218 v_s2 = v_s6;
1219 v_s3 = v_s7;
1220 v_s4 = v_s8;
1221 v_s5 = v_s9;
1222 v_s6 = v_s10;
1223 s += 4 * src_stride;
1224 d += 4 * dst_stride;
1225 height -= 4;
1226 } while (height != 0);
1227 src += 8;
1228 dst += 8;
1229 w -= 8;
1230 } while (w != 0);
1231 }
1232