xref: /aosp_15_r20/external/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include <assert.h>
13 
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/arm/highbd_convolve8_neon.h"
18 #include "vpx_dsp/arm/mem_neon.h"
19 #include "vpx_dsp/arm/transpose_neon.h"
20 #include "vpx_dsp/vpx_dsp_common.h"
21 #include "vpx_dsp/vpx_filter.h"
22 #include "vpx_ports/mem.h"
23 
24 static INLINE uint16x4_t
highbd_convolve8_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t filters,const uint16x4_t max)25 highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
26                    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
27                    const int16x4_t s6, const int16x4_t s7,
28                    const int16x8_t filters, const uint16x4_t max) {
29   const int16x4_t filters_lo = vget_low_s16(filters);
30   const int16x4_t filters_hi = vget_high_s16(filters);
31 
32   int32x4_t sum = vmull_lane_s16(s0, filters_lo, 0);
33   sum = vmlal_lane_s16(sum, s1, filters_lo, 1);
34   sum = vmlal_lane_s16(sum, s2, filters_lo, 2);
35   sum = vmlal_lane_s16(sum, s3, filters_lo, 3);
36   sum = vmlal_lane_s16(sum, s4, filters_hi, 0);
37   sum = vmlal_lane_s16(sum, s5, filters_hi, 1);
38   sum = vmlal_lane_s16(sum, s6, filters_hi, 2);
39   sum = vmlal_lane_s16(sum, s7, filters_hi, 3);
40 
41   uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
42   return vmin_u16(res, max);
43 }
44 
45 static INLINE uint16x8_t
highbd_convolve8_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filters,const uint16x8_t max)46 highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
47                    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
48                    const int16x8_t s6, const int16x8_t s7,
49                    const int16x8_t filters, const uint16x8_t max) {
50   const int16x4_t filters_lo = vget_low_s16(filters);
51   const int16x4_t filters_hi = vget_high_s16(filters);
52 
53   int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0);
54   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1);
55   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2);
56   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3);
57   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filters_hi, 0);
58   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1);
59   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2);
60   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3);
61 
62   int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0);
63   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1);
64   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2);
65   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3);
66   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filters_hi, 0);
67   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filters_hi, 1);
68   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filters_hi, 2);
69   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filters_hi, 3);
70 
71   uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
72                                 vqrshrun_n_s32(sum1, FILTER_BITS));
73   return vminq_u16(res, max);
74 }
75 
highbd_convolve_4tap_horiz_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int w,int h,const int16x4_t filter,int bd)76 static INLINE void highbd_convolve_4tap_horiz_neon(
77     const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
78     ptrdiff_t dst_stride, int w, int h, const int16x4_t filter, int bd) {
79   if (w == 4) {
80     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
81     const int16_t *s = (const int16_t *)src;
82     uint16_t *d = dst;
83 
84     do {
85       int16x4_t s0[4], s1[4], s2[4], s3[4];
86       load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
87       load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
88       load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
89       load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
90 
91       uint16x4_t d0 =
92           highbd_convolve4_4_neon(s0[0], s0[1], s0[2], s0[3], filter, max);
93       uint16x4_t d1 =
94           highbd_convolve4_4_neon(s1[0], s1[1], s1[2], s1[3], filter, max);
95       uint16x4_t d2 =
96           highbd_convolve4_4_neon(s2[0], s2[1], s2[2], s2[3], filter, max);
97       uint16x4_t d3 =
98           highbd_convolve4_4_neon(s3[0], s3[1], s3[2], s3[3], filter, max);
99 
100       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
101 
102       s += 4 * src_stride;
103       d += 4 * dst_stride;
104       h -= 4;
105     } while (h != 0);
106   } else {
107     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
108 
109     do {
110       const int16_t *s = (const int16_t *)src;
111       uint16_t *d = dst;
112       int width = w;
113 
114       do {
115         int16x8_t s0[4], s1[4], s2[4], s3[4];
116         load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
117         load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
118         load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
119         load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
120 
121         uint16x8_t d0 =
122             highbd_convolve4_8_neon(s0[0], s0[1], s0[2], s0[3], filter, max);
123         uint16x8_t d1 =
124             highbd_convolve4_8_neon(s1[0], s1[1], s1[2], s1[3], filter, max);
125         uint16x8_t d2 =
126             highbd_convolve4_8_neon(s2[0], s2[1], s2[2], s2[3], filter, max);
127         uint16x8_t d3 =
128             highbd_convolve4_8_neon(s3[0], s3[1], s3[2], s3[3], filter, max);
129 
130         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
131 
132         s += 8;
133         d += 8;
134         width -= 8;
135       } while (width != 0);
136       src += 4 * src_stride;
137       dst += 4 * dst_stride;
138       h -= 4;
139     } while (h != 0);
140   }
141 }
142 
highbd_convolve_8tap_horiz_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int w,int h,const int16x8_t filter,int bd)143 static INLINE void highbd_convolve_8tap_horiz_neon(
144     const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
145     ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) {
146   if (w == 4) {
147     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
148     const int16_t *s = (const int16_t *)src;
149     uint16_t *d = dst;
150 
151     do {
152       int16x4_t s0[8], s1[8], s2[8], s3[8];
153       load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
154                    &s0[4], &s0[5], &s0[6], &s0[7]);
155       load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
156                    &s1[4], &s1[5], &s1[6], &s1[7]);
157       load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
158                    &s2[4], &s2[5], &s2[6], &s2[7]);
159       load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
160                    &s3[4], &s3[5], &s3[6], &s3[7]);
161 
162       uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4],
163                                          s0[5], s0[6], s0[7], filter, max);
164       uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4],
165                                          s1[5], s1[6], s1[7], filter, max);
166       uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4],
167                                          s2[5], s2[6], s2[7], filter, max);
168       uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4],
169                                          s3[5], s3[6], s3[7], filter, max);
170 
171       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
172 
173       s += 4 * src_stride;
174       d += 4 * dst_stride;
175       h -= 4;
176     } while (h != 0);
177   } else {
178     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
179 
180     do {
181       const int16_t *s = (const int16_t *)src;
182       uint16_t *d = dst;
183       int width = w;
184 
185       do {
186         int16x8_t s0[8], s1[8], s2[8], s3[8];
187         load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
188                      &s0[4], &s0[5], &s0[6], &s0[7]);
189         load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
190                      &s1[4], &s1[5], &s1[6], &s1[7]);
191         load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
192                      &s2[4], &s2[5], &s2[6], &s2[7]);
193         load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
194                      &s3[4], &s3[5], &s3[6], &s3[7]);
195 
196         uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4],
197                                            s0[5], s0[6], s0[7], filter, max);
198         uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4],
199                                            s1[5], s1[6], s1[7], filter, max);
200         uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4],
201                                            s2[5], s2[6], s2[7], filter, max);
202         uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4],
203                                            s3[5], s3[6], s3[7], filter, max);
204 
205         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
206 
207         s += 8;
208         d += 8;
209         width -= 8;
210       } while (width != 0);
211       src += 4 * src_stride;
212       dst += 4 * dst_stride;
213       h -= 4;
214     } while (h != 0);
215   }
216 }
217 
vpx_highbd_convolve8_horiz_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)218 void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
219                                      uint16_t *dst, ptrdiff_t dst_stride,
220                                      const InterpKernel *filter, int x0_q4,
221                                      int x_step_q4, int y0_q4, int y_step_q4,
222                                      int w, int h, int bd) {
223   if (x_step_q4 != 16) {
224     vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter,
225                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
226     return;
227   }
228 
229   assert((intptr_t)dst % 4 == 0);
230   assert(dst_stride % 4 == 0);
231   assert(x_step_q4 == 16);
232 
233   (void)x_step_q4;
234   (void)y0_q4;
235   (void)y_step_q4;
236 
237   if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
238     const int16x4_t x_filter_4tap = vld1_s16(filter[x0_q4] + 2);
239     highbd_convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h,
240                                     x_filter_4tap, bd);
241   } else {
242     const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]);
243     highbd_convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h,
244                                     x_filter_8tap, bd);
245   }
246 }
247 
vpx_highbd_convolve8_avg_horiz_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)248 void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
249                                          ptrdiff_t src_stride, uint16_t *dst,
250                                          ptrdiff_t dst_stride,
251                                          const InterpKernel *filter, int x0_q4,
252                                          int x_step_q4, int y0_q4,
253                                          int y_step_q4, int w, int h, int bd) {
254   if (x_step_q4 != 16) {
255     vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
256                                      x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
257                                      bd);
258     return;
259   }
260 
261   assert((intptr_t)dst % 4 == 0);
262   assert(dst_stride % 4 == 0);
263 
264   const int16x8_t filters = vld1q_s16(filter[x0_q4]);
265 
266   src -= 3;
267 
268   if (w == 4) {
269     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
270     const int16_t *s = (const int16_t *)src;
271     uint16_t *d = dst;
272 
273     do {
274       int16x4_t s0[8], s1[8], s2[8], s3[8];
275       load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
276                    &s0[4], &s0[5], &s0[6], &s0[7]);
277       load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
278                    &s1[4], &s1[5], &s1[6], &s1[7]);
279       load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
280                    &s2[4], &s2[5], &s2[6], &s2[7]);
281       load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
282                    &s3[4], &s3[5], &s3[6], &s3[7]);
283 
284       uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4],
285                                          s0[5], s0[6], s0[7], filters, max);
286       uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4],
287                                          s1[5], s1[6], s1[7], filters, max);
288       uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4],
289                                          s2[5], s2[6], s2[7], filters, max);
290       uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4],
291                                          s3[5], s3[6], s3[7], filters, max);
292 
293       d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
294       d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
295       d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
296       d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
297 
298       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
299 
300       s += 4 * src_stride;
301       d += 4 * dst_stride;
302       h -= 4;
303     } while (h != 0);
304   } else {
305     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
306 
307     do {
308       const int16_t *s = (const int16_t *)src;
309       uint16_t *d = dst;
310       int width = w;
311 
312       do {
313         int16x8_t s0[8], s1[8], s2[8], s3[8];
314         load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
315                      &s0[4], &s0[5], &s0[6], &s0[7]);
316         load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
317                      &s1[4], &s1[5], &s1[6], &s1[7]);
318         load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
319                      &s2[4], &s2[5], &s2[6], &s2[7]);
320         load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
321                      &s3[4], &s3[5], &s3[6], &s3[7]);
322 
323         uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4],
324                                            s0[5], s0[6], s0[7], filters, max);
325         uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4],
326                                            s1[5], s1[6], s1[7], filters, max);
327         uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4],
328                                            s2[5], s2[6], s2[7], filters, max);
329         uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4],
330                                            s3[5], s3[6], s3[7], filters, max);
331 
332         d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
333         d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
334         d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
335         d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
336 
337         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
338 
339         s += 8;
340         d += 8;
341         width -= 8;
342       } while (width != 0);
343       src += 4 * src_stride;
344       dst += 4 * dst_stride;
345       h -= 4;
346     } while (h != 0);
347   }
348 }
349 
highbd_convolve_4tap_vert_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int w,int h,const int16x4_t filter,int bd)350 static INLINE void highbd_convolve_4tap_vert_neon(
351     const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
352     ptrdiff_t dst_stride, int w, int h, const int16x4_t filter, int bd) {
353   if (w == 4) {
354     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
355     const int16_t *s = (const int16_t *)src;
356     uint16_t *d = dst;
357 
358     int16x4_t s0, s1, s2;
359     load_s16_4x3(s, src_stride, &s0, &s1, &s2);
360 
361     s += 3 * src_stride;
362 
363     do {
364       int16x4_t s3, s4, s5, s6;
365       load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
366 
367       uint16x4_t d0 = highbd_convolve4_4_neon(s0, s1, s2, s3, filter, max);
368       uint16x4_t d1 = highbd_convolve4_4_neon(s1, s2, s3, s4, filter, max);
369       uint16x4_t d2 = highbd_convolve4_4_neon(s2, s3, s4, s5, filter, max);
370       uint16x4_t d3 = highbd_convolve4_4_neon(s3, s4, s5, s6, filter, max);
371 
372       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
373 
374       s0 = s4;
375       s1 = s5;
376       s2 = s6;
377       s += 4 * src_stride;
378       d += 4 * dst_stride;
379       h -= 4;
380     } while (h != 0);
381   } else {
382     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
383 
384     do {
385       const int16_t *s = (const int16_t *)src;
386       uint16_t *d = dst;
387       int height = h;
388 
389       int16x8_t s0, s1, s2;
390       load_s16_8x3(s, src_stride, &s0, &s1, &s2);
391 
392       s += 3 * src_stride;
393 
394       do {
395         int16x8_t s3, s4, s5, s6;
396         load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
397 
398         uint16x8_t d0 = highbd_convolve4_8_neon(s0, s1, s2, s3, filter, max);
399         uint16x8_t d1 = highbd_convolve4_8_neon(s1, s2, s3, s4, filter, max);
400         uint16x8_t d2 = highbd_convolve4_8_neon(s2, s3, s4, s5, filter, max);
401         uint16x8_t d3 = highbd_convolve4_8_neon(s3, s4, s5, s6, filter, max);
402 
403         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
404 
405         s0 = s4;
406         s1 = s5;
407         s2 = s6;
408         s += 4 * src_stride;
409         d += 4 * dst_stride;
410         height -= 4;
411       } while (height != 0);
412       src += 8;
413       dst += 8;
414       w -= 8;
415     } while (w != 0);
416   }
417 }
418 
highbd_convolve_8tap_vert_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int w,int h,const int16x8_t filter,int bd)419 static INLINE void highbd_convolve_8tap_vert_neon(
420     const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
421     ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) {
422   if (w == 4) {
423     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
424     const int16_t *s = (const int16_t *)src;
425     uint16_t *d = dst;
426 
427     int16x4_t s0, s1, s2, s3, s4, s5, s6;
428     load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
429 
430     s += 7 * src_stride;
431 
432     do {
433       int16x4_t s7, s8, s9, s10;
434       load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
435 
436       uint16x4_t d0 =
437           highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter, max);
438       uint16x4_t d1 =
439           highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter, max);
440       uint16x4_t d2 =
441           highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter, max);
442       uint16x4_t d3 =
443           highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter, max);
444 
445       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
446 
447       s0 = s4;
448       s1 = s5;
449       s2 = s6;
450       s3 = s7;
451       s4 = s8;
452       s5 = s9;
453       s6 = s10;
454       s += 4 * src_stride;
455       d += 4 * dst_stride;
456       h -= 4;
457     } while (h != 0);
458   } else {
459     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
460 
461     do {
462       const int16_t *s = (const int16_t *)src;
463       uint16_t *d = dst;
464       int height = h;
465 
466       int16x8_t s0, s1, s2, s3, s4, s5, s6;
467       load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
468 
469       s += 7 * src_stride;
470 
471       do {
472         int16x8_t s7, s8, s9, s10;
473         load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
474 
475         uint16x8_t d0 =
476             highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter, max);
477         uint16x8_t d1 =
478             highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter, max);
479         uint16x8_t d2 =
480             highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter, max);
481         uint16x8_t d3 =
482             highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter, max);
483 
484         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
485 
486         s0 = s4;
487         s1 = s5;
488         s2 = s6;
489         s3 = s7;
490         s4 = s8;
491         s5 = s9;
492         s6 = s10;
493         s += 4 * src_stride;
494         d += 4 * dst_stride;
495         height -= 4;
496       } while (height != 0);
497       src += 8;
498       dst += 8;
499       w -= 8;
500     } while (w != 0);
501   }
502 }
503 
vpx_highbd_convolve8_vert_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)504 void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
505                                     uint16_t *dst, ptrdiff_t dst_stride,
506                                     const InterpKernel *filter, int x0_q4,
507                                     int x_step_q4, int y0_q4, int y_step_q4,
508                                     int w, int h, int bd) {
509   if (y_step_q4 != 16) {
510     vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
511                                 x_step_q4, y0_q4, y_step_q4, w, h, bd);
512     return;
513   }
514 
515   assert((intptr_t)dst % 4 == 0);
516   assert(dst_stride % 4 == 0);
517   assert(y_step_q4 == 16);
518 
519   (void)x_step_q4;
520   (void)y0_q4;
521   (void)y_step_q4;
522 
523   if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
524     const int16x4_t y_filter_4tap = vld1_s16(filter[y0_q4] + 2);
525     highbd_convolve_4tap_vert_neon(src - src_stride, src_stride, dst,
526                                    dst_stride, w, h, y_filter_4tap, bd);
527   } else {
528     const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]);
529     highbd_convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst,
530                                    dst_stride, w, h, y_filter_8tap, bd);
531   }
532 }
533 
vpx_highbd_convolve8_avg_vert_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)534 void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
535                                         ptrdiff_t src_stride, uint16_t *dst,
536                                         ptrdiff_t dst_stride,
537                                         const InterpKernel *filter, int x0_q4,
538                                         int x_step_q4, int y0_q4, int y_step_q4,
539                                         int w, int h, int bd) {
540   if (y_step_q4 != 16) {
541     vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
542                                     x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
543                                     bd);
544     return;
545   }
546 
547   assert((intptr_t)dst % 4 == 0);
548   assert(dst_stride % 4 == 0);
549 
550   const int16x8_t filters = vld1q_s16(filter[y0_q4]);
551 
552   src -= 3 * src_stride;
553 
554   if (w == 4) {
555     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
556     const int16_t *s = (const int16_t *)src;
557     uint16_t *d = dst;
558 
559     int16x4_t s0, s1, s2, s3, s4, s5, s6;
560     load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
561 
562     s += 7 * src_stride;
563 
564     do {
565       int16x4_t s7, s8, s9, s10;
566       load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
567 
568       uint16x4_t d0 =
569           highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
570       uint16x4_t d1 =
571           highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
572       uint16x4_t d2 =
573           highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
574       uint16x4_t d3 =
575           highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
576 
577       d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
578       d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
579       d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
580       d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
581 
582       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
583 
584       s0 = s4;
585       s1 = s5;
586       s2 = s6;
587       s3 = s7;
588       s4 = s8;
589       s5 = s9;
590       s6 = s10;
591       s += 4 * src_stride;
592       d += 4 * dst_stride;
593       h -= 4;
594     } while (h != 0);
595   } else {
596     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
597 
598     do {
599       const int16_t *s = (const int16_t *)src;
600       uint16_t *d = dst;
601       int height = h;
602 
603       int16x8_t s0, s1, s2, s3, s4, s5, s6;
604       load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
605 
606       s += 7 * src_stride;
607 
608       do {
609         int16x8_t s7, s8, s9, s10;
610         load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
611 
612         uint16x8_t d0 =
613             highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
614         uint16x8_t d1 =
615             highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
616         uint16x8_t d2 =
617             highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
618         uint16x8_t d3 =
619             highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
620 
621         d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
622         d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
623         d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
624         d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
625 
626         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
627 
628         s0 = s4;
629         s1 = s5;
630         s2 = s6;
631         s3 = s7;
632         s4 = s8;
633         s5 = s9;
634         s6 = s10;
635         s += 4 * src_stride;
636         d += 4 * dst_stride;
637         height -= 4;
638       } while (height != 0);
639       src += 8;
640       dst += 8;
641       w -= 8;
642     } while (w != 0);
643   }
644 }
645 
highbd_convolve_2d_4tap_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int w,int h,const int16x4_t x_filter,const int16x4_t y_filter,int bd)646 static INLINE void highbd_convolve_2d_4tap_neon(
647     const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
648     ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,
649     const int16x4_t y_filter, int bd) {
650   if (w == 4) {
651     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
652     const int16_t *s = (const int16_t *)src;
653     uint16_t *d = dst;
654 
655     int16x4_t h_s0[4], h_s1[4], h_s2[4];
656     load_s16_4x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]);
657     load_s16_4x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]);
658     load_s16_4x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]);
659 
660     int16x4_t v_s0 = vreinterpret_s16_u16(highbd_convolve4_4_neon(
661         h_s0[0], h_s0[1], h_s0[2], h_s0[3], x_filter, max));
662     int16x4_t v_s1 = vreinterpret_s16_u16(highbd_convolve4_4_neon(
663         h_s1[0], h_s1[1], h_s1[2], h_s1[3], x_filter, max));
664     int16x4_t v_s2 = vreinterpret_s16_u16(highbd_convolve4_4_neon(
665         h_s2[0], h_s2[1], h_s2[2], h_s2[3], x_filter, max));
666 
667     s += 3 * src_stride;
668 
669     do {
670       int16x4_t h_s3[4], h_s4[4], h_s5[4], h_s6[4];
671       load_s16_4x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2],
672                    &h_s3[3]);
673       load_s16_4x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2],
674                    &h_s4[3]);
675       load_s16_4x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2],
676                    &h_s5[3]);
677       load_s16_4x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2],
678                    &h_s6[3]);
679 
680       int16x4_t v_s3 = vreinterpret_s16_u16(highbd_convolve4_4_neon(
681           h_s3[0], h_s3[1], h_s3[2], h_s3[3], x_filter, max));
682       int16x4_t v_s4 = vreinterpret_s16_u16(highbd_convolve4_4_neon(
683           h_s4[0], h_s4[1], h_s4[2], h_s4[3], x_filter, max));
684       int16x4_t v_s5 = vreinterpret_s16_u16(highbd_convolve4_4_neon(
685           h_s5[0], h_s5[1], h_s5[2], h_s5[3], x_filter, max));
686       int16x4_t v_s6 = vreinterpret_s16_u16(highbd_convolve4_4_neon(
687           h_s6[0], h_s6[1], h_s6[2], h_s6[3], x_filter, max));
688 
689       uint16x4_t d0 =
690           highbd_convolve4_4_neon(v_s0, v_s1, v_s2, v_s3, y_filter, max);
691       uint16x4_t d1 =
692           highbd_convolve4_4_neon(v_s1, v_s2, v_s3, v_s4, y_filter, max);
693       uint16x4_t d2 =
694           highbd_convolve4_4_neon(v_s2, v_s3, v_s4, v_s5, y_filter, max);
695       uint16x4_t d3 =
696           highbd_convolve4_4_neon(v_s3, v_s4, v_s5, v_s6, y_filter, max);
697 
698       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
699 
700       v_s0 = v_s4;
701       v_s1 = v_s5;
702       v_s2 = v_s6;
703       s += 4 * src_stride;
704       d += 4 * dst_stride;
705       h -= 4;
706     } while (h != 0);
707 
708     return;
709   }
710 
711   const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
712 
713   do {
714     const int16_t *s = (const int16_t *)src;
715     uint16_t *d = dst;
716     int height = h;
717 
718     int16x8_t h_s0[4], h_s1[4], h_s2[4];
719     load_s16_8x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]);
720     load_s16_8x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]);
721     load_s16_8x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]);
722 
723     int16x8_t v_s0 = vreinterpretq_s16_u16(highbd_convolve4_8_neon(
724         h_s0[0], h_s0[1], h_s0[2], h_s0[3], x_filter, max));
725     int16x8_t v_s1 = vreinterpretq_s16_u16(highbd_convolve4_8_neon(
726         h_s1[0], h_s1[1], h_s1[2], h_s1[3], x_filter, max));
727     int16x8_t v_s2 = vreinterpretq_s16_u16(highbd_convolve4_8_neon(
728         h_s2[0], h_s2[1], h_s2[2], h_s2[3], x_filter, max));
729 
730     s += 3 * src_stride;
731 
732     do {
733       int16x8_t h_s3[4], h_s4[4], h_s5[4], h_s6[4];
734       load_s16_8x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2],
735                    &h_s3[3]);
736       load_s16_8x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2],
737                    &h_s4[3]);
738       load_s16_8x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2],
739                    &h_s5[3]);
740       load_s16_8x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2],
741                    &h_s6[3]);
742 
743       int16x8_t v_s3 = vreinterpretq_s16_u16(highbd_convolve4_8_neon(
744           h_s3[0], h_s3[1], h_s3[2], h_s3[3], x_filter, max));
745       int16x8_t v_s4 = vreinterpretq_s16_u16(highbd_convolve4_8_neon(
746           h_s4[0], h_s4[1], h_s4[2], h_s4[3], x_filter, max));
747       int16x8_t v_s5 = vreinterpretq_s16_u16(highbd_convolve4_8_neon(
748           h_s5[0], h_s5[1], h_s5[2], h_s5[3], x_filter, max));
749       int16x8_t v_s6 = vreinterpretq_s16_u16(highbd_convolve4_8_neon(
750           h_s6[0], h_s6[1], h_s6[2], h_s6[3], x_filter, max));
751 
752       uint16x8_t d0 =
753           highbd_convolve4_8_neon(v_s0, v_s1, v_s2, v_s3, y_filter, max);
754       uint16x8_t d1 =
755           highbd_convolve4_8_neon(v_s1, v_s2, v_s3, v_s4, y_filter, max);
756       uint16x8_t d2 =
757           highbd_convolve4_8_neon(v_s2, v_s3, v_s4, v_s5, y_filter, max);
758       uint16x8_t d3 =
759           highbd_convolve4_8_neon(v_s3, v_s4, v_s5, v_s6, y_filter, max);
760 
761       store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
762 
763       v_s0 = v_s4;
764       v_s1 = v_s5;
765       v_s2 = v_s6;
766       s += 4 * src_stride;
767       d += 4 * dst_stride;
768       height -= 4;
769     } while (height != 0);
770     src += 8;
771     dst += 8;
772     w -= 8;
773   } while (w != 0);
774 }
775 
highbd_convolve_2d_8tap_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int w,int h,const int16x8_t x_filter,const int16x8_t y_filter,int bd)776 static INLINE void highbd_convolve_2d_8tap_neon(
777     const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
778     ptrdiff_t dst_stride, int w, int h, const int16x8_t x_filter,
779     const int16x8_t y_filter, int bd) {
780   if (w == 4) {
781     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
782     const int16_t *s = (const int16_t *)src;
783     uint16_t *d = dst;
784 
785     int16x4_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
786     load_s16_4x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
787                  &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
788     load_s16_4x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
789                  &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
790     load_s16_4x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
791                  &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
792     load_s16_4x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
793                  &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
794     load_s16_4x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
795                  &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
796     load_s16_4x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
797                  &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
798     load_s16_4x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
799                  &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
800 
801     int16x4_t v_s0 = vreinterpret_s16_u16(
802         highbd_convolve8_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
803                            h_s0[6], h_s0[7], x_filter, max));
804     int16x4_t v_s1 = vreinterpret_s16_u16(
805         highbd_convolve8_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
806                            h_s1[6], h_s1[7], x_filter, max));
807     int16x4_t v_s2 = vreinterpret_s16_u16(
808         highbd_convolve8_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
809                            h_s2[6], h_s2[7], x_filter, max));
810     int16x4_t v_s3 = vreinterpret_s16_u16(
811         highbd_convolve8_4(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
812                            h_s3[6], h_s3[7], x_filter, max));
813     int16x4_t v_s4 = vreinterpret_s16_u16(
814         highbd_convolve8_4(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
815                            h_s4[6], h_s4[7], x_filter, max));
816     int16x4_t v_s5 = vreinterpret_s16_u16(
817         highbd_convolve8_4(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
818                            h_s5[6], h_s5[7], x_filter, max));
819     int16x4_t v_s6 = vreinterpret_s16_u16(
820         highbd_convolve8_4(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
821                            h_s6[6], h_s6[7], x_filter, max));
822 
823     s += 7 * src_stride;
824 
825     do {
826       int16x4_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
827       load_s16_4x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
828                    &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
829       load_s16_4x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
830                    &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
831       load_s16_4x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
832                    &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
833       load_s16_4x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
834                    &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
835 
836       int16x4_t v_s7 = vreinterpret_s16_u16(
837           highbd_convolve8_4(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
838                              h_s7[5], h_s7[6], h_s7[7], x_filter, max));
839       int16x4_t v_s8 = vreinterpret_s16_u16(
840           highbd_convolve8_4(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
841                              h_s8[5], h_s8[6], h_s8[7], x_filter, max));
842       int16x4_t v_s9 = vreinterpret_s16_u16(
843           highbd_convolve8_4(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
844                              h_s9[5], h_s9[6], h_s9[7], x_filter, max));
845       int16x4_t v_s10 = vreinterpret_s16_u16(
846           highbd_convolve8_4(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
847                              h_s10[5], h_s10[6], h_s10[7], x_filter, max));
848 
849       uint16x4_t d0 = highbd_convolve8_4(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
850                                          v_s6, v_s7, y_filter, max);
851       uint16x4_t d1 = highbd_convolve8_4(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
852                                          v_s7, v_s8, y_filter, max);
853       uint16x4_t d2 = highbd_convolve8_4(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
854                                          v_s8, v_s9, y_filter, max);
855       uint16x4_t d3 = highbd_convolve8_4(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
856                                          v_s9, v_s10, y_filter, max);
857 
858       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
859 
860       v_s0 = v_s4;
861       v_s1 = v_s5;
862       v_s2 = v_s6;
863       v_s3 = v_s7;
864       v_s4 = v_s8;
865       v_s5 = v_s9;
866       v_s6 = v_s10;
867       s += 4 * src_stride;
868       d += 4 * dst_stride;
869       h -= 4;
870     } while (h != 0);
871 
872     return;
873   }
874 
875   const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
876 
877   do {
878     const int16_t *s = (const int16_t *)src;
879     uint16_t *d = dst;
880     int height = h;
881 
882     int16x8_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
883     load_s16_8x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
884                  &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
885     load_s16_8x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
886                  &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
887     load_s16_8x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
888                  &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
889     load_s16_8x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
890                  &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
891     load_s16_8x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
892                  &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
893     load_s16_8x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
894                  &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
895     load_s16_8x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
896                  &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
897 
898     int16x8_t v_s0 = vreinterpretq_s16_u16(
899         highbd_convolve8_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
900                            h_s0[6], h_s0[7], x_filter, max));
901     int16x8_t v_s1 = vreinterpretq_s16_u16(
902         highbd_convolve8_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
903                            h_s1[6], h_s1[7], x_filter, max));
904     int16x8_t v_s2 = vreinterpretq_s16_u16(
905         highbd_convolve8_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
906                            h_s2[6], h_s2[7], x_filter, max));
907     int16x8_t v_s3 = vreinterpretq_s16_u16(
908         highbd_convolve8_8(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
909                            h_s3[6], h_s3[7], x_filter, max));
910     int16x8_t v_s4 = vreinterpretq_s16_u16(
911         highbd_convolve8_8(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
912                            h_s4[6], h_s4[7], x_filter, max));
913     int16x8_t v_s5 = vreinterpretq_s16_u16(
914         highbd_convolve8_8(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
915                            h_s5[6], h_s5[7], x_filter, max));
916     int16x8_t v_s6 = vreinterpretq_s16_u16(
917         highbd_convolve8_8(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
918                            h_s6[6], h_s6[7], x_filter, max));
919 
920     s += 7 * src_stride;
921 
922     do {
923       int16x8_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
924       load_s16_8x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
925                    &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
926       load_s16_8x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
927                    &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
928       load_s16_8x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
929                    &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
930       load_s16_8x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
931                    &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
932 
933       int16x8_t v_s7 = vreinterpretq_s16_u16(
934           highbd_convolve8_8(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
935                              h_s7[5], h_s7[6], h_s7[7], x_filter, max));
936       int16x8_t v_s8 = vreinterpretq_s16_u16(
937           highbd_convolve8_8(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
938                              h_s8[5], h_s8[6], h_s8[7], x_filter, max));
939       int16x8_t v_s9 = vreinterpretq_s16_u16(
940           highbd_convolve8_8(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
941                              h_s9[5], h_s9[6], h_s9[7], x_filter, max));
942       int16x8_t v_s10 = vreinterpretq_s16_u16(
943           highbd_convolve8_8(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
944                              h_s10[5], h_s10[6], h_s10[7], x_filter, max));
945 
946       uint16x8_t d0 = highbd_convolve8_8(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
947                                          v_s6, v_s7, y_filter, max);
948       uint16x8_t d1 = highbd_convolve8_8(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
949                                          v_s7, v_s8, y_filter, max);
950       uint16x8_t d2 = highbd_convolve8_8(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
951                                          v_s8, v_s9, y_filter, max);
952       uint16x8_t d3 = highbd_convolve8_8(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
953                                          v_s9, v_s10, y_filter, max);
954 
955       store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
956 
957       v_s0 = v_s4;
958       v_s1 = v_s5;
959       v_s2 = v_s6;
960       v_s3 = v_s7;
961       v_s4 = v_s8;
962       v_s5 = v_s9;
963       v_s6 = v_s10;
964       s += 4 * src_stride;
965       d += 4 * dst_stride;
966       height -= 4;
967     } while (height != 0);
968     src += 8;
969     dst += 8;
970     w -= 8;
971   } while (w != 0);
972 }
973 
vpx_highbd_convolve8_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)974 void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
975                                uint16_t *dst, ptrdiff_t dst_stride,
976                                const InterpKernel *filter, int x0_q4,
977                                int x_step_q4, int y0_q4, int y_step_q4, int w,
978                                int h, int bd) {
979   if (x_step_q4 != 16 || y_step_q4 != 16) {
980     vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
981                            x_step_q4, y0_q4, y_step_q4, w, h, bd);
982     return;
983   }
984 
985   const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8;
986   const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
987   // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
988   // lines post both horizontally and vertically.
989   const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1;
990   const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride;
991 
992   if (x_filter_taps == 4 && y_filter_taps == 4) {
993     const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
994     const int16x4_t y_filter = vld1_s16(filter[y0_q4] + 2);
995 
996     highbd_convolve_2d_4tap_neon(src - horiz_offset - vert_offset, src_stride,
997                                  dst, dst_stride, w, h, x_filter, y_filter, bd);
998     return;
999   }
1000 
1001   const int16x8_t x_filter = vld1q_s16(filter[x0_q4]);
1002   const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
1003 
1004   highbd_convolve_2d_8tap_neon(src - horiz_offset - vert_offset, src_stride,
1005                                dst, dst_stride, w, h, x_filter, y_filter, bd);
1006 }
1007 
vpx_highbd_convolve8_avg_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)1008 void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
1009                                    uint16_t *dst, ptrdiff_t dst_stride,
1010                                    const InterpKernel *filter, int x0_q4,
1011                                    int x_step_q4, int y0_q4, int y_step_q4,
1012                                    int w, int h, int bd) {
1013   if (x_step_q4 != 16 || y_step_q4 != 16) {
1014     vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
1015                                x_step_q4, y0_q4, y_step_q4, w, h, bd);
1016     return;
1017   }
1018 
1019   // Averaging convolution always uses an 8-tap filter.
1020   const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1;
1021   const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride;
1022   // Account for needing SUBPEL_TAPS / 2 - 1 lines prior and SUBPEL_TAPS / 2
1023   // lines post both horizontally and vertically.
1024   src = src - horiz_offset - vert_offset;
1025 
1026   const int16x8_t x_filter = vld1q_s16(filter[x0_q4]);
1027   const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
1028 
1029   if (w == 4) {
1030     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
1031     const int16_t *s = (const int16_t *)src;
1032     uint16_t *d = dst;
1033 
1034     int16x4_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
1035     load_s16_4x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
1036                  &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
1037     load_s16_4x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
1038                  &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
1039     load_s16_4x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
1040                  &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
1041     load_s16_4x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
1042                  &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
1043     load_s16_4x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
1044                  &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
1045     load_s16_4x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
1046                  &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
1047     load_s16_4x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
1048                  &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
1049 
1050     int16x4_t v_s0 = vreinterpret_s16_u16(
1051         highbd_convolve8_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
1052                            h_s0[6], h_s0[7], x_filter, max));
1053     int16x4_t v_s1 = vreinterpret_s16_u16(
1054         highbd_convolve8_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
1055                            h_s1[6], h_s1[7], x_filter, max));
1056     int16x4_t v_s2 = vreinterpret_s16_u16(
1057         highbd_convolve8_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
1058                            h_s2[6], h_s2[7], x_filter, max));
1059     int16x4_t v_s3 = vreinterpret_s16_u16(
1060         highbd_convolve8_4(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
1061                            h_s3[6], h_s3[7], x_filter, max));
1062     int16x4_t v_s4 = vreinterpret_s16_u16(
1063         highbd_convolve8_4(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
1064                            h_s4[6], h_s4[7], x_filter, max));
1065     int16x4_t v_s5 = vreinterpret_s16_u16(
1066         highbd_convolve8_4(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
1067                            h_s5[6], h_s5[7], x_filter, max));
1068     int16x4_t v_s6 = vreinterpret_s16_u16(
1069         highbd_convolve8_4(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
1070                            h_s6[6], h_s6[7], x_filter, max));
1071 
1072     s += 7 * src_stride;
1073 
1074     do {
1075       int16x4_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
1076       load_s16_4x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
1077                    &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
1078       load_s16_4x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
1079                    &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
1080       load_s16_4x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
1081                    &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
1082       load_s16_4x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
1083                    &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
1084 
1085       int16x4_t v_s7 = vreinterpret_s16_u16(
1086           highbd_convolve8_4(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
1087                              h_s7[5], h_s7[6], h_s7[7], x_filter, max));
1088       int16x4_t v_s8 = vreinterpret_s16_u16(
1089           highbd_convolve8_4(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
1090                              h_s8[5], h_s8[6], h_s8[7], x_filter, max));
1091       int16x4_t v_s9 = vreinterpret_s16_u16(
1092           highbd_convolve8_4(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
1093                              h_s9[5], h_s9[6], h_s9[7], x_filter, max));
1094       int16x4_t v_s10 = vreinterpret_s16_u16(
1095           highbd_convolve8_4(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
1096                              h_s10[5], h_s10[6], h_s10[7], x_filter, max));
1097 
1098       uint16x4_t d0 = highbd_convolve8_4(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
1099                                          v_s6, v_s7, y_filter, max);
1100       uint16x4_t d1 = highbd_convolve8_4(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
1101                                          v_s7, v_s8, y_filter, max);
1102       uint16x4_t d2 = highbd_convolve8_4(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
1103                                          v_s8, v_s9, y_filter, max);
1104       uint16x4_t d3 = highbd_convolve8_4(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
1105                                          v_s9, v_s10, y_filter, max);
1106 
1107       d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
1108       d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
1109       d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
1110       d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
1111 
1112       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1113 
1114       v_s0 = v_s4;
1115       v_s1 = v_s5;
1116       v_s2 = v_s6;
1117       v_s3 = v_s7;
1118       v_s4 = v_s8;
1119       v_s5 = v_s9;
1120       v_s6 = v_s10;
1121       s += 4 * src_stride;
1122       d += 4 * dst_stride;
1123       h -= 4;
1124     } while (h != 0);
1125 
1126     return;
1127   }
1128 
1129   const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
1130 
1131   do {
1132     const int16_t *s = (const int16_t *)src;
1133     uint16_t *d = dst;
1134     int height = h;
1135 
1136     int16x8_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
1137     load_s16_8x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
1138                  &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
1139     load_s16_8x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
1140                  &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
1141     load_s16_8x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
1142                  &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
1143     load_s16_8x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
1144                  &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
1145     load_s16_8x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
1146                  &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
1147     load_s16_8x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
1148                  &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
1149     load_s16_8x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
1150                  &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
1151 
1152     int16x8_t v_s0 = vreinterpretq_s16_u16(
1153         highbd_convolve8_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
1154                            h_s0[6], h_s0[7], x_filter, max));
1155     int16x8_t v_s1 = vreinterpretq_s16_u16(
1156         highbd_convolve8_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
1157                            h_s1[6], h_s1[7], x_filter, max));
1158     int16x8_t v_s2 = vreinterpretq_s16_u16(
1159         highbd_convolve8_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
1160                            h_s2[6], h_s2[7], x_filter, max));
1161     int16x8_t v_s3 = vreinterpretq_s16_u16(
1162         highbd_convolve8_8(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
1163                            h_s3[6], h_s3[7], x_filter, max));
1164     int16x8_t v_s4 = vreinterpretq_s16_u16(
1165         highbd_convolve8_8(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
1166                            h_s4[6], h_s4[7], x_filter, max));
1167     int16x8_t v_s5 = vreinterpretq_s16_u16(
1168         highbd_convolve8_8(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
1169                            h_s5[6], h_s5[7], x_filter, max));
1170     int16x8_t v_s6 = vreinterpretq_s16_u16(
1171         highbd_convolve8_8(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
1172                            h_s6[6], h_s6[7], x_filter, max));
1173 
1174     s += 7 * src_stride;
1175 
1176     do {
1177       int16x8_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
1178       load_s16_8x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
1179                    &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
1180       load_s16_8x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
1181                    &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
1182       load_s16_8x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
1183                    &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
1184       load_s16_8x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
1185                    &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
1186 
1187       int16x8_t v_s7 = vreinterpretq_s16_u16(
1188           highbd_convolve8_8(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
1189                              h_s7[5], h_s7[6], h_s7[7], x_filter, max));
1190       int16x8_t v_s8 = vreinterpretq_s16_u16(
1191           highbd_convolve8_8(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
1192                              h_s8[5], h_s8[6], h_s8[7], x_filter, max));
1193       int16x8_t v_s9 = vreinterpretq_s16_u16(
1194           highbd_convolve8_8(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
1195                              h_s9[5], h_s9[6], h_s9[7], x_filter, max));
1196       int16x8_t v_s10 = vreinterpretq_s16_u16(
1197           highbd_convolve8_8(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
1198                              h_s10[5], h_s10[6], h_s10[7], x_filter, max));
1199 
1200       uint16x8_t d0 = highbd_convolve8_8(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
1201                                          v_s6, v_s7, y_filter, max);
1202       uint16x8_t d1 = highbd_convolve8_8(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
1203                                          v_s7, v_s8, y_filter, max);
1204       uint16x8_t d2 = highbd_convolve8_8(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
1205                                          v_s8, v_s9, y_filter, max);
1206       uint16x8_t d3 = highbd_convolve8_8(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
1207                                          v_s9, v_s10, y_filter, max);
1208 
1209       d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
1210       d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
1211       d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
1212       d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
1213 
1214       store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1215 
1216       v_s0 = v_s4;
1217       v_s1 = v_s5;
1218       v_s2 = v_s6;
1219       v_s3 = v_s7;
1220       v_s4 = v_s8;
1221       v_s5 = v_s9;
1222       v_s6 = v_s10;
1223       s += 4 * src_stride;
1224       d += 4 * dst_stride;
1225       height -= 4;
1226     } while (height != 0);
1227     src += 8;
1228     dst += 8;
1229     w -= 8;
1230   } while (w != 0);
1231 }
1232