xref: /aosp_15_r20/external/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include <assert.h>
13 
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/arm/mem_neon.h"
18 #include "vpx_dsp/arm/transpose_neon.h"
19 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
20 #include "vpx_dsp/vpx_filter.h"
21 #include "vpx_ports/mem.h"
22 
convolve_4tap_horiz_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h,const int16x8_t filter)23 static INLINE void convolve_4tap_horiz_neon(const uint8_t *src,
24                                             ptrdiff_t src_stride, uint8_t *dst,
25                                             ptrdiff_t dst_stride, int w, int h,
26                                             const int16x8_t filter) {
27   // 4-tap and bilinear filter values are even, so halve them to reduce
28   // intermediate precision requirements.
29   const uint8x8_t x_filter =
30       vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(filter)), 1);
31 
32   // Neon does not have lane-referencing multiply or multiply-accumulate
33   // instructions that operate on vectors of 8-bit elements. This means we have
34   // to duplicate filter taps into a whole vector and use standard multiply /
35   // multiply-accumulate instructions.
36   const uint8x8_t filter_taps[4] = { vdup_lane_u8(x_filter, 2),
37                                      vdup_lane_u8(x_filter, 3),
38                                      vdup_lane_u8(x_filter, 4),
39                                      vdup_lane_u8(x_filter, 5) };
40 
41   if (w == 4) {
42     do {
43       uint8x8_t s01[4];
44 
45       s01[0] = load_unaligned_u8(src + 0, src_stride);
46       s01[1] = load_unaligned_u8(src + 1, src_stride);
47       s01[2] = load_unaligned_u8(src + 2, src_stride);
48       s01[3] = load_unaligned_u8(src + 3, src_stride);
49 
50       uint8x8_t d01 = convolve4_8(s01[0], s01[1], s01[2], s01[3], filter_taps);
51 
52       store_unaligned_u8(dst, dst_stride, d01);
53 
54       src += 2 * src_stride;
55       dst += 2 * dst_stride;
56       h -= 2;
57     } while (h > 0);
58   } else {
59     do {
60       const uint8_t *s = src;
61       uint8_t *d = dst;
62       int width = w;
63 
64       do {
65         uint8x8_t s0[4], s1[4];
66 
67         s0[0] = vld1_u8(s + 0);
68         s0[1] = vld1_u8(s + 1);
69         s0[2] = vld1_u8(s + 2);
70         s0[3] = vld1_u8(s + 3);
71 
72         s1[0] = vld1_u8(s + src_stride + 0);
73         s1[1] = vld1_u8(s + src_stride + 1);
74         s1[2] = vld1_u8(s + src_stride + 2);
75         s1[3] = vld1_u8(s + src_stride + 3);
76 
77         uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter_taps);
78         uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter_taps);
79 
80         vst1_u8(d, d0);
81         vst1_u8(d + dst_stride, d1);
82         s += 8;
83         d += 8;
84         width -= 8;
85       } while (width != 0);
86       src += 2 * src_stride;
87       dst += 2 * dst_stride;
88       h -= 2;
89     } while (h > 0);
90   }
91 }
92 
convolve_8tap_horiz_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h,const int16x8_t filter)93 static INLINE void convolve_8tap_horiz_neon(const uint8_t *src,
94                                             ptrdiff_t src_stride, uint8_t *dst,
95                                             ptrdiff_t dst_stride, int w, int h,
96                                             const int16x8_t filter) {
97   if (h == 4) {
98     uint8x8_t t0, t1, t2, t3;
99     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
100 
101     transpose_u8_8x4(&t0, &t1, &t2, &t3);
102     int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
103     int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
104     int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
105     int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
106     int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
107     int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
108     int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
109 
110     src += 7;
111 
112     do {
113       uint8x8_t t7, t8, t9, t10;
114       load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
115 
116       transpose_u8_8x4(&t7, &t8, &t9, &t10);
117       int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
118       int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
119       int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9)));
120       int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10)));
121 
122       int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
123       int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
124       int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
125       int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
126       uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
127       uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
128 
129       transpose_u8_4x4(&d01, &d23);
130 
131       store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01);
132       store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23);
133 
134       s0 = s4;
135       s1 = s5;
136       s2 = s6;
137       s3 = s7;
138       s4 = s8;
139       s5 = s9;
140       s6 = s10;
141       src += 4;
142       dst += 4;
143       w -= 4;
144     } while (w != 0);
145   } else {
146     if (w == 4) {
147       do {
148         uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
149         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
150 
151         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
152         int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
153         int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
154         int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
155         int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
156         int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
157         int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
158         int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
159 
160         load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
161                     &t7);
162 
163         transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
164         int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
165         int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
166         int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
167         int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
168 
169         uint8x8_t d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
170         uint8x8_t d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
171         uint8x8_t d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
172         uint8x8_t d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
173 
174         transpose_u8_8x4(&d04, &d15, &d26, &d37);
175 
176         store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04);
177         store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15);
178         store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
179         store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
180 
181         src += 8 * src_stride;
182         dst += 8 * dst_stride;
183         h -= 8;
184       } while (h > 0);
185     } else {
186       do {
187         uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
188         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
189 
190         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
191         int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
192         int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
193         int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
194         int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
195         int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
196         int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
197         int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
198 
199         const uint8_t *s = src + 7;
200         uint8_t *d = dst;
201         int width = w;
202 
203         do {
204           uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15;
205           load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14,
206                       &t15);
207 
208           transpose_u8_8x8(&t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15);
209           int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8));
210           int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9));
211           int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10));
212           int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11));
213           int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12));
214           int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13));
215           int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14));
216           int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15));
217 
218           uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
219           uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
220           uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
221           uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
222           uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter);
223           uint8x8_t d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter);
224           uint8x8_t d6 =
225               convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter);
226           uint8x8_t d7 =
227               convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter);
228 
229           transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
230 
231           store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
232 
233           s0 = s8;
234           s1 = s9;
235           s2 = s10;
236           s3 = s11;
237           s4 = s12;
238           s5 = s13;
239           s6 = s14;
240           s += 8;
241           d += 8;
242           width -= 8;
243         } while (width != 0);
244         src += 8 * src_stride;
245         dst += 8 * dst_stride;
246         h -= 8;
247       } while (h > 0);
248     }
249   }
250 }
251 
vpx_convolve8_horiz_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)252 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
253                               uint8_t *dst, ptrdiff_t dst_stride,
254                               const InterpKernel *filter, int x0_q4,
255                               int x_step_q4, int y0_q4, int y_step_q4, int w,
256                               int h) {
257   assert((intptr_t)dst % 4 == 0);
258   assert(dst_stride % 4 == 0);
259   assert(x_step_q4 == 16);
260 
261   (void)x_step_q4;
262   (void)y0_q4;
263   (void)y_step_q4;
264 
265   const int16x8_t x_filter = vld1q_s16(filter[x0_q4]);
266 
267   if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
268     convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h,
269                              x_filter);
270   } else {
271     convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h,
272                              x_filter);
273   }
274 }
275 
vpx_convolve8_avg_horiz_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)276 void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
277                                   uint8_t *dst, ptrdiff_t dst_stride,
278                                   const InterpKernel *filter, int x0_q4,
279                                   int x_step_q4, int y0_q4, int y_step_q4,
280                                   int w, int h) {
281   const int16x8_t filters = vld1q_s16(filter[x0_q4]);
282 
283   assert((intptr_t)dst % 4 == 0);
284   assert(dst_stride % 4 == 0);
285   assert(x_step_q4 == 16);
286 
287   (void)x_step_q4;
288   (void)y0_q4;
289   (void)y_step_q4;
290 
291   src -= 3;
292 
293   if (h == 4) {
294     uint8x8_t t0, t1, t2, t3;
295     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
296 
297     transpose_u8_8x4(&t0, &t1, &t2, &t3);
298     int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
299     int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
300     int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
301     int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
302     int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
303     int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
304     int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
305 
306     src += 7;
307 
308     do {
309       uint8x8_t t7, t8, t9, t10;
310       load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
311 
312       transpose_u8_8x4(&t7, &t8, &t9, &t10);
313       int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
314       int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
315       int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9)));
316       int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10)));
317 
318       int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
319       int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
320       int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
321       int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
322       uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
323       uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
324 
325       transpose_u8_4x4(&d01, &d23);
326 
327       uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride);
328       uint8x8_t dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride);
329 
330       d01 = vrhadd_u8(d01, dd01);
331       d23 = vrhadd_u8(d23, dd23);
332 
333       store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01);
334       store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23);
335 
336       s0 = s4;
337       s1 = s5;
338       s2 = s6;
339       s3 = s7;
340       s4 = s8;
341       s5 = s9;
342       s6 = s10;
343       src += 4;
344       dst += 4;
345       w -= 4;
346     } while (w != 0);
347   } else {
348     if (w == 4) {
349       do {
350         uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
351         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
352 
353         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
354         int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
355         int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
356         int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
357         int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
358         int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
359         int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
360         int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
361 
362         load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
363                     &t7);
364 
365         transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
366         int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
367         int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
368         int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
369         int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
370 
371         uint8x8_t d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
372         uint8x8_t d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
373         uint8x8_t d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
374         uint8x8_t d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
375 
376         transpose_u8_8x4(&d04, &d15, &d26, &d37);
377 
378         uint8x8_t dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride);
379         uint8x8_t dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride);
380         uint8x8_t dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride);
381         uint8x8_t dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride);
382 
383         d04 = vrhadd_u8(d04, dd04);
384         d15 = vrhadd_u8(d15, dd15);
385         d26 = vrhadd_u8(d26, dd26);
386         d37 = vrhadd_u8(d37, dd37);
387 
388         store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04);
389         store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15);
390         store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
391         store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
392 
393         src += 8 * src_stride;
394         dst += 8 * dst_stride;
395         h -= 8;
396       } while (h != 0);
397     } else {
398       do {
399         uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
400         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
401 
402         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
403         int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
404         int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
405         int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
406         int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
407         int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
408         int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
409         int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
410 
411         const uint8_t *s = src + 7;
412         uint8_t *d = dst;
413         int width = w;
414 
415         do {
416           uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15;
417           load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14,
418                       &t15);
419 
420           transpose_u8_8x8(&t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15);
421           int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8));
422           int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9));
423           int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10));
424           int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11));
425           int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12));
426           int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13));
427           int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14));
428           int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15));
429 
430           uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
431           uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
432           uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
433           uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
434           uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
435           uint8x8_t d5 =
436               convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
437           uint8x8_t d6 =
438               convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
439           uint8x8_t d7 =
440               convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
441 
442           transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
443 
444           d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride));
445           d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride));
446           d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride));
447           d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride));
448           d4 = vrhadd_u8(d4, vld1_u8(d + 4 * dst_stride));
449           d5 = vrhadd_u8(d5, vld1_u8(d + 5 * dst_stride));
450           d6 = vrhadd_u8(d6, vld1_u8(d + 6 * dst_stride));
451           d7 = vrhadd_u8(d7, vld1_u8(d + 7 * dst_stride));
452 
453           store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
454 
455           s0 = s8;
456           s1 = s9;
457           s2 = s10;
458           s3 = s11;
459           s4 = s12;
460           s5 = s13;
461           s6 = s14;
462           s += 8;
463           d += 8;
464           width -= 8;
465         } while (width != 0);
466         src += 8 * src_stride;
467         dst += 8 * dst_stride;
468         h -= 8;
469       } while (h != 0);
470     }
471   }
472 }
473 
convolve_8tap_vert_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h,const int16x8_t filter)474 static INLINE void convolve_8tap_vert_neon(const uint8_t *src,
475                                            ptrdiff_t src_stride, uint8_t *dst,
476                                            ptrdiff_t dst_stride, int w, int h,
477                                            const int16x8_t filter) {
478   if (w == 4) {
479     uint8x8_t t0, t1, t2, t3, t4, t5, t6;
480     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
481     int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
482     int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
483     int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
484     int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
485     int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
486     int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
487     int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
488 
489     src += 7 * src_stride;
490 
491     do {
492       uint8x8_t t7, t8, t9, t10;
493       load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
494       int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
495       int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8)));
496       int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9)));
497       int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10)));
498 
499       int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
500       int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
501       int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
502       int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
503       uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
504       uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
505 
506       store_u8(dst + 0 * dst_stride, dst_stride, d01);
507       store_u8(dst + 2 * dst_stride, dst_stride, d23);
508 
509       s0 = s4;
510       s1 = s5;
511       s2 = s6;
512       s3 = s7;
513       s4 = s8;
514       s5 = s9;
515       s6 = s10;
516       src += 4 * src_stride;
517       dst += 4 * dst_stride;
518       h -= 4;
519     } while (h != 0);
520   } else {
521     do {
522       uint8x8_t t0, t1, t2, t3, t4, t5, t6;
523       load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
524       int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
525       int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
526       int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
527       int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
528       int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
529       int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
530       int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
531 
532       const uint8_t *s = src + 7 * src_stride;
533       uint8_t *d = dst;
534       int height = h;
535 
536       do {
537         uint8x8_t t7, t8, t9, t10;
538         load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
539         int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
540         int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
541         int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
542         int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
543 
544         uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
545         uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
546         uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
547         uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
548 
549         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
550 
551         s0 = s4;
552         s1 = s5;
553         s2 = s6;
554         s3 = s7;
555         s4 = s8;
556         s5 = s9;
557         s6 = s10;
558         s += 4 * src_stride;
559         d += 4 * dst_stride;
560         height -= 4;
561       } while (height != 0);
562       src += 8;
563       dst += 8;
564       w -= 8;
565     } while (w != 0);
566   }
567 }
568 
vpx_convolve8_vert_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)569 void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
570                              uint8_t *dst, ptrdiff_t dst_stride,
571                              const InterpKernel *filter, int x0_q4,
572                              int x_step_q4, int y0_q4, int y_step_q4, int w,
573                              int h) {
574   assert((intptr_t)dst % 4 == 0);
575   assert(dst_stride % 4 == 0);
576   assert(y_step_q4 == 16);
577 
578   (void)x0_q4;
579   (void)x_step_q4;
580   (void)y_step_q4;
581 
582   const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
583 
584   if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
585     convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h,
586                             y_filter);
587   } else {
588     convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst, dst_stride,
589                             w, h, y_filter);
590   }
591 }
592 
vpx_convolve8_avg_vert_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)593 void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
594                                  uint8_t *dst, ptrdiff_t dst_stride,
595                                  const InterpKernel *filter, int x0_q4,
596                                  int x_step_q4, int y0_q4, int y_step_q4, int w,
597                                  int h) {
598   const int16x8_t filters = vld1q_s16(filter[y0_q4]);
599 
600   assert((intptr_t)dst % 4 == 0);
601   assert(dst_stride % 4 == 0);
602   assert(y_step_q4 == 16);
603 
604   (void)x0_q4;
605   (void)x_step_q4;
606   (void)y_step_q4;
607 
608   src -= 3 * src_stride;
609 
610   if (w == 4) {
611     uint8x8_t t0, t1, t2, t3, t4, t5, t6;
612     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
613     int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
614     int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
615     int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
616     int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
617     int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
618     int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
619     int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
620 
621     src += 7 * src_stride;
622 
623     do {
624       uint8x8_t t7, t8, t9, t10;
625       load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
626       int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
627       int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8)));
628       int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9)));
629       int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10)));
630 
631       int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
632       int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
633       int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
634       int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
635       uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
636       uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
637 
638       uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
639       uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
640 
641       d01 = vrhadd_u8(d01, dd01);
642       d23 = vrhadd_u8(d23, dd23);
643 
644       store_u8(dst + 0 * dst_stride, dst_stride, d01);
645       store_u8(dst + 2 * dst_stride, dst_stride, d23);
646 
647       s0 = s4;
648       s1 = s5;
649       s2 = s6;
650       s3 = s7;
651       s4 = s8;
652       s5 = s9;
653       s6 = s10;
654       src += 4 * src_stride;
655       dst += 4 * dst_stride;
656       h -= 4;
657     } while (h != 0);
658   } else {
659     do {
660       uint8x8_t t0, t1, t2, t3, t4, t5, t6;
661       load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
662       int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
663       int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
664       int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
665       int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
666       int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
667       int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
668       int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
669 
670       const uint8_t *s = src + 7 * src_stride;
671       uint8_t *d = dst;
672       int height = h;
673 
674       do {
675         uint8x8_t t7, t8, t9, t10;
676         load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
677         int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
678         int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
679         int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
680         int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
681 
682         uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
683         uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
684         uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
685         uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
686 
687         d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride));
688         d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride));
689         d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride));
690         d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride));
691 
692         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
693 
694         s0 = s4;
695         s1 = s5;
696         s2 = s6;
697         s3 = s7;
698         s4 = s8;
699         s5 = s9;
700         s6 = s10;
701         height -= 4;
702         s += 4 * src_stride;
703         d += 4 * dst_stride;
704       } while (height != 0);
705       src += 8;
706       dst += 8;
707       w -= 8;
708     } while (w != 0);
709   }
710 }
711