xref: /aosp_15_r20/external/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include <assert.h>
13 
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/arm/mem_neon.h"
18 #include "vpx_dsp/arm/transpose_neon.h"
19 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
20 #include "vpx_dsp/vpx_filter.h"
21 #include "vpx_ports/mem.h"
22 
23 // Filter values always sum to 128.
24 #define FILTER_SUM 128
25 
26 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
27   0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
28   4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
29   8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
30 };
31 
32 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
33   // Shift left and insert new last column in transposed 4x4 block.
34   1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
35   // Shift left and insert two new columns in transposed 4x4 block.
36   2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
37   // Shift left and insert three new columns in transposed 4x4 block.
38   3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
39 };
40 
convolve4_4_h(const uint8x16_t samples,const int8x8_t filters,const uint8x16_t permute_tbl)41 static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
42                                       const int8x8_t filters,
43                                       const uint8x16_t permute_tbl) {
44   // Transform sample range to [-128, 127] for 8-bit signed dot product.
45   int8x16_t samples_128 =
46       vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
47 
48   // Permute samples ready for dot product.
49   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
50   int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl);
51 
52   // Accumulate into 128 * FILTER_SUM to account for range transform. (Divide
53   // by 2 since we halved the filter values.)
54   int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM / 2);
55   int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0);
56 
57   // Further narrowing and packing is performed by the caller.
58   return vmovn_s32(sum);
59 }
60 
convolve4_8_h(const uint8x16_t samples,const int8x8_t filters,const uint8x16x2_t permute_tbl)61 static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
62                                       const int8x8_t filters,
63                                       const uint8x16x2_t permute_tbl) {
64   // Transform sample range to [-128, 127] for 8-bit signed dot product.
65   int8x16_t samples_128 =
66       vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
67 
68   // Permute samples ready for dot product.
69   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
70   // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
71   int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
72                                 vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
73 
74   // Accumulate into 128 * FILTER_SUM to account for range transform. (Divide
75   // by 2 since we halved the filter values.)
76   int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM / 2);
77   // First 4 output values.
78   int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
79   // Second 4 output values.
80   int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
81 
82   // Narrow and re-pack.
83   int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
84   // We halved the filter values so -1 from right shift.
85   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
86 }
87 
convolve8_4_h(const uint8x16_t samples,const int8x8_t filters,const uint8x16x2_t permute_tbl)88 static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples,
89                                       const int8x8_t filters,
90                                       const uint8x16x2_t permute_tbl) {
91   // Transform sample range to [-128, 127] for 8-bit signed dot product.
92   int8x16_t samples_128 =
93       vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
94 
95   // Permute samples ready for dot product.
96   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
97   // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
98   int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
99                                 vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
100 
101   // Accumulate into 128 * FILTER_SUM to account for range transform.
102   int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
103   int32x4_t sum = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
104   sum = vdotq_lane_s32(sum, perm_samples[1], filters, 1);
105 
106   // Further narrowing and packing is performed by the caller.
107   return vshrn_n_s32(sum, 1);
108 }
109 
convolve8_8_h(const uint8x16_t samples,const int8x8_t filters,const uint8x16x3_t permute_tbl)110 static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
111                                       const int8x8_t filters,
112                                       const uint8x16x3_t permute_tbl) {
113   // Transform sample range to [-128, 127] for 8-bit signed dot product.
114   int8x16_t samples_128 =
115       vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
116 
117   // Permute samples ready for dot product.
118   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
119   // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
120   // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
121   int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
122                                 vqtbl1q_s8(samples_128, permute_tbl.val[1]),
123                                 vqtbl1q_s8(samples_128, permute_tbl.val[2]) };
124 
125   // Accumulate into 128 * FILTER_SUM to account for range transform.
126   int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
127   // First 4 output values.
128   int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
129   sum0 = vdotq_lane_s32(sum0, perm_samples[1], filters, 1);
130   // Second 4 output values.
131   int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
132   sum1 = vdotq_lane_s32(sum1, perm_samples[2], filters, 1);
133 
134   // Narrow and re-pack.
135   int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
136   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
137 }
138 
convolve_4tap_horiz_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h,const int8x8_t filter)139 static INLINE void convolve_4tap_horiz_neon_dotprod(
140     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
141     ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
142   if (w == 4) {
143     const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
144 
145     do {
146       uint8x16_t s0, s1, s2, s3;
147       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
148 
149       int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl);
150       int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl);
151       int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl);
152       int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl);
153       // We halved the filter values so -1 from right shift.
154       uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
155       uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
156 
157       store_u8(dst + 0 * dst_stride, dst_stride, d01);
158       store_u8(dst + 2 * dst_stride, dst_stride, d23);
159 
160       src += 4 * src_stride;
161       dst += 4 * dst_stride;
162       h -= 4;
163     } while (h != 0);
164   } else {
165     const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
166 
167     do {
168       const uint8_t *s = src;
169       uint8_t *d = dst;
170       int width = w;
171 
172       do {
173         uint8x16_t s0, s1, s2, s3;
174         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
175 
176         uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl);
177         uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl);
178         uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl);
179         uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl);
180 
181         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
182 
183         s += 8;
184         d += 8;
185         width -= 8;
186       } while (width != 0);
187       src += 4 * src_stride;
188       dst += 4 * dst_stride;
189       h -= 4;
190     } while (h != 0);
191   }
192 }
193 
convolve_8tap_horiz_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h,const int8x8_t filter)194 static INLINE void convolve_8tap_horiz_neon_dotprod(
195     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
196     ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
197   if (w == 4) {
198     const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
199 
200     do {
201       uint8x16_t s0, s1, s2, s3;
202       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
203 
204       int16x4_t t0 = convolve8_4_h(s0, filter, permute_tbl);
205       int16x4_t t1 = convolve8_4_h(s1, filter, permute_tbl);
206       int16x4_t t2 = convolve8_4_h(s2, filter, permute_tbl);
207       int16x4_t t3 = convolve8_4_h(s3, filter, permute_tbl);
208       uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
209       uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
210 
211       store_u8(dst + 0 * dst_stride, dst_stride, d01);
212       store_u8(dst + 2 * dst_stride, dst_stride, d23);
213 
214       src += 4 * src_stride;
215       dst += 4 * dst_stride;
216       h -= 4;
217     } while (h != 0);
218   } else {
219     const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
220 
221     do {
222       const uint8_t *s = src;
223       uint8_t *d = dst;
224       int width = w;
225 
226       do {
227         uint8x16_t s0, s1, s2, s3;
228         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
229 
230         uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
231         uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
232         uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
233         uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
234 
235         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
236 
237         s += 8;
238         d += 8;
239         width -= 8;
240       } while (width != 0);
241       src += 4 * src_stride;
242       dst += 4 * dst_stride;
243       h -= 4;
244     } while (h != 0);
245   }
246 }
247 
vpx_convolve8_horiz_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)248 void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
249                                       uint8_t *dst, ptrdiff_t dst_stride,
250                                       const InterpKernel *filter, int x0_q4,
251                                       int x_step_q4, int y0_q4, int y_step_q4,
252                                       int w, int h) {
253   assert((intptr_t)dst % 4 == 0);
254   assert(dst_stride % 4 == 0);
255   assert(x_step_q4 == 16);
256 
257   (void)x_step_q4;
258   (void)y0_q4;
259   (void)y_step_q4;
260 
261   if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
262     // Load 4-tap filter into first 4 elements of the vector.
263     // All 4-tap and bilinear filter values are even, so halve them to reduce
264     // intermediate precision requirements.
265     const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
266     const int8x8_t x_filter_4tap =
267         vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
268 
269     convolve_4tap_horiz_neon_dotprod(src - 1, src_stride, dst, dst_stride, w, h,
270                                      x_filter_4tap);
271 
272   } else {
273     const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
274 
275     convolve_8tap_horiz_neon_dotprod(src - 3, src_stride, dst, dst_stride, w, h,
276                                      x_filter_8tap);
277   }
278 }
279 
vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)280 void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
281                                           ptrdiff_t src_stride, uint8_t *dst,
282                                           ptrdiff_t dst_stride,
283                                           const InterpKernel *filter, int x0_q4,
284                                           int x_step_q4, int y0_q4,
285                                           int y_step_q4, int w, int h) {
286   const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
287 
288   assert((intptr_t)dst % 4 == 0);
289   assert(dst_stride % 4 == 0);
290   assert(x_step_q4 == 16);
291 
292   (void)x_step_q4;
293   (void)y0_q4;
294   (void)y_step_q4;
295 
296   src -= 3;
297 
298   if (w == 4) {
299     const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
300 
301     do {
302       uint8x16_t s0, s1, s2, s3;
303       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
304 
305       int16x4_t t0 = convolve8_4_h(s0, filters, permute_tbl);
306       int16x4_t t1 = convolve8_4_h(s1, filters, permute_tbl);
307       int16x4_t t2 = convolve8_4_h(s2, filters, permute_tbl);
308       int16x4_t t3 = convolve8_4_h(s3, filters, permute_tbl);
309       uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
310       uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
311 
312       uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
313       uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
314 
315       d01 = vrhadd_u8(d01, dd01);
316       d23 = vrhadd_u8(d23, dd23);
317 
318       store_u8(dst + 0 * dst_stride, dst_stride, d01);
319       store_u8(dst + 2 * dst_stride, dst_stride, d23);
320 
321       src += 4 * src_stride;
322       dst += 4 * dst_stride;
323       h -= 4;
324     } while (h != 0);
325   } else {
326     const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
327 
328     do {
329       const uint8_t *s = src;
330       uint8_t *d = dst;
331       int width = w;
332 
333       do {
334         uint8x16_t s0, s1, s2, s3;
335         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
336 
337         uint8x8_t d0 = convolve8_8_h(s0, filters, permute_tbl);
338         uint8x8_t d1 = convolve8_8_h(s1, filters, permute_tbl);
339         uint8x8_t d2 = convolve8_8_h(s2, filters, permute_tbl);
340         uint8x8_t d3 = convolve8_8_h(s3, filters, permute_tbl);
341 
342         uint8x8_t dd0, dd1, dd2, dd3;
343         load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
344 
345         d0 = vrhadd_u8(d0, dd0);
346         d1 = vrhadd_u8(d1, dd1);
347         d2 = vrhadd_u8(d2, dd2);
348         d3 = vrhadd_u8(d3, dd3);
349 
350         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
351 
352         s += 8;
353         d += 8;
354         width -= 8;
355       } while (width != 0);
356       src += 4 * src_stride;
357       dst += 4 * dst_stride;
358       h -= 4;
359     } while (h != 0);
360   }
361 }
362 
transpose_concat_4x4(int8x8_t a0,int8x8_t a1,int8x8_t a2,int8x8_t a3,int8x16_t * b)363 static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
364                                         int8x8_t a3, int8x16_t *b) {
365   // Transpose 8-bit elements and concatenate result rows as follows:
366   // a0: 00, 01, 02, 03, XX, XX, XX, XX
367   // a1: 10, 11, 12, 13, XX, XX, XX, XX
368   // a2: 20, 21, 22, 23, XX, XX, XX, XX
369   // a3: 30, 31, 32, 33, XX, XX, XX, XX
370   //
371   // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
372 
373   int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
374   int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
375   int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
376   int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
377 
378   int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
379   int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
380 
381   int16x8_t a0123 =
382       vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)).val[0];
383 
384   *b = vreinterpretq_s8_s16(a0123);
385 }
386 
transpose_concat_8x4(int8x8_t a0,int8x8_t a1,int8x8_t a2,int8x8_t a3,int8x16_t * b0,int8x16_t * b1)387 static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
388                                         int8x8_t a3, int8x16_t *b0,
389                                         int8x16_t *b1) {
390   // Transpose 8-bit elements and concatenate result rows as follows:
391   // a0: 00, 01, 02, 03, 04, 05, 06, 07
392   // a1: 10, 11, 12, 13, 14, 15, 16, 17
393   // a2: 20, 21, 22, 23, 24, 25, 26, 27
394   // a3: 30, 31, 32, 33, 34, 35, 36, 37
395   //
396   // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
397   // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
398 
399   int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
400   int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
401   int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
402   int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
403 
404   int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
405   int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
406 
407   int16x8x2_t a0123 =
408       vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23));
409 
410   *b0 = vreinterpretq_s8_s16(a0123.val[0]);
411   *b1 = vreinterpretq_s8_s16(a0123.val[1]);
412 }
413 
convolve8_4_v(const int8x16_t samples_lo,const int8x16_t samples_hi,const int8x8_t filters)414 static INLINE int16x4_t convolve8_4_v(const int8x16_t samples_lo,
415                                       const int8x16_t samples_hi,
416                                       const int8x8_t filters) {
417   // The sample range transform and permutation are performed by the caller.
418 
419   // Accumulate into 128 * FILTER_SUM to account for range transform.
420   int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
421   int32x4_t sum = vdotq_lane_s32(acc, samples_lo, filters, 0);
422   sum = vdotq_lane_s32(sum, samples_hi, filters, 1);
423 
424   // Further narrowing and packing is performed by the caller.
425   return vshrn_n_s32(sum, 1);
426 }
427 
convolve8_8_v(const int8x16_t samples0_lo,const int8x16_t samples0_hi,const int8x16_t samples1_lo,const int8x16_t samples1_hi,const int8x8_t filters)428 static INLINE uint8x8_t convolve8_8_v(const int8x16_t samples0_lo,
429                                       const int8x16_t samples0_hi,
430                                       const int8x16_t samples1_lo,
431                                       const int8x16_t samples1_hi,
432                                       const int8x8_t filters) {
433   // The sample range transform and permutation are performed by the caller.
434 
435   // Accumulate into 128 * FILTER_SUM to account for range transform.
436   int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
437   // First 4 output values.
438   int32x4_t sum0 = vdotq_lane_s32(acc, samples0_lo, filters, 0);
439   sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1);
440   // Second 4 output values.
441   int32x4_t sum1 = vdotq_lane_s32(acc, samples1_lo, filters, 0);
442   sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1);
443 
444   // Narrow and re-pack.
445   int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
446   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
447 }
448 
convolve_8tap_vert_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h,const int8x8_t filter)449 static INLINE void convolve_8tap_vert_neon_dotprod(
450     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
451     ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
452   const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
453 
454   if (w == 4) {
455     uint8x8_t t0, t1, t2, t3, t4, t5, t6;
456     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
457     src += 7 * src_stride;
458 
459     // Transform sample range to [-128, 127] for 8-bit signed dot product.
460     int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
461     int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
462     int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
463     int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
464     int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
465     int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
466     int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
467 
468     // This operation combines a conventional transpose and the sample permute
469     // (see horizontal case) required before computing the dot product.
470     int8x16_t s0123, s1234, s2345, s3456;
471     transpose_concat_4x4(s0, s1, s2, s3, &s0123);
472     transpose_concat_4x4(s1, s2, s3, s4, &s1234);
473     transpose_concat_4x4(s2, s3, s4, s5, &s2345);
474     transpose_concat_4x4(s3, s4, s5, s6, &s3456);
475 
476     do {
477       uint8x8_t t7, t8, t9, t10;
478       load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
479 
480       int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
481       int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
482       int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
483       int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
484 
485       int8x16_t s78910;
486       transpose_concat_4x4(s7, s8, s9, s10, &s78910);
487 
488       // Merge new data into block from previous iteration.
489       int8x16x2_t samples_LUT = { { s3456, s78910 } };
490       int8x16_t s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
491       int8x16_t s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
492       int8x16_t s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
493 
494       int16x4_t d0 = convolve8_4_v(s0123, s4567, filter);
495       int16x4_t d1 = convolve8_4_v(s1234, s5678, filter);
496       int16x4_t d2 = convolve8_4_v(s2345, s6789, filter);
497       int16x4_t d3 = convolve8_4_v(s3456, s78910, filter);
498       uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
499       uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
500 
501       store_u8(dst + 0 * dst_stride, dst_stride, d01);
502       store_u8(dst + 2 * dst_stride, dst_stride, d23);
503 
504       /* Prepare block for next iteration - re-using as much as possible. */
505       /* Shuffle everything up four rows. */
506       s0123 = s4567;
507       s1234 = s5678;
508       s2345 = s6789;
509       s3456 = s78910;
510 
511       src += 4 * src_stride;
512       dst += 4 * dst_stride;
513       h -= 4;
514     } while (h != 0);
515   } else {
516     do {
517       const uint8_t *s = src;
518       uint8_t *d = dst;
519       int height = h;
520 
521       uint8x8_t t0, t1, t2, t3, t4, t5, t6;
522       load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
523       s += 7 * src_stride;
524 
525       // Transform sample range to [-128, 127] for 8-bit signed dot product.
526       int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
527       int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
528       int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
529       int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
530       int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
531       int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
532       int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
533 
534       // This operation combines a conventional transpose and the sample permute
535       // (see horizontal case) required before computing the dot product.
536       int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
537           s3456_lo, s3456_hi;
538       transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
539       transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
540       transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
541       transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
542 
543       do {
544         uint8x8_t t7, t8, t9, t10;
545         load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
546 
547         int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
548         int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
549         int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
550         int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
551 
552         int8x16_t s78910_lo, s78910_hi;
553         transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
554 
555         // Merge new data into block from previous iteration.
556         int8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
557         int8x16_t s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
558         int8x16_t s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
559         int8x16_t s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
560 
561         samples_LUT.val[0] = s3456_hi;
562         samples_LUT.val[1] = s78910_hi;
563         int8x16_t s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
564         int8x16_t s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
565         int8x16_t s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
566 
567         uint8x8_t d0 =
568             convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter);
569         uint8x8_t d1 =
570             convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter);
571         uint8x8_t d2 =
572             convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter);
573         uint8x8_t d3 =
574             convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter);
575 
576         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
577 
578         // Prepare block for next iteration - re-using as much as possible.
579         // Shuffle everything up four rows.
580         s0123_lo = s4567_lo;
581         s0123_hi = s4567_hi;
582         s1234_lo = s5678_lo;
583         s1234_hi = s5678_hi;
584         s2345_lo = s6789_lo;
585         s2345_hi = s6789_hi;
586         s3456_lo = s78910_lo;
587         s3456_hi = s78910_hi;
588 
589         s += 4 * src_stride;
590         d += 4 * dst_stride;
591         height -= 4;
592       } while (height != 0);
593       src += 8;
594       dst += 8;
595       w -= 8;
596     } while (w != 0);
597   }
598 }
599 
vpx_convolve8_vert_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)600 void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
601                                      uint8_t *dst, ptrdiff_t dst_stride,
602                                      const InterpKernel *filter, int x0_q4,
603                                      int x_step_q4, int y0_q4, int y_step_q4,
604                                      int w, int h) {
605   assert((intptr_t)dst % 4 == 0);
606   assert(dst_stride % 4 == 0);
607   assert(y_step_q4 == 16);
608 
609   (void)x0_q4;
610   (void)x_step_q4;
611   (void)y_step_q4;
612 
613   if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
614     const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
615 
616     convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h,
617                             y_filter);
618   } else {
619     const int8x8_t y_filter = vmovn_s16(vld1q_s16(filter[y0_q4]));
620 
621     convolve_8tap_vert_neon_dotprod(src - 3 * src_stride, src_stride, dst,
622                                     dst_stride, w, h, y_filter);
623   }
624 }
625 
vpx_convolve8_avg_vert_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)626 void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
627                                          ptrdiff_t src_stride, uint8_t *dst,
628                                          ptrdiff_t dst_stride,
629                                          const InterpKernel *filter, int x0_q4,
630                                          int x_step_q4, int y0_q4,
631                                          int y_step_q4, int w, int h) {
632   const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
633   const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
634 
635   assert((intptr_t)dst % 4 == 0);
636   assert(dst_stride % 4 == 0);
637   assert(y_step_q4 == 16);
638 
639   (void)x0_q4;
640   (void)x_step_q4;
641   (void)y_step_q4;
642 
643   src -= 3 * src_stride;
644 
645   if (w == 4) {
646     uint8x8_t t0, t1, t2, t3, t4, t5, t6;
647     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
648     src += 7 * src_stride;
649 
650     // Transform sample range to [-128, 127] for 8-bit signed dot product.
651     int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
652     int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
653     int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
654     int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
655     int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
656     int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
657     int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
658 
659     // This operation combines a conventional transpose and the sample permute
660     // (see horizontal case) required before computing the dot product.
661     int8x16_t s0123, s1234, s2345, s3456;
662     transpose_concat_4x4(s0, s1, s2, s3, &s0123);
663     transpose_concat_4x4(s1, s2, s3, s4, &s1234);
664     transpose_concat_4x4(s2, s3, s4, s5, &s2345);
665     transpose_concat_4x4(s3, s4, s5, s6, &s3456);
666 
667     do {
668       uint8x8_t t7, t8, t9, t10;
669       load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
670 
671       int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
672       int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
673       int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
674       int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
675 
676       int8x16_t s78910;
677       transpose_concat_4x4(s7, s8, s9, s10, &s78910);
678 
679       // Merge new data into block from previous iteration.
680       int8x16x2_t samples_LUT = { { s3456, s78910 } };
681       int8x16_t s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
682       int8x16_t s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
683       int8x16_t s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
684 
685       int16x4_t d0 = convolve8_4_v(s0123, s4567, filters);
686       int16x4_t d1 = convolve8_4_v(s1234, s5678, filters);
687       int16x4_t d2 = convolve8_4_v(s2345, s6789, filters);
688       int16x4_t d3 = convolve8_4_v(s3456, s78910, filters);
689       uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
690       uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
691 
692       uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
693       uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
694 
695       d01 = vrhadd_u8(d01, dd01);
696       d23 = vrhadd_u8(d23, dd23);
697 
698       store_u8(dst + 0 * dst_stride, dst_stride, d01);
699       store_u8(dst + 2 * dst_stride, dst_stride, d23);
700 
701       // Prepare block for next iteration - re-using as much as possible.
702       // Shuffle everything up four rows.
703       s0123 = s4567;
704       s1234 = s5678;
705       s2345 = s6789;
706       s3456 = s78910;
707 
708       src += 4 * src_stride;
709       dst += 4 * dst_stride;
710       h -= 4;
711     } while (h != 0);
712   } else {
713     do {
714       const uint8_t *s = src;
715       uint8_t *d = dst;
716       int height = h;
717 
718       uint8x8_t t0, t1, t2, t3, t4, t5, t6;
719       load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
720       s += 7 * src_stride;
721 
722       // Transform sample range to [-128, 127] for 8-bit signed dot product.
723       int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
724       int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
725       int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
726       int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
727       int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
728       int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
729       int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
730 
731       // This operation combines a conventional transpose and the sample permute
732       // (see horizontal case) required before computing the dot product.
733       int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
734           s3456_lo, s3456_hi;
735       transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
736       transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
737       transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
738       transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
739 
740       do {
741         uint8x8_t t7, t8, t9, t10;
742         load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
743 
744         int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
745         int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
746         int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
747         int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
748 
749         int8x16_t s78910_lo, s78910_hi;
750         transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
751 
752         // Merge new data into block from previous iteration.
753         int8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
754         int8x16_t s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
755         int8x16_t s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
756         int8x16_t s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
757 
758         samples_LUT.val[0] = s3456_hi;
759         samples_LUT.val[1] = s78910_hi;
760         int8x16_t s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
761         int8x16_t s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
762         int8x16_t s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
763 
764         uint8x8_t d0 =
765             convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filters);
766         uint8x8_t d1 =
767             convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filters);
768         uint8x8_t d2 =
769             convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filters);
770         uint8x8_t d3 =
771             convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filters);
772 
773         uint8x8_t dd0, dd1, dd2, dd3;
774         load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
775 
776         d0 = vrhadd_u8(d0, dd0);
777         d1 = vrhadd_u8(d1, dd1);
778         d2 = vrhadd_u8(d2, dd2);
779         d3 = vrhadd_u8(d3, dd3);
780 
781         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
782 
783         // Prepare block for next iteration - re-using as much as possible.
784         // Shuffle everything up four rows.
785         s0123_lo = s4567_lo;
786         s0123_hi = s4567_hi;
787         s1234_lo = s5678_lo;
788         s1234_hi = s5678_hi;
789         s2345_lo = s6789_lo;
790         s2345_hi = s6789_hi;
791         s3456_lo = s78910_lo;
792         s3456_hi = s78910_hi;
793 
794         s += 4 * src_stride;
795         d += 4 * dst_stride;
796         height -= 4;
797       } while (height != 0);
798       src += 8;
799       dst += 8;
800       w -= 8;
801     } while (w != 0);
802   }
803 }
804 
convolve_4tap_2d_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h,const int8x8_t x_filter,const uint8x8_t y_filter)805 static INLINE void convolve_4tap_2d_neon_dotprod(const uint8_t *src,
806                                                  ptrdiff_t src_stride,
807                                                  uint8_t *dst,
808                                                  ptrdiff_t dst_stride, int w,
809                                                  int h, const int8x8_t x_filter,
810                                                  const uint8x8_t y_filter) {
811   // Neon does not have lane-referencing multiply or multiply-accumulate
812   // instructions that operate on vectors of 8-bit elements. This means we have
813   // to duplicate filter taps into a whole vector and use standard multiply /
814   // multiply-accumulate instructions.
815   const uint8x8_t y_filter_taps[4] = { vdup_lane_u8(y_filter, 2),
816                                        vdup_lane_u8(y_filter, 3),
817                                        vdup_lane_u8(y_filter, 4),
818                                        vdup_lane_u8(y_filter, 5) };
819 
820   if (w == 4) {
821     const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
822 
823     uint8x16_t h_s0, h_s1, h_s2;
824     load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
825 
826     int16x4_t t0 = convolve4_4_h(h_s0, x_filter, permute_tbl);
827     int16x4_t t1 = convolve4_4_h(h_s1, x_filter, permute_tbl);
828     int16x4_t t2 = convolve4_4_h(h_s2, x_filter, permute_tbl);
829     // We halved the filter values so -1 from right shift.
830     uint8x8_t v_s01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
831     uint8x8_t v_s12 = vqrshrun_n_s16(vcombine_s16(t1, t2), FILTER_BITS - 1);
832 
833     src += 3 * src_stride;
834 
835     do {
836       uint8x16_t h_s3, h_s4, h_s5, h_s6;
837       load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
838 
839       int16x4_t t3 = convolve4_4_h(h_s3, x_filter, permute_tbl);
840       int16x4_t t4 = convolve4_4_h(h_s4, x_filter, permute_tbl);
841       int16x4_t t5 = convolve4_4_h(h_s5, x_filter, permute_tbl);
842       int16x4_t t6 = convolve4_4_h(h_s6, x_filter, permute_tbl);
843       // We halved the filter values so -1 from right shift.
844       uint8x8_t v_s34 = vqrshrun_n_s16(vcombine_s16(t3, t4), FILTER_BITS - 1);
845       uint8x8_t v_s56 = vqrshrun_n_s16(vcombine_s16(t5, t6), FILTER_BITS - 1);
846       uint8x8_t v_s23 = vext_u8(v_s12, v_s34, 4);
847       uint8x8_t v_s45 = vext_u8(v_s34, v_s56, 4);
848 
849       uint8x8_t d01 = convolve4_8(v_s01, v_s12, v_s23, v_s34, y_filter_taps);
850       uint8x8_t d23 = convolve4_8(v_s23, v_s34, v_s45, v_s56, y_filter_taps);
851 
852       store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01);
853       store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23);
854 
855       v_s01 = v_s45;
856       v_s12 = v_s56;
857       src += 4 * src_stride;
858       dst += 4 * dst_stride;
859       h -= 4;
860     } while (h != 0);
861   } else {
862     const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
863 
864     do {
865       const uint8_t *s = src;
866       uint8_t *d = dst;
867       int height = h;
868 
869       uint8x16_t h_s0, h_s1, h_s2;
870       load_u8_16x3(s, src_stride, &h_s0, &h_s1, &h_s2);
871 
872       uint8x8_t v_s0 = convolve4_8_h(h_s0, x_filter, permute_tbl);
873       uint8x8_t v_s1 = convolve4_8_h(h_s1, x_filter, permute_tbl);
874       uint8x8_t v_s2 = convolve4_8_h(h_s2, x_filter, permute_tbl);
875 
876       s += 3 * src_stride;
877 
878       do {
879         uint8x16_t h_s3, h_s4, h_s5, h_s6;
880         load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
881 
882         uint8x8_t v_s3 = convolve4_8_h(h_s3, x_filter, permute_tbl);
883         uint8x8_t v_s4 = convolve4_8_h(h_s4, x_filter, permute_tbl);
884         uint8x8_t v_s5 = convolve4_8_h(h_s5, x_filter, permute_tbl);
885         uint8x8_t v_s6 = convolve4_8_h(h_s6, x_filter, permute_tbl);
886 
887         uint8x8_t d0 = convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter_taps);
888         uint8x8_t d1 = convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter_taps);
889         uint8x8_t d2 = convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter_taps);
890         uint8x8_t d3 = convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter_taps);
891 
892         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
893 
894         v_s0 = v_s4;
895         v_s1 = v_s5;
896         v_s2 = v_s6;
897         s += 4 * src_stride;
898         d += 4 * dst_stride;
899         height -= 4;
900       } while (height != 0);
901       src += 8;
902       dst += 8;
903       w -= 8;
904     } while (w != 0);
905   }
906 }
907 
convolve_8tap_2d_horiz_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h,const int8x8_t filter)908 static INLINE void convolve_8tap_2d_horiz_neon_dotprod(
909     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
910     ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
911   if (w == 4) {
912     const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
913 
914     do {
915       uint8x16_t s0, s1, s2, s3;
916       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
917 
918       int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
919       int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
920       int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
921       int16x4_t d3 = convolve8_4_h(s3, filter, permute_tbl);
922       uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
923       uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
924 
925       store_u8(dst + 0 * dst_stride, dst_stride, d01);
926       store_u8(dst + 2 * dst_stride, dst_stride, d23);
927 
928       src += 4 * src_stride;
929       dst += 4 * dst_stride;
930       h -= 4;
931     } while (h > 3);
932 
933     // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
934     // below for further details on possible values of block height.
935     uint8x16_t s0, s1, s2;
936     load_u8_16x3(src, src_stride, &s0, &s1, &s2);
937 
938     int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
939     int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
940     int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
941     uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
942     uint8x8_t d23 =
943         vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
944 
945     store_u8(dst + 0 * dst_stride, dst_stride, d01);
946     store_u8_4x1(dst + 2 * dst_stride, d23);
947   } else {
948     const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
949 
950     do {
951       const uint8_t *s = src;
952       uint8_t *d = dst;
953       int width = w;
954 
955       do {
956         uint8x16_t s0, s1, s2, s3;
957         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
958 
959         uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
960         uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
961         uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
962         uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
963 
964         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
965 
966         s += 8;
967         d += 8;
968         width -= 8;
969       } while (width > 0);
970       src += 4 * src_stride;
971       dst += 4 * dst_stride;
972       h -= 4;
973     } while (h > 3);
974 
975     // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
976     // below for further details on possible values of block height.
977     const uint8_t *s = src;
978     uint8_t *d = dst;
979     int width = w;
980 
981     do {
982       uint8x16_t s0, s1, s2;
983       load_u8_16x3(s, src_stride, &s0, &s1, &s2);
984 
985       uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
986       uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
987       uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
988 
989       store_u8_8x3(d, dst_stride, d0, d1, d2);
990 
991       s += 8;
992       d += 8;
993       width -= 8;
994     } while (width > 0);
995   }
996 }
997 
vpx_convolve8_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)998 void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
999                                 uint8_t *dst, ptrdiff_t dst_stride,
1000                                 const InterpKernel *filter, int x0_q4,
1001                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
1002                                 int h) {
1003   assert(x_step_q4 == 16);
1004   assert(y_step_q4 == 16);
1005 
1006   (void)x_step_q4;
1007   (void)y_step_q4;
1008 
1009   const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8;
1010   const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
1011   // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
1012   // lines post both horizontally and vertically.
1013   const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1;
1014   const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride;
1015 
1016   if (x_filter_taps == 4 && y_filter_taps == 4) {
1017     const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
1018     const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
1019 
1020     // 4-tap and bilinear filter values are even, so halve them to reduce
1021     // intermediate precision requirements.
1022     const int8x8_t x_filter_4tap =
1023         vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
1024     const uint8x8_t y_filter_4tap =
1025         vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(y_filter)), 1);
1026 
1027     convolve_4tap_2d_neon_dotprod(src - horiz_offset - vert_offset, src_stride,
1028                                   dst, dst_stride, w, h, x_filter_4tap,
1029                                   y_filter_4tap);
1030     return;
1031   }
1032 
1033   // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
1034   // maximum buffer size to 64 * (64 + 7).
1035   DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
1036   const int im_stride = 64;
1037   const int im_height = h + SUBPEL_TAPS - 1;
1038 
1039   const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
1040   const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
1041 
1042   convolve_8tap_2d_horiz_neon_dotprod(src - horiz_offset - vert_offset,
1043                                       src_stride, im_block, im_stride, w,
1044                                       im_height, x_filter_8tap);
1045 
1046   convolve_8tap_vert_neon_dotprod(im_block, im_stride, dst, dst_stride, w, h,
1047                                   y_filter_8tap);
1048 }
1049 
vpx_convolve8_avg_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)1050 void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
1051                                     uint8_t *dst, ptrdiff_t dst_stride,
1052                                     const InterpKernel *filter, int x0_q4,
1053                                     int x_step_q4, int y0_q4, int y_step_q4,
1054                                     int w, int h) {
1055   DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
1056   const int im_stride = 64;
1057 
1058   // Averaging convolution always uses an 8-tap filter.
1059   // Account for the vertical phase needing 3 lines prior and 4 lines post.
1060   const int im_height = h + SUBPEL_TAPS - 1;
1061   const ptrdiff_t offset = SUBPEL_TAPS / 2 - 1;
1062 
1063   assert(y_step_q4 == 16);
1064   assert(x_step_q4 == 16);
1065 
1066   const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
1067 
1068   convolve_8tap_2d_horiz_neon_dotprod(src - offset - offset * src_stride,
1069                                       src_stride, im_block, im_stride, w,
1070                                       im_height, x_filter_8tap);
1071 
1072   vpx_convolve8_avg_vert_neon_dotprod(im_block + offset * im_stride, im_stride,
1073                                       dst, dst_stride, filter, x0_q4, x_step_q4,
1074                                       y0_q4, y_step_q4, w, h);
1075 }
1076