xref: /aosp_15_r20/external/libaom/aom_dsp/arm/aom_convolve8_neon_i8mm.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2014 The WebM project authors. All rights reserved.
3  * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
4  *
5  * This source code is subject to the terms of the BSD 2 Clause License and
6  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7  * was not distributed with this source code in the LICENSE file, you can
8  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
9  * Media Patent License 1.0 was not distributed with this source code in the
10  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
11  */
12 
13 #include <arm_neon.h>
14 #include <assert.h>
15 #include <string.h>
16 
17 #include "config/aom_config.h"
18 
19 #include "aom/aom_integer.h"
20 #include "aom_dsp/aom_dsp_common.h"
21 #include "aom_dsp/aom_filter.h"
22 #include "aom_dsp/arm/aom_convolve8_neon.h"
23 #include "aom_dsp/arm/aom_filter.h"
24 #include "aom_dsp/arm/mem_neon.h"
25 #include "aom_dsp/arm/transpose_neon.h"
26 #include "aom_ports/mem.h"
27 
28 DECLARE_ALIGNED(16, static const uint8_t, kMatMulPermuteTbl[32]) = {
29   // clang-format off
30   0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9,
31   4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13
32   // clang-format on
33 };
34 
35 DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = {
36   0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
37   4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
38   8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
39 };
40 
41 DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
42   // Shift left and insert new last column in transposed 4x4 block.
43   1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
44   // Shift left and insert two new columns in transposed 4x4 block.
45   2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
46   // Shift left and insert three new columns in transposed 4x4 block.
47   3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
48 };
49 
convolve8_4_h(const uint8x16_t samples,const int8x8_t filters,const uint8x16x2_t permute_tbl)50 static inline int16x4_t convolve8_4_h(const uint8x16_t samples,
51                                       const int8x8_t filters,
52                                       const uint8x16x2_t permute_tbl) {
53   // Permute samples ready for dot product.
54   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
55   // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
56   uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
57                                      vqtbl1q_u8(samples, permute_tbl.val[1]) };
58 
59   int32x4_t sum =
60       vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
61   sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
62 
63   // Further narrowing and packing is performed by the caller.
64   return vqmovn_s32(sum);
65 }
66 
convolve8_8_h(const uint8x16_t samples,const int8x8_t filters,const uint8x16x3_t permute_tbl)67 static inline uint8x8_t convolve8_8_h(const uint8x16_t samples,
68                                       const int8x8_t filters,
69                                       const uint8x16x3_t permute_tbl) {
70   // Permute samples ready for dot product.
71   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
72   // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
73   // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
74   uint8x16_t permuted_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
75                                      vqtbl1q_u8(samples, permute_tbl.val[1]),
76                                      vqtbl1q_u8(samples, permute_tbl.val[2]) };
77 
78   // First 4 output values.
79   int32x4_t sum0 =
80       vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
81   sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
82   // Second 4 output values.
83   int32x4_t sum1 =
84       vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
85   sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
86 
87   // Narrow and re-pack.
88   int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
89   return vqrshrun_n_s16(sum, FILTER_BITS);
90 }
91 
convolve8_horiz_8tap_neon_i8mm(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int w,int h)92 static inline void convolve8_horiz_8tap_neon_i8mm(
93     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
94     ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) {
95   const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
96 
97   if (w == 4) {
98     const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
99     do {
100       uint8x16_t s0, s1, s2, s3;
101       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
102 
103       int16x4_t d0 = convolve8_4_h(s0, filter, perm_tbl);
104       int16x4_t d1 = convolve8_4_h(s1, filter, perm_tbl);
105       int16x4_t d2 = convolve8_4_h(s2, filter, perm_tbl);
106       int16x4_t d3 = convolve8_4_h(s3, filter, perm_tbl);
107       uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
108       uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
109 
110       store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
111       store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
112 
113       src += 4 * src_stride;
114       dst += 4 * dst_stride;
115       h -= 4;
116     } while (h > 0);
117   } else {
118     const uint8x16x3_t perm_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
119 
120     do {
121       int width = w;
122       const uint8_t *s = src;
123       uint8_t *d = dst;
124       do {
125         uint8x16_t s0, s1, s2, s3;
126         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
127 
128         uint8x8_t d0 = convolve8_8_h(s0, filter, perm_tbl);
129         uint8x8_t d1 = convolve8_8_h(s1, filter, perm_tbl);
130         uint8x8_t d2 = convolve8_8_h(s2, filter, perm_tbl);
131         uint8x8_t d3 = convolve8_8_h(s3, filter, perm_tbl);
132 
133         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
134 
135         s += 8;
136         d += 8;
137         width -= 8;
138       } while (width != 0);
139       src += 4 * src_stride;
140       dst += 4 * dst_stride;
141       h -= 4;
142     } while (h > 0);
143   }
144 }
145 
convolve6_4_h(const uint8x16_t samples,const int8x16_t filter,const uint8x16_t permute_tbl)146 static inline int16x4_t convolve6_4_h(const uint8x16_t samples,
147                                       const int8x16_t filter,
148                                       const uint8x16_t permute_tbl) {
149   // Permute samples ready for matrix multiply.
150   // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
151   uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl);
152 
153   // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
154   // (filter), destructively accumulating into the destination register.
155   int32x4_t sum = vusmmlaq_s32(vdupq_n_s32(0), perm_samples, filter);
156 
157   // Further narrowing and packing is performed by the caller.
158   return vmovn_s32(sum);
159 }
160 
convolve6_8_h(const uint8x16_t samples,const int8x16_t filter,const uint8x16x2_t permute_tbl)161 static inline uint8x8_t convolve6_8_h(const uint8x16_t samples,
162                                       const int8x16_t filter,
163                                       const uint8x16x2_t permute_tbl) {
164   // Permute samples ready for matrix multiply.
165   // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
166   // { 4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 }
167   uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
168                                  vqtbl1q_u8(samples, permute_tbl.val[1]) };
169 
170   // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
171   // (filter), destructively accumulating into the destination register.
172   int32x4_t sum0123 = vusmmlaq_s32(vdupq_n_s32(0), perm_samples[0], filter);
173   int32x4_t sum4567 = vusmmlaq_s32(vdupq_n_s32(0), perm_samples[1], filter);
174 
175   // Narrow and re-pack.
176   int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
177   // We halved the filter values so -1 from right shift.
178   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
179 }
180 
convolve8_horiz_6tap_neon_i8mm(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int width,int height)181 static inline void convolve8_horiz_6tap_neon_i8mm(
182     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
183     ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height) {
184   // Filter values are even, so halve to reduce intermediate precision reqs.
185   const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(filter_x), 1);
186   // Stagger the filter for use with the matrix multiply instructions.
187   // { f0, f1, f2, f3, f4, f5,  0,  0,  0, f0, f1, f2, f3, f4, f5,  0 }
188   const int8x16_t filter =
189       vcombine_s8(vext_s8(x_filter, x_filter, 1), x_filter);
190 
191   if (width == 4) {
192     const uint8x16_t perm_tbl = vld1q_u8(kMatMulPermuteTbl);
193     do {
194       uint8x16_t s0, s1, s2, s3;
195       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
196 
197       int16x4_t t0 = convolve6_4_h(s0, filter, perm_tbl);
198       int16x4_t t1 = convolve6_4_h(s1, filter, perm_tbl);
199       int16x4_t t2 = convolve6_4_h(s2, filter, perm_tbl);
200       int16x4_t t3 = convolve6_4_h(s3, filter, perm_tbl);
201       // We halved the filter values so -1 from right shift.
202       uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
203       uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
204 
205       store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
206       store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
207 
208       src += 4 * src_stride;
209       dst += 4 * dst_stride;
210       height -= 4;
211     } while (height > 0);
212   } else {
213     const uint8x16x2_t perm_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
214 
215     do {
216       int w = width;
217       const uint8_t *s = src;
218       uint8_t *d = dst;
219       do {
220         uint8x16_t s0, s1, s2, s3;
221         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
222 
223         uint8x8_t d0 = convolve6_8_h(s0, filter, perm_tbl);
224         uint8x8_t d1 = convolve6_8_h(s1, filter, perm_tbl);
225         uint8x8_t d2 = convolve6_8_h(s2, filter, perm_tbl);
226         uint8x8_t d3 = convolve6_8_h(s3, filter, perm_tbl);
227 
228         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
229 
230         s += 8;
231         d += 8;
232         w -= 8;
233       } while (w != 0);
234       src += 4 * src_stride;
235       dst += 4 * dst_stride;
236       height -= 4;
237     } while (height > 0);
238   }
239 }
240 
aom_convolve8_horiz_neon_i8mm(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)241 void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
242                                    uint8_t *dst, ptrdiff_t dst_stride,
243                                    const int16_t *filter_x, int x_step_q4,
244                                    const int16_t *filter_y, int y_step_q4,
245                                    int w, int h) {
246   assert((intptr_t)dst % 4 == 0);
247   assert(dst_stride % 4 == 0);
248 
249   (void)x_step_q4;
250   (void)filter_y;
251   (void)y_step_q4;
252 
253   src -= ((SUBPEL_TAPS / 2) - 1);
254 
255   int filter_taps = get_filter_taps_convolve8(filter_x);
256 
257   if (filter_taps == 2) {
258     convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w,
259                               h);
260   } else if (filter_taps <= 6) {
261     convolve8_horiz_6tap_neon_i8mm(src + 1, src_stride, dst, dst_stride,
262                                    filter_x, w, h);
263   } else {
264     convolve8_horiz_8tap_neon_i8mm(src, src_stride, dst, dst_stride, filter_x,
265                                    w, h);
266   }
267 }
268 
transpose_concat_4x4(uint8x8_t a0,uint8x8_t a1,uint8x8_t a2,uint8x8_t a3,uint8x16_t * b)269 static inline void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
270                                         uint8x8_t a2, uint8x8_t a3,
271                                         uint8x16_t *b) {
272   // Transpose 8-bit elements and concatenate result rows as follows:
273   // a0: 00, 01, 02, 03, XX, XX, XX, XX
274   // a1: 10, 11, 12, 13, XX, XX, XX, XX
275   // a2: 20, 21, 22, 23, XX, XX, XX, XX
276   // a3: 30, 31, 32, 33, XX, XX, XX, XX
277   //
278   // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
279 
280   uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
281   uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
282   uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
283   uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
284 
285   uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
286   uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
287 
288   uint16x8_t a0123 =
289       vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)).val[0];
290 
291   *b = vreinterpretq_u8_u16(a0123);
292 }
293 
transpose_concat_8x4(uint8x8_t a0,uint8x8_t a1,uint8x8_t a2,uint8x8_t a3,uint8x16_t * b0,uint8x16_t * b1)294 static inline void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
295                                         uint8x8_t a2, uint8x8_t a3,
296                                         uint8x16_t *b0, uint8x16_t *b1) {
297   // Transpose 8-bit elements and concatenate result rows as follows:
298   // a0: 00, 01, 02, 03, 04, 05, 06, 07
299   // a1: 10, 11, 12, 13, 14, 15, 16, 17
300   // a2: 20, 21, 22, 23, 24, 25, 26, 27
301   // a3: 30, 31, 32, 33, 34, 35, 36, 37
302   //
303   // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
304   // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
305 
306   uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
307   uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
308   uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
309   uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
310 
311   uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
312   uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
313 
314   uint16x8x2_t a0123 =
315       vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23));
316 
317   *b0 = vreinterpretq_u8_u16(a0123.val[0]);
318   *b1 = vreinterpretq_u8_u16(a0123.val[1]);
319 }
320 
convolve8_4_v(const uint8x16_t samples_lo,const uint8x16_t samples_hi,const int8x8_t filters)321 static inline int16x4_t convolve8_4_v(const uint8x16_t samples_lo,
322                                       const uint8x16_t samples_hi,
323                                       const int8x8_t filters) {
324   // Sample permutation is performed by the caller.
325   int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
326   sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
327 
328   // Further narrowing and packing is performed by the caller.
329   return vqmovn_s32(sum);
330 }
331 
convolve8_8_v(const uint8x16_t samples0_lo,const uint8x16_t samples0_hi,const uint8x16_t samples1_lo,const uint8x16_t samples1_hi,const int8x8_t filters)332 static inline uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo,
333                                       const uint8x16_t samples0_hi,
334                                       const uint8x16_t samples1_lo,
335                                       const uint8x16_t samples1_hi,
336                                       const int8x8_t filters) {
337   // Sample permutation is performed by the caller.
338 
339   // First 4 output values.
340   int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0);
341   sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1);
342   // Second 4 output values.
343   int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0);
344   sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1);
345 
346   // Narrow and re-pack.
347   int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
348   return vqrshrun_n_s16(sum, FILTER_BITS);
349 }
350 
convolve8_vert_8tap_neon_i8mm(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_y,int w,int h)351 static inline void convolve8_vert_8tap_neon_i8mm(
352     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
353     ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) {
354   const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
355   const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
356   uint8x16x2_t samples_LUT;
357 
358   if (w == 4) {
359     uint8x8_t s0, s1, s2, s3, s4, s5, s6;
360     load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
361     src += 7 * src_stride;
362 
363     // This operation combines a conventional transpose and the sample permute
364     // (see horizontal case) required before computing the dot product.
365     uint8x16_t s0123, s1234, s2345, s3456;
366     transpose_concat_4x4(s0, s1, s2, s3, &s0123);
367     transpose_concat_4x4(s1, s2, s3, s4, &s1234);
368     transpose_concat_4x4(s2, s3, s4, s5, &s2345);
369     transpose_concat_4x4(s3, s4, s5, s6, &s3456);
370 
371     do {
372       uint8x8_t s7, s8, s9, s10;
373       load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
374 
375       uint8x16_t s4567, s5678, s6789, s78910;
376       transpose_concat_4x4(s7, s8, s9, s10, &s78910);
377 
378       // Merge new data into block from previous iteration.
379       samples_LUT.val[0] = s3456;
380       samples_LUT.val[1] = s78910;
381       s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
382       s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
383       s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
384 
385       int16x4_t d0 = convolve8_4_v(s0123, s4567, filter);
386       int16x4_t d1 = convolve8_4_v(s1234, s5678, filter);
387       int16x4_t d2 = convolve8_4_v(s2345, s6789, filter);
388       int16x4_t d3 = convolve8_4_v(s3456, s78910, filter);
389       uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
390       uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
391 
392       store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
393       store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
394 
395       // Prepare block for next iteration - re-using as much as possible.
396       // Shuffle everything up four rows.
397       s0123 = s4567;
398       s1234 = s5678;
399       s2345 = s6789;
400       s3456 = s78910;
401 
402       src += 4 * src_stride;
403       dst += 4 * dst_stride;
404       h -= 4;
405     } while (h != 0);
406   } else {
407     do {
408       int height = h;
409       const uint8_t *s = src;
410       uint8_t *d = dst;
411 
412       uint8x8_t s0, s1, s2, s3, s4, s5, s6;
413       load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
414       s += 7 * src_stride;
415 
416       // This operation combines a conventional transpose and the sample permute
417       // (see horizontal case) required before computing the dot product.
418       uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
419           s3456_lo, s3456_hi;
420       transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
421       transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
422       transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
423       transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
424 
425       do {
426         uint8x8_t s7, s8, s9, s10;
427         load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
428 
429         uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi,
430             s78910_lo, s78910_hi;
431         transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
432 
433         // Merge new data into block from previous iteration.
434         samples_LUT.val[0] = s3456_lo;
435         samples_LUT.val[1] = s78910_lo;
436         s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
437         s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
438         s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
439 
440         samples_LUT.val[0] = s3456_hi;
441         samples_LUT.val[1] = s78910_hi;
442         s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
443         s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
444         s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
445 
446         uint8x8_t d0 =
447             convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter);
448         uint8x8_t d1 =
449             convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter);
450         uint8x8_t d2 =
451             convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter);
452         uint8x8_t d3 =
453             convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter);
454 
455         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
456 
457         // Prepare block for next iteration - re-using as much as possible.
458         // Shuffle everything up four rows.
459         s0123_lo = s4567_lo;
460         s0123_hi = s4567_hi;
461         s1234_lo = s5678_lo;
462         s1234_hi = s5678_hi;
463         s2345_lo = s6789_lo;
464         s2345_hi = s6789_hi;
465         s3456_lo = s78910_lo;
466         s3456_hi = s78910_hi;
467 
468         s += 4 * src_stride;
469         d += 4 * dst_stride;
470         height -= 4;
471       } while (height != 0);
472       src += 8;
473       dst += 8;
474       w -= 8;
475     } while (w != 0);
476   }
477 }
478 
aom_convolve8_vert_neon_i8mm(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)479 void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
480                                   uint8_t *dst, ptrdiff_t dst_stride,
481                                   const int16_t *filter_x, int x_step_q4,
482                                   const int16_t *filter_y, int y_step_q4, int w,
483                                   int h) {
484   assert((intptr_t)dst % 4 == 0);
485   assert(dst_stride % 4 == 0);
486 
487   (void)filter_x;
488   (void)x_step_q4;
489   (void)y_step_q4;
490 
491   src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
492 
493   int filter_taps = get_filter_taps_convolve8(filter_y);
494 
495   if (filter_taps == 2) {
496     convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride,
497                              filter_y, w, h);
498   } else if (filter_taps == 4) {
499     convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride,
500                              filter_y, w, h);
501   } else {
502     convolve8_vert_8tap_neon_i8mm(src, src_stride, dst, dst_stride, filter_y, w,
503                                   h);
504   }
505 }
506