1 /*
2 * Copyright (c) 2014 The WebM project authors. All rights reserved.
3 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
4 *
5 * This source code is subject to the terms of the BSD 2 Clause License and
6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7 * was not distributed with this source code in the LICENSE file, you can
8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
9 * Media Patent License 1.0 was not distributed with this source code in the
10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
11 */
12
13 #include <arm_neon.h>
14 #include <assert.h>
15 #include <string.h>
16
17 #include "config/aom_config.h"
18
19 #include "aom/aom_integer.h"
20 #include "aom_dsp/aom_dsp_common.h"
21 #include "aom_dsp/aom_filter.h"
22 #include "aom_dsp/arm/aom_convolve8_neon.h"
23 #include "aom_dsp/arm/aom_filter.h"
24 #include "aom_dsp/arm/mem_neon.h"
25 #include "aom_dsp/arm/transpose_neon.h"
26 #include "aom_ports/mem.h"
27
28 DECLARE_ALIGNED(16, static const uint8_t, kMatMulPermuteTbl[32]) = {
29 // clang-format off
30 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9,
31 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
32 // clang-format on
33 };
34
35 DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = {
36 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
37 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
38 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
39 };
40
41 DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
42 // Shift left and insert new last column in transposed 4x4 block.
43 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
44 // Shift left and insert two new columns in transposed 4x4 block.
45 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
46 // Shift left and insert three new columns in transposed 4x4 block.
47 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
48 };
49
convolve8_4_h(const uint8x16_t samples,const int8x8_t filters,const uint8x16x2_t permute_tbl)50 static inline int16x4_t convolve8_4_h(const uint8x16_t samples,
51 const int8x8_t filters,
52 const uint8x16x2_t permute_tbl) {
53 // Permute samples ready for dot product.
54 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
55 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
56 uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
57 vqtbl1q_u8(samples, permute_tbl.val[1]) };
58
59 int32x4_t sum =
60 vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
61 sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
62
63 // Further narrowing and packing is performed by the caller.
64 return vqmovn_s32(sum);
65 }
66
convolve8_8_h(const uint8x16_t samples,const int8x8_t filters,const uint8x16x3_t permute_tbl)67 static inline uint8x8_t convolve8_8_h(const uint8x16_t samples,
68 const int8x8_t filters,
69 const uint8x16x3_t permute_tbl) {
70 // Permute samples ready for dot product.
71 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
72 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
73 // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
74 uint8x16_t permuted_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
75 vqtbl1q_u8(samples, permute_tbl.val[1]),
76 vqtbl1q_u8(samples, permute_tbl.val[2]) };
77
78 // First 4 output values.
79 int32x4_t sum0 =
80 vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
81 sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
82 // Second 4 output values.
83 int32x4_t sum1 =
84 vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
85 sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
86
87 // Narrow and re-pack.
88 int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
89 return vqrshrun_n_s16(sum, FILTER_BITS);
90 }
91
convolve8_horiz_8tap_neon_i8mm(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int w,int h)92 static inline void convolve8_horiz_8tap_neon_i8mm(
93 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
94 ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) {
95 const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
96
97 if (w == 4) {
98 const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
99 do {
100 uint8x16_t s0, s1, s2, s3;
101 load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
102
103 int16x4_t d0 = convolve8_4_h(s0, filter, perm_tbl);
104 int16x4_t d1 = convolve8_4_h(s1, filter, perm_tbl);
105 int16x4_t d2 = convolve8_4_h(s2, filter, perm_tbl);
106 int16x4_t d3 = convolve8_4_h(s3, filter, perm_tbl);
107 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
108 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
109
110 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
111 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
112
113 src += 4 * src_stride;
114 dst += 4 * dst_stride;
115 h -= 4;
116 } while (h > 0);
117 } else {
118 const uint8x16x3_t perm_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
119
120 do {
121 int width = w;
122 const uint8_t *s = src;
123 uint8_t *d = dst;
124 do {
125 uint8x16_t s0, s1, s2, s3;
126 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
127
128 uint8x8_t d0 = convolve8_8_h(s0, filter, perm_tbl);
129 uint8x8_t d1 = convolve8_8_h(s1, filter, perm_tbl);
130 uint8x8_t d2 = convolve8_8_h(s2, filter, perm_tbl);
131 uint8x8_t d3 = convolve8_8_h(s3, filter, perm_tbl);
132
133 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
134
135 s += 8;
136 d += 8;
137 width -= 8;
138 } while (width != 0);
139 src += 4 * src_stride;
140 dst += 4 * dst_stride;
141 h -= 4;
142 } while (h > 0);
143 }
144 }
145
convolve6_4_h(const uint8x16_t samples,const int8x16_t filter,const uint8x16_t permute_tbl)146 static inline int16x4_t convolve6_4_h(const uint8x16_t samples,
147 const int8x16_t filter,
148 const uint8x16_t permute_tbl) {
149 // Permute samples ready for matrix multiply.
150 // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 }
151 uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl);
152
153 // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
154 // (filter), destructively accumulating into the destination register.
155 int32x4_t sum = vusmmlaq_s32(vdupq_n_s32(0), perm_samples, filter);
156
157 // Further narrowing and packing is performed by the caller.
158 return vmovn_s32(sum);
159 }
160
convolve6_8_h(const uint8x16_t samples,const int8x16_t filter,const uint8x16x2_t permute_tbl)161 static inline uint8x8_t convolve6_8_h(const uint8x16_t samples,
162 const int8x16_t filter,
163 const uint8x16x2_t permute_tbl) {
164 // Permute samples ready for matrix multiply.
165 // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 }
166 // { 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 }
167 uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
168 vqtbl1q_u8(samples, permute_tbl.val[1]) };
169
170 // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
171 // (filter), destructively accumulating into the destination register.
172 int32x4_t sum0123 = vusmmlaq_s32(vdupq_n_s32(0), perm_samples[0], filter);
173 int32x4_t sum4567 = vusmmlaq_s32(vdupq_n_s32(0), perm_samples[1], filter);
174
175 // Narrow and re-pack.
176 int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
177 // We halved the filter values so -1 from right shift.
178 return vqrshrun_n_s16(sum, FILTER_BITS - 1);
179 }
180
convolve8_horiz_6tap_neon_i8mm(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int width,int height)181 static inline void convolve8_horiz_6tap_neon_i8mm(
182 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
183 ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height) {
184 // Filter values are even, so halve to reduce intermediate precision reqs.
185 const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(filter_x), 1);
186 // Stagger the filter for use with the matrix multiply instructions.
187 // { f0, f1, f2, f3, f4, f5, 0, 0, 0, f0, f1, f2, f3, f4, f5, 0 }
188 const int8x16_t filter =
189 vcombine_s8(vext_s8(x_filter, x_filter, 1), x_filter);
190
191 if (width == 4) {
192 const uint8x16_t perm_tbl = vld1q_u8(kMatMulPermuteTbl);
193 do {
194 uint8x16_t s0, s1, s2, s3;
195 load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
196
197 int16x4_t t0 = convolve6_4_h(s0, filter, perm_tbl);
198 int16x4_t t1 = convolve6_4_h(s1, filter, perm_tbl);
199 int16x4_t t2 = convolve6_4_h(s2, filter, perm_tbl);
200 int16x4_t t3 = convolve6_4_h(s3, filter, perm_tbl);
201 // We halved the filter values so -1 from right shift.
202 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
203 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
204
205 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
206 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
207
208 src += 4 * src_stride;
209 dst += 4 * dst_stride;
210 height -= 4;
211 } while (height > 0);
212 } else {
213 const uint8x16x2_t perm_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
214
215 do {
216 int w = width;
217 const uint8_t *s = src;
218 uint8_t *d = dst;
219 do {
220 uint8x16_t s0, s1, s2, s3;
221 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
222
223 uint8x8_t d0 = convolve6_8_h(s0, filter, perm_tbl);
224 uint8x8_t d1 = convolve6_8_h(s1, filter, perm_tbl);
225 uint8x8_t d2 = convolve6_8_h(s2, filter, perm_tbl);
226 uint8x8_t d3 = convolve6_8_h(s3, filter, perm_tbl);
227
228 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
229
230 s += 8;
231 d += 8;
232 w -= 8;
233 } while (w != 0);
234 src += 4 * src_stride;
235 dst += 4 * dst_stride;
236 height -= 4;
237 } while (height > 0);
238 }
239 }
240
aom_convolve8_horiz_neon_i8mm(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)241 void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
242 uint8_t *dst, ptrdiff_t dst_stride,
243 const int16_t *filter_x, int x_step_q4,
244 const int16_t *filter_y, int y_step_q4,
245 int w, int h) {
246 assert((intptr_t)dst % 4 == 0);
247 assert(dst_stride % 4 == 0);
248
249 (void)x_step_q4;
250 (void)filter_y;
251 (void)y_step_q4;
252
253 src -= ((SUBPEL_TAPS / 2) - 1);
254
255 int filter_taps = get_filter_taps_convolve8(filter_x);
256
257 if (filter_taps == 2) {
258 convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w,
259 h);
260 } else if (filter_taps <= 6) {
261 convolve8_horiz_6tap_neon_i8mm(src + 1, src_stride, dst, dst_stride,
262 filter_x, w, h);
263 } else {
264 convolve8_horiz_8tap_neon_i8mm(src, src_stride, dst, dst_stride, filter_x,
265 w, h);
266 }
267 }
268
transpose_concat_4x4(uint8x8_t a0,uint8x8_t a1,uint8x8_t a2,uint8x8_t a3,uint8x16_t * b)269 static inline void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
270 uint8x8_t a2, uint8x8_t a3,
271 uint8x16_t *b) {
272 // Transpose 8-bit elements and concatenate result rows as follows:
273 // a0: 00, 01, 02, 03, XX, XX, XX, XX
274 // a1: 10, 11, 12, 13, XX, XX, XX, XX
275 // a2: 20, 21, 22, 23, XX, XX, XX, XX
276 // a3: 30, 31, 32, 33, XX, XX, XX, XX
277 //
278 // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
279
280 uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
281 uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
282 uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
283 uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
284
285 uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
286 uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
287
288 uint16x8_t a0123 =
289 vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)).val[0];
290
291 *b = vreinterpretq_u8_u16(a0123);
292 }
293
transpose_concat_8x4(uint8x8_t a0,uint8x8_t a1,uint8x8_t a2,uint8x8_t a3,uint8x16_t * b0,uint8x16_t * b1)294 static inline void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
295 uint8x8_t a2, uint8x8_t a3,
296 uint8x16_t *b0, uint8x16_t *b1) {
297 // Transpose 8-bit elements and concatenate result rows as follows:
298 // a0: 00, 01, 02, 03, 04, 05, 06, 07
299 // a1: 10, 11, 12, 13, 14, 15, 16, 17
300 // a2: 20, 21, 22, 23, 24, 25, 26, 27
301 // a3: 30, 31, 32, 33, 34, 35, 36, 37
302 //
303 // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
304 // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
305
306 uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
307 uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
308 uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
309 uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
310
311 uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
312 uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
313
314 uint16x8x2_t a0123 =
315 vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23));
316
317 *b0 = vreinterpretq_u8_u16(a0123.val[0]);
318 *b1 = vreinterpretq_u8_u16(a0123.val[1]);
319 }
320
convolve8_4_v(const uint8x16_t samples_lo,const uint8x16_t samples_hi,const int8x8_t filters)321 static inline int16x4_t convolve8_4_v(const uint8x16_t samples_lo,
322 const uint8x16_t samples_hi,
323 const int8x8_t filters) {
324 // Sample permutation is performed by the caller.
325 int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
326 sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
327
328 // Further narrowing and packing is performed by the caller.
329 return vqmovn_s32(sum);
330 }
331
convolve8_8_v(const uint8x16_t samples0_lo,const uint8x16_t samples0_hi,const uint8x16_t samples1_lo,const uint8x16_t samples1_hi,const int8x8_t filters)332 static inline uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo,
333 const uint8x16_t samples0_hi,
334 const uint8x16_t samples1_lo,
335 const uint8x16_t samples1_hi,
336 const int8x8_t filters) {
337 // Sample permutation is performed by the caller.
338
339 // First 4 output values.
340 int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0);
341 sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1);
342 // Second 4 output values.
343 int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0);
344 sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1);
345
346 // Narrow and re-pack.
347 int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
348 return vqrshrun_n_s16(sum, FILTER_BITS);
349 }
350
convolve8_vert_8tap_neon_i8mm(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_y,int w,int h)351 static inline void convolve8_vert_8tap_neon_i8mm(
352 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
353 ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) {
354 const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
355 const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
356 uint8x16x2_t samples_LUT;
357
358 if (w == 4) {
359 uint8x8_t s0, s1, s2, s3, s4, s5, s6;
360 load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
361 src += 7 * src_stride;
362
363 // This operation combines a conventional transpose and the sample permute
364 // (see horizontal case) required before computing the dot product.
365 uint8x16_t s0123, s1234, s2345, s3456;
366 transpose_concat_4x4(s0, s1, s2, s3, &s0123);
367 transpose_concat_4x4(s1, s2, s3, s4, &s1234);
368 transpose_concat_4x4(s2, s3, s4, s5, &s2345);
369 transpose_concat_4x4(s3, s4, s5, s6, &s3456);
370
371 do {
372 uint8x8_t s7, s8, s9, s10;
373 load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
374
375 uint8x16_t s4567, s5678, s6789, s78910;
376 transpose_concat_4x4(s7, s8, s9, s10, &s78910);
377
378 // Merge new data into block from previous iteration.
379 samples_LUT.val[0] = s3456;
380 samples_LUT.val[1] = s78910;
381 s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
382 s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
383 s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
384
385 int16x4_t d0 = convolve8_4_v(s0123, s4567, filter);
386 int16x4_t d1 = convolve8_4_v(s1234, s5678, filter);
387 int16x4_t d2 = convolve8_4_v(s2345, s6789, filter);
388 int16x4_t d3 = convolve8_4_v(s3456, s78910, filter);
389 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
390 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
391
392 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
393 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
394
395 // Prepare block for next iteration - re-using as much as possible.
396 // Shuffle everything up four rows.
397 s0123 = s4567;
398 s1234 = s5678;
399 s2345 = s6789;
400 s3456 = s78910;
401
402 src += 4 * src_stride;
403 dst += 4 * dst_stride;
404 h -= 4;
405 } while (h != 0);
406 } else {
407 do {
408 int height = h;
409 const uint8_t *s = src;
410 uint8_t *d = dst;
411
412 uint8x8_t s0, s1, s2, s3, s4, s5, s6;
413 load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
414 s += 7 * src_stride;
415
416 // This operation combines a conventional transpose and the sample permute
417 // (see horizontal case) required before computing the dot product.
418 uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
419 s3456_lo, s3456_hi;
420 transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
421 transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
422 transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
423 transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
424
425 do {
426 uint8x8_t s7, s8, s9, s10;
427 load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
428
429 uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi,
430 s78910_lo, s78910_hi;
431 transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
432
433 // Merge new data into block from previous iteration.
434 samples_LUT.val[0] = s3456_lo;
435 samples_LUT.val[1] = s78910_lo;
436 s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
437 s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
438 s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
439
440 samples_LUT.val[0] = s3456_hi;
441 samples_LUT.val[1] = s78910_hi;
442 s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
443 s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
444 s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
445
446 uint8x8_t d0 =
447 convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter);
448 uint8x8_t d1 =
449 convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter);
450 uint8x8_t d2 =
451 convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter);
452 uint8x8_t d3 =
453 convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter);
454
455 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
456
457 // Prepare block for next iteration - re-using as much as possible.
458 // Shuffle everything up four rows.
459 s0123_lo = s4567_lo;
460 s0123_hi = s4567_hi;
461 s1234_lo = s5678_lo;
462 s1234_hi = s5678_hi;
463 s2345_lo = s6789_lo;
464 s2345_hi = s6789_hi;
465 s3456_lo = s78910_lo;
466 s3456_hi = s78910_hi;
467
468 s += 4 * src_stride;
469 d += 4 * dst_stride;
470 height -= 4;
471 } while (height != 0);
472 src += 8;
473 dst += 8;
474 w -= 8;
475 } while (w != 0);
476 }
477 }
478
aom_convolve8_vert_neon_i8mm(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)479 void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
480 uint8_t *dst, ptrdiff_t dst_stride,
481 const int16_t *filter_x, int x_step_q4,
482 const int16_t *filter_y, int y_step_q4, int w,
483 int h) {
484 assert((intptr_t)dst % 4 == 0);
485 assert(dst_stride % 4 == 0);
486
487 (void)filter_x;
488 (void)x_step_q4;
489 (void)y_step_q4;
490
491 src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
492
493 int filter_taps = get_filter_taps_convolve8(filter_y);
494
495 if (filter_taps == 2) {
496 convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride,
497 filter_y, w, h);
498 } else if (filter_taps == 4) {
499 convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride,
500 filter_y, w, h);
501 } else {
502 convolve8_vert_8tap_neon_i8mm(src, src_stride, dst, dst_stride, filter_y, w,
503 h);
504 }
505 }
506