xref: /aosp_15_r20/external/libaom/av1/common/arm/highbd_convolve_sve2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <arm_neon.h>
14 
15 #include "config/aom_config.h"
16 #include "config/av1_rtcd.h"
17 
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/arm/aom_neon_sve_bridge.h"
20 #include "aom_dsp/arm/aom_neon_sve2_bridge.h"
21 #include "aom_dsp/arm/mem_neon.h"
22 #include "aom_ports/mem.h"
23 #include "av1/common/convolve.h"
24 #include "av1/common/filter.h"
25 #include "av1/common/arm/highbd_convolve_sve2.h"
26 
27 DECLARE_ALIGNED(16, static const uint16_t, kDotProdTbl[32]) = {
28   0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
29   4, 5, 6, 7, 5, 6, 7, 0, 6, 7, 0, 1, 7, 0, 1, 2,
30 };
31 
convolve12_4_x(int16x8_t s0,int16x8_t s1,int16x8_t filter_0_7,int16x8_t filter_4_11,const int64x2_t offset,uint16x8x4_t permute_tbl,uint16x4_t max)32 static inline uint16x4_t convolve12_4_x(
33     int16x8_t s0, int16x8_t s1, int16x8_t filter_0_7, int16x8_t filter_4_11,
34     const int64x2_t offset, uint16x8x4_t permute_tbl, uint16x4_t max) {
35   int16x8_t permuted_samples[6];
36   permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]);
37   permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]);
38   permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]);
39   permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]);
40   permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]);
41   permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]);
42 
43   int64x2_t sum01 =
44       aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0);
45   sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1);
46   sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1);
47 
48   int64x2_t sum23 =
49       aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0);
50   sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1);
51   sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1);
52 
53   int32x4_t res0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
54   uint16x4_t res = vqrshrun_n_s32(res0123, FILTER_BITS);
55 
56   return vmin_u16(res, max);
57 }
58 
convolve12_8_x(int16x8_t s0,int16x8_t s1,int16x8_t s2,int16x8_t filter_0_7,int16x8_t filter_4_11,int64x2_t offset,uint16x8x4_t permute_tbl,uint16x8_t max)59 static inline uint16x8_t convolve12_8_x(int16x8_t s0, int16x8_t s1,
60                                         int16x8_t s2, int16x8_t filter_0_7,
61                                         int16x8_t filter_4_11, int64x2_t offset,
62                                         uint16x8x4_t permute_tbl,
63                                         uint16x8_t max) {
64   int16x8_t permuted_samples[8];
65   permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]);
66   permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]);
67   permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]);
68   permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]);
69   permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]);
70   permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]);
71   permuted_samples[6] = aom_tbl2_s16(s1, s2, permute_tbl.val[2]);
72   permuted_samples[7] = aom_tbl2_s16(s1, s2, permute_tbl.val[3]);
73 
74   int64x2_t sum01 =
75       aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0);
76   sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1);
77   sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1);
78 
79   int64x2_t sum23 =
80       aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0);
81   sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1);
82   sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1);
83 
84   int64x2_t sum45 =
85       aom_svdot_lane_s16(offset, permuted_samples[2], filter_0_7, 0);
86   sum45 = aom_svdot_lane_s16(sum45, permuted_samples[4], filter_0_7, 1);
87   sum45 = aom_svdot_lane_s16(sum45, permuted_samples[6], filter_4_11, 1);
88 
89   int64x2_t sum67 =
90       aom_svdot_lane_s16(offset, permuted_samples[3], filter_0_7, 0);
91   sum67 = aom_svdot_lane_s16(sum67, permuted_samples[5], filter_0_7, 1);
92   sum67 = aom_svdot_lane_s16(sum67, permuted_samples[7], filter_4_11, 1);
93 
94   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
95   int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
96 
97   uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS),
98                                 vqrshrun_n_s32(sum4567, FILTER_BITS));
99 
100   return vminq_u16(res, max);
101 }
102 
highbd_convolve_x_sr_12tap_sve2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int width,int height,const int16_t * y_filter_ptr,ConvolveParams * conv_params,int bd)103 static inline void highbd_convolve_x_sr_12tap_sve2(
104     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
105     int width, int height, const int16_t *y_filter_ptr,
106     ConvolveParams *conv_params, int bd) {
107   // This shim allows to do only one rounding shift instead of two.
108   const int64x2_t offset = vdupq_n_s64(1 << (conv_params->round_0 - 1));
109 
110   const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
111   const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4);
112 
113   uint16x8x4_t permute_tbl = vld1q_u16_x4(kDotProdTbl);
114   // Scale indices by size of the true vector length to avoid reading from an
115   // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
116   uint16x8_t correction0 = vreinterpretq_u16_u64(vcombine_u64(
117       vdup_n_u64(0), vdup_n_u64(svcnth() * 0x0001000000000000ULL)));
118   permute_tbl.val[2] = vaddq_u16(permute_tbl.val[2], correction0);
119 
120   uint16x8_t correction1 = vreinterpretq_u16_u64(
121       vcombine_u64(vdup_n_u64(svcnth() * 0x0001000100000000ULL),
122                    vdup_n_u64(svcnth() * 0x0001000100010000ULL)));
123   permute_tbl.val[3] = vaddq_u16(permute_tbl.val[3], correction1);
124 
125   if (width == 4) {
126     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
127     const int16_t *s = (const int16_t *)src;
128 
129     do {
130       int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
131       load_s16_8x4(s, src_stride, &s0, &s2, &s4, &s6);
132       load_s16_8x4(s + 8, src_stride, &s1, &s3, &s5, &s7);
133 
134       uint16x4_t d0 = convolve12_4_x(s0, s1, y_filter_0_7, y_filter_4_11,
135                                      offset, permute_tbl, max);
136       uint16x4_t d1 = convolve12_4_x(s2, s3, y_filter_0_7, y_filter_4_11,
137                                      offset, permute_tbl, max);
138       uint16x4_t d2 = convolve12_4_x(s4, s5, y_filter_0_7, y_filter_4_11,
139                                      offset, permute_tbl, max);
140       uint16x4_t d3 = convolve12_4_x(s6, s7, y_filter_0_7, y_filter_4_11,
141                                      offset, permute_tbl, max);
142 
143       store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
144 
145       s += 4 * src_stride;
146       dst += 4 * dst_stride;
147       height -= 4;
148     } while (height != 0);
149   } else {
150     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
151 
152     do {
153       const int16_t *s = (const int16_t *)src;
154       uint16_t *d = dst;
155       int w = width;
156 
157       do {
158         int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11;
159         load_s16_8x4(s, src_stride, &s0, &s3, &s6, &s9);
160         load_s16_8x4(s + 8, src_stride, &s1, &s4, &s7, &s10);
161         load_s16_8x4(s + 16, src_stride, &s2, &s5, &s8, &s11);
162 
163         uint16x8_t d0 = convolve12_8_x(s0, s1, s2, y_filter_0_7, y_filter_4_11,
164                                        offset, permute_tbl, max);
165         uint16x8_t d1 = convolve12_8_x(s3, s4, s5, y_filter_0_7, y_filter_4_11,
166                                        offset, permute_tbl, max);
167         uint16x8_t d2 = convolve12_8_x(s6, s7, s8, y_filter_0_7, y_filter_4_11,
168                                        offset, permute_tbl, max);
169         uint16x8_t d3 = convolve12_8_x(s9, s10, s11, y_filter_0_7,
170                                        y_filter_4_11, offset, permute_tbl, max);
171 
172         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
173 
174         s += 8;
175         d += 8;
176         w -= 8;
177       } while (w != 0);
178       src += 4 * src_stride;
179       dst += 4 * dst_stride;
180       height -= 4;
181     } while (height != 0);
182   }
183 }
184 
convolve8_8_x(int16x8_t s0[8],int16x8_t filter,int64x2_t offset,uint16x8_t max)185 static inline uint16x8_t convolve8_8_x(int16x8_t s0[8], int16x8_t filter,
186                                        int64x2_t offset, uint16x8_t max) {
187   int64x2_t sum[8];
188   sum[0] = aom_sdotq_s16(offset, s0[0], filter);
189   sum[1] = aom_sdotq_s16(offset, s0[1], filter);
190   sum[2] = aom_sdotq_s16(offset, s0[2], filter);
191   sum[3] = aom_sdotq_s16(offset, s0[3], filter);
192   sum[4] = aom_sdotq_s16(offset, s0[4], filter);
193   sum[5] = aom_sdotq_s16(offset, s0[5], filter);
194   sum[6] = aom_sdotq_s16(offset, s0[6], filter);
195   sum[7] = aom_sdotq_s16(offset, s0[7], filter);
196 
197   sum[0] = vpaddq_s64(sum[0], sum[1]);
198   sum[2] = vpaddq_s64(sum[2], sum[3]);
199   sum[4] = vpaddq_s64(sum[4], sum[5]);
200   sum[6] = vpaddq_s64(sum[6], sum[7]);
201 
202   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2]));
203   int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6]));
204 
205   uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS),
206                                 vqrshrun_n_s32(sum4567, FILTER_BITS));
207 
208   return vminq_u16(res, max);
209 }
210 
highbd_convolve_x_sr_8tap_sve2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int width,int height,const int16_t * y_filter_ptr,ConvolveParams * conv_params,int bd)211 static inline void highbd_convolve_x_sr_8tap_sve2(
212     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
213     int width, int height, const int16_t *y_filter_ptr,
214     ConvolveParams *conv_params, int bd) {
215   const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
216   // This shim allows to do only one rounding shift instead of two.
217   const int64_t offset = 1 << (conv_params->round_0 - 1);
218   const int64x2_t offset_lo = vcombine_s64((int64x1_t)(offset), vdup_n_s64(0));
219 
220   const int16x8_t filter = vld1q_s16(y_filter_ptr);
221 
222   do {
223     const int16_t *s = (const int16_t *)src;
224     uint16_t *d = dst;
225     int w = width;
226 
227     do {
228       int16x8_t s0[8], s1[8], s2[8], s3[8];
229       load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
230                    &s0[4], &s0[5], &s0[6], &s0[7]);
231       load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
232                    &s1[4], &s1[5], &s1[6], &s1[7]);
233       load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
234                    &s2[4], &s2[5], &s2[6], &s2[7]);
235       load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
236                    &s3[4], &s3[5], &s3[6], &s3[7]);
237 
238       uint16x8_t d0 = convolve8_8_x(s0, filter, offset_lo, max);
239       uint16x8_t d1 = convolve8_8_x(s1, filter, offset_lo, max);
240       uint16x8_t d2 = convolve8_8_x(s2, filter, offset_lo, max);
241       uint16x8_t d3 = convolve8_8_x(s3, filter, offset_lo, max);
242 
243       store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
244 
245       s += 8;
246       d += 8;
247       w -= 8;
248     } while (w != 0);
249     src += 4 * src_stride;
250     dst += 4 * dst_stride;
251     height -= 4;
252   } while (height != 0);
253 }
254 
255 // clang-format off
256 DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = {
257   0, 2, 4, 6, 1, 3, 5, 7,
258 };
259 // clang-format on
260 
convolve4_4_x(int16x8_t s0,int16x8_t filter,int64x2_t offset,uint16x8x2_t permute_tbl,uint16x4_t max)261 static inline uint16x4_t convolve4_4_x(int16x8_t s0, int16x8_t filter,
262                                        int64x2_t offset,
263                                        uint16x8x2_t permute_tbl,
264                                        uint16x4_t max) {
265   int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]);
266   int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]);
267 
268   int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0);
269   int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0);
270 
271   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
272   uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
273 
274   return vmin_u16(res, max);
275 }
276 
convolve4_8_x(int16x8_t s0[4],int16x8_t filter,int64x2_t offset,uint16x8_t tbl,uint16x8_t max)277 static inline uint16x8_t convolve4_8_x(int16x8_t s0[4], int16x8_t filter,
278                                        int64x2_t offset, uint16x8_t tbl,
279                                        uint16x8_t max) {
280   int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0);
281   int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0);
282   int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0);
283   int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0);
284 
285   int32x4_t sum0415 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15));
286   int32x4_t sum2637 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37));
287 
288   uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0415, FILTER_BITS),
289                                 vqrshrun_n_s32(sum2637, FILTER_BITS));
290   res = aom_tbl_u16(res, tbl);
291 
292   return vminq_u16(res, max);
293 }
294 
highbd_convolve_x_sr_4tap_sve2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int width,int height,const int16_t * x_filter_ptr,ConvolveParams * conv_params,int bd)295 static inline void highbd_convolve_x_sr_4tap_sve2(
296     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
297     int width, int height, const int16_t *x_filter_ptr,
298     ConvolveParams *conv_params, int bd) {
299   // This shim allows to do only one rounding shift instead of two.
300   const int64x2_t offset = vdupq_n_s64(1 << (conv_params->round_0 - 1));
301 
302   const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
303   const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0));
304 
305   if (width == 4) {
306     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
307     uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl);
308 
309     const int16_t *s = (const int16_t *)(src);
310 
311     do {
312       int16x8_t s0, s1, s2, s3;
313       load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3);
314 
315       uint16x4_t d0 = convolve4_4_x(s0, filter, offset, permute_tbl, max);
316       uint16x4_t d1 = convolve4_4_x(s1, filter, offset, permute_tbl, max);
317       uint16x4_t d2 = convolve4_4_x(s2, filter, offset, permute_tbl, max);
318       uint16x4_t d3 = convolve4_4_x(s3, filter, offset, permute_tbl, max);
319 
320       store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
321 
322       s += 4 * src_stride;
323       dst += 4 * dst_stride;
324       height -= 4;
325     } while (height != 0);
326   } else {
327     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
328     uint16x8_t idx = vld1q_u16(kDeinterleaveTbl);
329 
330     do {
331       const int16_t *s = (const int16_t *)(src);
332       uint16_t *d = dst;
333       int w = width;
334 
335       do {
336         int16x8_t s0[4], s1[4], s2[4], s3[4];
337         load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
338         load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
339         load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
340         load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
341 
342         uint16x8_t d0 = convolve4_8_x(s0, filter, offset, idx, max);
343         uint16x8_t d1 = convolve4_8_x(s1, filter, offset, idx, max);
344         uint16x8_t d2 = convolve4_8_x(s2, filter, offset, idx, max);
345         uint16x8_t d3 = convolve4_8_x(s3, filter, offset, idx, max);
346 
347         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
348 
349         s += 8;
350         d += 8;
351         w -= 8;
352       } while (w != 0);
353       src += 4 * src_stride;
354       dst += 4 * dst_stride;
355       height -= 4;
356     } while (height != 0);
357   }
358 }
359 
av1_highbd_convolve_x_sr_sve2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)360 void av1_highbd_convolve_x_sr_sve2(const uint16_t *src, int src_stride,
361                                    uint16_t *dst, int dst_stride, int w, int h,
362                                    const InterpFilterParams *filter_params_x,
363                                    const int subpel_x_qn,
364                                    ConvolveParams *conv_params, int bd) {
365   if (w == 2 || h == 2) {
366     av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
367                                filter_params_x, subpel_x_qn, conv_params, bd);
368     return;
369   }
370 
371   const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
372 
373   if (x_filter_taps == 6) {
374     av1_highbd_convolve_x_sr_neon(src, src_stride, dst, dst_stride, w, h,
375                                   filter_params_x, subpel_x_qn, conv_params,
376                                   bd);
377     return;
378   }
379 
380   const int horiz_offset = filter_params_x->taps / 2 - 1;
381   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
382       filter_params_x, subpel_x_qn & SUBPEL_MASK);
383 
384   src -= horiz_offset;
385 
386   if (x_filter_taps == 12) {
387     highbd_convolve_x_sr_12tap_sve2(src, src_stride, dst, dst_stride, w, h,
388                                     x_filter_ptr, conv_params, bd);
389     return;
390   }
391 
392   if (x_filter_taps == 8) {
393     highbd_convolve_x_sr_8tap_sve2(src, src_stride, dst, dst_stride, w, h,
394                                    x_filter_ptr, conv_params, bd);
395     return;
396   }
397 
398   highbd_convolve_x_sr_4tap_sve2(src + 2, src_stride, dst, dst_stride, w, h,
399                                  x_filter_ptr, conv_params, bd);
400 }
401 
highbd_convolve12_4_y(int16x8_t s0[2],int16x8_t s1[2],int16x8_t s2[2],int16x8_t filter_0_7,int16x8_t filter_4_11,uint16x4_t max)402 static inline uint16x4_t highbd_convolve12_4_y(int16x8_t s0[2], int16x8_t s1[2],
403                                                int16x8_t s2[2],
404                                                int16x8_t filter_0_7,
405                                                int16x8_t filter_4_11,
406                                                uint16x4_t max) {
407   int64x2_t sum[2];
408 
409   sum[0] = aom_svdot_lane_s16(vdupq_n_s64(0), s0[0], filter_0_7, 0);
410   sum[0] = aom_svdot_lane_s16(sum[0], s1[0], filter_0_7, 1);
411   sum[0] = aom_svdot_lane_s16(sum[0], s2[0], filter_4_11, 1);
412 
413   sum[1] = aom_svdot_lane_s16(vdupq_n_s64(0), s0[1], filter_0_7, 0);
414   sum[1] = aom_svdot_lane_s16(sum[1], s1[1], filter_0_7, 1);
415   sum[1] = aom_svdot_lane_s16(sum[1], s2[1], filter_4_11, 1);
416 
417   int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[1]));
418 
419   uint16x4_t res = vqrshrun_n_s32(res_s32, FILTER_BITS);
420 
421   return vmin_u16(res, max);
422 }
423 
highbd_convolve_y_sr_12tap_sve2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int width,int height,const int16_t * y_filter_ptr,int bd)424 static inline void highbd_convolve_y_sr_12tap_sve2(
425     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
426     int width, int height, const int16_t *y_filter_ptr, int bd) {
427   const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
428   const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4);
429 
430   uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl);
431   // Scale indices by size of the true vector length to avoid reading from an
432   // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
433   uint16x8_t correction0 =
434       vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL));
435   merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0);
436 
437   uint16x8_t correction1 =
438       vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL));
439   merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1);
440 
441   uint16x8_t correction2 =
442       vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL));
443   merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2);
444 
445   const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
446 
447   do {
448     int16_t *s = (int16_t *)src;
449     uint16_t *d = dst;
450     int h = height;
451 
452     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
453     load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
454                   &s9, &sA);
455     s += 11 * src_stride;
456 
457     int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2],
458         s6789[2], s789A[2];
459     transpose_concat_4x4(s0, s1, s2, s3, s0123);
460     transpose_concat_4x4(s1, s2, s3, s4, s1234);
461     transpose_concat_4x4(s2, s3, s4, s5, s2345);
462     transpose_concat_4x4(s3, s4, s5, s6, s3456);
463     transpose_concat_4x4(s4, s5, s6, s7, s4567);
464     transpose_concat_4x4(s5, s6, s7, s8, s5678);
465     transpose_concat_4x4(s6, s7, s8, s9, s6789);
466     transpose_concat_4x4(s7, s8, s9, sA, s789A);
467 
468     do {
469       int16x4_t sB, sC, sD, sE;
470       load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE);
471 
472       int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2];
473       transpose_concat_4x4(sB, sC, sD, sE, sBCDE);
474 
475       // Use the above transpose and reuse data from the previous loop to get
476       // the rest.
477       aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[0], s89AB);
478       aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[1], s9ABC);
479       aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[2], sABCD);
480 
481       uint16x4_t d0 = highbd_convolve12_4_y(s0123, s4567, s89AB, y_filter_0_7,
482                                             y_filter_4_11, max);
483       uint16x4_t d1 = highbd_convolve12_4_y(s1234, s5678, s9ABC, y_filter_0_7,
484                                             y_filter_4_11, max);
485       uint16x4_t d2 = highbd_convolve12_4_y(s2345, s6789, sABCD, y_filter_0_7,
486                                             y_filter_4_11, max);
487       uint16x4_t d3 = highbd_convolve12_4_y(s3456, s789A, sBCDE, y_filter_0_7,
488                                             y_filter_4_11, max);
489 
490       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
491 
492       // Prepare block for next iteration - re-using as much as possible.
493       // Shuffle everything up four rows.
494       s0123[0] = s4567[0];
495       s0123[1] = s4567[1];
496       s1234[0] = s5678[0];
497       s1234[1] = s5678[1];
498       s2345[0] = s6789[0];
499       s2345[1] = s6789[1];
500       s3456[0] = s789A[0];
501       s3456[1] = s789A[1];
502       s4567[0] = s89AB[0];
503       s4567[1] = s89AB[1];
504       s5678[0] = s9ABC[0];
505       s5678[1] = s9ABC[1];
506       s6789[0] = sABCD[0];
507       s6789[1] = sABCD[1];
508       s789A[0] = sBCDE[0];
509       s789A[1] = sBCDE[1];
510 
511       s += 4 * src_stride;
512       d += 4 * dst_stride;
513       h -= 4;
514     } while (h != 0);
515     src += 4;
516     dst += 4;
517     width -= 4;
518   } while (width != 0);
519 }
520 
highbd_convolve8_4_y(int16x8_t samples_lo[2],int16x8_t samples_hi[2],int16x8_t filter,uint16x4_t max)521 static inline uint16x4_t highbd_convolve8_4_y(int16x8_t samples_lo[2],
522                                               int16x8_t samples_hi[2],
523                                               int16x8_t filter,
524                                               uint16x4_t max) {
525   int64x2_t sum01 =
526       aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0);
527   sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1);
528 
529   int64x2_t sum23 =
530       aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0);
531   sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1);
532 
533   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
534   uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
535   return vmin_u16(res, max);
536 }
537 
highbd_convolve8_8_y(int16x8_t samples_lo[4],int16x8_t samples_hi[4],int16x8_t filter,uint16x8_t max)538 static inline uint16x8_t highbd_convolve8_8_y(int16x8_t samples_lo[4],
539                                               int16x8_t samples_hi[4],
540                                               int16x8_t filter,
541                                               uint16x8_t max) {
542   int64x2_t sum01 =
543       aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0);
544   sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1);
545 
546   int64x2_t sum23 =
547       aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0);
548   sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1);
549 
550   int64x2_t sum45 =
551       aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[2], filter, 0);
552   sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1);
553 
554   int64x2_t sum67 =
555       aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[3], filter, 0);
556   sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1);
557 
558   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
559   int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
560   uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS),
561                                 vqrshrun_n_s32(sum4567, FILTER_BITS));
562   return vminq_u16(res, max);
563 }
564 
highbd_convolve_y_sr_8tap_sve2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int width,int height,const int16_t * filter_y,int bd)565 static void highbd_convolve_y_sr_8tap_sve2(const uint16_t *src,
566                                            ptrdiff_t src_stride, uint16_t *dst,
567                                            ptrdiff_t dst_stride, int width,
568                                            int height, const int16_t *filter_y,
569                                            int bd) {
570   assert(width >= 4 && height >= 4);
571 
572   const int16x8_t y_filter = vld1q_s16(filter_y);
573 
574   uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl);
575   // Scale indices by size of the true vector length to avoid reading from an
576   // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
577   uint16x8_t correction0 =
578       vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL));
579   merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0);
580 
581   uint16x8_t correction1 =
582       vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL));
583   merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1);
584 
585   uint16x8_t correction2 =
586       vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL));
587   merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2);
588 
589   if (width == 4) {
590     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
591     int16_t *s = (int16_t *)src;
592 
593     int16x4_t s0, s1, s2, s3, s4, s5, s6;
594     load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
595     s += 7 * src_stride;
596 
597     // This operation combines a conventional transpose and the sample permute
598     // required before computing the dot product.
599     int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
600     transpose_concat_4x4(s0, s1, s2, s3, s0123);
601     transpose_concat_4x4(s1, s2, s3, s4, s1234);
602     transpose_concat_4x4(s2, s3, s4, s5, s2345);
603     transpose_concat_4x4(s3, s4, s5, s6, s3456);
604 
605     do {
606       int16x4_t s7, s8, s9, s10;
607       load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
608 
609       int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
610       // Transpose and shuffle the 4 lines that were loaded.
611       transpose_concat_4x4(s7, s8, s9, s10, s789A);
612 
613       // Merge new data into block from previous iteration.
614       aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
615       aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678);
616       aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789);
617 
618       uint16x4_t d0 = highbd_convolve8_4_y(s0123, s4567, y_filter, max);
619       uint16x4_t d1 = highbd_convolve8_4_y(s1234, s5678, y_filter, max);
620       uint16x4_t d2 = highbd_convolve8_4_y(s2345, s6789, y_filter, max);
621       uint16x4_t d3 = highbd_convolve8_4_y(s3456, s789A, y_filter, max);
622 
623       store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
624 
625       // Prepare block for next iteration - re-using as much as possible.
626       // Shuffle everything up four rows.
627       s0123[0] = s4567[0];
628       s0123[1] = s4567[1];
629       s1234[0] = s5678[0];
630       s1234[1] = s5678[1];
631       s2345[0] = s6789[0];
632       s2345[1] = s6789[1];
633       s3456[0] = s789A[0];
634       s3456[1] = s789A[1];
635       s += 4 * src_stride;
636       dst += 4 * dst_stride;
637       height -= 4;
638     } while (height != 0);
639   } else {
640     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
641 
642     do {
643       int h = height;
644       int16_t *s = (int16_t *)src;
645       uint16_t *d = dst;
646 
647       int16x8_t s0, s1, s2, s3, s4, s5, s6;
648       load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
649       s += 7 * src_stride;
650 
651       // This operation combines a conventional transpose and the sample permute
652       // required before computing the dot product.
653       int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
654       transpose_concat_8x4(s0, s1, s2, s3, s0123);
655       transpose_concat_8x4(s1, s2, s3, s4, s1234);
656       transpose_concat_8x4(s2, s3, s4, s5, s2345);
657       transpose_concat_8x4(s3, s4, s5, s6, s3456);
658 
659       do {
660         int16x8_t s7, s8, s9, s10;
661         load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
662 
663         int16x8_t s4567[4], s5678[4], s6789[4], s789A[4];
664         // Transpose and shuffle the 4 lines that were loaded.
665         transpose_concat_8x4(s7, s8, s9, s10, s789A);
666 
667         // Merge new data into block from previous iteration.
668         aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
669         aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678);
670         aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789);
671 
672         uint16x8_t d0 = highbd_convolve8_8_y(s0123, s4567, y_filter, max);
673         uint16x8_t d1 = highbd_convolve8_8_y(s1234, s5678, y_filter, max);
674         uint16x8_t d2 = highbd_convolve8_8_y(s2345, s6789, y_filter, max);
675         uint16x8_t d3 = highbd_convolve8_8_y(s3456, s789A, y_filter, max);
676 
677         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
678 
679         // Prepare block for next iteration - re-using as much as possible.
680         // Shuffle everything up four rows.
681         s0123[0] = s4567[0];
682         s0123[1] = s4567[1];
683         s0123[2] = s4567[2];
684         s0123[3] = s4567[3];
685         s1234[0] = s5678[0];
686         s1234[1] = s5678[1];
687         s1234[2] = s5678[2];
688         s1234[3] = s5678[3];
689         s2345[0] = s6789[0];
690         s2345[1] = s6789[1];
691         s2345[2] = s6789[2];
692         s2345[3] = s6789[3];
693         s3456[0] = s789A[0];
694         s3456[1] = s789A[1];
695         s3456[2] = s789A[2];
696         s3456[3] = s789A[3];
697 
698         s += 4 * src_stride;
699         d += 4 * dst_stride;
700         h -= 4;
701       } while (h != 0);
702       src += 8;
703       dst += 8;
704       width -= 8;
705     } while (width != 0);
706   }
707 }
708 
highbd_convolve4_4_y(int16x8_t samples[2],int16x8_t filter,uint16x4_t max)709 static inline uint16x4_t highbd_convolve4_4_y(int16x8_t samples[2],
710                                               int16x8_t filter,
711                                               uint16x4_t max) {
712   int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[0], filter, 0);
713   int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[1], filter, 0);
714 
715   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
716   uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
717   return vmin_u16(res, max);
718 }
719 
highbd_convolve4_8_y(int16x8_t samples[4],int16x8_t filter,uint16x8_t max)720 static inline uint16x8_t highbd_convolve4_8_y(int16x8_t samples[4],
721                                               int16x8_t filter,
722                                               uint16x8_t max) {
723   int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[0], filter, 0);
724   int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[1], filter, 0);
725   int64x2_t sum45 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[2], filter, 0);
726   int64x2_t sum67 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[3], filter, 0);
727 
728   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
729   int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
730   uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS),
731                                 vqrshrun_n_s32(sum4567, FILTER_BITS));
732   return vminq_u16(res, max);
733 }
734 
highbd_convolve_y_sr_4tap_sve2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int width,int height,const int16_t * filter_y,int bd)735 static void highbd_convolve_y_sr_4tap_sve2(const uint16_t *src,
736                                            ptrdiff_t src_stride, uint16_t *dst,
737                                            ptrdiff_t dst_stride, int width,
738                                            int height, const int16_t *filter_y,
739                                            int bd) {
740   assert(width >= 4 && height >= 4);
741 
742   const int16x8_t y_filter =
743       vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0));
744 
745   if (width == 4) {
746     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
747     int16_t *s = (int16_t *)src;
748 
749     int16x4_t s0, s1, s2;
750     load_s16_4x3(s, src_stride, &s0, &s1, &s2);
751     s += 3 * src_stride;
752 
753     do {
754       int16x4_t s3, s4, s5, s6;
755       load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
756 
757       // This operation combines a conventional transpose and the sample permute
758       // required before computing the dot product.
759       int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
760       transpose_concat_4x4(s0, s1, s2, s3, s0123);
761       transpose_concat_4x4(s1, s2, s3, s4, s1234);
762       transpose_concat_4x4(s2, s3, s4, s5, s2345);
763       transpose_concat_4x4(s3, s4, s5, s6, s3456);
764 
765       uint16x4_t d0 = highbd_convolve4_4_y(s0123, y_filter, max);
766       uint16x4_t d1 = highbd_convolve4_4_y(s1234, y_filter, max);
767       uint16x4_t d2 = highbd_convolve4_4_y(s2345, y_filter, max);
768       uint16x4_t d3 = highbd_convolve4_4_y(s3456, y_filter, max);
769 
770       store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
771 
772       // Shuffle everything up four rows.
773       s0 = s4;
774       s1 = s5;
775       s2 = s6;
776 
777       s += 4 * src_stride;
778       dst += 4 * dst_stride;
779       height -= 4;
780     } while (height != 0);
781   } else {
782     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
783 
784     do {
785       int h = height;
786       int16_t *s = (int16_t *)src;
787       uint16_t *d = dst;
788 
789       int16x8_t s0, s1, s2;
790       load_s16_8x3(s, src_stride, &s0, &s1, &s2);
791       s += 3 * src_stride;
792 
793       do {
794         int16x8_t s3, s4, s5, s6;
795         load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
796 
797         // This operation combines a conventional transpose and the sample
798         // permute required before computing the dot product.
799         int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
800         transpose_concat_8x4(s0, s1, s2, s3, s0123);
801         transpose_concat_8x4(s1, s2, s3, s4, s1234);
802         transpose_concat_8x4(s2, s3, s4, s5, s2345);
803         transpose_concat_8x4(s3, s4, s5, s6, s3456);
804 
805         uint16x8_t d0 = highbd_convolve4_8_y(s0123, y_filter, max);
806         uint16x8_t d1 = highbd_convolve4_8_y(s1234, y_filter, max);
807         uint16x8_t d2 = highbd_convolve4_8_y(s2345, y_filter, max);
808         uint16x8_t d3 = highbd_convolve4_8_y(s3456, y_filter, max);
809 
810         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
811 
812         // Shuffle everything up four rows.
813         s0 = s4;
814         s1 = s5;
815         s2 = s6;
816 
817         s += 4 * src_stride;
818         d += 4 * dst_stride;
819         h -= 4;
820       } while (h != 0);
821       src += 8;
822       dst += 8;
823       width -= 8;
824     } while (width != 0);
825   }
826 }
827 
av1_highbd_convolve_y_sr_sve2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,int bd)828 void av1_highbd_convolve_y_sr_sve2(const uint16_t *src, int src_stride,
829                                    uint16_t *dst, int dst_stride, int w, int h,
830                                    const InterpFilterParams *filter_params_y,
831                                    const int subpel_y_qn, int bd) {
832   if (w == 2 || h == 2) {
833     av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
834                                filter_params_y, subpel_y_qn, bd);
835     return;
836   }
837   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
838 
839   if (y_filter_taps == 6) {
840     av1_highbd_convolve_y_sr_neon(src, src_stride, dst, dst_stride, w, h,
841                                   filter_params_y, subpel_y_qn, bd);
842     return;
843   }
844 
845   const int vert_offset = filter_params_y->taps / 2 - 1;
846   const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
847       filter_params_y, subpel_y_qn & SUBPEL_MASK);
848 
849   src -= vert_offset * src_stride;
850 
851   if (y_filter_taps > 8) {
852     highbd_convolve_y_sr_12tap_sve2(src, src_stride, dst, dst_stride, w, h,
853                                     y_filter_ptr, bd);
854     return;
855   }
856 
857   if (y_filter_taps == 4) {
858     highbd_convolve_y_sr_4tap_sve2(src + 2 * src_stride, src_stride, dst,
859                                    dst_stride, w, h, y_filter_ptr, bd);
860     return;
861   }
862 
863   highbd_convolve_y_sr_8tap_sve2(src, src_stride, dst, dst_stride, w, h,
864                                  y_filter_ptr, bd);
865 }
866 
convolve12_4_2d_h(int16x8_t s0,int16x8_t s1,int16x8_t filter_0_7,int16x8_t filter_4_11,const int64x2_t offset,int32x4_t shift,uint16x8x4_t permute_tbl)867 static inline uint16x4_t convolve12_4_2d_h(
868     int16x8_t s0, int16x8_t s1, int16x8_t filter_0_7, int16x8_t filter_4_11,
869     const int64x2_t offset, int32x4_t shift, uint16x8x4_t permute_tbl) {
870   int16x8_t permuted_samples[6];
871   permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]);
872   permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]);
873   permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]);
874   permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]);
875   permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]);
876   permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]);
877 
878   int64x2_t sum01 =
879       aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0);
880   sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1);
881   sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1);
882 
883   int64x2_t sum23 =
884       aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0);
885   sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1);
886   sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1);
887 
888   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
889   sum0123 = vqrshlq_s32(sum0123, shift);
890   return vqmovun_s32(sum0123);
891 }
892 
convolve12_8_2d_h(int16x8_t s0,int16x8_t s1,int16x8_t s2,int16x8_t filter_0_7,int16x8_t filter_4_11,int64x2_t offset,int32x4_t shift,uint16x8x4_t permute_tbl)893 static inline uint16x8_t convolve12_8_2d_h(int16x8_t s0, int16x8_t s1,
894                                            int16x8_t s2, int16x8_t filter_0_7,
895                                            int16x8_t filter_4_11,
896                                            int64x2_t offset, int32x4_t shift,
897                                            uint16x8x4_t permute_tbl) {
898   int16x8_t permuted_samples[8];
899   permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]);
900   permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]);
901   permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]);
902   permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]);
903   permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]);
904   permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]);
905   permuted_samples[6] = aom_tbl2_s16(s1, s2, permute_tbl.val[2]);
906   permuted_samples[7] = aom_tbl2_s16(s1, s2, permute_tbl.val[3]);
907 
908   int64x2_t sum01 =
909       aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0);
910   sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1);
911   sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1);
912 
913   int64x2_t sum23 =
914       aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0);
915   sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1);
916   sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1);
917 
918   int64x2_t sum45 =
919       aom_svdot_lane_s16(offset, permuted_samples[2], filter_0_7, 0);
920   sum45 = aom_svdot_lane_s16(sum45, permuted_samples[4], filter_0_7, 1);
921   sum45 = aom_svdot_lane_s16(sum45, permuted_samples[6], filter_4_11, 1);
922 
923   int64x2_t sum67 =
924       aom_svdot_lane_s16(offset, permuted_samples[3], filter_0_7, 0);
925   sum67 = aom_svdot_lane_s16(sum67, permuted_samples[5], filter_0_7, 1);
926   sum67 = aom_svdot_lane_s16(sum67, permuted_samples[7], filter_4_11, 1);
927 
928   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
929   int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
930 
931   sum0123 = vqrshlq_s32(sum0123, shift);
932   sum4567 = vqrshlq_s32(sum4567, shift);
933 
934   return vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567));
935 }
936 
highbd_convolve_2d_sr_horiz_12tap_sve2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int width,int height,const int16_t * y_filter_ptr,ConvolveParams * conv_params,const int x_offset)937 static inline void highbd_convolve_2d_sr_horiz_12tap_sve2(
938     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
939     int width, int height, const int16_t *y_filter_ptr,
940     ConvolveParams *conv_params, const int x_offset) {
941   const int64x2_t offset = vdupq_n_s64(x_offset);
942   const int32x4_t shift = vdupq_n_s32(-conv_params->round_0);
943 
944   const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
945   const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4);
946 
947   uint16x8x4_t permute_tbl = vld1q_u16_x4(kDotProdTbl);
948   // Scale indices by size of the true vector length to avoid reading from an
949   // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
950   uint16x8_t correction0 = vreinterpretq_u16_u64(vcombine_u64(
951       vdup_n_u64(0), vdup_n_u64(svcnth() * 0x0001000000000000ULL)));
952   permute_tbl.val[2] = vaddq_u16(permute_tbl.val[2], correction0);
953 
954   uint16x8_t correction1 = vreinterpretq_u16_u64(
955       vcombine_u64(vdup_n_u64(svcnth() * 0x0001000100000000ULL),
956                    vdup_n_u64(svcnth() * 0x0001000100010000ULL)));
957   permute_tbl.val[3] = vaddq_u16(permute_tbl.val[3], correction1);
958 
959   if (width == 4) {
960     const int16_t *s = (const int16_t *)src;
961 
962     do {
963       int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
964       load_s16_8x4(s, src_stride, &s0, &s2, &s4, &s6);
965       load_s16_8x4(s + 8, src_stride, &s1, &s3, &s5, &s7);
966 
967       uint16x4_t d0 = convolve12_4_2d_h(s0, s1, y_filter_0_7, y_filter_4_11,
968                                         offset, shift, permute_tbl);
969       uint16x4_t d1 = convolve12_4_2d_h(s2, s3, y_filter_0_7, y_filter_4_11,
970                                         offset, shift, permute_tbl);
971       uint16x4_t d2 = convolve12_4_2d_h(s4, s5, y_filter_0_7, y_filter_4_11,
972                                         offset, shift, permute_tbl);
973       uint16x4_t d3 = convolve12_4_2d_h(s6, s7, y_filter_0_7, y_filter_4_11,
974                                         offset, shift, permute_tbl);
975 
976       store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
977 
978       dst += 4 * dst_stride;
979       s += 4 * src_stride;
980       height -= 4;
981     } while (height > 0);
982   } else {
983     do {
984       const int16_t *s = (const int16_t *)src;
985       uint16_t *d = dst;
986       int w = width;
987 
988       do {
989         int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11;
990         load_s16_8x4(s, src_stride, &s0, &s3, &s6, &s9);
991         load_s16_8x4(s + 8, src_stride, &s1, &s4, &s7, &s10);
992         load_s16_8x4(s + 16, src_stride, &s2, &s5, &s8, &s11);
993 
994         uint16x8_t d0 =
995             convolve12_8_2d_h(s0, s1, s2, y_filter_0_7, y_filter_4_11, offset,
996                               shift, permute_tbl);
997         uint16x8_t d1 =
998             convolve12_8_2d_h(s3, s4, s5, y_filter_0_7, y_filter_4_11, offset,
999                               shift, permute_tbl);
1000         uint16x8_t d2 =
1001             convolve12_8_2d_h(s6, s7, s8, y_filter_0_7, y_filter_4_11, offset,
1002                               shift, permute_tbl);
1003         uint16x8_t d3 =
1004             convolve12_8_2d_h(s9, s10, s11, y_filter_0_7, y_filter_4_11, offset,
1005                               shift, permute_tbl);
1006 
1007         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1008 
1009         s += 8;
1010         d += 8;
1011         w -= 8;
1012       } while (w != 0);
1013       src += 4 * src_stride;
1014       dst += 4 * dst_stride;
1015       height -= 4;
1016     } while (height > 0);
1017   }
1018 }
1019 
convolve8_8_2d_h(int16x8_t s0[8],int16x8_t filter,int64x2_t offset,int32x4_t shift)1020 static inline uint16x8_t convolve8_8_2d_h(int16x8_t s0[8], int16x8_t filter,
1021                                           int64x2_t offset, int32x4_t shift) {
1022   int64x2_t sum[8];
1023   sum[0] = aom_sdotq_s16(offset, s0[0], filter);
1024   sum[1] = aom_sdotq_s16(offset, s0[1], filter);
1025   sum[2] = aom_sdotq_s16(offset, s0[2], filter);
1026   sum[3] = aom_sdotq_s16(offset, s0[3], filter);
1027   sum[4] = aom_sdotq_s16(offset, s0[4], filter);
1028   sum[5] = aom_sdotq_s16(offset, s0[5], filter);
1029   sum[6] = aom_sdotq_s16(offset, s0[6], filter);
1030   sum[7] = aom_sdotq_s16(offset, s0[7], filter);
1031 
1032   sum[0] = vpaddq_s64(sum[0], sum[1]);
1033   sum[2] = vpaddq_s64(sum[2], sum[3]);
1034   sum[4] = vpaddq_s64(sum[4], sum[5]);
1035   sum[6] = vpaddq_s64(sum[6], sum[7]);
1036 
1037   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2]));
1038   int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6]));
1039 
1040   sum0123 = vqrshlq_s32(sum0123, shift);
1041   sum4567 = vqrshlq_s32(sum4567, shift);
1042 
1043   return vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567));
1044 }
1045 
highbd_convolve_2d_sr_horiz_8tap_sve2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int width,int height,const int16_t * y_filter_ptr,ConvolveParams * conv_params,const int x_offset)1046 static inline void highbd_convolve_2d_sr_horiz_8tap_sve2(
1047     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
1048     int width, int height, const int16_t *y_filter_ptr,
1049     ConvolveParams *conv_params, const int x_offset) {
1050   const int64x2_t offset = vdupq_n_s64(x_offset);
1051   const int64x2_t offset_lo = vcombine_s64(vget_low_s64(offset), vdup_n_s64(0));
1052   const int32x4_t shift = vdupq_n_s32(-conv_params->round_0);
1053 
1054   const int16x8_t filter = vld1q_s16(y_filter_ptr);
1055 
1056   do {
1057     const int16_t *s = (const int16_t *)src;
1058     uint16_t *d = dst;
1059     int w = width;
1060 
1061     do {
1062       int16x8_t s0[8], s1[8], s2[8], s3[8];
1063       load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1064                    &s0[4], &s0[5], &s0[6], &s0[7]);
1065       load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1066                    &s1[4], &s1[5], &s1[6], &s1[7]);
1067       load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1068                    &s2[4], &s2[5], &s2[6], &s2[7]);
1069       load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1070                    &s3[4], &s3[5], &s3[6], &s3[7]);
1071 
1072       uint16x8_t d0 = convolve8_8_2d_h(s0, filter, offset_lo, shift);
1073       uint16x8_t d1 = convolve8_8_2d_h(s1, filter, offset_lo, shift);
1074       uint16x8_t d2 = convolve8_8_2d_h(s2, filter, offset_lo, shift);
1075       uint16x8_t d3 = convolve8_8_2d_h(s3, filter, offset_lo, shift);
1076 
1077       store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1078 
1079       s += 8;
1080       d += 8;
1081       w -= 8;
1082     } while (w != 0);
1083     src += 4 * src_stride;
1084     dst += 4 * dst_stride;
1085     height -= 4;
1086   } while (height > 0);
1087 }
1088 
convolve4_4_2d_h(int16x8_t s0,int16x8_t filter,int64x2_t offset,int32x4_t shift,uint16x8x2_t permute_tbl)1089 static inline uint16x4_t convolve4_4_2d_h(int16x8_t s0, int16x8_t filter,
1090                                           int64x2_t offset, int32x4_t shift,
1091                                           uint16x8x2_t permute_tbl) {
1092   int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]);
1093   int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]);
1094 
1095   int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0);
1096   int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0);
1097 
1098   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
1099   sum0123 = vqrshlq_s32(sum0123, shift);
1100   return vqmovun_s32(sum0123);
1101 }
1102 
convolve4_8_2d_h(int16x8_t s0[8],int16x8_t filter,int64x2_t offset,int32x4_t shift,uint16x8_t tbl)1103 static inline uint16x8_t convolve4_8_2d_h(int16x8_t s0[8], int16x8_t filter,
1104                                           int64x2_t offset, int32x4_t shift,
1105                                           uint16x8_t tbl) {
1106   int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0);
1107   int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0);
1108   int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0);
1109   int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0);
1110 
1111   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15));
1112   int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37));
1113 
1114   sum0123 = vqrshlq_s32(sum0123, shift);
1115   sum4567 = vqrshlq_s32(sum4567, shift);
1116 
1117   uint16x8_t res = vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567));
1118   return aom_tbl_u16(res, tbl);
1119 }
1120 
highbd_convolve_2d_sr_horiz_4tap_sve2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int width,int height,const int16_t * x_filter_ptr,ConvolveParams * conv_params,const int x_offset)1121 static inline void highbd_convolve_2d_sr_horiz_4tap_sve2(
1122     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
1123     int width, int height, const int16_t *x_filter_ptr,
1124     ConvolveParams *conv_params, const int x_offset) {
1125   const int64x2_t offset = vdupq_n_s64(x_offset);
1126   const int32x4_t shift = vdupq_n_s32(-conv_params->round_0);
1127 
1128   const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
1129   const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0));
1130 
1131   if (width == 4) {
1132     const int16_t *s = (const int16_t *)(src);
1133 
1134     uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl);
1135 
1136     do {
1137       int16x8_t s0, s1, s2, s3;
1138       load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3);
1139 
1140       uint16x4_t d0 = convolve4_4_2d_h(s0, filter, offset, shift, permute_tbl);
1141       uint16x4_t d1 = convolve4_4_2d_h(s1, filter, offset, shift, permute_tbl);
1142       uint16x4_t d2 = convolve4_4_2d_h(s2, filter, offset, shift, permute_tbl);
1143       uint16x4_t d3 = convolve4_4_2d_h(s3, filter, offset, shift, permute_tbl);
1144 
1145       store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
1146 
1147       s += 4 * src_stride;
1148       dst += 4 * dst_stride;
1149       height -= 4;
1150     } while (height > 0);
1151   } else {
1152     uint16x8_t idx = vld1q_u16(kDeinterleaveTbl);
1153 
1154     do {
1155       const int16_t *s = (const int16_t *)(src);
1156       uint16_t *d = dst;
1157       int w = width;
1158 
1159       do {
1160         int16x8_t s0[8], s1[8], s2[8], s3[8];
1161         load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1162                      &s0[4], &s0[5], &s0[6], &s0[7]);
1163         load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1164                      &s1[4], &s1[5], &s1[6], &s1[7]);
1165         load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1166                      &s2[4], &s2[5], &s2[6], &s2[7]);
1167         load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1168                      &s3[4], &s3[5], &s3[6], &s3[7]);
1169 
1170         uint16x8_t d0 = convolve4_8_2d_h(s0, filter, offset, shift, idx);
1171         uint16x8_t d1 = convolve4_8_2d_h(s1, filter, offset, shift, idx);
1172         uint16x8_t d2 = convolve4_8_2d_h(s2, filter, offset, shift, idx);
1173         uint16x8_t d3 = convolve4_8_2d_h(s3, filter, offset, shift, idx);
1174 
1175         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1176 
1177         s += 8;
1178         d += 8;
1179         w -= 8;
1180       } while (w != 0);
1181       src += 4 * src_stride;
1182       dst += 4 * dst_stride;
1183       height -= 4;
1184     } while (height > 0);
1185   }
1186 }
1187 
highbd_convolve12_4_2d_v(int16x8_t s0[2],int16x8_t s1[2],int16x8_t s2[2],int16x8_t filter_0_7,int16x8_t filter_4_11,int32x4_t shift,int64x2_t offset,uint16x4_t max)1188 static inline uint16x4_t highbd_convolve12_4_2d_v(
1189     int16x8_t s0[2], int16x8_t s1[2], int16x8_t s2[2], int16x8_t filter_0_7,
1190     int16x8_t filter_4_11, int32x4_t shift, int64x2_t offset, uint16x4_t max) {
1191   int64x2_t sum01 = aom_svdot_lane_s16(offset, s0[0], filter_0_7, 0);
1192   sum01 = aom_svdot_lane_s16(sum01, s1[0], filter_0_7, 1);
1193   sum01 = aom_svdot_lane_s16(sum01, s2[0], filter_4_11, 1);
1194 
1195   int64x2_t sum23 = aom_svdot_lane_s16(offset, s0[1], filter_0_7, 0);
1196   sum23 = aom_svdot_lane_s16(sum23, s1[1], filter_0_7, 1);
1197   sum23 = aom_svdot_lane_s16(sum23, s2[1], filter_4_11, 1);
1198 
1199   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
1200   sum0123 = vshlq_s32(sum0123, shift);
1201 
1202   uint16x4_t res = vqmovun_s32(sum0123);
1203 
1204   return vmin_u16(res, max);
1205 }
1206 
highbd_convolve_2d_sr_vert_12tap_sve2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int width,int height,const int16_t * y_filter_ptr,ConvolveParams * conv_params,int bd,const int y_offset)1207 static inline void highbd_convolve_2d_sr_vert_12tap_sve2(
1208     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
1209     int width, int height, const int16_t *y_filter_ptr,
1210     ConvolveParams *conv_params, int bd, const int y_offset) {
1211   const int64x2_t offset = vdupq_n_s64(y_offset);
1212   const int32x4_t shift = vdupq_n_s32(-conv_params->round_1);
1213 
1214   const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
1215   const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4);
1216 
1217   uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl);
1218   // Scale indices by size of the true vector length to avoid reading from an
1219   // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
1220   uint16x8_t correction0 =
1221       vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL));
1222   merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0);
1223 
1224   uint16x8_t correction1 =
1225       vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL));
1226   merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1);
1227 
1228   uint16x8_t correction2 =
1229       vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL));
1230   merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2);
1231 
1232   const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
1233 
1234   do {
1235     int16_t *s = (int16_t *)src;
1236     uint16_t *d = (uint16_t *)dst;
1237     int h = height;
1238 
1239     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
1240     load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
1241                   &s9, &sA);
1242     s += 11 * src_stride;
1243 
1244     int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2],
1245         s6789[2], s789A[2];
1246     // This operation combines a conventional transpose and the sample permute
1247     // required before computing the dot product.
1248     transpose_concat_4x4(s0, s1, s2, s3, s0123);
1249     transpose_concat_4x4(s1, s2, s3, s4, s1234);
1250     transpose_concat_4x4(s2, s3, s4, s5, s2345);
1251     transpose_concat_4x4(s3, s4, s5, s6, s3456);
1252     transpose_concat_4x4(s4, s5, s6, s7, s4567);
1253     transpose_concat_4x4(s5, s6, s7, s8, s5678);
1254     transpose_concat_4x4(s6, s7, s8, s9, s6789);
1255     transpose_concat_4x4(s7, s8, s9, sA, s789A);
1256 
1257     do {
1258       int16x4_t sB, sC, sD, sE;
1259       load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE);
1260 
1261       int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2];
1262       transpose_concat_4x4(sB, sC, sD, sE, sBCDE);
1263 
1264       // Use the above transpose and reuse data from the previous loop to get
1265       // the rest.
1266       aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[0], s89AB);
1267       aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[1], s9ABC);
1268       aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[2], sABCD);
1269 
1270       uint16x4_t d0 = highbd_convolve12_4_2d_v(
1271           s0123, s4567, s89AB, y_filter_0_7, y_filter_4_11, shift, offset, max);
1272       uint16x4_t d1 = highbd_convolve12_4_2d_v(
1273           s1234, s5678, s9ABC, y_filter_0_7, y_filter_4_11, shift, offset, max);
1274       uint16x4_t d2 = highbd_convolve12_4_2d_v(
1275           s2345, s6789, sABCD, y_filter_0_7, y_filter_4_11, shift, offset, max);
1276       uint16x4_t d3 = highbd_convolve12_4_2d_v(
1277           s3456, s789A, sBCDE, y_filter_0_7, y_filter_4_11, shift, offset, max);
1278 
1279       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1280 
1281       // Prepare block for next iteration - re-using as much as possible.
1282       // Shuffle everything up four rows.
1283       s0123[0] = s4567[0];
1284       s0123[1] = s4567[1];
1285       s1234[0] = s5678[0];
1286       s1234[1] = s5678[1];
1287       s2345[0] = s6789[0];
1288       s2345[1] = s6789[1];
1289       s3456[0] = s789A[0];
1290       s3456[1] = s789A[1];
1291       s4567[0] = s89AB[0];
1292       s4567[1] = s89AB[1];
1293       s5678[0] = s9ABC[0];
1294       s5678[1] = s9ABC[1];
1295       s6789[0] = sABCD[0];
1296       s6789[1] = sABCD[1];
1297       s789A[0] = sBCDE[0];
1298       s789A[1] = sBCDE[1];
1299 
1300       s += 4 * src_stride;
1301       d += 4 * dst_stride;
1302       h -= 4;
1303     } while (h != 0);
1304     src += 4;
1305     dst += 4;
1306     width -= 4;
1307   } while (width != 0);
1308 }
1309 
highbd_convolve8_4_2d_v(int16x8_t samples_lo[2],int16x8_t samples_hi[2],int16x8_t filter,int32x4_t shift,int64x2_t offset,uint16x4_t max)1310 static inline uint16x4_t highbd_convolve8_4_2d_v(
1311     int16x8_t samples_lo[2], int16x8_t samples_hi[2], int16x8_t filter,
1312     int32x4_t shift, int64x2_t offset, uint16x4_t max) {
1313   int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0);
1314   sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1);
1315 
1316   int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0);
1317   sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1);
1318 
1319   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
1320   sum0123 = vshlq_s32(sum0123, shift);
1321 
1322   uint16x4_t res = vqmovun_s32(sum0123);
1323   return vmin_u16(res, max);
1324 }
1325 
highbd_convolve8_8_2d_v(int16x8_t samples_lo[4],int16x8_t samples_hi[4],int16x8_t filter,int32x4_t shift,int64x2_t offset,uint16x8_t max)1326 static inline uint16x8_t highbd_convolve8_8_2d_v(
1327     int16x8_t samples_lo[4], int16x8_t samples_hi[4], int16x8_t filter,
1328     int32x4_t shift, int64x2_t offset, uint16x8_t max) {
1329   int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0);
1330   sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1);
1331 
1332   int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0);
1333   sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1);
1334 
1335   int64x2_t sum45 = aom_svdot_lane_s16(offset, samples_lo[2], filter, 0);
1336   sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1);
1337 
1338   int64x2_t sum67 = aom_svdot_lane_s16(offset, samples_lo[3], filter, 0);
1339   sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1);
1340 
1341   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
1342   int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
1343 
1344   sum0123 = vshlq_s32(sum0123, shift);
1345   sum4567 = vshlq_s32(sum4567, shift);
1346 
1347   uint16x8_t res = vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567));
1348   return vminq_u16(res, max);
1349 }
1350 
highbd_convolve_2d_sr_vert_8tap_sve2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int width,int height,const int16_t * filter_y,ConvolveParams * conv_params,int bd,const int y_offset)1351 static void highbd_convolve_2d_sr_vert_8tap_sve2(
1352     const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
1353     ptrdiff_t dst_stride, int width, int height, const int16_t *filter_y,
1354     ConvolveParams *conv_params, int bd, const int y_offset) {
1355   assert(width >= 4 && height >= 4);
1356   const int64x2_t offset = vdupq_n_s64(y_offset);
1357   const int32x4_t shift = vdupq_n_s32(-conv_params->round_1);
1358   const int16x8_t y_filter = vld1q_s16(filter_y);
1359 
1360   uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl);
1361   // Scale indices by size of the true vector length to avoid reading from an
1362   // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
1363   uint16x8_t correction0 =
1364       vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL));
1365   merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0);
1366 
1367   uint16x8_t correction1 =
1368       vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL));
1369   merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1);
1370 
1371   uint16x8_t correction2 =
1372       vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL));
1373   merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2);
1374 
1375   if (width == 4) {
1376     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
1377     int16_t *s = (int16_t *)src;
1378 
1379     int16x4_t s0, s1, s2, s3, s4, s5, s6;
1380     load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
1381     s += 7 * src_stride;
1382 
1383     // This operation combines a conventional transpose and the sample permute
1384     // required before computing the dot product.
1385     int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
1386     transpose_concat_4x4(s0, s1, s2, s3, s0123);
1387     transpose_concat_4x4(s1, s2, s3, s4, s1234);
1388     transpose_concat_4x4(s2, s3, s4, s5, s2345);
1389     transpose_concat_4x4(s3, s4, s5, s6, s3456);
1390 
1391     do {
1392       int16x4_t s7, s8, s9, s10;
1393       load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
1394 
1395       int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
1396       // Transpose and shuffle the 4 lines that were loaded.
1397       transpose_concat_4x4(s7, s8, s9, s10, s789A);
1398 
1399       // Merge new data into block from previous iteration.
1400       aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
1401       aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678);
1402       aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789);
1403 
1404       uint16x4_t d0 =
1405           highbd_convolve8_4_2d_v(s0123, s4567, y_filter, shift, offset, max);
1406       uint16x4_t d1 =
1407           highbd_convolve8_4_2d_v(s1234, s5678, y_filter, shift, offset, max);
1408       uint16x4_t d2 =
1409           highbd_convolve8_4_2d_v(s2345, s6789, y_filter, shift, offset, max);
1410       uint16x4_t d3 =
1411           highbd_convolve8_4_2d_v(s3456, s789A, y_filter, shift, offset, max);
1412 
1413       store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
1414 
1415       // Prepare block for next iteration - re-using as much as possible.
1416       // Shuffle everything up four rows.
1417       s0123[0] = s4567[0];
1418       s0123[1] = s4567[1];
1419       s1234[0] = s5678[0];
1420       s1234[1] = s5678[1];
1421       s2345[0] = s6789[0];
1422       s2345[1] = s6789[1];
1423       s3456[0] = s789A[0];
1424       s3456[1] = s789A[1];
1425 
1426       s += 4 * src_stride;
1427       dst += 4 * dst_stride;
1428       height -= 4;
1429     } while (height != 0);
1430   } else {
1431     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
1432 
1433     do {
1434       int h = height;
1435       int16_t *s = (int16_t *)src;
1436       uint16_t *d = dst;
1437 
1438       int16x8_t s0, s1, s2, s3, s4, s5, s6;
1439       load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
1440       s += 7 * src_stride;
1441 
1442       // This operation combines a conventional transpose and the sample permute
1443       // required before computing the dot product.
1444       int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
1445       transpose_concat_8x4(s0, s1, s2, s3, s0123);
1446       transpose_concat_8x4(s1, s2, s3, s4, s1234);
1447       transpose_concat_8x4(s2, s3, s4, s5, s2345);
1448       transpose_concat_8x4(s3, s4, s5, s6, s3456);
1449 
1450       do {
1451         int16x8_t s7, s8, s9, s10;
1452         load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
1453 
1454         int16x8_t s4567[4], s5678[4], s6789[4], s789A[4];
1455         // Transpose and shuffle the 4 lines that were loaded.
1456         transpose_concat_8x4(s7, s8, s9, s10, s789A);
1457 
1458         // Merge new data into block from previous iteration.
1459         aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
1460         aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678);
1461         aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789);
1462 
1463         uint16x8_t d0 =
1464             highbd_convolve8_8_2d_v(s0123, s4567, y_filter, shift, offset, max);
1465         uint16x8_t d1 =
1466             highbd_convolve8_8_2d_v(s1234, s5678, y_filter, shift, offset, max);
1467         uint16x8_t d2 =
1468             highbd_convolve8_8_2d_v(s2345, s6789, y_filter, shift, offset, max);
1469         uint16x8_t d3 =
1470             highbd_convolve8_8_2d_v(s3456, s789A, y_filter, shift, offset, max);
1471 
1472         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1473 
1474         // Prepare block for next iteration - re-using as much as possible.
1475         // Shuffle everything up four rows.
1476         s0123[0] = s4567[0];
1477         s0123[1] = s4567[1];
1478         s0123[2] = s4567[2];
1479         s0123[3] = s4567[3];
1480         s1234[0] = s5678[0];
1481         s1234[1] = s5678[1];
1482         s1234[2] = s5678[2];
1483         s1234[3] = s5678[3];
1484         s2345[0] = s6789[0];
1485         s2345[1] = s6789[1];
1486         s2345[2] = s6789[2];
1487         s2345[3] = s6789[3];
1488         s3456[0] = s789A[0];
1489         s3456[1] = s789A[1];
1490         s3456[2] = s789A[2];
1491         s3456[3] = s789A[3];
1492 
1493         s += 4 * src_stride;
1494         d += 4 * dst_stride;
1495         h -= 4;
1496       } while (h != 0);
1497       src += 8;
1498       dst += 8;
1499       width -= 8;
1500     } while (width != 0);
1501   }
1502 }
1503 
highbd_convolve4_4_2d_v(int16x8_t samples[2],int16x8_t filter,int32x4_t shift,int64x2_t offset,uint16x4_t max)1504 static inline uint16x4_t highbd_convolve4_4_2d_v(int16x8_t samples[2],
1505                                                  int16x8_t filter,
1506                                                  int32x4_t shift,
1507                                                  int64x2_t offset,
1508                                                  uint16x4_t max) {
1509   int64x2_t sum01 = aom_svdot_lane_s16(offset, samples[0], filter, 0);
1510   int64x2_t sum23 = aom_svdot_lane_s16(offset, samples[1], filter, 0);
1511 
1512   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
1513   sum0123 = vshlq_s32(sum0123, shift);
1514 
1515   uint16x4_t res = vqmovun_s32(sum0123);
1516   return vmin_u16(res, max);
1517 }
1518 
highbd_convolve4_8_2d_v(int16x8_t samples[4],int16x8_t filter,int32x4_t shift,int64x2_t offset,uint16x8_t max)1519 static inline uint16x8_t highbd_convolve4_8_2d_v(int16x8_t samples[4],
1520                                                  int16x8_t filter,
1521                                                  int32x4_t shift,
1522                                                  int64x2_t offset,
1523                                                  uint16x8_t max) {
1524   int64x2_t sum01 = aom_svdot_lane_s16(offset, samples[0], filter, 0);
1525   int64x2_t sum23 = aom_svdot_lane_s16(offset, samples[1], filter, 0);
1526   int64x2_t sum45 = aom_svdot_lane_s16(offset, samples[2], filter, 0);
1527   int64x2_t sum67 = aom_svdot_lane_s16(offset, samples[3], filter, 0);
1528 
1529   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
1530   int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
1531 
1532   sum0123 = vshlq_s32(sum0123, shift);
1533   sum4567 = vshlq_s32(sum4567, shift);
1534 
1535   uint16x8_t res = vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567));
1536   return vminq_u16(res, max);
1537 }
1538 
highbd_convolve_2d_sr_vert_4tap_sve2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int width,int height,const int16_t * filter_y,ConvolveParams * conv_params,int bd,const int y_offset)1539 static void highbd_convolve_2d_sr_vert_4tap_sve2(
1540     const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
1541     ptrdiff_t dst_stride, int width, int height, const int16_t *filter_y,
1542     ConvolveParams *conv_params, int bd, const int y_offset) {
1543   assert(width >= 4 && height >= 4);
1544   const int64x2_t offset = vdupq_n_s64(y_offset);
1545   const int32x4_t shift = vdupq_n_s32(-conv_params->round_1);
1546 
1547   const int16x8_t y_filter =
1548       vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0));
1549 
1550   if (width == 4) {
1551     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
1552     int16_t *s = (int16_t *)(src);
1553 
1554     int16x4_t s0, s1, s2;
1555     load_s16_4x3(s, src_stride, &s0, &s1, &s2);
1556     s += 3 * src_stride;
1557 
1558     do {
1559       int16x4_t s3, s4, s5, s6;
1560       load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
1561 
1562       // This operation combines a conventional transpose and the sample permute
1563       // required before computing the dot product.
1564       int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
1565       transpose_concat_4x4(s0, s1, s2, s3, s0123);
1566       transpose_concat_4x4(s1, s2, s3, s4, s1234);
1567       transpose_concat_4x4(s2, s3, s4, s5, s2345);
1568       transpose_concat_4x4(s3, s4, s5, s6, s3456);
1569 
1570       uint16x4_t d0 =
1571           highbd_convolve4_4_2d_v(s0123, y_filter, shift, offset, max);
1572       uint16x4_t d1 =
1573           highbd_convolve4_4_2d_v(s1234, y_filter, shift, offset, max);
1574       uint16x4_t d2 =
1575           highbd_convolve4_4_2d_v(s2345, y_filter, shift, offset, max);
1576       uint16x4_t d3 =
1577           highbd_convolve4_4_2d_v(s3456, y_filter, shift, offset, max);
1578 
1579       store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
1580 
1581       // Shuffle everything up four rows.
1582       s0 = s4;
1583       s1 = s5;
1584       s2 = s6;
1585 
1586       s += 4 * src_stride;
1587       dst += 4 * dst_stride;
1588       height -= 4;
1589     } while (height != 0);
1590   } else {
1591     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
1592 
1593     do {
1594       int h = height;
1595       int16_t *s = (int16_t *)(src);
1596       uint16_t *d = dst;
1597 
1598       int16x8_t s0, s1, s2;
1599       load_s16_8x3(s, src_stride, &s0, &s1, &s2);
1600       s += 3 * src_stride;
1601 
1602       do {
1603         int16x8_t s3, s4, s5, s6;
1604         load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
1605 
1606         // This operation combines a conventional transpose and the sample
1607         // permute required before computing the dot product.
1608         int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
1609         transpose_concat_8x4(s0, s1, s2, s3, s0123);
1610         transpose_concat_8x4(s1, s2, s3, s4, s1234);
1611         transpose_concat_8x4(s2, s3, s4, s5, s2345);
1612         transpose_concat_8x4(s3, s4, s5, s6, s3456);
1613 
1614         uint16x8_t d0 =
1615             highbd_convolve4_8_2d_v(s0123, y_filter, shift, offset, max);
1616         uint16x8_t d1 =
1617             highbd_convolve4_8_2d_v(s1234, y_filter, shift, offset, max);
1618         uint16x8_t d2 =
1619             highbd_convolve4_8_2d_v(s2345, y_filter, shift, offset, max);
1620         uint16x8_t d3 =
1621             highbd_convolve4_8_2d_v(s3456, y_filter, shift, offset, max);
1622 
1623         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1624 
1625         // Shuffle everything up four rows.
1626         s0 = s4;
1627         s1 = s5;
1628         s2 = s6;
1629 
1630         s += 4 * src_stride;
1631         d += 4 * dst_stride;
1632         h -= 4;
1633       } while (h != 0);
1634       src += 8;
1635       dst += 8;
1636       width -= 8;
1637     } while (width != 0);
1638   }
1639 }
1640 
av1_highbd_convolve_2d_sr_sve2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1641 void av1_highbd_convolve_2d_sr_sve2(const uint16_t *src, int src_stride,
1642                                     uint16_t *dst, int dst_stride, int w, int h,
1643                                     const InterpFilterParams *filter_params_x,
1644                                     const InterpFilterParams *filter_params_y,
1645                                     const int subpel_x_qn,
1646                                     const int subpel_y_qn,
1647                                     ConvolveParams *conv_params, int bd) {
1648   if (w == 2 || h == 2) {
1649     av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
1650                                 filter_params_x, filter_params_y, subpel_x_qn,
1651                                 subpel_y_qn, conv_params, bd);
1652     return;
1653   }
1654 
1655   DECLARE_ALIGNED(16, uint16_t,
1656                   im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
1657   const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
1658   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
1659 
1660   if (x_filter_taps == 6 || y_filter_taps == 6) {
1661     av1_highbd_convolve_2d_sr_neon(src, src_stride, dst, dst_stride, w, h,
1662                                    filter_params_x, filter_params_y,
1663                                    subpel_x_qn, subpel_y_qn, conv_params, bd);
1664     return;
1665   }
1666 
1667   const int clamped_x_taps = x_filter_taps < 4 ? 4 : x_filter_taps;
1668   const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
1669 
1670   const int im_stride = MAX_SB_SIZE;
1671   const int vert_offset = clamped_y_taps / 2 - 1;
1672   const int horiz_offset = clamped_x_taps / 2 - 1;
1673   const int x_offset = (1 << (bd + FILTER_BITS - 1));
1674   const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1675   // The extra shim of (1 << (conv_params->round_1 - 1)) allows us to do a
1676   // simple shift left instead of a rounding saturating shift left.
1677   const int y_offset =
1678       (1 << (conv_params->round_1 - 1)) - (1 << (y_offset_bits - 1));
1679 
1680   const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
1681 
1682   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
1683       filter_params_x, subpel_x_qn & SUBPEL_MASK);
1684   const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
1685       filter_params_y, subpel_y_qn & SUBPEL_MASK);
1686   const int im_h = h + clamped_y_taps - 1;
1687 
1688   if (x_filter_taps > 8) {
1689     highbd_convolve_2d_sr_horiz_12tap_sve2(src_ptr, src_stride, im_block,
1690                                            im_stride, w, im_h, x_filter_ptr,
1691                                            conv_params, x_offset);
1692 
1693     highbd_convolve_2d_sr_vert_12tap_sve2(im_block, im_stride, dst, dst_stride,
1694                                           w, h, y_filter_ptr, conv_params, bd,
1695                                           y_offset);
1696     return;
1697   }
1698 
1699   if (x_filter_taps <= 4) {
1700     highbd_convolve_2d_sr_horiz_4tap_sve2(src_ptr, src_stride, im_block,
1701                                           im_stride, w, im_h, x_filter_ptr,
1702                                           conv_params, x_offset);
1703   } else {
1704     highbd_convolve_2d_sr_horiz_8tap_sve2(src_ptr, src_stride, im_block,
1705                                           im_stride, w, im_h, x_filter_ptr,
1706                                           conv_params, x_offset);
1707   }
1708 
1709   if (y_filter_taps <= 4) {
1710     highbd_convolve_2d_sr_vert_4tap_sve2(im_block, im_stride, dst, dst_stride,
1711                                          w, h, y_filter_ptr, conv_params, bd,
1712                                          y_offset);
1713   } else {
1714     highbd_convolve_2d_sr_vert_8tap_sve2(im_block, im_stride, dst, dst_stride,
1715                                          w, h, y_filter_ptr, conv_params, bd,
1716                                          y_offset);
1717   }
1718 }
1719