xref: /aosp_15_r20/external/libaom/third_party/SVT-AV1/convolve_avx2.h (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
13 #define THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
14 
15 #include "EbMemory_AVX2.h"
16 #include "EbMemory_SSE4_1.h"
17 #include "synonyms.h"
18 
19 #include "aom_dsp/aom_filter.h"
20 #include "aom_dsp/x86/convolve_avx2.h"
21 #include "aom_dsp/x86/mem_sse2.h"
22 
populate_coeffs_4tap_avx2(const __m128i coeffs_128,__m256i coeffs[2])23 static inline void populate_coeffs_4tap_avx2(const __m128i coeffs_128,
24                                              __m256i coeffs[2]) {
25   const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
26 
27   // coeffs 2 3 2 3 2 3 2 3
28   coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
29   // coeffs 4 5 4 5 4 5 4 5
30   coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
31 }
32 
populate_coeffs_6tap_avx2(const __m128i coeffs_128,__m256i coeffs[3])33 static inline void populate_coeffs_6tap_avx2(const __m128i coeffs_128,
34                                              __m256i coeffs[3]) {
35   const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
36 
37   // coeffs 1 2 1 2 1 2 1 2
38   coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
39   // coeffs 3 4 3 4 3 4 3 4
40   coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
41   // coeffs 5 6 5 6 5 6 5 6
42   coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
43 }
44 
populate_coeffs_8tap_avx2(const __m128i coeffs_128,__m256i coeffs[4])45 static inline void populate_coeffs_8tap_avx2(const __m128i coeffs_128,
46                                              __m256i coeffs[4]) {
47   const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
48 
49   // coeffs 0 1 0 1 0 1 0 1
50   coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
51   // coeffs 2 3 2 3 2 3 2 3
52   coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
53   // coeffs 4 5 4 5 4 5 4 5
54   coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
55   // coeffs 6 7 6 7 6 7 6 7
56   coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
57 }
58 
prepare_half_coeffs_2tap_ssse3(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)59 static inline void prepare_half_coeffs_2tap_ssse3(
60     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
61     __m128i *const coeffs /* [1] */) {
62   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
63       filter_params, subpel_q4 & SUBPEL_MASK);
64   const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
65 
66   // right shift all filter co-efficients by 1 to reduce the bits required.
67   // This extra right shift will be taken care of at the end while rounding
68   // the result.
69   // Since all filter co-efficients are even, this change will not affect the
70   // end result
71   assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
72                             _mm_set1_epi16((short)0xffff)));
73 
74   const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
75 
76   // coeffs 3 4 3 4 3 4 3 4
77   *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
78 }
79 
prepare_half_coeffs_4tap_ssse3(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)80 static inline void prepare_half_coeffs_4tap_ssse3(
81     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
82     __m128i *const coeffs /* [2] */) {
83   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
84       filter_params, subpel_q4 & SUBPEL_MASK);
85   const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
86 
87   // right shift all filter co-efficients by 1 to reduce the bits required.
88   // This extra right shift will be taken care of at the end while rounding
89   // the result.
90   // Since all filter co-efficients are even, this change will not affect the
91   // end result
92   assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
93                             _mm_set1_epi16((short)0xffff)));
94 
95   const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
96 
97   // coeffs 2 3 2 3 2 3 2 3
98   coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
99   // coeffs 4 5 4 5 4 5 4 5
100   coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
101 }
102 
prepare_half_coeffs_6tap_ssse3(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)103 static inline void prepare_half_coeffs_6tap_ssse3(
104     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
105     __m128i *const coeffs /* [3] */) {
106   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
107       filter_params, subpel_q4 & SUBPEL_MASK);
108   const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
109 
110   // right shift all filter co-efficients by 1 to reduce the bits required.
111   // This extra right shift will be taken care of at the end while rounding
112   // the result.
113   // Since all filter co-efficients are even, this change will not affect the
114   // end result
115   assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
116                             _mm_set1_epi16((short)0xffff)));
117 
118   const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
119 
120   // coeffs 1 2 1 2 1 2 1 2
121   coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
122   // coeffs 3 4 3 4 3 4 3 4
123   coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
124   // coeffs 5 6 5 6 5 6 5 6
125   coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0C0Au));
126 }
127 
prepare_half_coeffs_8tap_ssse3(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)128 static inline void prepare_half_coeffs_8tap_ssse3(
129     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
130     __m128i *const coeffs /* [4] */) {
131   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
132       filter_params, subpel_q4 & SUBPEL_MASK);
133   const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
134 
135   // right shift all filter co-efficients by 1 to reduce the bits required.
136   // This extra right shift will be taken care of at the end while rounding
137   // the result.
138   // Since all filter co-efficients are even, this change will not affect the
139   // end result
140   assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
141                             _mm_set1_epi16((short)0xffff)));
142 
143   const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
144 
145   // coeffs 0 1 0 1 0 1 0 1
146   coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
147   // coeffs 2 3 2 3 2 3 2 3
148   coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
149   // coeffs 4 5 4 5 4 5 4 5
150   coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
151   // coeffs 6 7 6 7 6 7 6 7
152   coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
153 }
154 
prepare_half_coeffs_2tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)155 static inline void prepare_half_coeffs_2tap_avx2(
156     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
157     __m256i *const coeffs /* [1] */) {
158   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
159       filter_params, subpel_q4 & SUBPEL_MASK);
160   const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
161   const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
162 
163   // right shift all filter co-efficients by 1 to reduce the bits required.
164   // This extra right shift will be taken care of at the end while rounding
165   // the result.
166   // Since all filter co-efficients are even, this change will not affect the
167   // end result
168   assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
169                             _mm_set1_epi16((short)0xffff)));
170 
171   const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
172 
173   // coeffs 3 4 3 4 3 4 3 4
174   *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
175 }
176 
prepare_half_coeffs_4tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)177 static inline void prepare_half_coeffs_4tap_avx2(
178     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
179     __m256i *const coeffs /* [2] */) {
180   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
181       filter_params, subpel_q4 & SUBPEL_MASK);
182   const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
183 
184   // right shift all filter co-efficients by 1 to reduce the bits required.
185   // This extra right shift will be taken care of at the end while rounding
186   // the result.
187   // Since all filter co-efficients are even, this change will not affect the
188   // end result
189   assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
190                             _mm_set1_epi16((short)0xffff)));
191   const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
192   populate_coeffs_4tap_avx2(coeffs_1, coeffs);
193 }
194 
prepare_half_coeffs_6tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)195 static inline void prepare_half_coeffs_6tap_avx2(
196     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
197     __m256i *const coeffs /* [3] */) {
198   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
199       filter_params, subpel_q4 & SUBPEL_MASK);
200   const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
201 
202   // right shift all filter co-efficients by 1 to reduce the bits required.
203   // This extra right shift will be taken care of at the end while rounding
204   // the result.
205   // Since all filter co-efficients are even, this change will not affect the
206   // end result
207   assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
208                             _mm_set1_epi16((short)0xffff)));
209   const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
210   populate_coeffs_6tap_avx2(coeffs_1, coeffs);
211 }
212 
prepare_half_coeffs_8tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)213 static inline void prepare_half_coeffs_8tap_avx2(
214     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
215     __m256i *const coeffs /* [4] */) {
216   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
217       filter_params, subpel_q4 & SUBPEL_MASK);
218   const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
219 
220   // right shift all filter co-efficients by 1 to reduce the bits required.
221   // This extra right shift will be taken care of at the end while rounding
222   // the result.
223   // Since all filter co-efficients are even, this change will not affect the
224   // end result
225   assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
226                             _mm_set1_epi16((short)0xffff)));
227   const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
228   populate_coeffs_8tap_avx2(coeffs_1, coeffs);
229 }
230 
prepare_coeffs_2tap_sse2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)231 static inline void prepare_coeffs_2tap_sse2(
232     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
233     __m128i *const coeffs /* [1] */) {
234   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
235       filter_params, subpel_q4 & SUBPEL_MASK);
236 
237   const __m128i coeff = _mm_cvtsi32_si128(loadu_int32(filter + 3));
238 
239   // coeffs 3 4 3 4 3 4 3 4
240   coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
241 }
242 
prepare_coeffs_4tap_sse2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)243 static inline void prepare_coeffs_4tap_sse2(
244     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
245     __m128i *const coeffs /* [2] */) {
246   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
247       filter_params, subpel_q4 & SUBPEL_MASK);
248 
249   const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
250 
251   // coeffs 2 3 2 3 2 3 2 3
252   coeffs[0] = _mm_shuffle_epi32(coeff, 0x55);
253   // coeffs 4 5 4 5 4 5 4 5
254   coeffs[1] = _mm_shuffle_epi32(coeff, 0xaa);
255 }
256 
prepare_coeffs_6tap_ssse3(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)257 static inline void prepare_coeffs_6tap_ssse3(
258     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
259     __m128i *const coeffs /* [3] */) {
260   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
261       filter_params, subpel_q4 & SUBPEL_MASK);
262   const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
263 
264   // coeffs 1 2 1 2 1 2 1 2
265   coeffs[0] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x05040302u));
266   // coeffs 3 4 3 4 3 4 3 4
267   coeffs[1] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x09080706u));
268   // coeffs 5 6 5 6 5 6 5 6
269   coeffs[2] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x0D0C0B0Au));
270 }
271 
prepare_coeffs_8tap_sse2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)272 static inline void prepare_coeffs_8tap_sse2(
273     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
274     __m128i *const coeffs /* [4] */) {
275   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
276       filter_params, subpel_q4 & SUBPEL_MASK);
277 
278   const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
279 
280   // coeffs 0 1 0 1 0 1 0 1
281   coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
282   // coeffs 2 3 2 3 2 3 2 3
283   coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
284   // coeffs 4 5 4 5 4 5 4 5
285   coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
286   // coeffs 6 7 6 7 6 7 6 7
287   coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
288 }
289 
prepare_coeffs_2tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)290 static inline void prepare_coeffs_2tap_avx2(
291     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
292     __m256i *const coeffs /* [1] */) {
293   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
294       filter_params, subpel_q4 & SUBPEL_MASK);
295 
296   const __m128i coeff_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
297   const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
298 
299   // coeffs 3 4 3 4 3 4 3 4
300   coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
301 }
302 
prepare_coeffs_4tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)303 static inline void prepare_coeffs_4tap_avx2(
304     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
305     __m256i *const coeffs /* [2] */) {
306   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
307       filter_params, subpel_q4 & SUBPEL_MASK);
308 
309   const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
310   const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
311 
312   // coeffs 2 3 2 3 2 3 2 3
313   coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
314   // coeffs 4 5 4 5 4 5 4 5
315   coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
316 }
317 
prepare_coeffs_6tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)318 static inline void prepare_coeffs_6tap_avx2(
319     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
320     __m256i *const coeffs /* [3]*/) {
321   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
322       filter_params, subpel_q4 & SUBPEL_MASK);
323   const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
324   const __m256i coeff = _mm256_broadcastsi128_si256(coeffs_8);
325 
326   // coeffs 1 2 1 2 1 2 1 2
327   coeffs[0] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x05040302u));
328   // coeffs 3 4 3 4 3 4 3 4
329   coeffs[1] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x09080706u));
330   // coeffs 5 6 5 6 5 6 5 6
331   coeffs[2] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x0D0C0B0Au));
332 }
333 
prepare_coeffs_8tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)334 static inline void prepare_coeffs_8tap_avx2(
335     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
336     __m256i *const coeffs /* [4] */) {
337   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
338       filter_params, subpel_q4 & SUBPEL_MASK);
339 
340   const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
341   const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
342 
343   // coeffs 0 1 0 1 0 1 0 1
344   coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
345   // coeffs 2 3 2 3 2 3 2 3
346   coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
347   // coeffs 4 5 4 5 4 5 4 5
348   coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
349   // coeffs 6 7 6 7 6 7 6 7
350   coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
351 }
352 
load_16bit_5rows_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i dst[5])353 static inline void load_16bit_5rows_avx2(const int16_t *const src,
354                                          const ptrdiff_t stride,
355                                          __m256i dst[5]) {
356   dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
357   dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
358   dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
359   dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
360   dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
361 }
362 
load_16bit_7rows_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i dst[7])363 static inline void load_16bit_7rows_avx2(const int16_t *const src,
364                                          const ptrdiff_t stride,
365                                          __m256i dst[7]) {
366   dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
367   dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
368   dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
369   dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
370   dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
371   dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
372   dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
373 }
374 
load_16bit_8rows_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i dst[8])375 static AOM_FORCE_INLINE void load_16bit_8rows_avx2(const int16_t *const src,
376                                                    const ptrdiff_t stride,
377                                                    __m256i dst[8]) {
378   dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
379   dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
380   dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
381   dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
382   dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
383   dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
384   dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
385   dst[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
386 }
387 
loadu_unpack_16bit_5rows_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i s_256[5],__m256i ss_256[5],__m256i tt_256[5])388 static AOM_FORCE_INLINE void loadu_unpack_16bit_5rows_avx2(
389     const int16_t *const src, const ptrdiff_t stride, __m256i s_256[5],
390     __m256i ss_256[5], __m256i tt_256[5]) {
391   s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
392   s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
393   s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
394   s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
395   s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
396 
397   ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
398   ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
399   ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
400   ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
401 
402   tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
403   tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
404   tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
405   tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
406 }
407 
loadu_unpack_16bit_3rows_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i s_256[3],__m256i ss_256[3],__m256i tt_256[3])408 static AOM_FORCE_INLINE void loadu_unpack_16bit_3rows_avx2(
409     const int16_t *const src, const ptrdiff_t stride, __m256i s_256[3],
410     __m256i ss_256[3], __m256i tt_256[3]) {
411   s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
412   s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
413   s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
414 
415   ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
416   ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
417 
418   tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
419   tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
420 }
421 
convolve_8tap_unpack_avx2(const __m256i s[6],__m256i ss[7])422 static inline void convolve_8tap_unpack_avx2(const __m256i s[6],
423                                              __m256i ss[7]) {
424   ss[0] = _mm256_unpacklo_epi16(s[0], s[1]);
425   ss[1] = _mm256_unpacklo_epi16(s[2], s[3]);
426   ss[2] = _mm256_unpacklo_epi16(s[4], s[5]);
427   ss[4] = _mm256_unpackhi_epi16(s[0], s[1]);
428   ss[5] = _mm256_unpackhi_epi16(s[2], s[3]);
429   ss[6] = _mm256_unpackhi_epi16(s[4], s[5]);
430 }
431 
convolve_2tap_ssse3(const __m128i ss[1],const __m128i coeffs[1])432 static inline __m128i convolve_2tap_ssse3(const __m128i ss[1],
433                                           const __m128i coeffs[1]) {
434   return _mm_maddubs_epi16(ss[0], coeffs[0]);
435 }
436 
convolve_4tap_ssse3(const __m128i ss[2],const __m128i coeffs[2])437 static inline __m128i convolve_4tap_ssse3(const __m128i ss[2],
438                                           const __m128i coeffs[2]) {
439   const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
440   const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
441   return _mm_add_epi16(res_23, res_45);
442 }
443 
convolve_6tap_ssse3(const __m128i ss[3],const __m128i coeffs[3])444 static inline __m128i convolve_6tap_ssse3(const __m128i ss[3],
445                                           const __m128i coeffs[3]) {
446   const __m128i res_12 = _mm_maddubs_epi16(ss[0], coeffs[0]);
447   const __m128i res_34 = _mm_maddubs_epi16(ss[1], coeffs[1]);
448   const __m128i res_56 = _mm_maddubs_epi16(ss[2], coeffs[2]);
449   const __m128i res_1256 = _mm_add_epi16(res_12, res_56);
450   return _mm_add_epi16(res_1256, res_34);
451 }
452 
convolve_8tap_ssse3(const __m128i ss[4],const __m128i coeffs[4])453 static inline __m128i convolve_8tap_ssse3(const __m128i ss[4],
454                                           const __m128i coeffs[4]) {
455   const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
456   const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
457   const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
458   const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
459   const __m128i res_0145 = _mm_add_epi16(res_01, res_45);
460   const __m128i res_2367 = _mm_add_epi16(res_23, res_67);
461   return _mm_add_epi16(res_0145, res_2367);
462 }
463 
convolve_2tap_avx2(const __m256i ss[1],const __m256i coeffs[1])464 static inline __m256i convolve_2tap_avx2(const __m256i ss[1],
465                                          const __m256i coeffs[1]) {
466   return _mm256_maddubs_epi16(ss[0], coeffs[0]);
467 }
468 
convolve_4tap_avx2(const __m256i ss[2],const __m256i coeffs[2])469 static inline __m256i convolve_4tap_avx2(const __m256i ss[2],
470                                          const __m256i coeffs[2]) {
471   const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
472   const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
473   return _mm256_add_epi16(res_23, res_45);
474 }
475 
convolve_6tap_avx2(const __m256i ss[3],const __m256i coeffs[3])476 static inline __m256i convolve_6tap_avx2(const __m256i ss[3],
477                                          const __m256i coeffs[3]) {
478   const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
479   const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
480   const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
481   const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
482   return _mm256_add_epi16(res_0145, res_23);
483 }
484 
convolve_8tap_avx2(const __m256i ss[4],const __m256i coeffs[4])485 static inline __m256i convolve_8tap_avx2(const __m256i ss[4],
486                                          const __m256i coeffs[4]) {
487   const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
488   const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
489   const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
490   const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
491   const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
492   const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
493   return _mm256_add_epi16(res_0145, res_2367);
494 }
495 
convolve16_2tap_sse2(const __m128i ss[1],const __m128i coeffs[1])496 static inline __m128i convolve16_2tap_sse2(const __m128i ss[1],
497                                            const __m128i coeffs[1]) {
498   return _mm_madd_epi16(ss[0], coeffs[0]);
499 }
500 
convolve16_4tap_sse2(const __m128i ss[2],const __m128i coeffs[2])501 static inline __m128i convolve16_4tap_sse2(const __m128i ss[2],
502                                            const __m128i coeffs[2]) {
503   const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
504   const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
505   return _mm_add_epi32(res_01, res_23);
506 }
507 
convolve16_6tap_sse2(const __m128i ss[3],const __m128i coeffs[3])508 static inline __m128i convolve16_6tap_sse2(const __m128i ss[3],
509                                            const __m128i coeffs[3]) {
510   const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
511   const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
512   const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
513   const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
514   return _mm_add_epi32(res_0123, res_45);
515 }
516 
convolve16_8tap_sse2(const __m128i ss[4],const __m128i coeffs[4])517 static inline __m128i convolve16_8tap_sse2(const __m128i ss[4],
518                                            const __m128i coeffs[4]) {
519   const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
520   const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
521   const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
522   const __m128i res_67 = _mm_madd_epi16(ss[3], coeffs[3]);
523   const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
524   const __m128i res_4567 = _mm_add_epi32(res_45, res_67);
525   return _mm_add_epi32(res_0123, res_4567);
526 }
527 
convolve16_2tap_avx2(const __m256i ss[1],const __m256i coeffs[1])528 static inline __m256i convolve16_2tap_avx2(const __m256i ss[1],
529                                            const __m256i coeffs[1]) {
530   return _mm256_madd_epi16(ss[0], coeffs[0]);
531 }
532 
convolve16_4tap_avx2(const __m256i ss[2],const __m256i coeffs[2])533 static inline __m256i convolve16_4tap_avx2(const __m256i ss[2],
534                                            const __m256i coeffs[2]) {
535   const __m256i res_1 = _mm256_madd_epi16(ss[0], coeffs[0]);
536   const __m256i res_2 = _mm256_madd_epi16(ss[1], coeffs[1]);
537   return _mm256_add_epi32(res_1, res_2);
538 }
539 
convolve16_6tap_avx2(const __m256i ss[3],const __m256i coeffs[3])540 static inline __m256i convolve16_6tap_avx2(const __m256i ss[3],
541                                            const __m256i coeffs[3]) {
542   const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
543   const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
544   const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
545   const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
546   return _mm256_add_epi32(res_0123, res_45);
547 }
548 
convolve16_8tap_avx2(const __m256i ss[4],const __m256i coeffs[4])549 static inline __m256i convolve16_8tap_avx2(const __m256i ss[4],
550                                            const __m256i coeffs[4]) {
551   const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
552   const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
553   const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
554   const __m256i res_67 = _mm256_madd_epi16(ss[3], coeffs[3]);
555   const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
556   const __m256i res_4567 = _mm256_add_epi32(res_45, res_67);
557   return _mm256_add_epi32(res_0123, res_4567);
558 }
559 
x_convolve_4tap_avx2(const __m256i data,const __m256i coeffs[2],const __m256i filt[2])560 static inline __m256i x_convolve_4tap_avx2(const __m256i data,
561                                            const __m256i coeffs[2],
562                                            const __m256i filt[2]) {
563   __m256i ss[2];
564 
565   ss[0] = _mm256_shuffle_epi8(data, filt[0]);
566   ss[1] = _mm256_shuffle_epi8(data, filt[1]);
567 
568   return convolve_4tap_avx2(ss, coeffs);
569 }
570 
x_convolve_6tap_avx2(const __m256i data,const __m256i coeffs[3],const __m256i filt[3])571 static inline __m256i x_convolve_6tap_avx2(const __m256i data,
572                                            const __m256i coeffs[3],
573                                            const __m256i filt[3]) {
574   __m256i ss[3];
575 
576   ss[0] = _mm256_shuffle_epi8(data, filt[0]);
577   ss[1] = _mm256_shuffle_epi8(data, filt[1]);
578   ss[2] = _mm256_shuffle_epi8(data, filt[2]);
579 
580   return convolve_6tap_avx2(ss, coeffs);
581 }
582 
x_convolve_8tap_avx2(const __m256i data,const __m256i coeffs[4],const __m256i filt[4])583 static inline __m256i x_convolve_8tap_avx2(const __m256i data,
584                                            const __m256i coeffs[4],
585                                            const __m256i filt[4]) {
586   __m256i ss[4];
587 
588   ss[0] = _mm256_shuffle_epi8(data, filt[0]);
589   ss[1] = _mm256_shuffle_epi8(data, filt[1]);
590   ss[2] = _mm256_shuffle_epi8(data, filt[2]);
591   ss[3] = _mm256_shuffle_epi8(data, filt[3]);
592 
593   return convolve_8tap_avx2(ss, coeffs);
594 }
595 
sr_y_round_avx2(const __m256i src)596 static inline __m256i sr_y_round_avx2(const __m256i src) {
597   const __m256i round = _mm256_set1_epi16(32);
598   const __m256i dst = _mm256_add_epi16(src, round);
599   return _mm256_srai_epi16(dst, FILTER_BITS - 1);
600 }
601 
xy_x_round_sse2(const __m128i src)602 static inline __m128i xy_x_round_sse2(const __m128i src) {
603   const __m128i round = _mm_set1_epi16(2);
604   const __m128i dst = _mm_add_epi16(src, round);
605   return _mm_srai_epi16(dst, 2);
606 }
607 
xy_x_round_avx2(const __m256i src)608 static inline __m256i xy_x_round_avx2(const __m256i src) {
609   const __m256i round = _mm256_set1_epi16(2);
610   const __m256i dst = _mm256_add_epi16(src, round);
611   return _mm256_srai_epi16(dst, 2);
612 }
613 
xy_x_round_store_2x2_sse2(const __m128i res,int16_t * const dst)614 static inline void xy_x_round_store_2x2_sse2(const __m128i res,
615                                              int16_t *const dst) {
616   const __m128i d = xy_x_round_sse2(res);
617   _mm_storel_epi64((__m128i *)dst, d);
618 }
619 
xy_x_round_store_4x2_sse2(const __m128i res,int16_t * const dst)620 static inline void xy_x_round_store_4x2_sse2(const __m128i res,
621                                              int16_t *const dst) {
622   const __m128i d = xy_x_round_sse2(res);
623   _mm_storeu_si128((__m128i *)dst, d);
624 }
625 
xy_x_round_store_8x2_sse2(const __m128i res[2],int16_t * const dst)626 static inline void xy_x_round_store_8x2_sse2(const __m128i res[2],
627                                              int16_t *const dst) {
628   __m128i r[2];
629 
630   r[0] = xy_x_round_sse2(res[0]);
631   r[1] = xy_x_round_sse2(res[1]);
632   _mm_storeu_si128((__m128i *)dst, r[0]);
633   _mm_storeu_si128((__m128i *)(dst + 8), r[1]);
634 }
635 
xy_x_round_store_8x2_avx2(const __m256i res,int16_t * const dst)636 static inline void xy_x_round_store_8x2_avx2(const __m256i res,
637                                              int16_t *const dst) {
638   const __m256i d = xy_x_round_avx2(res);
639   _mm256_storeu_si256((__m256i *)dst, d);
640 }
641 
xy_x_round_store_32_avx2(const __m256i res[2],int16_t * const dst)642 static inline void xy_x_round_store_32_avx2(const __m256i res[2],
643                                             int16_t *const dst) {
644   __m256i r[2];
645 
646   r[0] = xy_x_round_avx2(res[0]);
647   r[1] = xy_x_round_avx2(res[1]);
648   const __m256i d0 =
649       _mm256_inserti128_si256(r[0], _mm256_castsi256_si128(r[1]), 1);
650   const __m256i d1 =
651       _mm256_inserti128_si256(r[1], _mm256_extracti128_si256(r[0], 1), 0);
652   _mm256_storeu_si256((__m256i *)dst, d0);
653   _mm256_storeu_si256((__m256i *)(dst + 16), d1);
654 }
655 
xy_y_round_sse2(const __m128i src)656 static inline __m128i xy_y_round_sse2(const __m128i src) {
657   const __m128i round = _mm_set1_epi32(1024);
658   const __m128i dst = _mm_add_epi32(src, round);
659   return _mm_srai_epi32(dst, 11);
660 }
661 
xy_y_round_half_pel_sse2(const __m128i src)662 static inline __m128i xy_y_round_half_pel_sse2(const __m128i src) {
663   const __m128i round = _mm_set1_epi16(16);
664   const __m128i dst = _mm_add_epi16(src, round);
665   return _mm_srai_epi16(dst, 5);
666 }
667 
xy_y_round_avx2(const __m256i src)668 static inline __m256i xy_y_round_avx2(const __m256i src) {
669   const __m256i round = _mm256_set1_epi32(1024);
670   const __m256i dst = _mm256_add_epi32(src, round);
671   return _mm256_srai_epi32(dst, 11);
672 }
673 
xy_y_round_16_avx2(const __m256i r[2])674 static inline __m256i xy_y_round_16_avx2(const __m256i r[2]) {
675   const __m256i r0 = xy_y_round_avx2(r[0]);
676   const __m256i r1 = xy_y_round_avx2(r[1]);
677   return _mm256_packs_epi32(r0, r1);
678 }
679 
xy_y_round_half_pel_avx2(const __m256i src)680 static inline __m256i xy_y_round_half_pel_avx2(const __m256i src) {
681   const __m256i round = _mm256_set1_epi16(16);
682   const __m256i dst = _mm256_add_epi16(src, round);
683   return _mm256_srai_epi16(dst, 5);
684 }
685 
pack_store_2x2_sse2(const __m128i res,uint8_t * const dst,const ptrdiff_t stride)686 static inline void pack_store_2x2_sse2(const __m128i res, uint8_t *const dst,
687                                        const ptrdiff_t stride) {
688   const __m128i d = _mm_packus_epi16(res, res);
689   *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
690   *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
691 }
692 
pack_store_4x2_sse2(const __m128i res,uint8_t * const dst,const ptrdiff_t stride)693 static inline void pack_store_4x2_sse2(const __m128i res, uint8_t *const dst,
694                                        const ptrdiff_t stride) {
695   const __m128i d = _mm_packus_epi16(res, res);
696   store_u8_4x2_sse2(d, dst, stride);
697 }
698 
pack_store_4x2_avx2(const __m256i res,uint8_t * const dst,const ptrdiff_t stride)699 static inline void pack_store_4x2_avx2(const __m256i res, uint8_t *const dst,
700                                        const ptrdiff_t stride) {
701   const __m256i d = _mm256_packus_epi16(res, res);
702   const __m128i d0 = _mm256_castsi256_si128(d);
703   const __m128i d1 = _mm256_extracti128_si256(d, 1);
704 
705   xx_storel_32(dst, d0);
706   xx_storel_32(dst + stride, d1);
707 }
708 
pack_store_8x2_avx2(const __m256i res,uint8_t * const dst,const ptrdiff_t stride)709 static inline void pack_store_8x2_avx2(const __m256i res, uint8_t *const dst,
710                                        const ptrdiff_t stride) {
711   const __m256i d = _mm256_packus_epi16(res, res);
712   const __m128i d0 = _mm256_castsi256_si128(d);
713   const __m128i d1 = _mm256_extracti128_si256(d, 1);
714   _mm_storel_epi64((__m128i *)dst, d0);
715   _mm_storel_epi64((__m128i *)(dst + stride), d1);
716 }
717 
pack_store_16x2_avx2(const __m256i res0,const __m256i res1,uint8_t * const dst,const ptrdiff_t stride)718 static inline void pack_store_16x2_avx2(const __m256i res0, const __m256i res1,
719                                         uint8_t *const dst,
720                                         const ptrdiff_t stride) {
721   const __m256i d = _mm256_packus_epi16(res0, res1);
722   storeu_u8_16x2_avx2(d, dst, stride);
723 }
724 
xy_y_pack_store_16x2_avx2(const __m256i res0,const __m256i res1,uint8_t * const dst,const ptrdiff_t stride)725 static inline void xy_y_pack_store_16x2_avx2(const __m256i res0,
726                                              const __m256i res1,
727                                              uint8_t *const dst,
728                                              const ptrdiff_t stride) {
729   const __m256i t = _mm256_packus_epi16(res0, res1);
730   const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
731   storeu_u8_16x2_avx2(d, dst, stride);
732 }
733 
pack_store_32_avx2(const __m256i res0,const __m256i res1,uint8_t * const dst)734 static inline void pack_store_32_avx2(const __m256i res0, const __m256i res1,
735                                       uint8_t *const dst) {
736   const __m256i t = _mm256_packus_epi16(res0, res1);
737   const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
738   _mm256_storeu_si256((__m256i *)dst, d);
739 }
740 
xy_y_round_store_2x2_sse2(const __m128i res,uint8_t * const dst,const ptrdiff_t stride)741 static inline void xy_y_round_store_2x2_sse2(const __m128i res,
742                                              uint8_t *const dst,
743                                              const ptrdiff_t stride) {
744   const __m128i r = xy_y_round_sse2(res);
745   const __m128i rr = _mm_packs_epi32(r, r);
746   pack_store_2x2_sse2(rr, dst, stride);
747 }
748 
xy_y_round_store_4x2_avx2(const __m256i res,uint8_t * const dst,const ptrdiff_t stride)749 static inline void xy_y_round_store_4x2_avx2(const __m256i res,
750                                              uint8_t *const dst,
751                                              const ptrdiff_t stride) {
752   const __m256i r = xy_y_round_avx2(res);
753   const __m256i rr = _mm256_packs_epi32(r, r);
754   pack_store_4x2_avx2(rr, dst, stride);
755 }
756 
xy_y_pack_store_32_avx2(const __m256i res0,const __m256i res1,uint8_t * const dst)757 static inline void xy_y_pack_store_32_avx2(const __m256i res0,
758                                            const __m256i res1,
759                                            uint8_t *const dst) {
760   const __m256i d = _mm256_packus_epi16(res0, res1);
761   // d = _mm256_permute4x64_epi64(d, 0xD8);
762   _mm256_storeu_si256((__m256i *)dst, d);
763 }
764 
xy_y_round_store_32_avx2(const __m256i r0[2],const __m256i r1[2],uint8_t * const dst)765 static inline void xy_y_round_store_32_avx2(const __m256i r0[2],
766                                             const __m256i r1[2],
767                                             uint8_t *const dst) {
768   const __m256i ra = xy_y_round_16_avx2(r0);
769   const __m256i rb = xy_y_round_16_avx2(r1);
770   xy_y_pack_store_32_avx2(ra, rb, dst);
771 }
772 
convolve_store_32_avx2(const __m256i res0,const __m256i res1,uint8_t * const dst)773 static inline void convolve_store_32_avx2(const __m256i res0,
774                                           const __m256i res1,
775                                           uint8_t *const dst) {
776   const __m256i d = _mm256_packus_epi16(res0, res1);
777   _mm256_storeu_si256((__m256i *)dst, d);
778 }
779 
sr_x_round_sse2(const __m128i src)780 static inline __m128i sr_x_round_sse2(const __m128i src) {
781   const __m128i round = _mm_set1_epi16(34);
782   const __m128i dst = _mm_add_epi16(src, round);
783   return _mm_srai_epi16(dst, 6);
784 }
785 
sr_x_round_avx2(const __m256i src)786 static inline __m256i sr_x_round_avx2(const __m256i src) {
787   const __m256i round = _mm256_set1_epi16(34);
788   const __m256i dst = _mm256_add_epi16(src, round);
789   return _mm256_srai_epi16(dst, 6);
790 }
791 
sr_y_round_sse2(const __m128i src)792 static inline __m128i sr_y_round_sse2(const __m128i src) {
793   const __m128i round = _mm_set1_epi16(32);
794   const __m128i dst = _mm_add_epi16(src, round);
795   return _mm_srai_epi16(dst, FILTER_BITS - 1);
796 }
797 
sr_x_round_store_8x2_avx2(const __m256i res,uint8_t * const dst,const ptrdiff_t dst_stride)798 static inline void sr_x_round_store_8x2_avx2(const __m256i res,
799                                              uint8_t *const dst,
800                                              const ptrdiff_t dst_stride) {
801   const __m256i r = sr_x_round_avx2(res);
802   pack_store_8x2_avx2(r, dst, dst_stride);
803 }
804 
sr_x_round_store_16x2_avx2(const __m256i res[2],uint8_t * const dst,const ptrdiff_t dst_stride)805 static inline void sr_x_round_store_16x2_avx2(const __m256i res[2],
806                                               uint8_t *const dst,
807                                               const ptrdiff_t dst_stride) {
808   __m256i r[2];
809 
810   r[0] = sr_x_round_avx2(res[0]);
811   r[1] = sr_x_round_avx2(res[1]);
812   pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
813 }
814 
sr_x_round_store_32_avx2(const __m256i res[2],uint8_t * const dst)815 static inline void sr_x_round_store_32_avx2(const __m256i res[2],
816                                             uint8_t *const dst) {
817   __m256i r[2];
818 
819   r[0] = sr_x_round_avx2(res[0]);
820   r[1] = sr_x_round_avx2(res[1]);
821   convolve_store_32_avx2(r[0], r[1], dst);
822 }
823 
sr_y_round_store_8x2_avx2(const __m256i res,uint8_t * const dst,const ptrdiff_t dst_stride)824 static inline void sr_y_round_store_8x2_avx2(const __m256i res,
825                                              uint8_t *const dst,
826                                              const ptrdiff_t dst_stride) {
827   const __m256i r = sr_y_round_avx2(res);
828   pack_store_8x2_avx2(r, dst, dst_stride);
829 }
830 
sr_y_round_store_16x2_avx2(const __m256i res[2],uint8_t * const dst,const ptrdiff_t dst_stride)831 static inline void sr_y_round_store_16x2_avx2(const __m256i res[2],
832                                               uint8_t *const dst,
833                                               const ptrdiff_t dst_stride) {
834   __m256i r[2];
835 
836   r[0] = sr_y_round_avx2(res[0]);
837   r[1] = sr_y_round_avx2(res[1]);
838   pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
839 }
840 
sr_y_2tap_32_avg_avx2(const uint8_t * const src,const __m256i s0,__m256i * const s1,uint8_t * const dst)841 static inline void sr_y_2tap_32_avg_avx2(const uint8_t *const src,
842                                          const __m256i s0, __m256i *const s1,
843                                          uint8_t *const dst) {
844   *s1 = _mm256_loadu_si256((__m256i *)src);
845   const __m256i d = _mm256_avg_epu8(s0, *s1);
846   _mm256_storeu_si256((__m256i *)dst, d);
847 }
848 
sr_x_2tap_32_avg_avx2(const uint8_t * const src,uint8_t * const dst)849 static inline void sr_x_2tap_32_avg_avx2(const uint8_t *const src,
850                                          uint8_t *const dst) {
851   const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
852   const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
853   const __m256i d = _mm256_avg_epu8(s0, s1);
854   _mm256_storeu_si256((__m256i *)dst, d);
855 }
856 
x_convolve_2tap_2x2_sse4_1(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[1])857 static inline __m128i x_convolve_2tap_2x2_sse4_1(const uint8_t *const src,
858                                                  const ptrdiff_t stride,
859                                                  const __m128i coeffs[1]) {
860   const __m128i sfl =
861       _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
862   const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
863   const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
864   return convolve_2tap_ssse3(&ss, coeffs);
865 }
866 
x_convolve_2tap_4x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[1])867 static inline __m128i x_convolve_2tap_4x2_ssse3(const uint8_t *const src,
868                                                 const ptrdiff_t stride,
869                                                 const __m128i coeffs[1]) {
870   const __m128i sfl =
871       _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
872   const __m128i s_128 = load_u8_8x2_sse2(src, stride);
873   const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
874   return convolve_2tap_ssse3(&ss, coeffs);
875 }
876 
x_convolve_2tap_8x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[1],__m128i r[2])877 static inline void x_convolve_2tap_8x2_ssse3(const uint8_t *const src,
878                                              const ptrdiff_t stride,
879                                              const __m128i coeffs[1],
880                                              __m128i r[2]) {
881   __m128i ss[2];
882   const __m128i s00 = _mm_loadu_si128((__m128i *)src);
883   const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
884   const __m128i s01 = _mm_srli_si128(s00, 1);
885   const __m128i s11 = _mm_srli_si128(s10, 1);
886   ss[0] = _mm_unpacklo_epi8(s00, s01);
887   ss[1] = _mm_unpacklo_epi8(s10, s11);
888 
889   r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
890   r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
891 }
892 
x_convolve_2tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[1])893 static inline __m256i x_convolve_2tap_8x2_avx2(const uint8_t *const src,
894                                                const ptrdiff_t stride,
895                                                const __m256i coeffs[1]) {
896   __m128i s_128[2][2];
897   __m256i s_256[2];
898 
899   s_128[0][0] = _mm_loadu_si128((__m128i *)src);
900   s_128[1][0] = _mm_loadu_si128((__m128i *)(src + stride));
901   s_128[0][1] = _mm_srli_si128(s_128[0][0], 1);
902   s_128[1][1] = _mm_srli_si128(s_128[1][0], 1);
903   s_256[0] = _mm256_setr_m128i(s_128[0][0], s_128[1][0]);
904   s_256[1] = _mm256_setr_m128i(s_128[0][1], s_128[1][1]);
905   const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
906   return convolve_2tap_avx2(&ss, coeffs);
907 }
908 
x_convolve_2tap_16x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[1],__m256i r[2])909 static inline void x_convolve_2tap_16x2_avx2(const uint8_t *const src,
910                                              const ptrdiff_t stride,
911                                              const __m256i coeffs[1],
912                                              __m256i r[2]) {
913   const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
914   const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
915   const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
916   const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
917   r[0] = convolve_2tap_avx2(&s0, coeffs);
918   r[1] = convolve_2tap_avx2(&s1, coeffs);
919 }
920 
x_convolve_2tap_32_avx2(const uint8_t * const src,const __m256i coeffs[1],__m256i r[2])921 static inline void x_convolve_2tap_32_avx2(const uint8_t *const src,
922                                            const __m256i coeffs[1],
923                                            __m256i r[2]) {
924   const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
925   const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
926   const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
927   const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
928 
929   r[0] = convolve_2tap_avx2(&ss0, coeffs);
930   r[1] = convolve_2tap_avx2(&ss1, coeffs);
931 }
932 
x_convolve_4tap_2x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[2])933 static inline __m128i x_convolve_4tap_2x2_ssse3(const uint8_t *const src,
934                                                 const ptrdiff_t stride,
935                                                 const __m128i coeffs[2]) {
936   const __m128i sfl0 =
937       _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
938   const __m128i sfl1 =
939       _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
940   const __m128i s = load_u8_8x2_sse2(src, stride);
941   __m128i ss[2];
942 
943   ss[0] = _mm_shuffle_epi8(s, sfl0);
944   ss[1] = _mm_shuffle_epi8(s, sfl1);
945   return convolve_4tap_ssse3(ss, coeffs);
946 }
947 
x_convolve_4tap_4x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[2])948 static inline __m128i x_convolve_4tap_4x2_ssse3(const uint8_t *const src,
949                                                 const ptrdiff_t stride,
950                                                 const __m128i coeffs[2]) {
951   const __m128i s = load_u8_8x2_sse2(src, stride);
952   const __m128i sfl0 =
953       _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
954   const __m128i sfl1 =
955       _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
956   __m128i ss[2];
957 
958   ss[0] = _mm_shuffle_epi8(s, sfl0);
959   ss[1] = _mm_shuffle_epi8(s, sfl1);
960   return convolve_4tap_ssse3(ss, coeffs);
961 }
962 
x_convolve_4tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[2],const __m256i filt[2])963 static inline __m256i x_convolve_4tap_8x2_avx2(const uint8_t *const src,
964                                                const ptrdiff_t stride,
965                                                const __m256i coeffs[2],
966                                                const __m256i filt[2]) {
967   const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
968   return x_convolve_4tap_avx2(s_256, coeffs, filt);
969 }
970 
x_convolve_4tap_16x2_avx2(const uint8_t * const src,const int32_t src_stride,const __m256i coeffs[2],const __m256i filt[2],__m256i r[2])971 static inline void x_convolve_4tap_16x2_avx2(const uint8_t *const src,
972                                              const int32_t src_stride,
973                                              const __m256i coeffs[2],
974                                              const __m256i filt[2],
975                                              __m256i r[2]) {
976   r[0] = x_convolve_4tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
977   r[1] = x_convolve_4tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
978 }
979 
x_convolve_4tap_32_avx2(const uint8_t * const src,const __m256i coeffs[2],const __m256i filt[2],__m256i r[2])980 static inline void x_convolve_4tap_32_avx2(const uint8_t *const src,
981                                            const __m256i coeffs[2],
982                                            const __m256i filt[2],
983                                            __m256i r[2]) {
984   const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
985   const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
986 
987   r[0] = x_convolve_4tap_avx2(s0_256, coeffs, filt);
988   r[1] = x_convolve_4tap_avx2(s1_256, coeffs, filt);
989 }
990 
x_convolve_6tap_2x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[3])991 static inline __m128i x_convolve_6tap_2x2_ssse3(const uint8_t *const src,
992                                                 const ptrdiff_t stride,
993                                                 const __m128i coeffs[3]) {
994   const __m128i sfl0 =
995       _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
996   const __m128i sfl1 =
997       _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
998   const __m128i sfl2 =
999       _mm_setr_epi8(4, 5, 5, 6, 12, 13, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0);
1000 
1001   const __m128i s = load_u8_8x2_sse2(src, stride);
1002   __m128i ss[3];
1003 
1004   ss[0] = _mm_shuffle_epi8(s, sfl0);
1005   ss[1] = _mm_shuffle_epi8(s, sfl1);
1006   ss[2] = _mm_shuffle_epi8(s, sfl2);
1007   return convolve_6tap_ssse3(ss, coeffs);
1008 }
1009 
x_convolve_6tap_4x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[3])1010 static inline __m128i x_convolve_6tap_4x2_ssse3(const uint8_t *const src,
1011                                                 const ptrdiff_t stride,
1012                                                 const __m128i coeffs[3]) {
1013   const __m128i s = load_u8_8x2_sse2(src, stride);
1014   const __m128i sfl0 =
1015       _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
1016   const __m128i sfl1 =
1017       _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
1018   const __m128i sfl2 =
1019       _mm_setr_epi8(4, 5, 5, 6, 12, 13, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0);
1020   __m128i ss[3];
1021 
1022   ss[0] = _mm_shuffle_epi8(s, sfl0);
1023   ss[1] = _mm_shuffle_epi8(s, sfl1);
1024   ss[2] = _mm_shuffle_epi8(s, sfl2);
1025   return convolve_6tap_ssse3(ss, coeffs);
1026 }
1027 
x_convolve_6tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[3],const __m256i filt[3])1028 static inline __m256i x_convolve_6tap_8x2_avx2(const uint8_t *const src,
1029                                                const ptrdiff_t stride,
1030                                                const __m256i coeffs[3],
1031                                                const __m256i filt[3]) {
1032   const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1033   return x_convolve_6tap_avx2(s_256, coeffs, filt);
1034 }
1035 
x_convolve_6tap_16x2_avx2(const uint8_t * const src,const int32_t src_stride,const __m256i coeffs[3],const __m256i filt[3],__m256i r[2])1036 static inline void x_convolve_6tap_16x2_avx2(const uint8_t *const src,
1037                                              const int32_t src_stride,
1038                                              const __m256i coeffs[3],
1039                                              const __m256i filt[3],
1040                                              __m256i r[2]) {
1041   r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1042   r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1043 }
1044 
x_convolve_6tap_32_avx2(const uint8_t * const src,const __m256i coeffs[3],const __m256i filt[3],__m256i r[2])1045 static inline void x_convolve_6tap_32_avx2(const uint8_t *const src,
1046                                            const __m256i coeffs[3],
1047                                            const __m256i filt[3],
1048                                            __m256i r[2]) {
1049   const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1050   const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1051 
1052   r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
1053   r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
1054 }
1055 
x_convolve_8tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[4],const __m256i filt[4])1056 static inline __m256i x_convolve_8tap_8x2_avx2(const uint8_t *const src,
1057                                                const ptrdiff_t stride,
1058                                                const __m256i coeffs[4],
1059                                                const __m256i filt[4]) {
1060   const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1061   return x_convolve_8tap_avx2(s_256, coeffs, filt);
1062 }
1063 
x_convolve_8tap_16x2_avx2(const uint8_t * const src,const int32_t src_stride,const __m256i coeffs[4],const __m256i filt[4],__m256i r[2])1064 static AOM_FORCE_INLINE void x_convolve_8tap_16x2_avx2(const uint8_t *const src,
1065                                                        const int32_t src_stride,
1066                                                        const __m256i coeffs[4],
1067                                                        const __m256i filt[4],
1068                                                        __m256i r[2]) {
1069   r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1070   r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1071 }
1072 
x_convolve_8tap_32_avx2(const uint8_t * const src,const __m256i coeffs[4],const __m256i filt[4],__m256i r[2])1073 static AOM_FORCE_INLINE void x_convolve_8tap_32_avx2(const uint8_t *const src,
1074                                                      const __m256i coeffs[4],
1075                                                      const __m256i filt[4],
1076                                                      __m256i r[2]) {
1077   const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1078   const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1079 
1080   r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
1081   r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
1082 }
1083 
y_convolve_2tap_2x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[1],__m128i s_16[2])1084 static inline __m128i y_convolve_2tap_2x2_ssse3(const uint8_t *const src,
1085                                                 const ptrdiff_t stride,
1086                                                 const __m128i coeffs[1],
1087                                                 __m128i s_16[2]) {
1088   __m128i s_128[2];
1089 
1090   s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src + stride));
1091   s_128[0] = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1092   s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src + 2 * stride));
1093   s_128[1] = _mm_unpacklo_epi16(s_16[1], s_16[0]);
1094   const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1095   return convolve_2tap_ssse3(&ss, coeffs);
1096 }
1097 
y_convolve_2tap_4x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[1],__m128i s_32[2])1098 static inline __m128i y_convolve_2tap_4x2_ssse3(const uint8_t *const src,
1099                                                 const ptrdiff_t stride,
1100                                                 const __m128i coeffs[1],
1101                                                 __m128i s_32[2]) {
1102   __m128i s_128[2];
1103 
1104   s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1105   s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1106   s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1107   s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1108   const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1109   return convolve_2tap_ssse3(&ss, coeffs);
1110 }
1111 
y_convolve_2tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[1],__m128i s_64[2])1112 static inline __m256i y_convolve_2tap_8x2_avx2(const uint8_t *const src,
1113                                                const ptrdiff_t stride,
1114                                                const __m256i coeffs[1],
1115                                                __m128i s_64[2]) {
1116   __m256i s_256[2];
1117 
1118   s_64[1] = _mm_loadl_epi64((__m128i *)(src + stride));
1119   s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
1120   s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1121   s_256[1] = _mm256_setr_m128i(s_64[1], s_64[0]);
1122   const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1123   return convolve_2tap_avx2(&ss, coeffs);
1124 }
1125 
y_convolve_2tap_16x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[1],__m128i s_128[2],__m256i r[2])1126 static inline void y_convolve_2tap_16x2_avx2(const uint8_t *const src,
1127                                              const ptrdiff_t stride,
1128                                              const __m256i coeffs[1],
1129                                              __m128i s_128[2], __m256i r[2]) {
1130   __m256i s_256[2];
1131 
1132   s_128[1] = _mm_loadu_si128((__m128i *)(src + stride));
1133   s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1134   s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1135   s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1136   const __m256i ss0 = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1137   const __m256i ss1 = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1138   r[0] = convolve_2tap_avx2(&ss0, coeffs);
1139   r[1] = convolve_2tap_avx2(&ss1, coeffs);
1140 }
1141 
y_convolve_2tap_32_avx2(const uint8_t * const src,const __m256i coeffs[1],const __m256i s0,__m256i * const s1,__m256i r[2])1142 static inline void y_convolve_2tap_32_avx2(const uint8_t *const src,
1143                                            const __m256i coeffs[1],
1144                                            const __m256i s0, __m256i *const s1,
1145                                            __m256i r[2]) {
1146   *s1 = _mm256_loadu_si256((__m256i *)src);
1147   const __m256i ss0 = _mm256_unpacklo_epi8(s0, *s1);
1148   const __m256i ss1 = _mm256_unpackhi_epi8(s0, *s1);
1149   r[0] = convolve_2tap_avx2(&ss0, coeffs);
1150   r[1] = convolve_2tap_avx2(&ss1, coeffs);
1151 }
1152 
y_convolve_4tap_2x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[2],__m128i s_16[4],__m128i ss_128[2])1153 static inline __m128i y_convolve_4tap_2x2_ssse3(const uint8_t *const src,
1154                                                 const ptrdiff_t stride,
1155                                                 const __m128i coeffs[2],
1156                                                 __m128i s_16[4],
1157                                                 __m128i ss_128[2]) {
1158   s_16[3] = _mm_cvtsi32_si128(loadu_int16(src + stride));
1159   const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1160   s_16[2] = _mm_cvtsi32_si128(loadu_int16(src + 2 * stride));
1161   const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[2]);
1162   ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1163   return convolve_4tap_ssse3(ss_128, coeffs);
1164 }
1165 
y_convolve_4tap_4x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[2],__m128i s_32[4],__m128i ss_128[2])1166 static inline __m128i y_convolve_4tap_4x2_ssse3(const uint8_t *const src,
1167                                                 const ptrdiff_t stride,
1168                                                 const __m128i coeffs[2],
1169                                                 __m128i s_32[4],
1170                                                 __m128i ss_128[2]) {
1171   s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1172   const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1173   s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1174   const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1175   ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1176   return convolve_4tap_ssse3(ss_128, coeffs);
1177 }
1178 
y_convolve_4tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[2],__m128i s_64[4],__m256i ss_256[2])1179 static inline __m256i y_convolve_4tap_8x2_avx2(const uint8_t *const src,
1180                                                const ptrdiff_t stride,
1181                                                const __m256i coeffs[2],
1182                                                __m128i s_64[4],
1183                                                __m256i ss_256[2]) {
1184   s_64[3] = _mm_loadl_epi64((__m128i *)(src + stride));
1185   const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1186   s_64[2] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1187   const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[2]);
1188   ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1189   return convolve_4tap_avx2(ss_256, coeffs);
1190 }
1191 
y_convolve_4tap_16x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[2],__m128i s_128[4],__m256i ss_256[4],__m256i r[2])1192 static inline void y_convolve_4tap_16x2_avx2(const uint8_t *const src,
1193                                              const ptrdiff_t stride,
1194                                              const __m256i coeffs[2],
1195                                              __m128i s_128[4],
1196                                              __m256i ss_256[4], __m256i r[2]) {
1197   s_128[3] = _mm_loadu_si128((__m128i *)(src + stride));
1198   const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1199   s_128[2] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1200   const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[2]);
1201   ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1202   ss_256[3] = _mm256_unpackhi_epi8(src23, src34);
1203   r[0] = convolve_4tap_avx2(ss_256, coeffs);
1204   r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1205 }
1206 
y_convolve_6tap_2x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[3],__m128i s_16[6],__m128i ss_128[3])1207 static inline __m128i y_convolve_6tap_2x2_ssse3(const uint8_t *const src,
1208                                                 const ptrdiff_t stride,
1209                                                 const __m128i coeffs[3],
1210                                                 __m128i s_16[6],
1211                                                 __m128i ss_128[3]) {
1212   s_16[5] = _mm_cvtsi32_si128(loadu_int16(src + 3 * stride));
1213   const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
1214   s_16[4] = _mm_cvtsi32_si128(loadu_int16(src + 4 * stride));
1215   const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[4]);
1216   ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1217   return convolve_6tap_ssse3(ss_128, coeffs);
1218 }
1219 
y_convolve_4tap_32x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[2],__m256i s_256[4],__m256i ss_256[4],__m256i tt_256[4],__m256i r[4])1220 static inline void y_convolve_4tap_32x2_avx2(
1221     const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[2],
1222     __m256i s_256[4], __m256i ss_256[4], __m256i tt_256[4], __m256i r[4]) {
1223   s_256[3] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
1224   ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1225   ss_256[3] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1226   s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
1227   tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[2]);
1228   tt_256[3] = _mm256_unpackhi_epi8(s_256[3], s_256[2]);
1229   r[0] = convolve_4tap_avx2(ss_256 + 0, coeffs);
1230   r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1231   r[2] = convolve_4tap_avx2(tt_256 + 0, coeffs);
1232   r[3] = convolve_4tap_avx2(tt_256 + 2, coeffs);
1233 }
1234 
y_convolve_6tap_4x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[3],__m128i s_32[6],__m128i ss_128[3])1235 static inline __m128i y_convolve_6tap_4x2_ssse3(const uint8_t *const src,
1236                                                 const ptrdiff_t stride,
1237                                                 const __m128i coeffs[3],
1238                                                 __m128i s_32[6],
1239                                                 __m128i ss_128[3]) {
1240   s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 3 * stride));
1241   const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1242   s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 4 * stride));
1243   const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1244   ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1245   return convolve_6tap_ssse3(ss_128, coeffs);
1246 }
1247 
y_convolve_6tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[3],__m128i s_64[6],__m256i ss_256[3])1248 static inline __m256i y_convolve_6tap_8x2_avx2(const uint8_t *const src,
1249                                                const ptrdiff_t stride,
1250                                                const __m256i coeffs[3],
1251                                                __m128i s_64[6],
1252                                                __m256i ss_256[3]) {
1253   s_64[5] = _mm_loadl_epi64((__m128i *)(src + 3 * stride));
1254   const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
1255   s_64[4] = _mm_loadl_epi64((__m128i *)(src + 4 * stride));
1256   const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[4]);
1257   ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1258   return convolve_6tap_avx2(ss_256, coeffs);
1259 }
1260 
y_convolve_6tap_16x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[3],__m128i s_128[6],__m256i ss_256[6],__m256i r[2])1261 static inline void y_convolve_6tap_16x2_avx2(const uint8_t *const src,
1262                                              const ptrdiff_t stride,
1263                                              const __m256i coeffs[3],
1264                                              __m128i s_128[6],
1265                                              __m256i ss_256[6], __m256i r[2]) {
1266   s_128[5] = _mm_loadu_si128((__m128i *)(src + 3 * stride));
1267   const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
1268   s_128[4] = _mm_loadu_si128((__m128i *)(src + 4 * stride));
1269   const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[4]);
1270   ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1271   ss_256[5] = _mm256_unpackhi_epi8(src45, src56);
1272   r[0] = convolve_6tap_avx2(ss_256, coeffs);
1273   r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1274 }
1275 
y_convolve_6tap_32x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[3],__m256i s_256[6],__m256i ss_256[6],__m256i tt_256[6],__m256i r[4])1276 static inline void y_convolve_6tap_32x2_avx2(
1277     const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[3],
1278     __m256i s_256[6], __m256i ss_256[6], __m256i tt_256[6], __m256i r[4]) {
1279   s_256[5] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1280   ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
1281   ss_256[5] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
1282   s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1283   tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[4]);
1284   tt_256[5] = _mm256_unpackhi_epi8(s_256[5], s_256[4]);
1285   r[0] = convolve_6tap_avx2(ss_256 + 0, coeffs);
1286   r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1287   r[2] = convolve_6tap_avx2(tt_256 + 0, coeffs);
1288   r[3] = convolve_6tap_avx2(tt_256 + 3, coeffs);
1289 }
1290 
y_convolve_8tap_2x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[4],__m128i s_16[8],__m128i ss_128[4])1291 static inline __m128i y_convolve_8tap_2x2_ssse3(const uint8_t *const src,
1292                                                 const ptrdiff_t stride,
1293                                                 const __m128i coeffs[4],
1294                                                 __m128i s_16[8],
1295                                                 __m128i ss_128[4]) {
1296   s_16[7] = _mm_cvtsi32_si128(loadu_int16(src + 7 * stride));
1297   const __m128i src67 = _mm_unpacklo_epi16(s_16[6], s_16[7]);
1298   s_16[6] = _mm_cvtsi32_si128(loadu_int16(src + 8 * stride));
1299   const __m128i src78 = _mm_unpacklo_epi16(s_16[7], s_16[6]);
1300   ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1301   return convolve_8tap_ssse3(ss_128, coeffs);
1302 }
1303 
y_convolve_8tap_4x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[4],__m128i s_32[8],__m128i ss_128[4])1304 static inline __m128i y_convolve_8tap_4x2_ssse3(const uint8_t *const src,
1305                                                 const ptrdiff_t stride,
1306                                                 const __m128i coeffs[4],
1307                                                 __m128i s_32[8],
1308                                                 __m128i ss_128[4]) {
1309   s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * stride));
1310   const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1311   s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * stride));
1312   const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1313   ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1314   return convolve_8tap_ssse3(ss_128, coeffs);
1315 }
1316 
y_convolve_8tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[4],__m128i s_64[8],__m256i ss_256[4])1317 static inline __m256i y_convolve_8tap_8x2_avx2(const uint8_t *const src,
1318                                                const ptrdiff_t stride,
1319                                                const __m256i coeffs[4],
1320                                                __m128i s_64[8],
1321                                                __m256i ss_256[4]) {
1322   s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * stride));
1323   const __m256i src67 = _mm256_setr_m128i(s_64[6], s_64[7]);
1324   s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * stride));
1325   const __m256i src78 = _mm256_setr_m128i(s_64[7], s_64[6]);
1326   ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1327   return convolve_8tap_avx2(ss_256, coeffs);
1328 }
1329 
y_convolve_8tap_16x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[4],__m128i s_128[8],__m256i ss_256[8],__m256i r[2])1330 static inline void y_convolve_8tap_16x2_avx2(const uint8_t *const src,
1331                                              const ptrdiff_t stride,
1332                                              const __m256i coeffs[4],
1333                                              __m128i s_128[8],
1334                                              __m256i ss_256[8], __m256i r[2]) {
1335   s_128[7] = _mm_loadu_si128((__m128i *)(src + 7 * stride));
1336   const __m256i src67 = _mm256_setr_m128i(s_128[6], s_128[7]);
1337   s_128[6] = _mm_loadu_si128((__m128i *)(src + 8 * stride));
1338   const __m256i src78 = _mm256_setr_m128i(s_128[7], s_128[6]);
1339   ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1340   ss_256[7] = _mm256_unpackhi_epi8(src67, src78);
1341   r[0] = convolve_8tap_avx2(ss_256, coeffs);
1342   r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1343 }
1344 
y_convolve_8tap_32x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[4],__m256i s_256[8],__m256i ss_256[8],__m256i tt_256[8],__m256i r[4])1345 static inline void y_convolve_8tap_32x2_avx2(
1346     const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1347     __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1348   s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1349   ss_256[3] = _mm256_unpacklo_epi8(s_256[6], s_256[7]);
1350   ss_256[7] = _mm256_unpackhi_epi8(s_256[6], s_256[7]);
1351   s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1352   tt_256[3] = _mm256_unpacklo_epi8(s_256[7], s_256[6]);
1353   tt_256[7] = _mm256_unpackhi_epi8(s_256[7], s_256[6]);
1354   r[0] = convolve_8tap_avx2(ss_256 + 0, coeffs);
1355   r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1356   r[2] = convolve_8tap_avx2(tt_256 + 0, coeffs);
1357   r[3] = convolve_8tap_avx2(tt_256 + 4, coeffs);
1358 }
1359 
xy_x_convolve_2tap_32_avx2(const uint8_t * const src,const __m256i coeffs[1],__m256i r[2])1360 static inline void xy_x_convolve_2tap_32_avx2(const uint8_t *const src,
1361                                               const __m256i coeffs[1],
1362                                               __m256i r[2]) {
1363   const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
1364   const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
1365   const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
1366   const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
1367 
1368   r[0] = convolve_2tap_avx2(&ss0, coeffs);
1369   r[1] = convolve_2tap_avx2(&ss1, coeffs);
1370 }
1371 
xy_x_2tap_32_avx2(const uint8_t * const src,const __m256i coeffs[1],int16_t * const dst)1372 static inline void xy_x_2tap_32_avx2(const uint8_t *const src,
1373                                      const __m256i coeffs[1],
1374                                      int16_t *const dst) {
1375   __m256i r[2];
1376 
1377   xy_x_convolve_2tap_32_avx2(src, coeffs, r);
1378   const __m256i d0 = xy_x_round_avx2(r[0]);
1379   const __m256i d1 = xy_x_round_avx2(r[1]);
1380   _mm256_storeu_si256((__m256i *)dst, d0);
1381   _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1382 }
1383 
xy_x_4tap_32_avx2(const uint8_t * const src,const __m256i coeffs[2],const __m256i filt[2],int16_t * const dst)1384 static inline void xy_x_4tap_32_avx2(const uint8_t *const src,
1385                                      const __m256i coeffs[2],
1386                                      const __m256i filt[2],
1387                                      int16_t *const dst) {
1388   __m256i r[2];
1389 
1390   x_convolve_4tap_32_avx2(src, coeffs, filt, r);
1391   const __m256i d0 = xy_x_round_avx2(r[0]);
1392   const __m256i d1 = xy_x_round_avx2(r[1]);
1393   _mm256_storeu_si256((__m256i *)dst, d0);
1394   _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1395 }
1396 
xy_x_6tap_32_avx2(const uint8_t * const src,const __m256i coeffs[3],const __m256i filt[3],int16_t * const dst)1397 static inline void xy_x_6tap_32_avx2(const uint8_t *const src,
1398                                      const __m256i coeffs[3],
1399                                      const __m256i filt[3],
1400                                      int16_t *const dst) {
1401   __m256i r[2];
1402 
1403   x_convolve_6tap_32_avx2(src, coeffs, filt, r);
1404   const __m256i d0 = xy_x_round_avx2(r[0]);
1405   const __m256i d1 = xy_x_round_avx2(r[1]);
1406   _mm256_storeu_si256((__m256i *)dst, d0);
1407   _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1408 }
1409 
xy_x_8tap_32_avx2(const uint8_t * const src,const __m256i coeffs[4],const __m256i filt[4],int16_t * const dst)1410 static inline void xy_x_8tap_32_avx2(const uint8_t *const src,
1411                                      const __m256i coeffs[4],
1412                                      const __m256i filt[4],
1413                                      int16_t *const dst) {
1414   __m256i r[2];
1415 
1416   x_convolve_8tap_32_avx2(src, coeffs, filt, r);
1417   const __m256i d0 = xy_x_round_avx2(r[0]);
1418   const __m256i d1 = xy_x_round_avx2(r[1]);
1419   _mm256_storeu_si256((__m256i *)dst, d0);
1420   _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1421 }
1422 
xy_y_convolve_2tap_2x2_sse2(const int16_t * const src,__m128i s_32[2],const __m128i coeffs[1])1423 static inline __m128i xy_y_convolve_2tap_2x2_sse2(const int16_t *const src,
1424                                                   __m128i s_32[2],
1425                                                   const __m128i coeffs[1]) {
1426   __m128i s_128[2];
1427 
1428   s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1429   s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1430   s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1431   s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1432   const __m128i ss = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1433   return convolve16_2tap_sse2(&ss, coeffs);
1434 }
1435 
xy_y_convolve_2tap_2x2_half_pel_sse2(const int16_t * const src,__m128i s_32[2])1436 static inline __m128i xy_y_convolve_2tap_2x2_half_pel_sse2(
1437     const int16_t *const src, __m128i s_32[2]) {
1438   __m128i s_128[2];
1439 
1440   s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1441   s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1442   s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1443   s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1444   return _mm_add_epi16(s_128[0], s_128[1]);
1445 }
1446 
xy_y_convolve_2tap_4x2_sse2(const int16_t * const src,__m128i s_64[2],const __m128i coeffs[1],__m128i r[2])1447 static inline void xy_y_convolve_2tap_4x2_sse2(const int16_t *const src,
1448                                                __m128i s_64[2],
1449                                                const __m128i coeffs[1],
1450                                                __m128i r[2]) {
1451   __m128i s_128[2];
1452 
1453   s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1454   s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1455   s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1456   s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1457   const __m128i ss0 = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1458   const __m128i ss1 = _mm_unpackhi_epi16(s_128[0], s_128[1]);
1459   r[0] = convolve16_2tap_sse2(&ss0, coeffs);
1460   r[1] = convolve16_2tap_sse2(&ss1, coeffs);
1461 }
1462 
xy_y_convolve_2tap_4x2_half_pel_sse2(const int16_t * const src,__m128i s_64[2])1463 static inline __m128i xy_y_convolve_2tap_4x2_half_pel_sse2(
1464     const int16_t *const src, __m128i s_64[2]) {
1465   __m128i s_128[2];
1466 
1467   s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1468   s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1469   s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1470   s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1471   return _mm_add_epi16(s_128[0], s_128[1]);
1472 }
1473 
xy_y_convolve_2tap_16_avx2(const __m256i s0,const __m256i s1,const __m256i coeffs[1],__m256i r[2])1474 static inline void xy_y_convolve_2tap_16_avx2(const __m256i s0,
1475                                               const __m256i s1,
1476                                               const __m256i coeffs[1],
1477                                               __m256i r[2]) {
1478   const __m256i ss0 = _mm256_unpacklo_epi16(s0, s1);
1479   const __m256i ss1 = _mm256_unpackhi_epi16(s0, s1);
1480   r[0] = convolve16_2tap_avx2(&ss0, coeffs);
1481   r[1] = convolve16_2tap_avx2(&ss1, coeffs);
1482 }
1483 
xy_y_convolve_2tap_8x2_avx2(const int16_t * const src,__m128i s_128[2],const __m256i coeffs[1],__m256i r[2])1484 static inline void xy_y_convolve_2tap_8x2_avx2(const int16_t *const src,
1485                                                __m128i s_128[2],
1486                                                const __m256i coeffs[1],
1487                                                __m256i r[2]) {
1488   __m256i s_256[2];
1489   s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1490   s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1491   s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1492   s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1493   xy_y_convolve_2tap_16_avx2(s_256[0], s_256[1], coeffs, r);
1494 }
1495 
xy_y_convolve_2tap_8x2_half_pel_avx2(const int16_t * const src,__m128i s_128[2])1496 static inline __m256i xy_y_convolve_2tap_8x2_half_pel_avx2(
1497     const int16_t *const src, __m128i s_128[2]) {
1498   __m256i s_256[2];
1499   s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1500   s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1501   s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1502   s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1503   return _mm256_add_epi16(s_256[0], s_256[1]);
1504 }
1505 
xy_y_convolve_2tap_16x2_half_pel_avx2(const int16_t * const src,__m256i s_256[2],__m256i r[2])1506 static inline void xy_y_convolve_2tap_16x2_half_pel_avx2(
1507     const int16_t *const src, __m256i s_256[2], __m256i r[2]) {
1508   s_256[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1509   r[0] = _mm256_add_epi16(s_256[0], s_256[1]);
1510   s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1511   r[1] = _mm256_add_epi16(s_256[1], s_256[0]);
1512 }
1513 
xy_y_store_16x2_avx2(const __m256i r[2],uint8_t * const dst,const ptrdiff_t stride)1514 static inline void xy_y_store_16x2_avx2(const __m256i r[2], uint8_t *const dst,
1515                                         const ptrdiff_t stride) {
1516   const __m256i t = _mm256_packus_epi16(r[0], r[1]);
1517   const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
1518   storeu_u8_16x2_avx2(d, dst, stride);
1519 }
1520 
xy_y_convolve_2tap_16x2_avx2(const int16_t * const src,__m256i s[2],const __m256i coeffs[1],__m256i r[4])1521 static inline void xy_y_convolve_2tap_16x2_avx2(const int16_t *const src,
1522                                                 __m256i s[2],
1523                                                 const __m256i coeffs[1],
1524                                                 __m256i r[4]) {
1525   s[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1526   xy_y_convolve_2tap_16_avx2(s[0], s[1], coeffs, r + 0);
1527   s[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1528   xy_y_convolve_2tap_16_avx2(s[1], s[0], coeffs, r + 2);
1529 }
1530 
xy_y_convolve_2tap_32_avx2(const int16_t * const src,const __m256i s0[2],__m256i s1[2],const __m256i coeffs[1],__m256i r[4])1531 static inline void xy_y_convolve_2tap_32_avx2(const int16_t *const src,
1532                                               const __m256i s0[2],
1533                                               __m256i s1[2],
1534                                               const __m256i coeffs[1],
1535                                               __m256i r[4]) {
1536   s1[0] = _mm256_loadu_si256((__m256i *)src);
1537   s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1538   xy_y_convolve_2tap_16_avx2(s0[0], s1[0], coeffs, r + 0);
1539   xy_y_convolve_2tap_16_avx2(s0[1], s1[1], coeffs, r + 2);
1540 }
1541 
xy_y_convolve_2tap_32_all_avx2(const int16_t * const src,const __m256i s0[2],__m256i s1[2],const __m256i coeffs[1],uint8_t * const dst)1542 static inline void xy_y_convolve_2tap_32_all_avx2(const int16_t *const src,
1543                                                   const __m256i s0[2],
1544                                                   __m256i s1[2],
1545                                                   const __m256i coeffs[1],
1546                                                   uint8_t *const dst) {
1547   __m256i r[4];
1548 
1549   xy_y_convolve_2tap_32_avx2(src, s0, s1, coeffs, r);
1550   xy_y_round_store_32_avx2(r + 0, r + 2, dst);
1551 }
1552 
xy_y_convolve_2tap_half_pel_32_avx2(const int16_t * const src,const __m256i s0[2],__m256i s1[2],__m256i r[2])1553 static inline void xy_y_convolve_2tap_half_pel_32_avx2(const int16_t *const src,
1554                                                        const __m256i s0[2],
1555                                                        __m256i s1[2],
1556                                                        __m256i r[2]) {
1557   s1[0] = _mm256_loadu_si256((__m256i *)src);
1558   s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1559   r[0] = _mm256_add_epi16(s0[0], s1[0]);
1560   r[1] = _mm256_add_epi16(s0[1], s1[1]);
1561 }
1562 
xy_y_convolve_2tap_half_pel_32_all_avx2(const int16_t * const src,const __m256i s0[2],__m256i s1[2],uint8_t * const dst)1563 static inline void xy_y_convolve_2tap_half_pel_32_all_avx2(
1564     const int16_t *const src, const __m256i s0[2], __m256i s1[2],
1565     uint8_t *const dst) {
1566   __m256i r[2];
1567 
1568   xy_y_convolve_2tap_half_pel_32_avx2(src, s0, s1, r);
1569   r[0] = xy_y_round_half_pel_avx2(r[0]);
1570   r[1] = xy_y_round_half_pel_avx2(r[1]);
1571   xy_y_pack_store_32_avx2(r[0], r[1], dst);
1572 }
1573 
xy_y_convolve_4tap_2x2_sse2(const int16_t * const src,__m128i s_32[4],__m128i ss_128[2],const __m128i coeffs[2])1574 static inline __m128i xy_y_convolve_4tap_2x2_sse2(const int16_t *const src,
1575                                                   __m128i s_32[4],
1576                                                   __m128i ss_128[2],
1577                                                   const __m128i coeffs[2]) {
1578   s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + 3 * 2));
1579   const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1580   s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 4 * 2));
1581   const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1582   ss_128[1] = _mm_unpacklo_epi16(src23, src34);
1583   const __m128i r = convolve16_4tap_sse2(ss_128, coeffs);
1584   ss_128[0] = ss_128[1];
1585   return r;
1586 }
1587 
xy_y_convolve_4tap_4x2_avx2(const int16_t * const src,__m128i s_64[4],__m256i ss_256[2],const __m256i coeffs[2])1588 static inline __m256i xy_y_convolve_4tap_4x2_avx2(const int16_t *const src,
1589                                                   __m128i s_64[4],
1590                                                   __m256i ss_256[2],
1591                                                   const __m256i coeffs[2]) {
1592   __m256i s_256[2];
1593   s_64[3] = _mm_loadl_epi64((__m128i *)(src + 3 * 4));
1594   s_256[0] = _mm256_setr_m128i(s_64[2], s_64[3]);
1595   s_64[2] = _mm_loadl_epi64((__m128i *)(src + 4 * 4));
1596   s_256[1] = _mm256_setr_m128i(s_64[3], s_64[2]);
1597   ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1598   const __m256i r = convolve16_4tap_avx2(ss_256, coeffs);
1599   ss_256[0] = ss_256[1];
1600   return r;
1601 }
1602 
xy_y_convolve_4tap_16_avx2(const __m256i * const ss,const __m256i coeffs[2],__m256i r[2])1603 static inline void xy_y_convolve_4tap_16_avx2(const __m256i *const ss,
1604                                               const __m256i coeffs[2],
1605                                               __m256i r[2]) {
1606   r[0] = convolve16_4tap_avx2(ss, coeffs);
1607   r[1] = convolve16_4tap_avx2(ss + 2, coeffs);
1608 }
1609 
xy_y_convolve_4tap_8x2_avx2(const int16_t * const src,__m256i ss_256[4],const __m256i coeffs[2],__m256i r[2])1610 static inline void xy_y_convolve_4tap_8x2_avx2(const int16_t *const src,
1611                                                __m256i ss_256[4],
1612                                                const __m256i coeffs[2],
1613                                                __m256i r[2]) {
1614   __m256i s_256[2];
1615   s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1616   s_256[1] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1617   ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1618   ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1619   xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1620   ss_256[0] = ss_256[1];
1621   ss_256[2] = ss_256[3];
1622 }
1623 
xy_y_convolve_4tap_8x2_half_pel_avx2(const int16_t * const src,const __m256i coeffs[1],__m256i s_256[4],__m256i r[2])1624 static inline void xy_y_convolve_4tap_8x2_half_pel_avx2(
1625     const int16_t *const src, const __m256i coeffs[1], __m256i s_256[4],
1626     __m256i r[2]) {
1627   __m256i a_256[2];
1628   s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1629   s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1630   a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1631   a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1632   xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r);
1633   s_256[0] = s_256[2];
1634   s_256[1] = s_256[3];
1635 }
1636 
xy_y_convolve_4tap_16x2_avx2(const int16_t * const src,__m256i s_256[4],__m256i ss_256[4],__m256i tt_256[4],const __m256i coeffs[2],__m256i r[4])1637 static inline void xy_y_convolve_4tap_16x2_avx2(
1638     const int16_t *const src, __m256i s_256[4], __m256i ss_256[4],
1639     __m256i tt_256[4], const __m256i coeffs[2], __m256i r[4]) {
1640   s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1641   ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1642   ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1643   s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1644   tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1645   tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1646   xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1647   xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1648   ss_256[0] = ss_256[1];
1649   ss_256[2] = ss_256[3];
1650   tt_256[0] = tt_256[1];
1651   tt_256[2] = tt_256[3];
1652 }
1653 
xy_y_convolve_4tap_32x2_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i s_256[4],__m256i ss_256[4],__m256i tt_256[4],const __m256i coeffs[2],__m256i r[4])1654 static inline void xy_y_convolve_4tap_32x2_avx2(
1655     const int16_t *const src, const ptrdiff_t stride, __m256i s_256[4],
1656     __m256i ss_256[4], __m256i tt_256[4], const __m256i coeffs[2],
1657     __m256i r[4]) {
1658   s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1659   ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1660   ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1661   s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1662   tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1663   tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1664   xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1665   xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1666   ss_256[0] = ss_256[1];
1667   ss_256[2] = ss_256[3];
1668   tt_256[0] = tt_256[1];
1669   tt_256[2] = tt_256[3];
1670 }
1671 
xy_y_convolve_4tap_16x2_half_pelavx2(const int16_t * const src,__m256i s_256[5],const __m256i coeffs[1],__m256i r[4])1672 static inline void xy_y_convolve_4tap_16x2_half_pelavx2(
1673     const int16_t *const src, __m256i s_256[5], const __m256i coeffs[1],
1674     __m256i r[4]) {
1675   __m256i a_256[2];
1676 
1677   s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1678   s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1679 
1680   a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1681   a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1682   xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 0);
1683 
1684   a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1685   a_256[1] = _mm256_add_epi16(s_256[2], s_256[3]);
1686   xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 2);
1687 
1688   s_256[0] = s_256[2];
1689   s_256[1] = s_256[3];
1690   s_256[2] = s_256[4];
1691 }
1692 
xy_y_convolve_6tap_2x2_sse2(const int16_t * const src,__m128i s_32[6],__m128i ss_128[3],const __m128i coeffs[3])1693 static inline __m128i xy_y_convolve_6tap_2x2_sse2(const int16_t *const src,
1694                                                   __m128i s_32[6],
1695                                                   __m128i ss_128[3],
1696                                                   const __m128i coeffs[3]) {
1697   s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 5 * 2));
1698   const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1699   s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 6 * 2));
1700   const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1701   ss_128[2] = _mm_unpacklo_epi16(src45, src56);
1702   const __m128i r = convolve16_6tap_sse2(ss_128, coeffs);
1703   ss_128[0] = ss_128[1];
1704   ss_128[1] = ss_128[2];
1705   return r;
1706 }
1707 
xy_y_convolve_6tap_4x2_avx2(const int16_t * const src,__m128i s_64[6],__m256i ss_256[3],const __m256i coeffs[3])1708 static inline __m256i xy_y_convolve_6tap_4x2_avx2(const int16_t *const src,
1709                                                   __m128i s_64[6],
1710                                                   __m256i ss_256[3],
1711                                                   const __m256i coeffs[3]) {
1712   __m256i s_256[2];
1713   s_64[5] = _mm_loadl_epi64((__m128i *)(src + 5 * 4));
1714   s_256[0] = _mm256_setr_m128i(s_64[4], s_64[5]);
1715   s_64[4] = _mm_loadl_epi64((__m128i *)(src + 6 * 4));
1716   s_256[1] = _mm256_setr_m128i(s_64[5], s_64[4]);
1717   ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1718   const __m256i r = convolve16_6tap_avx2(ss_256, coeffs);
1719   ss_256[0] = ss_256[1];
1720   ss_256[1] = ss_256[2];
1721   return r;
1722 }
1723 
xy_y_convolve_6tap_16_avx2(const __m256i ss[6],const __m256i coeffs[3],__m256i r[2])1724 static inline void xy_y_convolve_6tap_16_avx2(const __m256i ss[6],
1725                                               const __m256i coeffs[3],
1726                                               __m256i r[2]) {
1727   r[0] = convolve16_6tap_avx2(ss, coeffs);
1728   r[1] = convolve16_6tap_avx2(ss + 3, coeffs);
1729 }
1730 
xy_y_convolve_6tap_8x2_avx2(const int16_t * const src,__m256i ss_256[6],const __m256i coeffs[3],__m256i r[2])1731 static inline void xy_y_convolve_6tap_8x2_avx2(const int16_t *const src,
1732                                                __m256i ss_256[6],
1733                                                const __m256i coeffs[3],
1734                                                __m256i r[2]) {
1735   __m256i s_256[2];
1736   s_256[0] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1737   s_256[1] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1738   ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1739   ss_256[5] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1740   xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r);
1741   ss_256[0] = ss_256[1];
1742   ss_256[1] = ss_256[2];
1743   ss_256[3] = ss_256[4];
1744   ss_256[4] = ss_256[5];
1745 }
1746 
xy_y_convolve_6tap_8x2_half_pel_avx2(const int16_t * const src,const __m256i coeffs[2],__m256i s_256[6],__m256i r[2])1747 static inline void xy_y_convolve_6tap_8x2_half_pel_avx2(
1748     const int16_t *const src, const __m256i coeffs[2], __m256i s_256[6],
1749     __m256i r[2]) {
1750   __m256i a_256[2], ss_256[4];
1751   s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1752   s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1753   a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1754   a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1755   ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1756   ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1757   ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1758   ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1759   xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1760   s_256[0] = s_256[2];
1761   s_256[1] = s_256[3];
1762   s_256[2] = s_256[4];
1763   s_256[3] = s_256[5];
1764 }
1765 
xy_y_convolve_6tap_16x2_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i s_256[6],__m256i ss_256[6],__m256i tt_256[6],const __m256i coeffs[3],__m256i r[4])1766 static inline void xy_y_convolve_6tap_16x2_avx2(
1767     const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
1768     __m256i ss_256[6], __m256i tt_256[6], const __m256i coeffs[3],
1769     __m256i r[4]) {
1770   s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1771   ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
1772   ss_256[5] = _mm256_unpackhi_epi16(s_256[4], s_256[5]);
1773   s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1774   tt_256[2] = _mm256_unpacklo_epi16(s_256[5], s_256[4]);
1775   tt_256[5] = _mm256_unpackhi_epi16(s_256[5], s_256[4]);
1776 
1777   xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r + 0);
1778   xy_y_convolve_6tap_16_avx2(tt_256, coeffs, r + 2);
1779 
1780   ss_256[0] = ss_256[1];
1781   ss_256[1] = ss_256[2];
1782   ss_256[3] = ss_256[4];
1783   ss_256[4] = ss_256[5];
1784 
1785   tt_256[0] = tt_256[1];
1786   tt_256[1] = tt_256[2];
1787   tt_256[3] = tt_256[4];
1788   tt_256[4] = tt_256[5];
1789 }
1790 
xy_y_convolve_6tap_16x2_half_pel_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i s_256[6],__m256i ss_256[4],const __m256i coeffs[2],__m256i r[4])1791 static inline void xy_y_convolve_6tap_16x2_half_pel_avx2(
1792     const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
1793     __m256i ss_256[4], const __m256i coeffs[2], __m256i r[4]) {
1794   __m256i a_256[2];
1795 
1796   s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1797   a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1798   a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1799   ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1800   ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1801   ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1802   ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1803   xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1804 
1805   a_256[1] = _mm256_add_epi16(s_256[2], s_256[5]);
1806   s_256[0] = s_256[2];
1807   s_256[2] = s_256[4];
1808   s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1809   a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1810   s_256[1] = s_256[3];
1811   s_256[3] = s_256[5];
1812   ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1813   ss_256[1] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
1814   ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1815   ss_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
1816   xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1817 }
1818 
xy_y_convolve_8tap_2x2_sse2(const int16_t * const src,__m128i s_32[8],__m128i ss_128[4],const __m128i coeffs[4])1819 static inline __m128i xy_y_convolve_8tap_2x2_sse2(const int16_t *const src,
1820                                                   __m128i s_32[8],
1821                                                   __m128i ss_128[4],
1822                                                   const __m128i coeffs[4]) {
1823   s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * 2));
1824   const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1825   s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * 2));
1826   const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1827   ss_128[3] = _mm_unpacklo_epi16(src67, src78);
1828   const __m128i r = convolve16_8tap_sse2(ss_128, coeffs);
1829   ss_128[0] = ss_128[1];
1830   ss_128[1] = ss_128[2];
1831   ss_128[2] = ss_128[3];
1832   return r;
1833 }
1834 
xy_y_convolve_8tap_4x2_avx2(const int16_t * const src,__m128i s_64[8],__m256i ss_256[4],const __m256i coeffs[4])1835 static inline __m256i xy_y_convolve_8tap_4x2_avx2(const int16_t *const src,
1836                                                   __m128i s_64[8],
1837                                                   __m256i ss_256[4],
1838                                                   const __m256i coeffs[4]) {
1839   __m256i s_256[2];
1840   s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * 4));
1841   s_256[0] = _mm256_setr_m128i(s_64[6], s_64[7]);
1842   s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * 4));
1843   s_256[1] = _mm256_setr_m128i(s_64[7], s_64[6]);
1844   ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1845   const __m256i r = convolve16_8tap_avx2(ss_256, coeffs);
1846   ss_256[0] = ss_256[1];
1847   ss_256[1] = ss_256[2];
1848   ss_256[2] = ss_256[3];
1849   return r;
1850 }
1851 
xy_y_convolve_8tap_16_avx2(const __m256i * const ss,const __m256i coeffs[4],__m256i r[2])1852 static inline void xy_y_convolve_8tap_16_avx2(const __m256i *const ss,
1853                                               const __m256i coeffs[4],
1854                                               __m256i r[2]) {
1855   r[0] = convolve16_8tap_avx2(ss, coeffs);
1856   r[1] = convolve16_8tap_avx2(ss + 4, coeffs);
1857 }
1858 
xy_y_convolve_8tap_8x2_avx2(const int16_t * const src,__m256i ss_256[8],const __m256i coeffs[4],__m256i r[2])1859 static inline void xy_y_convolve_8tap_8x2_avx2(const int16_t *const src,
1860                                                __m256i ss_256[8],
1861                                                const __m256i coeffs[4],
1862                                                __m256i r[2]) {
1863   __m256i s_256[2];
1864   s_256[0] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1865   s_256[1] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1866   ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1867   ss_256[7] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1868   xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r);
1869   ss_256[0] = ss_256[1];
1870   ss_256[1] = ss_256[2];
1871   ss_256[2] = ss_256[3];
1872   ss_256[4] = ss_256[5];
1873   ss_256[5] = ss_256[6];
1874   ss_256[6] = ss_256[7];
1875 }
1876 
xy_y_convolve_8tap_8x2_half_pel_avx2(const int16_t * const src,const __m256i coeffs[2],__m256i s_256[8],__m256i r[2])1877 static inline void xy_y_convolve_8tap_8x2_half_pel_avx2(
1878     const int16_t *const src, const __m256i coeffs[2], __m256i s_256[8],
1879     __m256i r[2]) {
1880   __m256i a_256[4], ss_256[4];
1881 
1882   s_256[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1883   s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1884   a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1885   a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1886   a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1887   a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1888   ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1889   ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1890   ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1891   ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1892   xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1893   s_256[0] = s_256[2];
1894   s_256[1] = s_256[3];
1895   s_256[2] = s_256[4];
1896   s_256[3] = s_256[5];
1897   s_256[4] = s_256[6];
1898   s_256[5] = s_256[7];
1899 }
1900 
xy_y_convolve_8tap_16x2_avx2(const int16_t * const src,const ptrdiff_t stride,const __m256i coeffs[4],__m256i s_256[8],__m256i ss_256[8],__m256i tt_256[8],__m256i r[4])1901 static AOM_FORCE_INLINE void xy_y_convolve_8tap_16x2_avx2(
1902     const int16_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1903     __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1904   s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1905   ss_256[3] = _mm256_unpacklo_epi16(s_256[6], s_256[7]);
1906   ss_256[7] = _mm256_unpackhi_epi16(s_256[6], s_256[7]);
1907   s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1908   tt_256[3] = _mm256_unpacklo_epi16(s_256[7], s_256[6]);
1909   tt_256[7] = _mm256_unpackhi_epi16(s_256[7], s_256[6]);
1910 
1911   xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r + 0);
1912   xy_y_convolve_8tap_16_avx2(tt_256, coeffs, r + 2);
1913 
1914   ss_256[0] = ss_256[1];
1915   ss_256[1] = ss_256[2];
1916   ss_256[2] = ss_256[3];
1917   ss_256[4] = ss_256[5];
1918   ss_256[5] = ss_256[6];
1919   ss_256[6] = ss_256[7];
1920 
1921   tt_256[0] = tt_256[1];
1922   tt_256[1] = tt_256[2];
1923   tt_256[2] = tt_256[3];
1924   tt_256[4] = tt_256[5];
1925   tt_256[5] = tt_256[6];
1926   tt_256[6] = tt_256[7];
1927 }
1928 
xy_y_convolve_8tap_16x2_half_pel_avx2(const int16_t * const src,const ptrdiff_t stride,const __m256i coeffs[4],__m256i s_256[8],__m256i r[4])1929 static inline void xy_y_convolve_8tap_16x2_half_pel_avx2(
1930     const int16_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1931     __m256i s_256[8], __m256i r[4]) {
1932   __m256i a_256[4], ss_256[4];
1933   s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1934 
1935   a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1936   a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1937   a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1938   a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1939   ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1940   ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1941   ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1942   ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1943 
1944   xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1945 
1946   a_256[1] = _mm256_add_epi16(s_256[2], s_256[7]);
1947   a_256[2] = _mm256_add_epi16(s_256[3], s_256[6]);
1948   a_256[3] = _mm256_add_epi16(s_256[4], s_256[5]);
1949   s_256[0] = s_256[2];
1950   s_256[2] = s_256[4];
1951   s_256[4] = s_256[6];
1952   s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1953 
1954   a_256[0] = _mm256_add_epi16(s_256[1], s_256[6]);
1955   s_256[1] = s_256[3];
1956   s_256[3] = s_256[5];
1957   s_256[5] = s_256[7];
1958   ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1959   ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1960   ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1961   ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1962 
1963   xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1964 }
1965 
xy_y_round_store_8x2_avx2(const __m256i res[2],uint8_t * const dst,const ptrdiff_t stride)1966 static inline void xy_y_round_store_8x2_avx2(const __m256i res[2],
1967                                              uint8_t *const dst,
1968                                              const ptrdiff_t stride) {
1969   const __m256i r = xy_y_round_16_avx2(res);
1970   pack_store_8x2_avx2(r, dst, stride);
1971 }
1972 
xy_y_round_store_16x2_avx2(const __m256i res[4],uint8_t * const dst,const ptrdiff_t stride)1973 static inline void xy_y_round_store_16x2_avx2(const __m256i res[4],
1974                                               uint8_t *const dst,
1975                                               const ptrdiff_t stride) {
1976   const __m256i r0 = xy_y_round_16_avx2(res + 0);
1977   const __m256i r1 = xy_y_round_16_avx2(res + 2);
1978   xy_y_pack_store_16x2_avx2(r0, r1, dst, stride);
1979 }
1980 
sr_y_round_store_32_avx2(const __m256i res[2],uint8_t * const dst)1981 static inline void sr_y_round_store_32_avx2(const __m256i res[2],
1982                                             uint8_t *const dst) {
1983   __m256i r[2];
1984 
1985   r[0] = sr_y_round_avx2(res[0]);
1986   r[1] = sr_y_round_avx2(res[1]);
1987   convolve_store_32_avx2(r[0], r[1], dst);
1988 }
1989 
sr_y_round_store_32x2_avx2(const __m256i res[4],uint8_t * const dst,const int32_t dst_stride)1990 static inline void sr_y_round_store_32x2_avx2(const __m256i res[4],
1991                                               uint8_t *const dst,
1992                                               const int32_t dst_stride) {
1993   sr_y_round_store_32_avx2(res, dst);
1994   sr_y_round_store_32_avx2(res + 2, dst + dst_stride);
1995 }
1996 
sr_y_2tap_32_avx2(const uint8_t * const src,const __m256i coeffs[1],const __m256i s0,__m256i * const s1,uint8_t * const dst)1997 static inline void sr_y_2tap_32_avx2(const uint8_t *const src,
1998                                      const __m256i coeffs[1], const __m256i s0,
1999                                      __m256i *const s1, uint8_t *const dst) {
2000   __m256i r[2];
2001   y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
2002   sr_y_round_store_32_avx2(r, dst);
2003 }
2004 
av1_convolve_y_sr_specialized_avx2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_y,const int32_t subpel_y_q4)2005 static AOM_FORCE_INLINE void av1_convolve_y_sr_specialized_avx2(
2006     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
2007     int32_t w, int32_t h, const InterpFilterParams *filter_params_y,
2008     const int32_t subpel_y_q4) {
2009   int32_t x, y;
2010   __m128i coeffs_128[4];
2011   __m256i coeffs_256[4];
2012 
2013   int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
2014 
2015   if (vert_tap == 2) {
2016     // vert_filt as 2 tap
2017     const uint8_t *src_ptr = src;
2018 
2019     y = h;
2020 
2021     if (subpel_y_q4 != 8) {
2022       if (w <= 8) {
2023         prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4,
2024                                        coeffs_128);
2025 
2026         if (w == 2) {
2027           __m128i s_16[2];
2028 
2029           s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2030 
2031           do {
2032             const __m128i res = y_convolve_2tap_2x2_ssse3(src_ptr, src_stride,
2033                                                           coeffs_128, s_16);
2034             const __m128i r = sr_y_round_sse2(res);
2035             pack_store_2x2_sse2(r, dst, dst_stride);
2036             src_ptr += 2 * src_stride;
2037             dst += 2 * dst_stride;
2038             y -= 2;
2039           } while (y);
2040         } else if (w == 4) {
2041           __m128i s_32[2];
2042 
2043           s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2044 
2045           do {
2046             const __m128i res = y_convolve_2tap_4x2_ssse3(src_ptr, src_stride,
2047                                                           coeffs_128, s_32);
2048             const __m128i r = sr_y_round_sse2(res);
2049             pack_store_4x2_sse2(r, dst, dst_stride);
2050             src_ptr += 2 * src_stride;
2051             dst += 2 * dst_stride;
2052             y -= 2;
2053           } while (y);
2054         } else {
2055           __m128i s_64[2], s_128[2];
2056 
2057           assert(w == 8);
2058 
2059           s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2060 
2061           do {
2062             // Note: Faster than binding to AVX2 registers.
2063             s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2064             s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
2065             s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2066             s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
2067             const __m128i ss0 = _mm_unpacklo_epi8(s_128[0], s_128[1]);
2068             const __m128i ss1 = _mm_unpackhi_epi8(s_128[0], s_128[1]);
2069             const __m128i res0 = convolve_2tap_ssse3(&ss0, coeffs_128);
2070             const __m128i res1 = convolve_2tap_ssse3(&ss1, coeffs_128);
2071             const __m128i r0 = sr_y_round_sse2(res0);
2072             const __m128i r1 = sr_y_round_sse2(res1);
2073             const __m128i d = _mm_packus_epi16(r0, r1);
2074             _mm_storel_epi64((__m128i *)dst, d);
2075             _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2076             src_ptr += 2 * src_stride;
2077             dst += 2 * dst_stride;
2078             y -= 2;
2079           } while (y);
2080         }
2081       } else {
2082         prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2083 
2084         if (w == 16) {
2085           __m128i s_128[2];
2086 
2087           s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2088 
2089           do {
2090             __m256i r[2];
2091 
2092             y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2093                                       r);
2094             sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2095             src_ptr += 2 * src_stride;
2096             dst += 2 * dst_stride;
2097             y -= 2;
2098           } while (y);
2099         } else if (w == 32) {
2100           __m256i s_256[2];
2101 
2102           s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2103 
2104           do {
2105             sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0],
2106                               &s_256[1], dst);
2107             sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1],
2108                               &s_256[0], dst + dst_stride);
2109             src_ptr += 2 * src_stride;
2110             dst += 2 * dst_stride;
2111             y -= 2;
2112           } while (y);
2113         } else if (w == 64) {
2114           __m256i s_256[2][2];
2115 
2116           s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2117           s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2118 
2119           do {
2120             sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2121                               &s_256[1][0], dst);
2122             sr_y_2tap_32_avx2(src_ptr + src_stride + 32, coeffs_256,
2123                               s_256[0][1], &s_256[1][1], dst + 32);
2124             sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2125                               &s_256[0][0], dst + dst_stride);
2126             sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 32, coeffs_256,
2127                               s_256[1][1], &s_256[0][1], dst + dst_stride + 32);
2128 
2129             src_ptr += 2 * src_stride;
2130             dst += 2 * dst_stride;
2131             y -= 2;
2132           } while (y);
2133         } else {
2134           __m256i s_256[2][4];
2135 
2136           assert(w == 128);
2137 
2138           s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2139           s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2140           s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2141           s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2142 
2143           do {
2144             sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2145                               &s_256[1][0], dst);
2146             sr_y_2tap_32_avx2(src_ptr + src_stride + 1 * 32, coeffs_256,
2147                               s_256[0][1], &s_256[1][1], dst + 1 * 32);
2148             sr_y_2tap_32_avx2(src_ptr + src_stride + 2 * 32, coeffs_256,
2149                               s_256[0][2], &s_256[1][2], dst + 2 * 32);
2150             sr_y_2tap_32_avx2(src_ptr + src_stride + 3 * 32, coeffs_256,
2151                               s_256[0][3], &s_256[1][3], dst + 3 * 32);
2152 
2153             sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2154                               &s_256[0][0], dst + dst_stride);
2155             sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32, coeffs_256,
2156                               s_256[1][1], &s_256[0][1],
2157                               dst + dst_stride + 1 * 32);
2158             sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32, coeffs_256,
2159                               s_256[1][2], &s_256[0][2],
2160                               dst + dst_stride + 2 * 32);
2161             sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32, coeffs_256,
2162                               s_256[1][3], &s_256[0][3],
2163                               dst + dst_stride + 3 * 32);
2164 
2165             src_ptr += 2 * src_stride;
2166             dst += 2 * dst_stride;
2167             y -= 2;
2168           } while (y);
2169         }
2170       }
2171     } else {
2172       // average to get half pel
2173       if (w <= 8) {
2174         if (w == 2) {
2175           __m128i s_16[2];
2176 
2177           s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2178 
2179           do {
2180             s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + src_stride));
2181             const __m128i d0 = _mm_avg_epu8(s_16[0], s_16[1]);
2182             *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d0);
2183             s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
2184             const __m128i d1 = _mm_avg_epu8(s_16[1], s_16[0]);
2185             *(int16_t *)(dst + dst_stride) = (int16_t)_mm_cvtsi128_si32(d1);
2186             src_ptr += 2 * src_stride;
2187             dst += 2 * dst_stride;
2188             y -= 2;
2189           } while (y);
2190         } else if (w == 4) {
2191           __m128i s_32[2];
2192 
2193           s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2194 
2195           do {
2196             s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + src_stride));
2197             const __m128i d0 = _mm_avg_epu8(s_32[0], s_32[1]);
2198             xx_storel_32(dst, d0);
2199             s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2200             const __m128i d1 = _mm_avg_epu8(s_32[1], s_32[0]);
2201             xx_storel_32(dst + dst_stride, d1);
2202             src_ptr += 2 * src_stride;
2203             dst += 2 * dst_stride;
2204             y -= 2;
2205           } while (y);
2206         } else {
2207           __m128i s_64[2];
2208 
2209           assert(w == 8);
2210 
2211           s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2212 
2213           do {
2214             // Note: Faster than binding to AVX2 registers.
2215             s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2216             const __m128i d0 = _mm_avg_epu8(s_64[0], s_64[1]);
2217             _mm_storel_epi64((__m128i *)dst, d0);
2218             s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2219             const __m128i d1 = _mm_avg_epu8(s_64[1], s_64[0]);
2220             _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
2221             src_ptr += 2 * src_stride;
2222             dst += 2 * dst_stride;
2223             y -= 2;
2224           } while (y);
2225         }
2226       } else if (w == 16) {
2227         __m128i s_128[2];
2228 
2229         s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2230 
2231         do {
2232           s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
2233           const __m128i d0 = _mm_avg_epu8(s_128[0], s_128[1]);
2234           _mm_storeu_si128((__m128i *)dst, d0);
2235           s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2236           const __m128i d1 = _mm_avg_epu8(s_128[1], s_128[0]);
2237           _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
2238           src_ptr += 2 * src_stride;
2239           dst += 2 * dst_stride;
2240           y -= 2;
2241         } while (y);
2242       } else if (w == 32) {
2243         __m256i s_256[2];
2244 
2245         s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2246 
2247         do {
2248           sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0], &s_256[1], dst);
2249           sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1], &s_256[0],
2250                                 dst + dst_stride);
2251           src_ptr += 2 * src_stride;
2252           dst += 2 * dst_stride;
2253           y -= 2;
2254         } while (y);
2255       } else if (w == 64) {
2256         __m256i s_256[2][2];
2257 
2258         s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2259         s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2260 
2261         do {
2262           sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2263                                 dst);
2264           sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 32, s_256[0][1],
2265                                 &s_256[1][1], dst + 32);
2266 
2267           sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2268                                 &s_256[0][0], dst + dst_stride);
2269           sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 32, s_256[1][1],
2270                                 &s_256[0][1], dst + dst_stride + 32);
2271 
2272           src_ptr += 2 * src_stride;
2273           dst += 2 * dst_stride;
2274           y -= 2;
2275         } while (y);
2276       } else {
2277         __m256i s_256[2][4];
2278 
2279         assert(w == 128);
2280 
2281         s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2282         s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2283         s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2284         s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2285 
2286         do {
2287           sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2288                                 dst);
2289           sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 1 * 32, s_256[0][1],
2290                                 &s_256[1][1], dst + 1 * 32);
2291           sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 2 * 32, s_256[0][2],
2292                                 &s_256[1][2], dst + 2 * 32);
2293           sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 3 * 32, s_256[0][3],
2294                                 &s_256[1][3], dst + 3 * 32);
2295 
2296           sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2297                                 &s_256[0][0], dst + dst_stride);
2298           sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 1 * 32, s_256[1][1],
2299                                 &s_256[0][1], dst + dst_stride + 1 * 32);
2300           sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 2 * 32, s_256[1][2],
2301                                 &s_256[0][2], dst + dst_stride + 2 * 32);
2302           sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 3 * 32, s_256[1][3],
2303                                 &s_256[0][3], dst + dst_stride + 3 * 32);
2304 
2305           src_ptr += 2 * src_stride;
2306           dst += 2 * dst_stride;
2307           y -= 2;
2308         } while (y);
2309       }
2310     }
2311   } else if (vert_tap == 4) {
2312     // vert_filt as 4 tap
2313     const uint8_t *src_ptr = src - src_stride;
2314 
2315     y = h;
2316 
2317     if (w <= 4) {
2318       prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2319 
2320       if (w == 2) {
2321         __m128i s_16[4], ss_128[2];
2322 
2323         s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2324         s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2325         s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2326 
2327         const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2328         const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2329 
2330         ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2331 
2332         do {
2333           src_ptr += 2 * src_stride;
2334           const __m128i res = y_convolve_4tap_2x2_ssse3(
2335               src_ptr, src_stride, coeffs_128, s_16, ss_128);
2336           const __m128i r = sr_y_round_sse2(res);
2337           pack_store_2x2_sse2(r, dst, dst_stride);
2338 
2339           ss_128[0] = ss_128[1];
2340           dst += 2 * dst_stride;
2341           y -= 2;
2342         } while (y);
2343       } else {
2344         __m128i s_32[4], ss_128[2];
2345 
2346         assert(w == 4);
2347 
2348         s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2349         s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2350         s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2351 
2352         const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2353         const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2354 
2355         ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2356 
2357         do {
2358           src_ptr += 2 * src_stride;
2359           const __m128i res = y_convolve_4tap_4x2_ssse3(
2360               src_ptr, src_stride, coeffs_128, s_32, ss_128);
2361           const __m128i r = sr_y_round_sse2(res);
2362           pack_store_4x2_sse2(r, dst, dst_stride);
2363 
2364           ss_128[0] = ss_128[1];
2365           dst += 2 * dst_stride;
2366           y -= 2;
2367         } while (y);
2368       }
2369     } else {
2370       prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2371 
2372       if (w == 8) {
2373         __m128i s_64[4];
2374         __m256i ss_256[2];
2375 
2376         s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2377         s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2378         s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2379 
2380         // Load lines a and b. Line a to lower 128, line b to upper 128
2381         const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2382         const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2383 
2384         ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2385 
2386         do {
2387           src_ptr += 2 * src_stride;
2388           const __m256i res = y_convolve_4tap_8x2_avx2(
2389               src_ptr, src_stride, coeffs_256, s_64, ss_256);
2390           sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2391 
2392           ss_256[0] = ss_256[1];
2393           dst += 2 * dst_stride;
2394           y -= 2;
2395         } while (y);
2396       } else if (w == 16) {
2397         __m128i s_128[4];
2398         __m256i ss_256[4], r[2];
2399 
2400         s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2401         s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2402         s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2403 
2404         // Load lines a and b. Line a to lower 128, line b to upper 128
2405         const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2406         const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2407 
2408         ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2409         ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
2410 
2411         do {
2412           src_ptr += 2 * src_stride;
2413           y_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2414                                     ss_256, r);
2415           sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2416 
2417           ss_256[0] = ss_256[1];
2418           ss_256[2] = ss_256[3];
2419           dst += 2 * dst_stride;
2420           y -= 2;
2421         } while (y);
2422       } else if (w == 32) {
2423         // AV1 standard won't have 32x4 case.
2424         // This only favors some optimization feature which
2425         // subsamples 32x8 to 32x4 and triggers 4-tap filter.
2426 
2427         __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2428 
2429         s_256[0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * src_stride));
2430         s_256[1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * src_stride));
2431         s_256[2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * src_stride));
2432 
2433         ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2434         ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2435 
2436         tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2437         tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2438 
2439         do {
2440           src_ptr += 2 * src_stride;
2441           y_convolve_4tap_32x2_avx2(src_ptr, src_stride, coeffs_256, s_256,
2442                                     ss_256, tt_256, r);
2443           sr_y_round_store_32x2_avx2(r, dst, dst_stride);
2444 
2445           ss_256[0] = ss_256[1];
2446           ss_256[2] = ss_256[3];
2447 
2448           tt_256[0] = tt_256[1];
2449           tt_256[2] = tt_256[3];
2450           dst += 2 * dst_stride;
2451           y -= 2;
2452         } while (y);
2453       } else {
2454         assert(!(w % 32));
2455 
2456         __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2457         x = 0;
2458         do {
2459           const uint8_t *s = src_ptr + x;
2460           uint8_t *d = dst + x;
2461           s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2462           s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2463           s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2464 
2465           ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2466           ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2467 
2468           tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2469           tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2470 
2471           y = h;
2472           do {
2473             s += 2 * src_stride;
2474             y_convolve_4tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2475                                       tt_256, r);
2476             sr_y_round_store_32x2_avx2(r, d, dst_stride);
2477 
2478             ss_256[0] = ss_256[1];
2479             ss_256[2] = ss_256[3];
2480 
2481             tt_256[0] = tt_256[1];
2482             tt_256[2] = tt_256[3];
2483             d += 2 * dst_stride;
2484             y -= 2;
2485           } while (y);
2486           x += 32;
2487         } while (x < w);
2488       }
2489     }
2490   } else if (vert_tap == 6) {
2491     // vert_filt as 6 tap
2492     const uint8_t *src_ptr = src - 2 * src_stride;
2493 
2494     if (w <= 4) {
2495       prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2496 
2497       y = h;
2498 
2499       if (w == 2) {
2500         __m128i s_16[6], ss_128[3];
2501 
2502         s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2503         s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2504         s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2505         s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2506         s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2507 
2508         const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2509         const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2510         const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2511         const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2512 
2513         ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2514         ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2515 
2516         do {
2517           src_ptr += 2 * src_stride;
2518           const __m128i res = y_convolve_6tap_2x2_ssse3(
2519               src_ptr, src_stride, coeffs_128, s_16, ss_128);
2520           const __m128i r = sr_y_round_sse2(res);
2521           pack_store_2x2_sse2(r, dst, dst_stride);
2522 
2523           ss_128[0] = ss_128[1];
2524           ss_128[1] = ss_128[2];
2525           dst += 2 * dst_stride;
2526           y -= 2;
2527         } while (y);
2528       } else {
2529         __m128i s_32[6], ss_128[3];
2530 
2531         assert(w == 4);
2532 
2533         s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2534         s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2535         s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2536         s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2537         s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2538 
2539         const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2540         const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2541         const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2542         const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2543 
2544         ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2545         ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2546 
2547         do {
2548           src_ptr += 2 * src_stride;
2549           const __m128i res = y_convolve_6tap_4x2_ssse3(
2550               src_ptr, src_stride, coeffs_128, s_32, ss_128);
2551           const __m128i r = sr_y_round_sse2(res);
2552           pack_store_4x2_sse2(r, dst, dst_stride);
2553 
2554           ss_128[0] = ss_128[1];
2555           ss_128[1] = ss_128[2];
2556           dst += 2 * dst_stride;
2557           y -= 2;
2558         } while (y);
2559       }
2560     } else {
2561       prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2562 
2563       if (w == 8) {
2564         __m128i s_64[6];
2565         __m256i ss_256[3];
2566 
2567         s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2568         s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2569         s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2570         s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2571         s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2572 
2573         // Load lines a and b. Line a to lower 128, line b to upper 128
2574         const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2575         const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2576         const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2577         const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2578 
2579         ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2580         ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2581 
2582         y = h;
2583         do {
2584           src_ptr += 2 * src_stride;
2585           const __m256i res = y_convolve_6tap_8x2_avx2(
2586               src_ptr, src_stride, coeffs_256, s_64, ss_256);
2587           sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2588 
2589           ss_256[0] = ss_256[1];
2590           ss_256[1] = ss_256[2];
2591           dst += 2 * dst_stride;
2592           y -= 2;
2593         } while (y);
2594       } else if (w == 16) {
2595         __m128i s_128[6];
2596         __m256i ss_256[6], r[2];
2597 
2598         s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2599         s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2600         s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2601         s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2602         s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2603 
2604         // Load lines a and b. Line a to lower 128, line b to upper 128
2605         const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2606         const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2607         const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2608         const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2609 
2610         ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2611         ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2612 
2613         ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
2614         ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
2615 
2616         y = h;
2617         do {
2618           src_ptr += 2 * src_stride;
2619           y_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2620                                     ss_256, r);
2621           sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2622 
2623           ss_256[0] = ss_256[1];
2624           ss_256[1] = ss_256[2];
2625 
2626           ss_256[3] = ss_256[4];
2627           ss_256[4] = ss_256[5];
2628           dst += 2 * dst_stride;
2629           y -= 2;
2630         } while (y);
2631       } else {
2632         __m256i s_256[6], ss_256[6], tt_256[6], r[4];
2633 
2634         assert(!(w % 32));
2635 
2636         x = 0;
2637         do {
2638           const uint8_t *s = src_ptr + x;
2639           uint8_t *d = dst + x;
2640 
2641           s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2642           s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2643           s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2644           s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2645           s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2646 
2647           ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2648           ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2649           ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2650           ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2651 
2652           tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2653           tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2654           tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2655           tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2656 
2657           y = h;
2658           do {
2659             s += 2 * src_stride;
2660             y_convolve_6tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2661                                       tt_256, r);
2662             sr_y_round_store_32x2_avx2(r, d, dst_stride);
2663 
2664             ss_256[0] = ss_256[1];
2665             ss_256[1] = ss_256[2];
2666             ss_256[3] = ss_256[4];
2667             ss_256[4] = ss_256[5];
2668 
2669             tt_256[0] = tt_256[1];
2670             tt_256[1] = tt_256[2];
2671             tt_256[3] = tt_256[4];
2672             tt_256[4] = tt_256[5];
2673             d += 2 * dst_stride;
2674             y -= 2;
2675           } while (y);
2676 
2677           x += 32;
2678         } while (x < w);
2679       }
2680     }
2681   } else if (vert_tap == 8) {
2682     // vert_filt as 8 tap
2683     const uint8_t *src_ptr = src - 3 * src_stride;
2684 
2685     if (w <= 4) {
2686       prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2687 
2688       y = h;
2689 
2690       if (w == 2) {
2691         __m128i s_16[8], ss_128[4];
2692 
2693         s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2694         s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2695         s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2696         s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2697         s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2698         s_16[5] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 5 * src_stride));
2699         s_16[6] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 6 * src_stride));
2700 
2701         const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2702         const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2703         const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2704         const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2705         const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
2706         const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
2707 
2708         ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2709         ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2710         ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2711 
2712         do {
2713           const __m128i res = y_convolve_8tap_2x2_ssse3(
2714               src_ptr, src_stride, coeffs_128, s_16, ss_128);
2715           const __m128i r = sr_y_round_sse2(res);
2716           pack_store_2x2_sse2(r, dst, dst_stride);
2717           ss_128[0] = ss_128[1];
2718           ss_128[1] = ss_128[2];
2719           ss_128[2] = ss_128[3];
2720           src_ptr += 2 * src_stride;
2721           dst += 2 * dst_stride;
2722           y -= 2;
2723         } while (y);
2724       } else {
2725         __m128i s_32[8], ss_128[4];
2726 
2727         assert(w == 4);
2728 
2729         s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2730         s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2731         s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2732         s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2733         s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2734         s_32[5] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 5 * src_stride));
2735         s_32[6] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 6 * src_stride));
2736 
2737         const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2738         const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2739         const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2740         const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2741         const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
2742         const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
2743 
2744         ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2745         ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2746         ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2747 
2748         do {
2749           const __m128i res = y_convolve_8tap_4x2_ssse3(
2750               src_ptr, src_stride, coeffs_128, s_32, ss_128);
2751           const __m128i r = sr_y_round_sse2(res);
2752           pack_store_4x2_sse2(r, dst, dst_stride);
2753           ss_128[0] = ss_128[1];
2754           ss_128[1] = ss_128[2];
2755           ss_128[2] = ss_128[3];
2756           src_ptr += 2 * src_stride;
2757           dst += 2 * dst_stride;
2758           y -= 2;
2759         } while (y);
2760       }
2761     } else {
2762       prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2763 
2764       if (w == 8) {
2765         __m128i s_64[8];
2766         __m256i ss_256[4];
2767 
2768         s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2769         s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2770         s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2771         s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2772         s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2773         s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
2774         s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
2775 
2776         // Load lines a and b. Line a to lower 128, line b to upper 128
2777         const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2778         const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2779         const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2780         const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2781         const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
2782         const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
2783 
2784         ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2785         ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2786         ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2787 
2788         y = h;
2789         do {
2790           const __m256i res = y_convolve_8tap_8x2_avx2(
2791               src_ptr, src_stride, coeffs_256, s_64, ss_256);
2792           sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2793           ss_256[0] = ss_256[1];
2794           ss_256[1] = ss_256[2];
2795           ss_256[2] = ss_256[3];
2796           src_ptr += 2 * src_stride;
2797           dst += 2 * dst_stride;
2798           y -= 2;
2799         } while (y);
2800       } else if (w == 16) {
2801         __m128i s_128[8];
2802         __m256i ss_256[8], r[2];
2803 
2804         s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2805         s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2806         s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2807         s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2808         s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2809         s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
2810         s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
2811 
2812         // Load lines a and b. Line a to lower 128, line b to upper 128
2813         const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2814         const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2815         const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2816         const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2817         const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
2818         const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
2819 
2820         ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2821         ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2822         ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2823 
2824         ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
2825         ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
2826         ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
2827 
2828         y = h;
2829         do {
2830           y_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2831                                     ss_256, r);
2832           sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2833 
2834           ss_256[0] = ss_256[1];
2835           ss_256[1] = ss_256[2];
2836           ss_256[2] = ss_256[3];
2837 
2838           ss_256[4] = ss_256[5];
2839           ss_256[5] = ss_256[6];
2840           ss_256[6] = ss_256[7];
2841           src_ptr += 2 * src_stride;
2842           dst += 2 * dst_stride;
2843           y -= 2;
2844         } while (y);
2845       } else {
2846         __m256i s_256[8], ss_256[8], tt_256[8], r[4];
2847 
2848         assert(!(w % 32));
2849 
2850         x = 0;
2851         do {
2852           const uint8_t *s = src_ptr + x;
2853           uint8_t *d = dst + x;
2854 
2855           s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2856           s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2857           s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2858           s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2859           s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2860           s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
2861           s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
2862 
2863           ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2864           ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2865           ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
2866           ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2867           ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2868           ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
2869 
2870           tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2871           tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2872           tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
2873           tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2874           tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2875           tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
2876 
2877           y = h;
2878           do {
2879             y_convolve_8tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2880                                       tt_256, r);
2881             sr_y_round_store_32x2_avx2(r, d, dst_stride);
2882 
2883             ss_256[0] = ss_256[1];
2884             ss_256[1] = ss_256[2];
2885             ss_256[2] = ss_256[3];
2886             ss_256[4] = ss_256[5];
2887             ss_256[5] = ss_256[6];
2888             ss_256[6] = ss_256[7];
2889 
2890             tt_256[0] = tt_256[1];
2891             tt_256[1] = tt_256[2];
2892             tt_256[2] = tt_256[3];
2893             tt_256[4] = tt_256[5];
2894             tt_256[5] = tt_256[6];
2895             tt_256[6] = tt_256[7];
2896             s += 2 * src_stride;
2897             d += 2 * dst_stride;
2898             y -= 2;
2899           } while (y);
2900 
2901           x += 32;
2902         } while (x < w);
2903       }
2904     }
2905   }
2906 }
2907 
sr_x_2tap_32_avx2(const uint8_t * const src,const __m256i coeffs[1],uint8_t * const dst)2908 static inline void sr_x_2tap_32_avx2(const uint8_t *const src,
2909                                      const __m256i coeffs[1],
2910                                      uint8_t *const dst) {
2911   __m256i r[2];
2912 
2913   x_convolve_2tap_32_avx2(src, coeffs, r);
2914   sr_x_round_store_32_avx2(r, dst);
2915 }
2916 
sr_x_6tap_32_avx2(const uint8_t * const src,const __m256i coeffs[3],const __m256i filt[3],uint8_t * const dst)2917 static inline void sr_x_6tap_32_avx2(const uint8_t *const src,
2918                                      const __m256i coeffs[3],
2919                                      const __m256i filt[3],
2920                                      uint8_t *const dst) {
2921   __m256i r[2];
2922 
2923   x_convolve_6tap_32_avx2(src, coeffs, filt, r);
2924   sr_x_round_store_32_avx2(r, dst);
2925 }
2926 
sr_x_8tap_32_avx2(const uint8_t * const src,const __m256i coeffs[4],const __m256i filt[4],uint8_t * const dst)2927 static AOM_FORCE_INLINE void sr_x_8tap_32_avx2(const uint8_t *const src,
2928                                                const __m256i coeffs[4],
2929                                                const __m256i filt[4],
2930                                                uint8_t *const dst) {
2931   __m256i r[2];
2932 
2933   x_convolve_8tap_32_avx2(src, coeffs, filt, r);
2934   sr_x_round_store_32_avx2(r, dst);
2935 }
2936 
av1_convolve_x_sr_specialized_avx2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const int32_t subpel_x_q4,ConvolveParams * conv_params)2937 static AOM_FORCE_INLINE void av1_convolve_x_sr_specialized_avx2(
2938     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
2939     int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
2940     const int32_t subpel_x_q4, ConvolveParams *conv_params) {
2941   int32_t y = h;
2942   __m128i coeffs_128[4];
2943   __m256i coeffs_256[4];
2944 
2945   assert(conv_params->round_0 == 3);
2946   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
2947          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
2948   (void)conv_params;
2949 
2950   const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
2951 
2952   if (horz_tap == 2) {
2953     // horz_filt as 2 tap
2954     const uint8_t *src_ptr = src;
2955 
2956     if (subpel_x_q4 != 8) {
2957       if (w <= 8) {
2958         prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4,
2959                                        coeffs_128);
2960 
2961         if (w == 2) {
2962           do {
2963             const __m128i res =
2964                 x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
2965             const __m128i r = sr_x_round_sse2(res);
2966             pack_store_2x2_sse2(r, dst, dst_stride);
2967             src_ptr += 2 * src_stride;
2968             dst += 2 * dst_stride;
2969             y -= 2;
2970           } while (y);
2971         } else if (w == 4) {
2972           do {
2973             const __m128i res =
2974                 x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
2975             const __m128i r = sr_x_round_sse2(res);
2976             pack_store_4x2_sse2(r, dst, dst_stride);
2977             src_ptr += 2 * src_stride;
2978             dst += 2 * dst_stride;
2979             y -= 2;
2980           } while (y);
2981         } else {
2982           assert(w == 8);
2983 
2984           do {
2985             __m128i res[2];
2986 
2987             x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, res);
2988             res[0] = sr_x_round_sse2(res[0]);
2989             res[1] = sr_x_round_sse2(res[1]);
2990             const __m128i d = _mm_packus_epi16(res[0], res[1]);
2991             _mm_storel_epi64((__m128i *)dst, d);
2992             _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2993 
2994             src_ptr += 2 * src_stride;
2995             dst += 2 * dst_stride;
2996             y -= 2;
2997           } while (y);
2998         }
2999       } else {
3000         prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3001 
3002         if (w == 16) {
3003           do {
3004             __m256i r[2];
3005 
3006             x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
3007             sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3008             src_ptr += 2 * src_stride;
3009             dst += 2 * dst_stride;
3010             y -= 2;
3011           } while (y);
3012         } else if (w == 32) {
3013           do {
3014             sr_x_2tap_32_avx2(src_ptr, coeffs_256, dst);
3015             src_ptr += src_stride;
3016             dst += dst_stride;
3017           } while (--y);
3018         } else if (w == 64) {
3019           do {
3020             sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3021             sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3022             src_ptr += src_stride;
3023             dst += dst_stride;
3024           } while (--y);
3025         } else {
3026           assert(w == 128);
3027 
3028           do {
3029             sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3030             sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3031             sr_x_2tap_32_avx2(src_ptr + 2 * 32, coeffs_256, dst + 2 * 32);
3032             sr_x_2tap_32_avx2(src_ptr + 3 * 32, coeffs_256, dst + 3 * 32);
3033             src_ptr += src_stride;
3034             dst += dst_stride;
3035           } while (--y);
3036         }
3037       }
3038     } else {
3039       // average to get half pel
3040       if (w == 2) {
3041         do {
3042           __m128i s_128;
3043 
3044           s_128 = load_u8_4x2_sse4_1(src_ptr, src_stride);
3045           const __m128i s1 = _mm_srli_si128(s_128, 1);
3046           const __m128i d = _mm_avg_epu8(s_128, s1);
3047           *(uint16_t *)dst = (uint16_t)_mm_cvtsi128_si32(d);
3048           *(uint16_t *)(dst + dst_stride) = _mm_extract_epi16(d, 2);
3049 
3050           src_ptr += 2 * src_stride;
3051           dst += 2 * dst_stride;
3052           y -= 2;
3053         } while (y);
3054       } else if (w == 4) {
3055         do {
3056           __m128i s_128;
3057 
3058           s_128 = load_u8_8x2_sse2(src_ptr, src_stride);
3059           const __m128i s1 = _mm_srli_si128(s_128, 1);
3060           const __m128i d = _mm_avg_epu8(s_128, s1);
3061           xx_storel_32(dst, d);
3062           *(int32_t *)(dst + dst_stride) = _mm_extract_epi32(d, 2);
3063 
3064           src_ptr += 2 * src_stride;
3065           dst += 2 * dst_stride;
3066           y -= 2;
3067         } while (y);
3068       } else if (w == 8) {
3069         do {
3070           const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3071           const __m128i s10 =
3072               _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3073           const __m128i s01 = _mm_srli_si128(s00, 1);
3074           const __m128i s11 = _mm_srli_si128(s10, 1);
3075           const __m128i d0 = _mm_avg_epu8(s00, s01);
3076           const __m128i d1 = _mm_avg_epu8(s10, s11);
3077           _mm_storel_epi64((__m128i *)dst, d0);
3078           _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
3079 
3080           src_ptr += 2 * src_stride;
3081           dst += 2 * dst_stride;
3082           y -= 2;
3083         } while (y);
3084       } else if (w == 16) {
3085         do {
3086           const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3087           const __m128i s01 = _mm_loadu_si128((__m128i *)(src_ptr + 1));
3088           const __m128i s10 =
3089               _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3090           const __m128i s11 =
3091               _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1));
3092           const __m128i d0 = _mm_avg_epu8(s00, s01);
3093           const __m128i d1 = _mm_avg_epu8(s10, s11);
3094           _mm_storeu_si128((__m128i *)dst, d0);
3095           _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
3096 
3097           src_ptr += 2 * src_stride;
3098           dst += 2 * dst_stride;
3099           y -= 2;
3100         } while (y);
3101       } else if (w == 32) {
3102         do {
3103           sr_x_2tap_32_avg_avx2(src_ptr, dst);
3104           src_ptr += src_stride;
3105           dst += dst_stride;
3106         } while (--y);
3107       } else if (w == 64) {
3108         do {
3109           sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3110           sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3111           src_ptr += src_stride;
3112           dst += dst_stride;
3113         } while (--y);
3114       } else {
3115         assert(w == 128);
3116 
3117         do {
3118           sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3119           sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3120           sr_x_2tap_32_avg_avx2(src_ptr + 2 * 32, dst + 2 * 32);
3121           sr_x_2tap_32_avg_avx2(src_ptr + 3 * 32, dst + 3 * 32);
3122           src_ptr += src_stride;
3123           dst += dst_stride;
3124         } while (--y);
3125       }
3126     }
3127   } else if (horz_tap == 4) {
3128     // horz_filt as 4 tap
3129     const uint8_t *src_ptr = src - 1;
3130 
3131     prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
3132 
3133     if (w == 2) {
3134       do {
3135         const __m128i res =
3136             x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
3137         const __m128i r = sr_x_round_sse2(res);
3138         pack_store_2x2_sse2(r, dst, dst_stride);
3139         src_ptr += 2 * src_stride;
3140         dst += 2 * dst_stride;
3141         y -= 2;
3142       } while (y);
3143     } else if (w == 4) {
3144       do {
3145         const __m128i res =
3146             x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
3147         const __m128i r = sr_x_round_sse2(res);
3148         pack_store_4x2_sse2(r, dst, dst_stride);
3149         src_ptr += 2 * src_stride;
3150         dst += 2 * dst_stride;
3151         y -= 2;
3152       } while (y);
3153     } else if (w == 8) {
3154       // TODO([email protected]): Reuse the old SIMD code here. Need to
3155       // rewrite this for better performance later.
3156       __m256i filt_256[2];
3157       prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3158 
3159       filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3160       filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3161       for (int i = 0; i < h; i += 2) {
3162         const __m256i data = _mm256_permute2x128_si256(
3163             _mm256_castsi128_si256(
3164                 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
3165             _mm256_castsi128_si256(_mm_loadu_si128(
3166                 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
3167             0x20);
3168 
3169         __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3170         res_16b = sr_x_round_avx2(res_16b);
3171 
3172         __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3173 
3174         const __m128i res_0 = _mm256_castsi256_si128(res_8b);
3175         const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
3176 
3177         _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
3178         _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
3179       }
3180     } else {
3181       assert(!(w % 16));
3182       // TODO([email protected]): Reuse the old SIMD code here. Need to
3183       // rewrite this for better performance later.
3184       __m256i filt_256[2];
3185       prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3186       filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3187       filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3188 
3189       for (int i = 0; i < h; ++i) {
3190         for (int j = 0; j < w; j += 16) {
3191           // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
3192           // 18 19 20 21 22 23
3193           const __m256i data = _mm256_inserti128_si256(
3194               _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
3195               _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
3196               1);
3197 
3198           __m256i res_16b =
3199               convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3200           res_16b = sr_x_round_avx2(res_16b);
3201 
3202           /* rounding code */
3203           // 8 bit conversion and saturation to uint8
3204           __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3205 
3206           // Store values into the destination buffer
3207           // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
3208           res_8b = _mm256_permute4x64_epi64(res_8b, 216);
3209           __m128i res = _mm256_castsi256_si128(res_8b);
3210           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
3211         }
3212       }
3213     }
3214   } else {
3215     __m256i filt_256[4];
3216 
3217     filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3218     filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3219     filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
3220 
3221     if (horz_tap == 6) {
3222       // horz_filt as 6 tap
3223       const uint8_t *src_ptr = src - 2;
3224 
3225       prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3226 
3227       if (w == 8) {
3228         do {
3229           const __m256i res = x_convolve_6tap_8x2_avx2(src_ptr, src_stride,
3230                                                        coeffs_256, filt_256);
3231           sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3232           src_ptr += 2 * src_stride;
3233           dst += 2 * dst_stride;
3234           y -= 2;
3235         } while (y);
3236       } else if (w == 16) {
3237         do {
3238           __m256i r[2];
3239 
3240           x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3241                                     r);
3242           sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3243           src_ptr += 2 * src_stride;
3244           dst += 2 * dst_stride;
3245           y -= 2;
3246         } while (y);
3247       } else if (w == 32) {
3248         do {
3249           sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3250           src_ptr += src_stride;
3251           dst += dst_stride;
3252         } while (--y);
3253       } else if (w == 64) {
3254         do {
3255           sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3256           sr_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3257           src_ptr += src_stride;
3258           dst += dst_stride;
3259         } while (--y);
3260       } else {
3261         assert(w == 128);
3262 
3263         do {
3264           sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3265           sr_x_6tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3266                             dst + 1 * 32);
3267           sr_x_6tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3268                             dst + 2 * 32);
3269           sr_x_6tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3270                             dst + 3 * 32);
3271           src_ptr += src_stride;
3272           dst += dst_stride;
3273         } while (--y);
3274       }
3275     } else if (horz_tap == 8) {
3276       // horz_filt as 8 tap
3277       const uint8_t *src_ptr = src - 3;
3278 
3279       filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
3280 
3281       prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3282 
3283       if (w == 8) {
3284         do {
3285           const __m256i res = x_convolve_8tap_8x2_avx2(src_ptr, src_stride,
3286                                                        coeffs_256, filt_256);
3287           sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3288           src_ptr += 2 * src_stride;
3289           dst += 2 * dst_stride;
3290           y -= 2;
3291         } while (y);
3292       } else if (w == 16) {
3293         do {
3294           __m256i r[2];
3295 
3296           x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3297                                     r);
3298           sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3299           src_ptr += 2 * src_stride;
3300           dst += 2 * dst_stride;
3301           y -= 2;
3302         } while (y);
3303       } else if (w == 32) {
3304         do {
3305           sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3306           src_ptr += src_stride;
3307           dst += dst_stride;
3308         } while (--y);
3309       } else if (w == 64) {
3310         do {
3311           sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3312           sr_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3313           src_ptr += src_stride;
3314           dst += dst_stride;
3315         } while (--y);
3316       } else {
3317         assert(w == 128);
3318 
3319         do {
3320           sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3321           sr_x_8tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3322                             dst + 1 * 32);
3323           sr_x_8tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3324                             dst + 2 * 32);
3325           sr_x_8tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3326                             dst + 3 * 32);
3327           src_ptr += src_stride;
3328           dst += dst_stride;
3329         } while (--y);
3330       }
3331     }
3332   }
3333 }
3334 
3335 #endif  // THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
3336