1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #ifndef THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
13 #define THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
14
15 #include "EbMemory_AVX2.h"
16 #include "EbMemory_SSE4_1.h"
17 #include "synonyms.h"
18
19 #include "aom_dsp/aom_filter.h"
20 #include "aom_dsp/x86/convolve_avx2.h"
21 #include "aom_dsp/x86/mem_sse2.h"
22
populate_coeffs_4tap_avx2(const __m128i coeffs_128,__m256i coeffs[2])23 static inline void populate_coeffs_4tap_avx2(const __m128i coeffs_128,
24 __m256i coeffs[2]) {
25 const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
26
27 // coeffs 2 3 2 3 2 3 2 3
28 coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
29 // coeffs 4 5 4 5 4 5 4 5
30 coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
31 }
32
populate_coeffs_6tap_avx2(const __m128i coeffs_128,__m256i coeffs[3])33 static inline void populate_coeffs_6tap_avx2(const __m128i coeffs_128,
34 __m256i coeffs[3]) {
35 const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
36
37 // coeffs 1 2 1 2 1 2 1 2
38 coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
39 // coeffs 3 4 3 4 3 4 3 4
40 coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
41 // coeffs 5 6 5 6 5 6 5 6
42 coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
43 }
44
populate_coeffs_8tap_avx2(const __m128i coeffs_128,__m256i coeffs[4])45 static inline void populate_coeffs_8tap_avx2(const __m128i coeffs_128,
46 __m256i coeffs[4]) {
47 const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
48
49 // coeffs 0 1 0 1 0 1 0 1
50 coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
51 // coeffs 2 3 2 3 2 3 2 3
52 coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
53 // coeffs 4 5 4 5 4 5 4 5
54 coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
55 // coeffs 6 7 6 7 6 7 6 7
56 coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
57 }
58
prepare_half_coeffs_2tap_ssse3(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)59 static inline void prepare_half_coeffs_2tap_ssse3(
60 const InterpFilterParams *const filter_params, const int32_t subpel_q4,
61 __m128i *const coeffs /* [1] */) {
62 const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
63 filter_params, subpel_q4 & SUBPEL_MASK);
64 const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
65
66 // right shift all filter co-efficients by 1 to reduce the bits required.
67 // This extra right shift will be taken care of at the end while rounding
68 // the result.
69 // Since all filter co-efficients are even, this change will not affect the
70 // end result
71 assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
72 _mm_set1_epi16((short)0xffff)));
73
74 const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
75
76 // coeffs 3 4 3 4 3 4 3 4
77 *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
78 }
79
prepare_half_coeffs_4tap_ssse3(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)80 static inline void prepare_half_coeffs_4tap_ssse3(
81 const InterpFilterParams *const filter_params, const int32_t subpel_q4,
82 __m128i *const coeffs /* [2] */) {
83 const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
84 filter_params, subpel_q4 & SUBPEL_MASK);
85 const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
86
87 // right shift all filter co-efficients by 1 to reduce the bits required.
88 // This extra right shift will be taken care of at the end while rounding
89 // the result.
90 // Since all filter co-efficients are even, this change will not affect the
91 // end result
92 assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
93 _mm_set1_epi16((short)0xffff)));
94
95 const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
96
97 // coeffs 2 3 2 3 2 3 2 3
98 coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
99 // coeffs 4 5 4 5 4 5 4 5
100 coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
101 }
102
prepare_half_coeffs_6tap_ssse3(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)103 static inline void prepare_half_coeffs_6tap_ssse3(
104 const InterpFilterParams *const filter_params, const int32_t subpel_q4,
105 __m128i *const coeffs /* [3] */) {
106 const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
107 filter_params, subpel_q4 & SUBPEL_MASK);
108 const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
109
110 // right shift all filter co-efficients by 1 to reduce the bits required.
111 // This extra right shift will be taken care of at the end while rounding
112 // the result.
113 // Since all filter co-efficients are even, this change will not affect the
114 // end result
115 assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
116 _mm_set1_epi16((short)0xffff)));
117
118 const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
119
120 // coeffs 1 2 1 2 1 2 1 2
121 coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
122 // coeffs 3 4 3 4 3 4 3 4
123 coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
124 // coeffs 5 6 5 6 5 6 5 6
125 coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0C0Au));
126 }
127
prepare_half_coeffs_8tap_ssse3(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)128 static inline void prepare_half_coeffs_8tap_ssse3(
129 const InterpFilterParams *const filter_params, const int32_t subpel_q4,
130 __m128i *const coeffs /* [4] */) {
131 const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
132 filter_params, subpel_q4 & SUBPEL_MASK);
133 const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
134
135 // right shift all filter co-efficients by 1 to reduce the bits required.
136 // This extra right shift will be taken care of at the end while rounding
137 // the result.
138 // Since all filter co-efficients are even, this change will not affect the
139 // end result
140 assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
141 _mm_set1_epi16((short)0xffff)));
142
143 const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
144
145 // coeffs 0 1 0 1 0 1 0 1
146 coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
147 // coeffs 2 3 2 3 2 3 2 3
148 coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
149 // coeffs 4 5 4 5 4 5 4 5
150 coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
151 // coeffs 6 7 6 7 6 7 6 7
152 coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
153 }
154
prepare_half_coeffs_2tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)155 static inline void prepare_half_coeffs_2tap_avx2(
156 const InterpFilterParams *const filter_params, const int32_t subpel_q4,
157 __m256i *const coeffs /* [1] */) {
158 const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
159 filter_params, subpel_q4 & SUBPEL_MASK);
160 const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
161 const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
162
163 // right shift all filter co-efficients by 1 to reduce the bits required.
164 // This extra right shift will be taken care of at the end while rounding
165 // the result.
166 // Since all filter co-efficients are even, this change will not affect the
167 // end result
168 assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
169 _mm_set1_epi16((short)0xffff)));
170
171 const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
172
173 // coeffs 3 4 3 4 3 4 3 4
174 *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
175 }
176
prepare_half_coeffs_4tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)177 static inline void prepare_half_coeffs_4tap_avx2(
178 const InterpFilterParams *const filter_params, const int32_t subpel_q4,
179 __m256i *const coeffs /* [2] */) {
180 const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
181 filter_params, subpel_q4 & SUBPEL_MASK);
182 const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
183
184 // right shift all filter co-efficients by 1 to reduce the bits required.
185 // This extra right shift will be taken care of at the end while rounding
186 // the result.
187 // Since all filter co-efficients are even, this change will not affect the
188 // end result
189 assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
190 _mm_set1_epi16((short)0xffff)));
191 const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
192 populate_coeffs_4tap_avx2(coeffs_1, coeffs);
193 }
194
prepare_half_coeffs_6tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)195 static inline void prepare_half_coeffs_6tap_avx2(
196 const InterpFilterParams *const filter_params, const int32_t subpel_q4,
197 __m256i *const coeffs /* [3] */) {
198 const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
199 filter_params, subpel_q4 & SUBPEL_MASK);
200 const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
201
202 // right shift all filter co-efficients by 1 to reduce the bits required.
203 // This extra right shift will be taken care of at the end while rounding
204 // the result.
205 // Since all filter co-efficients are even, this change will not affect the
206 // end result
207 assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
208 _mm_set1_epi16((short)0xffff)));
209 const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
210 populate_coeffs_6tap_avx2(coeffs_1, coeffs);
211 }
212
prepare_half_coeffs_8tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)213 static inline void prepare_half_coeffs_8tap_avx2(
214 const InterpFilterParams *const filter_params, const int32_t subpel_q4,
215 __m256i *const coeffs /* [4] */) {
216 const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
217 filter_params, subpel_q4 & SUBPEL_MASK);
218 const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
219
220 // right shift all filter co-efficients by 1 to reduce the bits required.
221 // This extra right shift will be taken care of at the end while rounding
222 // the result.
223 // Since all filter co-efficients are even, this change will not affect the
224 // end result
225 assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
226 _mm_set1_epi16((short)0xffff)));
227 const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
228 populate_coeffs_8tap_avx2(coeffs_1, coeffs);
229 }
230
prepare_coeffs_2tap_sse2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)231 static inline void prepare_coeffs_2tap_sse2(
232 const InterpFilterParams *const filter_params, const int32_t subpel_q4,
233 __m128i *const coeffs /* [1] */) {
234 const int16_t *filter = av1_get_interp_filter_subpel_kernel(
235 filter_params, subpel_q4 & SUBPEL_MASK);
236
237 const __m128i coeff = _mm_cvtsi32_si128(loadu_int32(filter + 3));
238
239 // coeffs 3 4 3 4 3 4 3 4
240 coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
241 }
242
prepare_coeffs_4tap_sse2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)243 static inline void prepare_coeffs_4tap_sse2(
244 const InterpFilterParams *const filter_params, const int32_t subpel_q4,
245 __m128i *const coeffs /* [2] */) {
246 const int16_t *filter = av1_get_interp_filter_subpel_kernel(
247 filter_params, subpel_q4 & SUBPEL_MASK);
248
249 const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
250
251 // coeffs 2 3 2 3 2 3 2 3
252 coeffs[0] = _mm_shuffle_epi32(coeff, 0x55);
253 // coeffs 4 5 4 5 4 5 4 5
254 coeffs[1] = _mm_shuffle_epi32(coeff, 0xaa);
255 }
256
prepare_coeffs_6tap_ssse3(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)257 static inline void prepare_coeffs_6tap_ssse3(
258 const InterpFilterParams *const filter_params, const int32_t subpel_q4,
259 __m128i *const coeffs /* [3] */) {
260 const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
261 filter_params, subpel_q4 & SUBPEL_MASK);
262 const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
263
264 // coeffs 1 2 1 2 1 2 1 2
265 coeffs[0] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x05040302u));
266 // coeffs 3 4 3 4 3 4 3 4
267 coeffs[1] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x09080706u));
268 // coeffs 5 6 5 6 5 6 5 6
269 coeffs[2] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x0D0C0B0Au));
270 }
271
prepare_coeffs_8tap_sse2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)272 static inline void prepare_coeffs_8tap_sse2(
273 const InterpFilterParams *const filter_params, const int32_t subpel_q4,
274 __m128i *const coeffs /* [4] */) {
275 const int16_t *filter = av1_get_interp_filter_subpel_kernel(
276 filter_params, subpel_q4 & SUBPEL_MASK);
277
278 const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
279
280 // coeffs 0 1 0 1 0 1 0 1
281 coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
282 // coeffs 2 3 2 3 2 3 2 3
283 coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
284 // coeffs 4 5 4 5 4 5 4 5
285 coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
286 // coeffs 6 7 6 7 6 7 6 7
287 coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
288 }
289
prepare_coeffs_2tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)290 static inline void prepare_coeffs_2tap_avx2(
291 const InterpFilterParams *const filter_params, const int32_t subpel_q4,
292 __m256i *const coeffs /* [1] */) {
293 const int16_t *filter = av1_get_interp_filter_subpel_kernel(
294 filter_params, subpel_q4 & SUBPEL_MASK);
295
296 const __m128i coeff_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
297 const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
298
299 // coeffs 3 4 3 4 3 4 3 4
300 coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
301 }
302
prepare_coeffs_4tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)303 static inline void prepare_coeffs_4tap_avx2(
304 const InterpFilterParams *const filter_params, const int32_t subpel_q4,
305 __m256i *const coeffs /* [2] */) {
306 const int16_t *filter = av1_get_interp_filter_subpel_kernel(
307 filter_params, subpel_q4 & SUBPEL_MASK);
308
309 const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
310 const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
311
312 // coeffs 2 3 2 3 2 3 2 3
313 coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
314 // coeffs 4 5 4 5 4 5 4 5
315 coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
316 }
317
prepare_coeffs_6tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)318 static inline void prepare_coeffs_6tap_avx2(
319 const InterpFilterParams *const filter_params, const int32_t subpel_q4,
320 __m256i *const coeffs /* [3]*/) {
321 const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
322 filter_params, subpel_q4 & SUBPEL_MASK);
323 const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
324 const __m256i coeff = _mm256_broadcastsi128_si256(coeffs_8);
325
326 // coeffs 1 2 1 2 1 2 1 2
327 coeffs[0] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x05040302u));
328 // coeffs 3 4 3 4 3 4 3 4
329 coeffs[1] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x09080706u));
330 // coeffs 5 6 5 6 5 6 5 6
331 coeffs[2] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x0D0C0B0Au));
332 }
333
prepare_coeffs_8tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)334 static inline void prepare_coeffs_8tap_avx2(
335 const InterpFilterParams *const filter_params, const int32_t subpel_q4,
336 __m256i *const coeffs /* [4] */) {
337 const int16_t *filter = av1_get_interp_filter_subpel_kernel(
338 filter_params, subpel_q4 & SUBPEL_MASK);
339
340 const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
341 const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
342
343 // coeffs 0 1 0 1 0 1 0 1
344 coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
345 // coeffs 2 3 2 3 2 3 2 3
346 coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
347 // coeffs 4 5 4 5 4 5 4 5
348 coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
349 // coeffs 6 7 6 7 6 7 6 7
350 coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
351 }
352
load_16bit_5rows_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i dst[5])353 static inline void load_16bit_5rows_avx2(const int16_t *const src,
354 const ptrdiff_t stride,
355 __m256i dst[5]) {
356 dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
357 dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
358 dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
359 dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
360 dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
361 }
362
load_16bit_7rows_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i dst[7])363 static inline void load_16bit_7rows_avx2(const int16_t *const src,
364 const ptrdiff_t stride,
365 __m256i dst[7]) {
366 dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
367 dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
368 dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
369 dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
370 dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
371 dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
372 dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
373 }
374
load_16bit_8rows_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i dst[8])375 static AOM_FORCE_INLINE void load_16bit_8rows_avx2(const int16_t *const src,
376 const ptrdiff_t stride,
377 __m256i dst[8]) {
378 dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
379 dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
380 dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
381 dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
382 dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
383 dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
384 dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
385 dst[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
386 }
387
loadu_unpack_16bit_5rows_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i s_256[5],__m256i ss_256[5],__m256i tt_256[5])388 static AOM_FORCE_INLINE void loadu_unpack_16bit_5rows_avx2(
389 const int16_t *const src, const ptrdiff_t stride, __m256i s_256[5],
390 __m256i ss_256[5], __m256i tt_256[5]) {
391 s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
392 s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
393 s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
394 s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
395 s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
396
397 ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
398 ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
399 ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
400 ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
401
402 tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
403 tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
404 tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
405 tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
406 }
407
loadu_unpack_16bit_3rows_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i s_256[3],__m256i ss_256[3],__m256i tt_256[3])408 static AOM_FORCE_INLINE void loadu_unpack_16bit_3rows_avx2(
409 const int16_t *const src, const ptrdiff_t stride, __m256i s_256[3],
410 __m256i ss_256[3], __m256i tt_256[3]) {
411 s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
412 s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
413 s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
414
415 ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
416 ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
417
418 tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
419 tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
420 }
421
convolve_8tap_unpack_avx2(const __m256i s[6],__m256i ss[7])422 static inline void convolve_8tap_unpack_avx2(const __m256i s[6],
423 __m256i ss[7]) {
424 ss[0] = _mm256_unpacklo_epi16(s[0], s[1]);
425 ss[1] = _mm256_unpacklo_epi16(s[2], s[3]);
426 ss[2] = _mm256_unpacklo_epi16(s[4], s[5]);
427 ss[4] = _mm256_unpackhi_epi16(s[0], s[1]);
428 ss[5] = _mm256_unpackhi_epi16(s[2], s[3]);
429 ss[6] = _mm256_unpackhi_epi16(s[4], s[5]);
430 }
431
convolve_2tap_ssse3(const __m128i ss[1],const __m128i coeffs[1])432 static inline __m128i convolve_2tap_ssse3(const __m128i ss[1],
433 const __m128i coeffs[1]) {
434 return _mm_maddubs_epi16(ss[0], coeffs[0]);
435 }
436
convolve_4tap_ssse3(const __m128i ss[2],const __m128i coeffs[2])437 static inline __m128i convolve_4tap_ssse3(const __m128i ss[2],
438 const __m128i coeffs[2]) {
439 const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
440 const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
441 return _mm_add_epi16(res_23, res_45);
442 }
443
convolve_6tap_ssse3(const __m128i ss[3],const __m128i coeffs[3])444 static inline __m128i convolve_6tap_ssse3(const __m128i ss[3],
445 const __m128i coeffs[3]) {
446 const __m128i res_12 = _mm_maddubs_epi16(ss[0], coeffs[0]);
447 const __m128i res_34 = _mm_maddubs_epi16(ss[1], coeffs[1]);
448 const __m128i res_56 = _mm_maddubs_epi16(ss[2], coeffs[2]);
449 const __m128i res_1256 = _mm_add_epi16(res_12, res_56);
450 return _mm_add_epi16(res_1256, res_34);
451 }
452
convolve_8tap_ssse3(const __m128i ss[4],const __m128i coeffs[4])453 static inline __m128i convolve_8tap_ssse3(const __m128i ss[4],
454 const __m128i coeffs[4]) {
455 const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
456 const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
457 const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
458 const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
459 const __m128i res_0145 = _mm_add_epi16(res_01, res_45);
460 const __m128i res_2367 = _mm_add_epi16(res_23, res_67);
461 return _mm_add_epi16(res_0145, res_2367);
462 }
463
convolve_2tap_avx2(const __m256i ss[1],const __m256i coeffs[1])464 static inline __m256i convolve_2tap_avx2(const __m256i ss[1],
465 const __m256i coeffs[1]) {
466 return _mm256_maddubs_epi16(ss[0], coeffs[0]);
467 }
468
convolve_4tap_avx2(const __m256i ss[2],const __m256i coeffs[2])469 static inline __m256i convolve_4tap_avx2(const __m256i ss[2],
470 const __m256i coeffs[2]) {
471 const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
472 const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
473 return _mm256_add_epi16(res_23, res_45);
474 }
475
convolve_6tap_avx2(const __m256i ss[3],const __m256i coeffs[3])476 static inline __m256i convolve_6tap_avx2(const __m256i ss[3],
477 const __m256i coeffs[3]) {
478 const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
479 const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
480 const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
481 const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
482 return _mm256_add_epi16(res_0145, res_23);
483 }
484
convolve_8tap_avx2(const __m256i ss[4],const __m256i coeffs[4])485 static inline __m256i convolve_8tap_avx2(const __m256i ss[4],
486 const __m256i coeffs[4]) {
487 const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
488 const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
489 const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
490 const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
491 const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
492 const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
493 return _mm256_add_epi16(res_0145, res_2367);
494 }
495
convolve16_2tap_sse2(const __m128i ss[1],const __m128i coeffs[1])496 static inline __m128i convolve16_2tap_sse2(const __m128i ss[1],
497 const __m128i coeffs[1]) {
498 return _mm_madd_epi16(ss[0], coeffs[0]);
499 }
500
convolve16_4tap_sse2(const __m128i ss[2],const __m128i coeffs[2])501 static inline __m128i convolve16_4tap_sse2(const __m128i ss[2],
502 const __m128i coeffs[2]) {
503 const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
504 const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
505 return _mm_add_epi32(res_01, res_23);
506 }
507
convolve16_6tap_sse2(const __m128i ss[3],const __m128i coeffs[3])508 static inline __m128i convolve16_6tap_sse2(const __m128i ss[3],
509 const __m128i coeffs[3]) {
510 const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
511 const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
512 const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
513 const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
514 return _mm_add_epi32(res_0123, res_45);
515 }
516
convolve16_8tap_sse2(const __m128i ss[4],const __m128i coeffs[4])517 static inline __m128i convolve16_8tap_sse2(const __m128i ss[4],
518 const __m128i coeffs[4]) {
519 const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
520 const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
521 const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
522 const __m128i res_67 = _mm_madd_epi16(ss[3], coeffs[3]);
523 const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
524 const __m128i res_4567 = _mm_add_epi32(res_45, res_67);
525 return _mm_add_epi32(res_0123, res_4567);
526 }
527
convolve16_2tap_avx2(const __m256i ss[1],const __m256i coeffs[1])528 static inline __m256i convolve16_2tap_avx2(const __m256i ss[1],
529 const __m256i coeffs[1]) {
530 return _mm256_madd_epi16(ss[0], coeffs[0]);
531 }
532
convolve16_4tap_avx2(const __m256i ss[2],const __m256i coeffs[2])533 static inline __m256i convolve16_4tap_avx2(const __m256i ss[2],
534 const __m256i coeffs[2]) {
535 const __m256i res_1 = _mm256_madd_epi16(ss[0], coeffs[0]);
536 const __m256i res_2 = _mm256_madd_epi16(ss[1], coeffs[1]);
537 return _mm256_add_epi32(res_1, res_2);
538 }
539
convolve16_6tap_avx2(const __m256i ss[3],const __m256i coeffs[3])540 static inline __m256i convolve16_6tap_avx2(const __m256i ss[3],
541 const __m256i coeffs[3]) {
542 const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
543 const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
544 const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
545 const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
546 return _mm256_add_epi32(res_0123, res_45);
547 }
548
convolve16_8tap_avx2(const __m256i ss[4],const __m256i coeffs[4])549 static inline __m256i convolve16_8tap_avx2(const __m256i ss[4],
550 const __m256i coeffs[4]) {
551 const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
552 const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
553 const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
554 const __m256i res_67 = _mm256_madd_epi16(ss[3], coeffs[3]);
555 const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
556 const __m256i res_4567 = _mm256_add_epi32(res_45, res_67);
557 return _mm256_add_epi32(res_0123, res_4567);
558 }
559
x_convolve_4tap_avx2(const __m256i data,const __m256i coeffs[2],const __m256i filt[2])560 static inline __m256i x_convolve_4tap_avx2(const __m256i data,
561 const __m256i coeffs[2],
562 const __m256i filt[2]) {
563 __m256i ss[2];
564
565 ss[0] = _mm256_shuffle_epi8(data, filt[0]);
566 ss[1] = _mm256_shuffle_epi8(data, filt[1]);
567
568 return convolve_4tap_avx2(ss, coeffs);
569 }
570
x_convolve_6tap_avx2(const __m256i data,const __m256i coeffs[3],const __m256i filt[3])571 static inline __m256i x_convolve_6tap_avx2(const __m256i data,
572 const __m256i coeffs[3],
573 const __m256i filt[3]) {
574 __m256i ss[3];
575
576 ss[0] = _mm256_shuffle_epi8(data, filt[0]);
577 ss[1] = _mm256_shuffle_epi8(data, filt[1]);
578 ss[2] = _mm256_shuffle_epi8(data, filt[2]);
579
580 return convolve_6tap_avx2(ss, coeffs);
581 }
582
x_convolve_8tap_avx2(const __m256i data,const __m256i coeffs[4],const __m256i filt[4])583 static inline __m256i x_convolve_8tap_avx2(const __m256i data,
584 const __m256i coeffs[4],
585 const __m256i filt[4]) {
586 __m256i ss[4];
587
588 ss[0] = _mm256_shuffle_epi8(data, filt[0]);
589 ss[1] = _mm256_shuffle_epi8(data, filt[1]);
590 ss[2] = _mm256_shuffle_epi8(data, filt[2]);
591 ss[3] = _mm256_shuffle_epi8(data, filt[3]);
592
593 return convolve_8tap_avx2(ss, coeffs);
594 }
595
sr_y_round_avx2(const __m256i src)596 static inline __m256i sr_y_round_avx2(const __m256i src) {
597 const __m256i round = _mm256_set1_epi16(32);
598 const __m256i dst = _mm256_add_epi16(src, round);
599 return _mm256_srai_epi16(dst, FILTER_BITS - 1);
600 }
601
xy_x_round_sse2(const __m128i src)602 static inline __m128i xy_x_round_sse2(const __m128i src) {
603 const __m128i round = _mm_set1_epi16(2);
604 const __m128i dst = _mm_add_epi16(src, round);
605 return _mm_srai_epi16(dst, 2);
606 }
607
xy_x_round_avx2(const __m256i src)608 static inline __m256i xy_x_round_avx2(const __m256i src) {
609 const __m256i round = _mm256_set1_epi16(2);
610 const __m256i dst = _mm256_add_epi16(src, round);
611 return _mm256_srai_epi16(dst, 2);
612 }
613
xy_x_round_store_2x2_sse2(const __m128i res,int16_t * const dst)614 static inline void xy_x_round_store_2x2_sse2(const __m128i res,
615 int16_t *const dst) {
616 const __m128i d = xy_x_round_sse2(res);
617 _mm_storel_epi64((__m128i *)dst, d);
618 }
619
xy_x_round_store_4x2_sse2(const __m128i res,int16_t * const dst)620 static inline void xy_x_round_store_4x2_sse2(const __m128i res,
621 int16_t *const dst) {
622 const __m128i d = xy_x_round_sse2(res);
623 _mm_storeu_si128((__m128i *)dst, d);
624 }
625
xy_x_round_store_8x2_sse2(const __m128i res[2],int16_t * const dst)626 static inline void xy_x_round_store_8x2_sse2(const __m128i res[2],
627 int16_t *const dst) {
628 __m128i r[2];
629
630 r[0] = xy_x_round_sse2(res[0]);
631 r[1] = xy_x_round_sse2(res[1]);
632 _mm_storeu_si128((__m128i *)dst, r[0]);
633 _mm_storeu_si128((__m128i *)(dst + 8), r[1]);
634 }
635
xy_x_round_store_8x2_avx2(const __m256i res,int16_t * const dst)636 static inline void xy_x_round_store_8x2_avx2(const __m256i res,
637 int16_t *const dst) {
638 const __m256i d = xy_x_round_avx2(res);
639 _mm256_storeu_si256((__m256i *)dst, d);
640 }
641
xy_x_round_store_32_avx2(const __m256i res[2],int16_t * const dst)642 static inline void xy_x_round_store_32_avx2(const __m256i res[2],
643 int16_t *const dst) {
644 __m256i r[2];
645
646 r[0] = xy_x_round_avx2(res[0]);
647 r[1] = xy_x_round_avx2(res[1]);
648 const __m256i d0 =
649 _mm256_inserti128_si256(r[0], _mm256_castsi256_si128(r[1]), 1);
650 const __m256i d1 =
651 _mm256_inserti128_si256(r[1], _mm256_extracti128_si256(r[0], 1), 0);
652 _mm256_storeu_si256((__m256i *)dst, d0);
653 _mm256_storeu_si256((__m256i *)(dst + 16), d1);
654 }
655
xy_y_round_sse2(const __m128i src)656 static inline __m128i xy_y_round_sse2(const __m128i src) {
657 const __m128i round = _mm_set1_epi32(1024);
658 const __m128i dst = _mm_add_epi32(src, round);
659 return _mm_srai_epi32(dst, 11);
660 }
661
xy_y_round_half_pel_sse2(const __m128i src)662 static inline __m128i xy_y_round_half_pel_sse2(const __m128i src) {
663 const __m128i round = _mm_set1_epi16(16);
664 const __m128i dst = _mm_add_epi16(src, round);
665 return _mm_srai_epi16(dst, 5);
666 }
667
xy_y_round_avx2(const __m256i src)668 static inline __m256i xy_y_round_avx2(const __m256i src) {
669 const __m256i round = _mm256_set1_epi32(1024);
670 const __m256i dst = _mm256_add_epi32(src, round);
671 return _mm256_srai_epi32(dst, 11);
672 }
673
xy_y_round_16_avx2(const __m256i r[2])674 static inline __m256i xy_y_round_16_avx2(const __m256i r[2]) {
675 const __m256i r0 = xy_y_round_avx2(r[0]);
676 const __m256i r1 = xy_y_round_avx2(r[1]);
677 return _mm256_packs_epi32(r0, r1);
678 }
679
xy_y_round_half_pel_avx2(const __m256i src)680 static inline __m256i xy_y_round_half_pel_avx2(const __m256i src) {
681 const __m256i round = _mm256_set1_epi16(16);
682 const __m256i dst = _mm256_add_epi16(src, round);
683 return _mm256_srai_epi16(dst, 5);
684 }
685
pack_store_2x2_sse2(const __m128i res,uint8_t * const dst,const ptrdiff_t stride)686 static inline void pack_store_2x2_sse2(const __m128i res, uint8_t *const dst,
687 const ptrdiff_t stride) {
688 const __m128i d = _mm_packus_epi16(res, res);
689 *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
690 *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
691 }
692
pack_store_4x2_sse2(const __m128i res,uint8_t * const dst,const ptrdiff_t stride)693 static inline void pack_store_4x2_sse2(const __m128i res, uint8_t *const dst,
694 const ptrdiff_t stride) {
695 const __m128i d = _mm_packus_epi16(res, res);
696 store_u8_4x2_sse2(d, dst, stride);
697 }
698
pack_store_4x2_avx2(const __m256i res,uint8_t * const dst,const ptrdiff_t stride)699 static inline void pack_store_4x2_avx2(const __m256i res, uint8_t *const dst,
700 const ptrdiff_t stride) {
701 const __m256i d = _mm256_packus_epi16(res, res);
702 const __m128i d0 = _mm256_castsi256_si128(d);
703 const __m128i d1 = _mm256_extracti128_si256(d, 1);
704
705 xx_storel_32(dst, d0);
706 xx_storel_32(dst + stride, d1);
707 }
708
pack_store_8x2_avx2(const __m256i res,uint8_t * const dst,const ptrdiff_t stride)709 static inline void pack_store_8x2_avx2(const __m256i res, uint8_t *const dst,
710 const ptrdiff_t stride) {
711 const __m256i d = _mm256_packus_epi16(res, res);
712 const __m128i d0 = _mm256_castsi256_si128(d);
713 const __m128i d1 = _mm256_extracti128_si256(d, 1);
714 _mm_storel_epi64((__m128i *)dst, d0);
715 _mm_storel_epi64((__m128i *)(dst + stride), d1);
716 }
717
pack_store_16x2_avx2(const __m256i res0,const __m256i res1,uint8_t * const dst,const ptrdiff_t stride)718 static inline void pack_store_16x2_avx2(const __m256i res0, const __m256i res1,
719 uint8_t *const dst,
720 const ptrdiff_t stride) {
721 const __m256i d = _mm256_packus_epi16(res0, res1);
722 storeu_u8_16x2_avx2(d, dst, stride);
723 }
724
xy_y_pack_store_16x2_avx2(const __m256i res0,const __m256i res1,uint8_t * const dst,const ptrdiff_t stride)725 static inline void xy_y_pack_store_16x2_avx2(const __m256i res0,
726 const __m256i res1,
727 uint8_t *const dst,
728 const ptrdiff_t stride) {
729 const __m256i t = _mm256_packus_epi16(res0, res1);
730 const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
731 storeu_u8_16x2_avx2(d, dst, stride);
732 }
733
pack_store_32_avx2(const __m256i res0,const __m256i res1,uint8_t * const dst)734 static inline void pack_store_32_avx2(const __m256i res0, const __m256i res1,
735 uint8_t *const dst) {
736 const __m256i t = _mm256_packus_epi16(res0, res1);
737 const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
738 _mm256_storeu_si256((__m256i *)dst, d);
739 }
740
xy_y_round_store_2x2_sse2(const __m128i res,uint8_t * const dst,const ptrdiff_t stride)741 static inline void xy_y_round_store_2x2_sse2(const __m128i res,
742 uint8_t *const dst,
743 const ptrdiff_t stride) {
744 const __m128i r = xy_y_round_sse2(res);
745 const __m128i rr = _mm_packs_epi32(r, r);
746 pack_store_2x2_sse2(rr, dst, stride);
747 }
748
xy_y_round_store_4x2_avx2(const __m256i res,uint8_t * const dst,const ptrdiff_t stride)749 static inline void xy_y_round_store_4x2_avx2(const __m256i res,
750 uint8_t *const dst,
751 const ptrdiff_t stride) {
752 const __m256i r = xy_y_round_avx2(res);
753 const __m256i rr = _mm256_packs_epi32(r, r);
754 pack_store_4x2_avx2(rr, dst, stride);
755 }
756
xy_y_pack_store_32_avx2(const __m256i res0,const __m256i res1,uint8_t * const dst)757 static inline void xy_y_pack_store_32_avx2(const __m256i res0,
758 const __m256i res1,
759 uint8_t *const dst) {
760 const __m256i d = _mm256_packus_epi16(res0, res1);
761 // d = _mm256_permute4x64_epi64(d, 0xD8);
762 _mm256_storeu_si256((__m256i *)dst, d);
763 }
764
xy_y_round_store_32_avx2(const __m256i r0[2],const __m256i r1[2],uint8_t * const dst)765 static inline void xy_y_round_store_32_avx2(const __m256i r0[2],
766 const __m256i r1[2],
767 uint8_t *const dst) {
768 const __m256i ra = xy_y_round_16_avx2(r0);
769 const __m256i rb = xy_y_round_16_avx2(r1);
770 xy_y_pack_store_32_avx2(ra, rb, dst);
771 }
772
convolve_store_32_avx2(const __m256i res0,const __m256i res1,uint8_t * const dst)773 static inline void convolve_store_32_avx2(const __m256i res0,
774 const __m256i res1,
775 uint8_t *const dst) {
776 const __m256i d = _mm256_packus_epi16(res0, res1);
777 _mm256_storeu_si256((__m256i *)dst, d);
778 }
779
sr_x_round_sse2(const __m128i src)780 static inline __m128i sr_x_round_sse2(const __m128i src) {
781 const __m128i round = _mm_set1_epi16(34);
782 const __m128i dst = _mm_add_epi16(src, round);
783 return _mm_srai_epi16(dst, 6);
784 }
785
sr_x_round_avx2(const __m256i src)786 static inline __m256i sr_x_round_avx2(const __m256i src) {
787 const __m256i round = _mm256_set1_epi16(34);
788 const __m256i dst = _mm256_add_epi16(src, round);
789 return _mm256_srai_epi16(dst, 6);
790 }
791
sr_y_round_sse2(const __m128i src)792 static inline __m128i sr_y_round_sse2(const __m128i src) {
793 const __m128i round = _mm_set1_epi16(32);
794 const __m128i dst = _mm_add_epi16(src, round);
795 return _mm_srai_epi16(dst, FILTER_BITS - 1);
796 }
797
sr_x_round_store_8x2_avx2(const __m256i res,uint8_t * const dst,const ptrdiff_t dst_stride)798 static inline void sr_x_round_store_8x2_avx2(const __m256i res,
799 uint8_t *const dst,
800 const ptrdiff_t dst_stride) {
801 const __m256i r = sr_x_round_avx2(res);
802 pack_store_8x2_avx2(r, dst, dst_stride);
803 }
804
sr_x_round_store_16x2_avx2(const __m256i res[2],uint8_t * const dst,const ptrdiff_t dst_stride)805 static inline void sr_x_round_store_16x2_avx2(const __m256i res[2],
806 uint8_t *const dst,
807 const ptrdiff_t dst_stride) {
808 __m256i r[2];
809
810 r[0] = sr_x_round_avx2(res[0]);
811 r[1] = sr_x_round_avx2(res[1]);
812 pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
813 }
814
sr_x_round_store_32_avx2(const __m256i res[2],uint8_t * const dst)815 static inline void sr_x_round_store_32_avx2(const __m256i res[2],
816 uint8_t *const dst) {
817 __m256i r[2];
818
819 r[0] = sr_x_round_avx2(res[0]);
820 r[1] = sr_x_round_avx2(res[1]);
821 convolve_store_32_avx2(r[0], r[1], dst);
822 }
823
sr_y_round_store_8x2_avx2(const __m256i res,uint8_t * const dst,const ptrdiff_t dst_stride)824 static inline void sr_y_round_store_8x2_avx2(const __m256i res,
825 uint8_t *const dst,
826 const ptrdiff_t dst_stride) {
827 const __m256i r = sr_y_round_avx2(res);
828 pack_store_8x2_avx2(r, dst, dst_stride);
829 }
830
sr_y_round_store_16x2_avx2(const __m256i res[2],uint8_t * const dst,const ptrdiff_t dst_stride)831 static inline void sr_y_round_store_16x2_avx2(const __m256i res[2],
832 uint8_t *const dst,
833 const ptrdiff_t dst_stride) {
834 __m256i r[2];
835
836 r[0] = sr_y_round_avx2(res[0]);
837 r[1] = sr_y_round_avx2(res[1]);
838 pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
839 }
840
sr_y_2tap_32_avg_avx2(const uint8_t * const src,const __m256i s0,__m256i * const s1,uint8_t * const dst)841 static inline void sr_y_2tap_32_avg_avx2(const uint8_t *const src,
842 const __m256i s0, __m256i *const s1,
843 uint8_t *const dst) {
844 *s1 = _mm256_loadu_si256((__m256i *)src);
845 const __m256i d = _mm256_avg_epu8(s0, *s1);
846 _mm256_storeu_si256((__m256i *)dst, d);
847 }
848
sr_x_2tap_32_avg_avx2(const uint8_t * const src,uint8_t * const dst)849 static inline void sr_x_2tap_32_avg_avx2(const uint8_t *const src,
850 uint8_t *const dst) {
851 const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
852 const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
853 const __m256i d = _mm256_avg_epu8(s0, s1);
854 _mm256_storeu_si256((__m256i *)dst, d);
855 }
856
x_convolve_2tap_2x2_sse4_1(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[1])857 static inline __m128i x_convolve_2tap_2x2_sse4_1(const uint8_t *const src,
858 const ptrdiff_t stride,
859 const __m128i coeffs[1]) {
860 const __m128i sfl =
861 _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
862 const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
863 const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
864 return convolve_2tap_ssse3(&ss, coeffs);
865 }
866
x_convolve_2tap_4x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[1])867 static inline __m128i x_convolve_2tap_4x2_ssse3(const uint8_t *const src,
868 const ptrdiff_t stride,
869 const __m128i coeffs[1]) {
870 const __m128i sfl =
871 _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
872 const __m128i s_128 = load_u8_8x2_sse2(src, stride);
873 const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
874 return convolve_2tap_ssse3(&ss, coeffs);
875 }
876
x_convolve_2tap_8x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[1],__m128i r[2])877 static inline void x_convolve_2tap_8x2_ssse3(const uint8_t *const src,
878 const ptrdiff_t stride,
879 const __m128i coeffs[1],
880 __m128i r[2]) {
881 __m128i ss[2];
882 const __m128i s00 = _mm_loadu_si128((__m128i *)src);
883 const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
884 const __m128i s01 = _mm_srli_si128(s00, 1);
885 const __m128i s11 = _mm_srli_si128(s10, 1);
886 ss[0] = _mm_unpacklo_epi8(s00, s01);
887 ss[1] = _mm_unpacklo_epi8(s10, s11);
888
889 r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
890 r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
891 }
892
x_convolve_2tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[1])893 static inline __m256i x_convolve_2tap_8x2_avx2(const uint8_t *const src,
894 const ptrdiff_t stride,
895 const __m256i coeffs[1]) {
896 __m128i s_128[2][2];
897 __m256i s_256[2];
898
899 s_128[0][0] = _mm_loadu_si128((__m128i *)src);
900 s_128[1][0] = _mm_loadu_si128((__m128i *)(src + stride));
901 s_128[0][1] = _mm_srli_si128(s_128[0][0], 1);
902 s_128[1][1] = _mm_srli_si128(s_128[1][0], 1);
903 s_256[0] = _mm256_setr_m128i(s_128[0][0], s_128[1][0]);
904 s_256[1] = _mm256_setr_m128i(s_128[0][1], s_128[1][1]);
905 const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
906 return convolve_2tap_avx2(&ss, coeffs);
907 }
908
x_convolve_2tap_16x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[1],__m256i r[2])909 static inline void x_convolve_2tap_16x2_avx2(const uint8_t *const src,
910 const ptrdiff_t stride,
911 const __m256i coeffs[1],
912 __m256i r[2]) {
913 const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
914 const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
915 const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
916 const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
917 r[0] = convolve_2tap_avx2(&s0, coeffs);
918 r[1] = convolve_2tap_avx2(&s1, coeffs);
919 }
920
x_convolve_2tap_32_avx2(const uint8_t * const src,const __m256i coeffs[1],__m256i r[2])921 static inline void x_convolve_2tap_32_avx2(const uint8_t *const src,
922 const __m256i coeffs[1],
923 __m256i r[2]) {
924 const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
925 const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
926 const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
927 const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
928
929 r[0] = convolve_2tap_avx2(&ss0, coeffs);
930 r[1] = convolve_2tap_avx2(&ss1, coeffs);
931 }
932
x_convolve_4tap_2x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[2])933 static inline __m128i x_convolve_4tap_2x2_ssse3(const uint8_t *const src,
934 const ptrdiff_t stride,
935 const __m128i coeffs[2]) {
936 const __m128i sfl0 =
937 _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
938 const __m128i sfl1 =
939 _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
940 const __m128i s = load_u8_8x2_sse2(src, stride);
941 __m128i ss[2];
942
943 ss[0] = _mm_shuffle_epi8(s, sfl0);
944 ss[1] = _mm_shuffle_epi8(s, sfl1);
945 return convolve_4tap_ssse3(ss, coeffs);
946 }
947
x_convolve_4tap_4x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[2])948 static inline __m128i x_convolve_4tap_4x2_ssse3(const uint8_t *const src,
949 const ptrdiff_t stride,
950 const __m128i coeffs[2]) {
951 const __m128i s = load_u8_8x2_sse2(src, stride);
952 const __m128i sfl0 =
953 _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
954 const __m128i sfl1 =
955 _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
956 __m128i ss[2];
957
958 ss[0] = _mm_shuffle_epi8(s, sfl0);
959 ss[1] = _mm_shuffle_epi8(s, sfl1);
960 return convolve_4tap_ssse3(ss, coeffs);
961 }
962
x_convolve_4tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[2],const __m256i filt[2])963 static inline __m256i x_convolve_4tap_8x2_avx2(const uint8_t *const src,
964 const ptrdiff_t stride,
965 const __m256i coeffs[2],
966 const __m256i filt[2]) {
967 const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
968 return x_convolve_4tap_avx2(s_256, coeffs, filt);
969 }
970
x_convolve_4tap_16x2_avx2(const uint8_t * const src,const int32_t src_stride,const __m256i coeffs[2],const __m256i filt[2],__m256i r[2])971 static inline void x_convolve_4tap_16x2_avx2(const uint8_t *const src,
972 const int32_t src_stride,
973 const __m256i coeffs[2],
974 const __m256i filt[2],
975 __m256i r[2]) {
976 r[0] = x_convolve_4tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
977 r[1] = x_convolve_4tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
978 }
979
x_convolve_4tap_32_avx2(const uint8_t * const src,const __m256i coeffs[2],const __m256i filt[2],__m256i r[2])980 static inline void x_convolve_4tap_32_avx2(const uint8_t *const src,
981 const __m256i coeffs[2],
982 const __m256i filt[2],
983 __m256i r[2]) {
984 const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
985 const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
986
987 r[0] = x_convolve_4tap_avx2(s0_256, coeffs, filt);
988 r[1] = x_convolve_4tap_avx2(s1_256, coeffs, filt);
989 }
990
x_convolve_6tap_2x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[3])991 static inline __m128i x_convolve_6tap_2x2_ssse3(const uint8_t *const src,
992 const ptrdiff_t stride,
993 const __m128i coeffs[3]) {
994 const __m128i sfl0 =
995 _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
996 const __m128i sfl1 =
997 _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
998 const __m128i sfl2 =
999 _mm_setr_epi8(4, 5, 5, 6, 12, 13, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0);
1000
1001 const __m128i s = load_u8_8x2_sse2(src, stride);
1002 __m128i ss[3];
1003
1004 ss[0] = _mm_shuffle_epi8(s, sfl0);
1005 ss[1] = _mm_shuffle_epi8(s, sfl1);
1006 ss[2] = _mm_shuffle_epi8(s, sfl2);
1007 return convolve_6tap_ssse3(ss, coeffs);
1008 }
1009
x_convolve_6tap_4x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[3])1010 static inline __m128i x_convolve_6tap_4x2_ssse3(const uint8_t *const src,
1011 const ptrdiff_t stride,
1012 const __m128i coeffs[3]) {
1013 const __m128i s = load_u8_8x2_sse2(src, stride);
1014 const __m128i sfl0 =
1015 _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
1016 const __m128i sfl1 =
1017 _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
1018 const __m128i sfl2 =
1019 _mm_setr_epi8(4, 5, 5, 6, 12, 13, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0);
1020 __m128i ss[3];
1021
1022 ss[0] = _mm_shuffle_epi8(s, sfl0);
1023 ss[1] = _mm_shuffle_epi8(s, sfl1);
1024 ss[2] = _mm_shuffle_epi8(s, sfl2);
1025 return convolve_6tap_ssse3(ss, coeffs);
1026 }
1027
x_convolve_6tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[3],const __m256i filt[3])1028 static inline __m256i x_convolve_6tap_8x2_avx2(const uint8_t *const src,
1029 const ptrdiff_t stride,
1030 const __m256i coeffs[3],
1031 const __m256i filt[3]) {
1032 const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1033 return x_convolve_6tap_avx2(s_256, coeffs, filt);
1034 }
1035
x_convolve_6tap_16x2_avx2(const uint8_t * const src,const int32_t src_stride,const __m256i coeffs[3],const __m256i filt[3],__m256i r[2])1036 static inline void x_convolve_6tap_16x2_avx2(const uint8_t *const src,
1037 const int32_t src_stride,
1038 const __m256i coeffs[3],
1039 const __m256i filt[3],
1040 __m256i r[2]) {
1041 r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1042 r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1043 }
1044
x_convolve_6tap_32_avx2(const uint8_t * const src,const __m256i coeffs[3],const __m256i filt[3],__m256i r[2])1045 static inline void x_convolve_6tap_32_avx2(const uint8_t *const src,
1046 const __m256i coeffs[3],
1047 const __m256i filt[3],
1048 __m256i r[2]) {
1049 const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1050 const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1051
1052 r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
1053 r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
1054 }
1055
x_convolve_8tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[4],const __m256i filt[4])1056 static inline __m256i x_convolve_8tap_8x2_avx2(const uint8_t *const src,
1057 const ptrdiff_t stride,
1058 const __m256i coeffs[4],
1059 const __m256i filt[4]) {
1060 const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1061 return x_convolve_8tap_avx2(s_256, coeffs, filt);
1062 }
1063
x_convolve_8tap_16x2_avx2(const uint8_t * const src,const int32_t src_stride,const __m256i coeffs[4],const __m256i filt[4],__m256i r[2])1064 static AOM_FORCE_INLINE void x_convolve_8tap_16x2_avx2(const uint8_t *const src,
1065 const int32_t src_stride,
1066 const __m256i coeffs[4],
1067 const __m256i filt[4],
1068 __m256i r[2]) {
1069 r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1070 r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1071 }
1072
x_convolve_8tap_32_avx2(const uint8_t * const src,const __m256i coeffs[4],const __m256i filt[4],__m256i r[2])1073 static AOM_FORCE_INLINE void x_convolve_8tap_32_avx2(const uint8_t *const src,
1074 const __m256i coeffs[4],
1075 const __m256i filt[4],
1076 __m256i r[2]) {
1077 const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1078 const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1079
1080 r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
1081 r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
1082 }
1083
y_convolve_2tap_2x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[1],__m128i s_16[2])1084 static inline __m128i y_convolve_2tap_2x2_ssse3(const uint8_t *const src,
1085 const ptrdiff_t stride,
1086 const __m128i coeffs[1],
1087 __m128i s_16[2]) {
1088 __m128i s_128[2];
1089
1090 s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src + stride));
1091 s_128[0] = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1092 s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src + 2 * stride));
1093 s_128[1] = _mm_unpacklo_epi16(s_16[1], s_16[0]);
1094 const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1095 return convolve_2tap_ssse3(&ss, coeffs);
1096 }
1097
y_convolve_2tap_4x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[1],__m128i s_32[2])1098 static inline __m128i y_convolve_2tap_4x2_ssse3(const uint8_t *const src,
1099 const ptrdiff_t stride,
1100 const __m128i coeffs[1],
1101 __m128i s_32[2]) {
1102 __m128i s_128[2];
1103
1104 s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1105 s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1106 s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1107 s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1108 const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1109 return convolve_2tap_ssse3(&ss, coeffs);
1110 }
1111
y_convolve_2tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[1],__m128i s_64[2])1112 static inline __m256i y_convolve_2tap_8x2_avx2(const uint8_t *const src,
1113 const ptrdiff_t stride,
1114 const __m256i coeffs[1],
1115 __m128i s_64[2]) {
1116 __m256i s_256[2];
1117
1118 s_64[1] = _mm_loadl_epi64((__m128i *)(src + stride));
1119 s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
1120 s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1121 s_256[1] = _mm256_setr_m128i(s_64[1], s_64[0]);
1122 const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1123 return convolve_2tap_avx2(&ss, coeffs);
1124 }
1125
y_convolve_2tap_16x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[1],__m128i s_128[2],__m256i r[2])1126 static inline void y_convolve_2tap_16x2_avx2(const uint8_t *const src,
1127 const ptrdiff_t stride,
1128 const __m256i coeffs[1],
1129 __m128i s_128[2], __m256i r[2]) {
1130 __m256i s_256[2];
1131
1132 s_128[1] = _mm_loadu_si128((__m128i *)(src + stride));
1133 s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1134 s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1135 s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1136 const __m256i ss0 = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1137 const __m256i ss1 = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1138 r[0] = convolve_2tap_avx2(&ss0, coeffs);
1139 r[1] = convolve_2tap_avx2(&ss1, coeffs);
1140 }
1141
y_convolve_2tap_32_avx2(const uint8_t * const src,const __m256i coeffs[1],const __m256i s0,__m256i * const s1,__m256i r[2])1142 static inline void y_convolve_2tap_32_avx2(const uint8_t *const src,
1143 const __m256i coeffs[1],
1144 const __m256i s0, __m256i *const s1,
1145 __m256i r[2]) {
1146 *s1 = _mm256_loadu_si256((__m256i *)src);
1147 const __m256i ss0 = _mm256_unpacklo_epi8(s0, *s1);
1148 const __m256i ss1 = _mm256_unpackhi_epi8(s0, *s1);
1149 r[0] = convolve_2tap_avx2(&ss0, coeffs);
1150 r[1] = convolve_2tap_avx2(&ss1, coeffs);
1151 }
1152
y_convolve_4tap_2x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[2],__m128i s_16[4],__m128i ss_128[2])1153 static inline __m128i y_convolve_4tap_2x2_ssse3(const uint8_t *const src,
1154 const ptrdiff_t stride,
1155 const __m128i coeffs[2],
1156 __m128i s_16[4],
1157 __m128i ss_128[2]) {
1158 s_16[3] = _mm_cvtsi32_si128(loadu_int16(src + stride));
1159 const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1160 s_16[2] = _mm_cvtsi32_si128(loadu_int16(src + 2 * stride));
1161 const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[2]);
1162 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1163 return convolve_4tap_ssse3(ss_128, coeffs);
1164 }
1165
y_convolve_4tap_4x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[2],__m128i s_32[4],__m128i ss_128[2])1166 static inline __m128i y_convolve_4tap_4x2_ssse3(const uint8_t *const src,
1167 const ptrdiff_t stride,
1168 const __m128i coeffs[2],
1169 __m128i s_32[4],
1170 __m128i ss_128[2]) {
1171 s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1172 const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1173 s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1174 const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1175 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1176 return convolve_4tap_ssse3(ss_128, coeffs);
1177 }
1178
y_convolve_4tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[2],__m128i s_64[4],__m256i ss_256[2])1179 static inline __m256i y_convolve_4tap_8x2_avx2(const uint8_t *const src,
1180 const ptrdiff_t stride,
1181 const __m256i coeffs[2],
1182 __m128i s_64[4],
1183 __m256i ss_256[2]) {
1184 s_64[3] = _mm_loadl_epi64((__m128i *)(src + stride));
1185 const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1186 s_64[2] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1187 const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[2]);
1188 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1189 return convolve_4tap_avx2(ss_256, coeffs);
1190 }
1191
y_convolve_4tap_16x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[2],__m128i s_128[4],__m256i ss_256[4],__m256i r[2])1192 static inline void y_convolve_4tap_16x2_avx2(const uint8_t *const src,
1193 const ptrdiff_t stride,
1194 const __m256i coeffs[2],
1195 __m128i s_128[4],
1196 __m256i ss_256[4], __m256i r[2]) {
1197 s_128[3] = _mm_loadu_si128((__m128i *)(src + stride));
1198 const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1199 s_128[2] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1200 const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[2]);
1201 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1202 ss_256[3] = _mm256_unpackhi_epi8(src23, src34);
1203 r[0] = convolve_4tap_avx2(ss_256, coeffs);
1204 r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1205 }
1206
y_convolve_6tap_2x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[3],__m128i s_16[6],__m128i ss_128[3])1207 static inline __m128i y_convolve_6tap_2x2_ssse3(const uint8_t *const src,
1208 const ptrdiff_t stride,
1209 const __m128i coeffs[3],
1210 __m128i s_16[6],
1211 __m128i ss_128[3]) {
1212 s_16[5] = _mm_cvtsi32_si128(loadu_int16(src + 3 * stride));
1213 const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
1214 s_16[4] = _mm_cvtsi32_si128(loadu_int16(src + 4 * stride));
1215 const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[4]);
1216 ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1217 return convolve_6tap_ssse3(ss_128, coeffs);
1218 }
1219
y_convolve_4tap_32x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[2],__m256i s_256[4],__m256i ss_256[4],__m256i tt_256[4],__m256i r[4])1220 static inline void y_convolve_4tap_32x2_avx2(
1221 const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[2],
1222 __m256i s_256[4], __m256i ss_256[4], __m256i tt_256[4], __m256i r[4]) {
1223 s_256[3] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
1224 ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1225 ss_256[3] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1226 s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
1227 tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[2]);
1228 tt_256[3] = _mm256_unpackhi_epi8(s_256[3], s_256[2]);
1229 r[0] = convolve_4tap_avx2(ss_256 + 0, coeffs);
1230 r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1231 r[2] = convolve_4tap_avx2(tt_256 + 0, coeffs);
1232 r[3] = convolve_4tap_avx2(tt_256 + 2, coeffs);
1233 }
1234
y_convolve_6tap_4x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[3],__m128i s_32[6],__m128i ss_128[3])1235 static inline __m128i y_convolve_6tap_4x2_ssse3(const uint8_t *const src,
1236 const ptrdiff_t stride,
1237 const __m128i coeffs[3],
1238 __m128i s_32[6],
1239 __m128i ss_128[3]) {
1240 s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 3 * stride));
1241 const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1242 s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 4 * stride));
1243 const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1244 ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1245 return convolve_6tap_ssse3(ss_128, coeffs);
1246 }
1247
y_convolve_6tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[3],__m128i s_64[6],__m256i ss_256[3])1248 static inline __m256i y_convolve_6tap_8x2_avx2(const uint8_t *const src,
1249 const ptrdiff_t stride,
1250 const __m256i coeffs[3],
1251 __m128i s_64[6],
1252 __m256i ss_256[3]) {
1253 s_64[5] = _mm_loadl_epi64((__m128i *)(src + 3 * stride));
1254 const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
1255 s_64[4] = _mm_loadl_epi64((__m128i *)(src + 4 * stride));
1256 const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[4]);
1257 ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1258 return convolve_6tap_avx2(ss_256, coeffs);
1259 }
1260
y_convolve_6tap_16x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[3],__m128i s_128[6],__m256i ss_256[6],__m256i r[2])1261 static inline void y_convolve_6tap_16x2_avx2(const uint8_t *const src,
1262 const ptrdiff_t stride,
1263 const __m256i coeffs[3],
1264 __m128i s_128[6],
1265 __m256i ss_256[6], __m256i r[2]) {
1266 s_128[5] = _mm_loadu_si128((__m128i *)(src + 3 * stride));
1267 const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
1268 s_128[4] = _mm_loadu_si128((__m128i *)(src + 4 * stride));
1269 const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[4]);
1270 ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1271 ss_256[5] = _mm256_unpackhi_epi8(src45, src56);
1272 r[0] = convolve_6tap_avx2(ss_256, coeffs);
1273 r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1274 }
1275
y_convolve_6tap_32x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[3],__m256i s_256[6],__m256i ss_256[6],__m256i tt_256[6],__m256i r[4])1276 static inline void y_convolve_6tap_32x2_avx2(
1277 const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[3],
1278 __m256i s_256[6], __m256i ss_256[6], __m256i tt_256[6], __m256i r[4]) {
1279 s_256[5] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1280 ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
1281 ss_256[5] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
1282 s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1283 tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[4]);
1284 tt_256[5] = _mm256_unpackhi_epi8(s_256[5], s_256[4]);
1285 r[0] = convolve_6tap_avx2(ss_256 + 0, coeffs);
1286 r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1287 r[2] = convolve_6tap_avx2(tt_256 + 0, coeffs);
1288 r[3] = convolve_6tap_avx2(tt_256 + 3, coeffs);
1289 }
1290
y_convolve_8tap_2x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[4],__m128i s_16[8],__m128i ss_128[4])1291 static inline __m128i y_convolve_8tap_2x2_ssse3(const uint8_t *const src,
1292 const ptrdiff_t stride,
1293 const __m128i coeffs[4],
1294 __m128i s_16[8],
1295 __m128i ss_128[4]) {
1296 s_16[7] = _mm_cvtsi32_si128(loadu_int16(src + 7 * stride));
1297 const __m128i src67 = _mm_unpacklo_epi16(s_16[6], s_16[7]);
1298 s_16[6] = _mm_cvtsi32_si128(loadu_int16(src + 8 * stride));
1299 const __m128i src78 = _mm_unpacklo_epi16(s_16[7], s_16[6]);
1300 ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1301 return convolve_8tap_ssse3(ss_128, coeffs);
1302 }
1303
y_convolve_8tap_4x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[4],__m128i s_32[8],__m128i ss_128[4])1304 static inline __m128i y_convolve_8tap_4x2_ssse3(const uint8_t *const src,
1305 const ptrdiff_t stride,
1306 const __m128i coeffs[4],
1307 __m128i s_32[8],
1308 __m128i ss_128[4]) {
1309 s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * stride));
1310 const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1311 s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * stride));
1312 const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1313 ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1314 return convolve_8tap_ssse3(ss_128, coeffs);
1315 }
1316
y_convolve_8tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[4],__m128i s_64[8],__m256i ss_256[4])1317 static inline __m256i y_convolve_8tap_8x2_avx2(const uint8_t *const src,
1318 const ptrdiff_t stride,
1319 const __m256i coeffs[4],
1320 __m128i s_64[8],
1321 __m256i ss_256[4]) {
1322 s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * stride));
1323 const __m256i src67 = _mm256_setr_m128i(s_64[6], s_64[7]);
1324 s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * stride));
1325 const __m256i src78 = _mm256_setr_m128i(s_64[7], s_64[6]);
1326 ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1327 return convolve_8tap_avx2(ss_256, coeffs);
1328 }
1329
y_convolve_8tap_16x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[4],__m128i s_128[8],__m256i ss_256[8],__m256i r[2])1330 static inline void y_convolve_8tap_16x2_avx2(const uint8_t *const src,
1331 const ptrdiff_t stride,
1332 const __m256i coeffs[4],
1333 __m128i s_128[8],
1334 __m256i ss_256[8], __m256i r[2]) {
1335 s_128[7] = _mm_loadu_si128((__m128i *)(src + 7 * stride));
1336 const __m256i src67 = _mm256_setr_m128i(s_128[6], s_128[7]);
1337 s_128[6] = _mm_loadu_si128((__m128i *)(src + 8 * stride));
1338 const __m256i src78 = _mm256_setr_m128i(s_128[7], s_128[6]);
1339 ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1340 ss_256[7] = _mm256_unpackhi_epi8(src67, src78);
1341 r[0] = convolve_8tap_avx2(ss_256, coeffs);
1342 r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1343 }
1344
y_convolve_8tap_32x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[4],__m256i s_256[8],__m256i ss_256[8],__m256i tt_256[8],__m256i r[4])1345 static inline void y_convolve_8tap_32x2_avx2(
1346 const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1347 __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1348 s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1349 ss_256[3] = _mm256_unpacklo_epi8(s_256[6], s_256[7]);
1350 ss_256[7] = _mm256_unpackhi_epi8(s_256[6], s_256[7]);
1351 s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1352 tt_256[3] = _mm256_unpacklo_epi8(s_256[7], s_256[6]);
1353 tt_256[7] = _mm256_unpackhi_epi8(s_256[7], s_256[6]);
1354 r[0] = convolve_8tap_avx2(ss_256 + 0, coeffs);
1355 r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1356 r[2] = convolve_8tap_avx2(tt_256 + 0, coeffs);
1357 r[3] = convolve_8tap_avx2(tt_256 + 4, coeffs);
1358 }
1359
xy_x_convolve_2tap_32_avx2(const uint8_t * const src,const __m256i coeffs[1],__m256i r[2])1360 static inline void xy_x_convolve_2tap_32_avx2(const uint8_t *const src,
1361 const __m256i coeffs[1],
1362 __m256i r[2]) {
1363 const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
1364 const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
1365 const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
1366 const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
1367
1368 r[0] = convolve_2tap_avx2(&ss0, coeffs);
1369 r[1] = convolve_2tap_avx2(&ss1, coeffs);
1370 }
1371
xy_x_2tap_32_avx2(const uint8_t * const src,const __m256i coeffs[1],int16_t * const dst)1372 static inline void xy_x_2tap_32_avx2(const uint8_t *const src,
1373 const __m256i coeffs[1],
1374 int16_t *const dst) {
1375 __m256i r[2];
1376
1377 xy_x_convolve_2tap_32_avx2(src, coeffs, r);
1378 const __m256i d0 = xy_x_round_avx2(r[0]);
1379 const __m256i d1 = xy_x_round_avx2(r[1]);
1380 _mm256_storeu_si256((__m256i *)dst, d0);
1381 _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1382 }
1383
xy_x_4tap_32_avx2(const uint8_t * const src,const __m256i coeffs[2],const __m256i filt[2],int16_t * const dst)1384 static inline void xy_x_4tap_32_avx2(const uint8_t *const src,
1385 const __m256i coeffs[2],
1386 const __m256i filt[2],
1387 int16_t *const dst) {
1388 __m256i r[2];
1389
1390 x_convolve_4tap_32_avx2(src, coeffs, filt, r);
1391 const __m256i d0 = xy_x_round_avx2(r[0]);
1392 const __m256i d1 = xy_x_round_avx2(r[1]);
1393 _mm256_storeu_si256((__m256i *)dst, d0);
1394 _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1395 }
1396
xy_x_6tap_32_avx2(const uint8_t * const src,const __m256i coeffs[3],const __m256i filt[3],int16_t * const dst)1397 static inline void xy_x_6tap_32_avx2(const uint8_t *const src,
1398 const __m256i coeffs[3],
1399 const __m256i filt[3],
1400 int16_t *const dst) {
1401 __m256i r[2];
1402
1403 x_convolve_6tap_32_avx2(src, coeffs, filt, r);
1404 const __m256i d0 = xy_x_round_avx2(r[0]);
1405 const __m256i d1 = xy_x_round_avx2(r[1]);
1406 _mm256_storeu_si256((__m256i *)dst, d0);
1407 _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1408 }
1409
xy_x_8tap_32_avx2(const uint8_t * const src,const __m256i coeffs[4],const __m256i filt[4],int16_t * const dst)1410 static inline void xy_x_8tap_32_avx2(const uint8_t *const src,
1411 const __m256i coeffs[4],
1412 const __m256i filt[4],
1413 int16_t *const dst) {
1414 __m256i r[2];
1415
1416 x_convolve_8tap_32_avx2(src, coeffs, filt, r);
1417 const __m256i d0 = xy_x_round_avx2(r[0]);
1418 const __m256i d1 = xy_x_round_avx2(r[1]);
1419 _mm256_storeu_si256((__m256i *)dst, d0);
1420 _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1421 }
1422
xy_y_convolve_2tap_2x2_sse2(const int16_t * const src,__m128i s_32[2],const __m128i coeffs[1])1423 static inline __m128i xy_y_convolve_2tap_2x2_sse2(const int16_t *const src,
1424 __m128i s_32[2],
1425 const __m128i coeffs[1]) {
1426 __m128i s_128[2];
1427
1428 s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1429 s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1430 s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1431 s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1432 const __m128i ss = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1433 return convolve16_2tap_sse2(&ss, coeffs);
1434 }
1435
xy_y_convolve_2tap_2x2_half_pel_sse2(const int16_t * const src,__m128i s_32[2])1436 static inline __m128i xy_y_convolve_2tap_2x2_half_pel_sse2(
1437 const int16_t *const src, __m128i s_32[2]) {
1438 __m128i s_128[2];
1439
1440 s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1441 s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1442 s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1443 s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1444 return _mm_add_epi16(s_128[0], s_128[1]);
1445 }
1446
xy_y_convolve_2tap_4x2_sse2(const int16_t * const src,__m128i s_64[2],const __m128i coeffs[1],__m128i r[2])1447 static inline void xy_y_convolve_2tap_4x2_sse2(const int16_t *const src,
1448 __m128i s_64[2],
1449 const __m128i coeffs[1],
1450 __m128i r[2]) {
1451 __m128i s_128[2];
1452
1453 s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1454 s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1455 s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1456 s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1457 const __m128i ss0 = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1458 const __m128i ss1 = _mm_unpackhi_epi16(s_128[0], s_128[1]);
1459 r[0] = convolve16_2tap_sse2(&ss0, coeffs);
1460 r[1] = convolve16_2tap_sse2(&ss1, coeffs);
1461 }
1462
xy_y_convolve_2tap_4x2_half_pel_sse2(const int16_t * const src,__m128i s_64[2])1463 static inline __m128i xy_y_convolve_2tap_4x2_half_pel_sse2(
1464 const int16_t *const src, __m128i s_64[2]) {
1465 __m128i s_128[2];
1466
1467 s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1468 s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1469 s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1470 s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1471 return _mm_add_epi16(s_128[0], s_128[1]);
1472 }
1473
xy_y_convolve_2tap_16_avx2(const __m256i s0,const __m256i s1,const __m256i coeffs[1],__m256i r[2])1474 static inline void xy_y_convolve_2tap_16_avx2(const __m256i s0,
1475 const __m256i s1,
1476 const __m256i coeffs[1],
1477 __m256i r[2]) {
1478 const __m256i ss0 = _mm256_unpacklo_epi16(s0, s1);
1479 const __m256i ss1 = _mm256_unpackhi_epi16(s0, s1);
1480 r[0] = convolve16_2tap_avx2(&ss0, coeffs);
1481 r[1] = convolve16_2tap_avx2(&ss1, coeffs);
1482 }
1483
xy_y_convolve_2tap_8x2_avx2(const int16_t * const src,__m128i s_128[2],const __m256i coeffs[1],__m256i r[2])1484 static inline void xy_y_convolve_2tap_8x2_avx2(const int16_t *const src,
1485 __m128i s_128[2],
1486 const __m256i coeffs[1],
1487 __m256i r[2]) {
1488 __m256i s_256[2];
1489 s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1490 s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1491 s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1492 s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1493 xy_y_convolve_2tap_16_avx2(s_256[0], s_256[1], coeffs, r);
1494 }
1495
xy_y_convolve_2tap_8x2_half_pel_avx2(const int16_t * const src,__m128i s_128[2])1496 static inline __m256i xy_y_convolve_2tap_8x2_half_pel_avx2(
1497 const int16_t *const src, __m128i s_128[2]) {
1498 __m256i s_256[2];
1499 s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1500 s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1501 s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1502 s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1503 return _mm256_add_epi16(s_256[0], s_256[1]);
1504 }
1505
xy_y_convolve_2tap_16x2_half_pel_avx2(const int16_t * const src,__m256i s_256[2],__m256i r[2])1506 static inline void xy_y_convolve_2tap_16x2_half_pel_avx2(
1507 const int16_t *const src, __m256i s_256[2], __m256i r[2]) {
1508 s_256[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1509 r[0] = _mm256_add_epi16(s_256[0], s_256[1]);
1510 s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1511 r[1] = _mm256_add_epi16(s_256[1], s_256[0]);
1512 }
1513
xy_y_store_16x2_avx2(const __m256i r[2],uint8_t * const dst,const ptrdiff_t stride)1514 static inline void xy_y_store_16x2_avx2(const __m256i r[2], uint8_t *const dst,
1515 const ptrdiff_t stride) {
1516 const __m256i t = _mm256_packus_epi16(r[0], r[1]);
1517 const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
1518 storeu_u8_16x2_avx2(d, dst, stride);
1519 }
1520
xy_y_convolve_2tap_16x2_avx2(const int16_t * const src,__m256i s[2],const __m256i coeffs[1],__m256i r[4])1521 static inline void xy_y_convolve_2tap_16x2_avx2(const int16_t *const src,
1522 __m256i s[2],
1523 const __m256i coeffs[1],
1524 __m256i r[4]) {
1525 s[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1526 xy_y_convolve_2tap_16_avx2(s[0], s[1], coeffs, r + 0);
1527 s[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1528 xy_y_convolve_2tap_16_avx2(s[1], s[0], coeffs, r + 2);
1529 }
1530
xy_y_convolve_2tap_32_avx2(const int16_t * const src,const __m256i s0[2],__m256i s1[2],const __m256i coeffs[1],__m256i r[4])1531 static inline void xy_y_convolve_2tap_32_avx2(const int16_t *const src,
1532 const __m256i s0[2],
1533 __m256i s1[2],
1534 const __m256i coeffs[1],
1535 __m256i r[4]) {
1536 s1[0] = _mm256_loadu_si256((__m256i *)src);
1537 s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1538 xy_y_convolve_2tap_16_avx2(s0[0], s1[0], coeffs, r + 0);
1539 xy_y_convolve_2tap_16_avx2(s0[1], s1[1], coeffs, r + 2);
1540 }
1541
xy_y_convolve_2tap_32_all_avx2(const int16_t * const src,const __m256i s0[2],__m256i s1[2],const __m256i coeffs[1],uint8_t * const dst)1542 static inline void xy_y_convolve_2tap_32_all_avx2(const int16_t *const src,
1543 const __m256i s0[2],
1544 __m256i s1[2],
1545 const __m256i coeffs[1],
1546 uint8_t *const dst) {
1547 __m256i r[4];
1548
1549 xy_y_convolve_2tap_32_avx2(src, s0, s1, coeffs, r);
1550 xy_y_round_store_32_avx2(r + 0, r + 2, dst);
1551 }
1552
xy_y_convolve_2tap_half_pel_32_avx2(const int16_t * const src,const __m256i s0[2],__m256i s1[2],__m256i r[2])1553 static inline void xy_y_convolve_2tap_half_pel_32_avx2(const int16_t *const src,
1554 const __m256i s0[2],
1555 __m256i s1[2],
1556 __m256i r[2]) {
1557 s1[0] = _mm256_loadu_si256((__m256i *)src);
1558 s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1559 r[0] = _mm256_add_epi16(s0[0], s1[0]);
1560 r[1] = _mm256_add_epi16(s0[1], s1[1]);
1561 }
1562
xy_y_convolve_2tap_half_pel_32_all_avx2(const int16_t * const src,const __m256i s0[2],__m256i s1[2],uint8_t * const dst)1563 static inline void xy_y_convolve_2tap_half_pel_32_all_avx2(
1564 const int16_t *const src, const __m256i s0[2], __m256i s1[2],
1565 uint8_t *const dst) {
1566 __m256i r[2];
1567
1568 xy_y_convolve_2tap_half_pel_32_avx2(src, s0, s1, r);
1569 r[0] = xy_y_round_half_pel_avx2(r[0]);
1570 r[1] = xy_y_round_half_pel_avx2(r[1]);
1571 xy_y_pack_store_32_avx2(r[0], r[1], dst);
1572 }
1573
xy_y_convolve_4tap_2x2_sse2(const int16_t * const src,__m128i s_32[4],__m128i ss_128[2],const __m128i coeffs[2])1574 static inline __m128i xy_y_convolve_4tap_2x2_sse2(const int16_t *const src,
1575 __m128i s_32[4],
1576 __m128i ss_128[2],
1577 const __m128i coeffs[2]) {
1578 s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + 3 * 2));
1579 const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1580 s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 4 * 2));
1581 const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1582 ss_128[1] = _mm_unpacklo_epi16(src23, src34);
1583 const __m128i r = convolve16_4tap_sse2(ss_128, coeffs);
1584 ss_128[0] = ss_128[1];
1585 return r;
1586 }
1587
xy_y_convolve_4tap_4x2_avx2(const int16_t * const src,__m128i s_64[4],__m256i ss_256[2],const __m256i coeffs[2])1588 static inline __m256i xy_y_convolve_4tap_4x2_avx2(const int16_t *const src,
1589 __m128i s_64[4],
1590 __m256i ss_256[2],
1591 const __m256i coeffs[2]) {
1592 __m256i s_256[2];
1593 s_64[3] = _mm_loadl_epi64((__m128i *)(src + 3 * 4));
1594 s_256[0] = _mm256_setr_m128i(s_64[2], s_64[3]);
1595 s_64[2] = _mm_loadl_epi64((__m128i *)(src + 4 * 4));
1596 s_256[1] = _mm256_setr_m128i(s_64[3], s_64[2]);
1597 ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1598 const __m256i r = convolve16_4tap_avx2(ss_256, coeffs);
1599 ss_256[0] = ss_256[1];
1600 return r;
1601 }
1602
xy_y_convolve_4tap_16_avx2(const __m256i * const ss,const __m256i coeffs[2],__m256i r[2])1603 static inline void xy_y_convolve_4tap_16_avx2(const __m256i *const ss,
1604 const __m256i coeffs[2],
1605 __m256i r[2]) {
1606 r[0] = convolve16_4tap_avx2(ss, coeffs);
1607 r[1] = convolve16_4tap_avx2(ss + 2, coeffs);
1608 }
1609
xy_y_convolve_4tap_8x2_avx2(const int16_t * const src,__m256i ss_256[4],const __m256i coeffs[2],__m256i r[2])1610 static inline void xy_y_convolve_4tap_8x2_avx2(const int16_t *const src,
1611 __m256i ss_256[4],
1612 const __m256i coeffs[2],
1613 __m256i r[2]) {
1614 __m256i s_256[2];
1615 s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1616 s_256[1] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1617 ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1618 ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1619 xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1620 ss_256[0] = ss_256[1];
1621 ss_256[2] = ss_256[3];
1622 }
1623
xy_y_convolve_4tap_8x2_half_pel_avx2(const int16_t * const src,const __m256i coeffs[1],__m256i s_256[4],__m256i r[2])1624 static inline void xy_y_convolve_4tap_8x2_half_pel_avx2(
1625 const int16_t *const src, const __m256i coeffs[1], __m256i s_256[4],
1626 __m256i r[2]) {
1627 __m256i a_256[2];
1628 s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1629 s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1630 a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1631 a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1632 xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r);
1633 s_256[0] = s_256[2];
1634 s_256[1] = s_256[3];
1635 }
1636
xy_y_convolve_4tap_16x2_avx2(const int16_t * const src,__m256i s_256[4],__m256i ss_256[4],__m256i tt_256[4],const __m256i coeffs[2],__m256i r[4])1637 static inline void xy_y_convolve_4tap_16x2_avx2(
1638 const int16_t *const src, __m256i s_256[4], __m256i ss_256[4],
1639 __m256i tt_256[4], const __m256i coeffs[2], __m256i r[4]) {
1640 s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1641 ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1642 ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1643 s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1644 tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1645 tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1646 xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1647 xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1648 ss_256[0] = ss_256[1];
1649 ss_256[2] = ss_256[3];
1650 tt_256[0] = tt_256[1];
1651 tt_256[2] = tt_256[3];
1652 }
1653
xy_y_convolve_4tap_32x2_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i s_256[4],__m256i ss_256[4],__m256i tt_256[4],const __m256i coeffs[2],__m256i r[4])1654 static inline void xy_y_convolve_4tap_32x2_avx2(
1655 const int16_t *const src, const ptrdiff_t stride, __m256i s_256[4],
1656 __m256i ss_256[4], __m256i tt_256[4], const __m256i coeffs[2],
1657 __m256i r[4]) {
1658 s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1659 ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1660 ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1661 s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1662 tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1663 tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1664 xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1665 xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1666 ss_256[0] = ss_256[1];
1667 ss_256[2] = ss_256[3];
1668 tt_256[0] = tt_256[1];
1669 tt_256[2] = tt_256[3];
1670 }
1671
xy_y_convolve_4tap_16x2_half_pelavx2(const int16_t * const src,__m256i s_256[5],const __m256i coeffs[1],__m256i r[4])1672 static inline void xy_y_convolve_4tap_16x2_half_pelavx2(
1673 const int16_t *const src, __m256i s_256[5], const __m256i coeffs[1],
1674 __m256i r[4]) {
1675 __m256i a_256[2];
1676
1677 s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1678 s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1679
1680 a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1681 a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1682 xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 0);
1683
1684 a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1685 a_256[1] = _mm256_add_epi16(s_256[2], s_256[3]);
1686 xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 2);
1687
1688 s_256[0] = s_256[2];
1689 s_256[1] = s_256[3];
1690 s_256[2] = s_256[4];
1691 }
1692
xy_y_convolve_6tap_2x2_sse2(const int16_t * const src,__m128i s_32[6],__m128i ss_128[3],const __m128i coeffs[3])1693 static inline __m128i xy_y_convolve_6tap_2x2_sse2(const int16_t *const src,
1694 __m128i s_32[6],
1695 __m128i ss_128[3],
1696 const __m128i coeffs[3]) {
1697 s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 5 * 2));
1698 const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1699 s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 6 * 2));
1700 const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1701 ss_128[2] = _mm_unpacklo_epi16(src45, src56);
1702 const __m128i r = convolve16_6tap_sse2(ss_128, coeffs);
1703 ss_128[0] = ss_128[1];
1704 ss_128[1] = ss_128[2];
1705 return r;
1706 }
1707
xy_y_convolve_6tap_4x2_avx2(const int16_t * const src,__m128i s_64[6],__m256i ss_256[3],const __m256i coeffs[3])1708 static inline __m256i xy_y_convolve_6tap_4x2_avx2(const int16_t *const src,
1709 __m128i s_64[6],
1710 __m256i ss_256[3],
1711 const __m256i coeffs[3]) {
1712 __m256i s_256[2];
1713 s_64[5] = _mm_loadl_epi64((__m128i *)(src + 5 * 4));
1714 s_256[0] = _mm256_setr_m128i(s_64[4], s_64[5]);
1715 s_64[4] = _mm_loadl_epi64((__m128i *)(src + 6 * 4));
1716 s_256[1] = _mm256_setr_m128i(s_64[5], s_64[4]);
1717 ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1718 const __m256i r = convolve16_6tap_avx2(ss_256, coeffs);
1719 ss_256[0] = ss_256[1];
1720 ss_256[1] = ss_256[2];
1721 return r;
1722 }
1723
xy_y_convolve_6tap_16_avx2(const __m256i ss[6],const __m256i coeffs[3],__m256i r[2])1724 static inline void xy_y_convolve_6tap_16_avx2(const __m256i ss[6],
1725 const __m256i coeffs[3],
1726 __m256i r[2]) {
1727 r[0] = convolve16_6tap_avx2(ss, coeffs);
1728 r[1] = convolve16_6tap_avx2(ss + 3, coeffs);
1729 }
1730
xy_y_convolve_6tap_8x2_avx2(const int16_t * const src,__m256i ss_256[6],const __m256i coeffs[3],__m256i r[2])1731 static inline void xy_y_convolve_6tap_8x2_avx2(const int16_t *const src,
1732 __m256i ss_256[6],
1733 const __m256i coeffs[3],
1734 __m256i r[2]) {
1735 __m256i s_256[2];
1736 s_256[0] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1737 s_256[1] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1738 ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1739 ss_256[5] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1740 xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r);
1741 ss_256[0] = ss_256[1];
1742 ss_256[1] = ss_256[2];
1743 ss_256[3] = ss_256[4];
1744 ss_256[4] = ss_256[5];
1745 }
1746
xy_y_convolve_6tap_8x2_half_pel_avx2(const int16_t * const src,const __m256i coeffs[2],__m256i s_256[6],__m256i r[2])1747 static inline void xy_y_convolve_6tap_8x2_half_pel_avx2(
1748 const int16_t *const src, const __m256i coeffs[2], __m256i s_256[6],
1749 __m256i r[2]) {
1750 __m256i a_256[2], ss_256[4];
1751 s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1752 s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1753 a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1754 a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1755 ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1756 ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1757 ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1758 ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1759 xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1760 s_256[0] = s_256[2];
1761 s_256[1] = s_256[3];
1762 s_256[2] = s_256[4];
1763 s_256[3] = s_256[5];
1764 }
1765
xy_y_convolve_6tap_16x2_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i s_256[6],__m256i ss_256[6],__m256i tt_256[6],const __m256i coeffs[3],__m256i r[4])1766 static inline void xy_y_convolve_6tap_16x2_avx2(
1767 const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
1768 __m256i ss_256[6], __m256i tt_256[6], const __m256i coeffs[3],
1769 __m256i r[4]) {
1770 s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1771 ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
1772 ss_256[5] = _mm256_unpackhi_epi16(s_256[4], s_256[5]);
1773 s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1774 tt_256[2] = _mm256_unpacklo_epi16(s_256[5], s_256[4]);
1775 tt_256[5] = _mm256_unpackhi_epi16(s_256[5], s_256[4]);
1776
1777 xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r + 0);
1778 xy_y_convolve_6tap_16_avx2(tt_256, coeffs, r + 2);
1779
1780 ss_256[0] = ss_256[1];
1781 ss_256[1] = ss_256[2];
1782 ss_256[3] = ss_256[4];
1783 ss_256[4] = ss_256[5];
1784
1785 tt_256[0] = tt_256[1];
1786 tt_256[1] = tt_256[2];
1787 tt_256[3] = tt_256[4];
1788 tt_256[4] = tt_256[5];
1789 }
1790
xy_y_convolve_6tap_16x2_half_pel_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i s_256[6],__m256i ss_256[4],const __m256i coeffs[2],__m256i r[4])1791 static inline void xy_y_convolve_6tap_16x2_half_pel_avx2(
1792 const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
1793 __m256i ss_256[4], const __m256i coeffs[2], __m256i r[4]) {
1794 __m256i a_256[2];
1795
1796 s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1797 a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1798 a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1799 ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1800 ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1801 ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1802 ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1803 xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1804
1805 a_256[1] = _mm256_add_epi16(s_256[2], s_256[5]);
1806 s_256[0] = s_256[2];
1807 s_256[2] = s_256[4];
1808 s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1809 a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1810 s_256[1] = s_256[3];
1811 s_256[3] = s_256[5];
1812 ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1813 ss_256[1] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
1814 ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1815 ss_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
1816 xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1817 }
1818
xy_y_convolve_8tap_2x2_sse2(const int16_t * const src,__m128i s_32[8],__m128i ss_128[4],const __m128i coeffs[4])1819 static inline __m128i xy_y_convolve_8tap_2x2_sse2(const int16_t *const src,
1820 __m128i s_32[8],
1821 __m128i ss_128[4],
1822 const __m128i coeffs[4]) {
1823 s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * 2));
1824 const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1825 s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * 2));
1826 const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1827 ss_128[3] = _mm_unpacklo_epi16(src67, src78);
1828 const __m128i r = convolve16_8tap_sse2(ss_128, coeffs);
1829 ss_128[0] = ss_128[1];
1830 ss_128[1] = ss_128[2];
1831 ss_128[2] = ss_128[3];
1832 return r;
1833 }
1834
xy_y_convolve_8tap_4x2_avx2(const int16_t * const src,__m128i s_64[8],__m256i ss_256[4],const __m256i coeffs[4])1835 static inline __m256i xy_y_convolve_8tap_4x2_avx2(const int16_t *const src,
1836 __m128i s_64[8],
1837 __m256i ss_256[4],
1838 const __m256i coeffs[4]) {
1839 __m256i s_256[2];
1840 s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * 4));
1841 s_256[0] = _mm256_setr_m128i(s_64[6], s_64[7]);
1842 s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * 4));
1843 s_256[1] = _mm256_setr_m128i(s_64[7], s_64[6]);
1844 ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1845 const __m256i r = convolve16_8tap_avx2(ss_256, coeffs);
1846 ss_256[0] = ss_256[1];
1847 ss_256[1] = ss_256[2];
1848 ss_256[2] = ss_256[3];
1849 return r;
1850 }
1851
xy_y_convolve_8tap_16_avx2(const __m256i * const ss,const __m256i coeffs[4],__m256i r[2])1852 static inline void xy_y_convolve_8tap_16_avx2(const __m256i *const ss,
1853 const __m256i coeffs[4],
1854 __m256i r[2]) {
1855 r[0] = convolve16_8tap_avx2(ss, coeffs);
1856 r[1] = convolve16_8tap_avx2(ss + 4, coeffs);
1857 }
1858
xy_y_convolve_8tap_8x2_avx2(const int16_t * const src,__m256i ss_256[8],const __m256i coeffs[4],__m256i r[2])1859 static inline void xy_y_convolve_8tap_8x2_avx2(const int16_t *const src,
1860 __m256i ss_256[8],
1861 const __m256i coeffs[4],
1862 __m256i r[2]) {
1863 __m256i s_256[2];
1864 s_256[0] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1865 s_256[1] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1866 ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1867 ss_256[7] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1868 xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r);
1869 ss_256[0] = ss_256[1];
1870 ss_256[1] = ss_256[2];
1871 ss_256[2] = ss_256[3];
1872 ss_256[4] = ss_256[5];
1873 ss_256[5] = ss_256[6];
1874 ss_256[6] = ss_256[7];
1875 }
1876
xy_y_convolve_8tap_8x2_half_pel_avx2(const int16_t * const src,const __m256i coeffs[2],__m256i s_256[8],__m256i r[2])1877 static inline void xy_y_convolve_8tap_8x2_half_pel_avx2(
1878 const int16_t *const src, const __m256i coeffs[2], __m256i s_256[8],
1879 __m256i r[2]) {
1880 __m256i a_256[4], ss_256[4];
1881
1882 s_256[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1883 s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1884 a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1885 a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1886 a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1887 a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1888 ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1889 ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1890 ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1891 ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1892 xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1893 s_256[0] = s_256[2];
1894 s_256[1] = s_256[3];
1895 s_256[2] = s_256[4];
1896 s_256[3] = s_256[5];
1897 s_256[4] = s_256[6];
1898 s_256[5] = s_256[7];
1899 }
1900
xy_y_convolve_8tap_16x2_avx2(const int16_t * const src,const ptrdiff_t stride,const __m256i coeffs[4],__m256i s_256[8],__m256i ss_256[8],__m256i tt_256[8],__m256i r[4])1901 static AOM_FORCE_INLINE void xy_y_convolve_8tap_16x2_avx2(
1902 const int16_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1903 __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1904 s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1905 ss_256[3] = _mm256_unpacklo_epi16(s_256[6], s_256[7]);
1906 ss_256[7] = _mm256_unpackhi_epi16(s_256[6], s_256[7]);
1907 s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1908 tt_256[3] = _mm256_unpacklo_epi16(s_256[7], s_256[6]);
1909 tt_256[7] = _mm256_unpackhi_epi16(s_256[7], s_256[6]);
1910
1911 xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r + 0);
1912 xy_y_convolve_8tap_16_avx2(tt_256, coeffs, r + 2);
1913
1914 ss_256[0] = ss_256[1];
1915 ss_256[1] = ss_256[2];
1916 ss_256[2] = ss_256[3];
1917 ss_256[4] = ss_256[5];
1918 ss_256[5] = ss_256[6];
1919 ss_256[6] = ss_256[7];
1920
1921 tt_256[0] = tt_256[1];
1922 tt_256[1] = tt_256[2];
1923 tt_256[2] = tt_256[3];
1924 tt_256[4] = tt_256[5];
1925 tt_256[5] = tt_256[6];
1926 tt_256[6] = tt_256[7];
1927 }
1928
xy_y_convolve_8tap_16x2_half_pel_avx2(const int16_t * const src,const ptrdiff_t stride,const __m256i coeffs[4],__m256i s_256[8],__m256i r[4])1929 static inline void xy_y_convolve_8tap_16x2_half_pel_avx2(
1930 const int16_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1931 __m256i s_256[8], __m256i r[4]) {
1932 __m256i a_256[4], ss_256[4];
1933 s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1934
1935 a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1936 a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1937 a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1938 a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1939 ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1940 ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1941 ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1942 ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1943
1944 xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1945
1946 a_256[1] = _mm256_add_epi16(s_256[2], s_256[7]);
1947 a_256[2] = _mm256_add_epi16(s_256[3], s_256[6]);
1948 a_256[3] = _mm256_add_epi16(s_256[4], s_256[5]);
1949 s_256[0] = s_256[2];
1950 s_256[2] = s_256[4];
1951 s_256[4] = s_256[6];
1952 s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1953
1954 a_256[0] = _mm256_add_epi16(s_256[1], s_256[6]);
1955 s_256[1] = s_256[3];
1956 s_256[3] = s_256[5];
1957 s_256[5] = s_256[7];
1958 ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1959 ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1960 ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1961 ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1962
1963 xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1964 }
1965
xy_y_round_store_8x2_avx2(const __m256i res[2],uint8_t * const dst,const ptrdiff_t stride)1966 static inline void xy_y_round_store_8x2_avx2(const __m256i res[2],
1967 uint8_t *const dst,
1968 const ptrdiff_t stride) {
1969 const __m256i r = xy_y_round_16_avx2(res);
1970 pack_store_8x2_avx2(r, dst, stride);
1971 }
1972
xy_y_round_store_16x2_avx2(const __m256i res[4],uint8_t * const dst,const ptrdiff_t stride)1973 static inline void xy_y_round_store_16x2_avx2(const __m256i res[4],
1974 uint8_t *const dst,
1975 const ptrdiff_t stride) {
1976 const __m256i r0 = xy_y_round_16_avx2(res + 0);
1977 const __m256i r1 = xy_y_round_16_avx2(res + 2);
1978 xy_y_pack_store_16x2_avx2(r0, r1, dst, stride);
1979 }
1980
sr_y_round_store_32_avx2(const __m256i res[2],uint8_t * const dst)1981 static inline void sr_y_round_store_32_avx2(const __m256i res[2],
1982 uint8_t *const dst) {
1983 __m256i r[2];
1984
1985 r[0] = sr_y_round_avx2(res[0]);
1986 r[1] = sr_y_round_avx2(res[1]);
1987 convolve_store_32_avx2(r[0], r[1], dst);
1988 }
1989
sr_y_round_store_32x2_avx2(const __m256i res[4],uint8_t * const dst,const int32_t dst_stride)1990 static inline void sr_y_round_store_32x2_avx2(const __m256i res[4],
1991 uint8_t *const dst,
1992 const int32_t dst_stride) {
1993 sr_y_round_store_32_avx2(res, dst);
1994 sr_y_round_store_32_avx2(res + 2, dst + dst_stride);
1995 }
1996
sr_y_2tap_32_avx2(const uint8_t * const src,const __m256i coeffs[1],const __m256i s0,__m256i * const s1,uint8_t * const dst)1997 static inline void sr_y_2tap_32_avx2(const uint8_t *const src,
1998 const __m256i coeffs[1], const __m256i s0,
1999 __m256i *const s1, uint8_t *const dst) {
2000 __m256i r[2];
2001 y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
2002 sr_y_round_store_32_avx2(r, dst);
2003 }
2004
av1_convolve_y_sr_specialized_avx2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_y,const int32_t subpel_y_q4)2005 static AOM_FORCE_INLINE void av1_convolve_y_sr_specialized_avx2(
2006 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
2007 int32_t w, int32_t h, const InterpFilterParams *filter_params_y,
2008 const int32_t subpel_y_q4) {
2009 int32_t x, y;
2010 __m128i coeffs_128[4];
2011 __m256i coeffs_256[4];
2012
2013 int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
2014
2015 if (vert_tap == 2) {
2016 // vert_filt as 2 tap
2017 const uint8_t *src_ptr = src;
2018
2019 y = h;
2020
2021 if (subpel_y_q4 != 8) {
2022 if (w <= 8) {
2023 prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4,
2024 coeffs_128);
2025
2026 if (w == 2) {
2027 __m128i s_16[2];
2028
2029 s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2030
2031 do {
2032 const __m128i res = y_convolve_2tap_2x2_ssse3(src_ptr, src_stride,
2033 coeffs_128, s_16);
2034 const __m128i r = sr_y_round_sse2(res);
2035 pack_store_2x2_sse2(r, dst, dst_stride);
2036 src_ptr += 2 * src_stride;
2037 dst += 2 * dst_stride;
2038 y -= 2;
2039 } while (y);
2040 } else if (w == 4) {
2041 __m128i s_32[2];
2042
2043 s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2044
2045 do {
2046 const __m128i res = y_convolve_2tap_4x2_ssse3(src_ptr, src_stride,
2047 coeffs_128, s_32);
2048 const __m128i r = sr_y_round_sse2(res);
2049 pack_store_4x2_sse2(r, dst, dst_stride);
2050 src_ptr += 2 * src_stride;
2051 dst += 2 * dst_stride;
2052 y -= 2;
2053 } while (y);
2054 } else {
2055 __m128i s_64[2], s_128[2];
2056
2057 assert(w == 8);
2058
2059 s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2060
2061 do {
2062 // Note: Faster than binding to AVX2 registers.
2063 s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2064 s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
2065 s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2066 s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
2067 const __m128i ss0 = _mm_unpacklo_epi8(s_128[0], s_128[1]);
2068 const __m128i ss1 = _mm_unpackhi_epi8(s_128[0], s_128[1]);
2069 const __m128i res0 = convolve_2tap_ssse3(&ss0, coeffs_128);
2070 const __m128i res1 = convolve_2tap_ssse3(&ss1, coeffs_128);
2071 const __m128i r0 = sr_y_round_sse2(res0);
2072 const __m128i r1 = sr_y_round_sse2(res1);
2073 const __m128i d = _mm_packus_epi16(r0, r1);
2074 _mm_storel_epi64((__m128i *)dst, d);
2075 _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2076 src_ptr += 2 * src_stride;
2077 dst += 2 * dst_stride;
2078 y -= 2;
2079 } while (y);
2080 }
2081 } else {
2082 prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2083
2084 if (w == 16) {
2085 __m128i s_128[2];
2086
2087 s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2088
2089 do {
2090 __m256i r[2];
2091
2092 y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2093 r);
2094 sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2095 src_ptr += 2 * src_stride;
2096 dst += 2 * dst_stride;
2097 y -= 2;
2098 } while (y);
2099 } else if (w == 32) {
2100 __m256i s_256[2];
2101
2102 s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2103
2104 do {
2105 sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0],
2106 &s_256[1], dst);
2107 sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1],
2108 &s_256[0], dst + dst_stride);
2109 src_ptr += 2 * src_stride;
2110 dst += 2 * dst_stride;
2111 y -= 2;
2112 } while (y);
2113 } else if (w == 64) {
2114 __m256i s_256[2][2];
2115
2116 s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2117 s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2118
2119 do {
2120 sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2121 &s_256[1][0], dst);
2122 sr_y_2tap_32_avx2(src_ptr + src_stride + 32, coeffs_256,
2123 s_256[0][1], &s_256[1][1], dst + 32);
2124 sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2125 &s_256[0][0], dst + dst_stride);
2126 sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 32, coeffs_256,
2127 s_256[1][1], &s_256[0][1], dst + dst_stride + 32);
2128
2129 src_ptr += 2 * src_stride;
2130 dst += 2 * dst_stride;
2131 y -= 2;
2132 } while (y);
2133 } else {
2134 __m256i s_256[2][4];
2135
2136 assert(w == 128);
2137
2138 s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2139 s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2140 s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2141 s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2142
2143 do {
2144 sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2145 &s_256[1][0], dst);
2146 sr_y_2tap_32_avx2(src_ptr + src_stride + 1 * 32, coeffs_256,
2147 s_256[0][1], &s_256[1][1], dst + 1 * 32);
2148 sr_y_2tap_32_avx2(src_ptr + src_stride + 2 * 32, coeffs_256,
2149 s_256[0][2], &s_256[1][2], dst + 2 * 32);
2150 sr_y_2tap_32_avx2(src_ptr + src_stride + 3 * 32, coeffs_256,
2151 s_256[0][3], &s_256[1][3], dst + 3 * 32);
2152
2153 sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2154 &s_256[0][0], dst + dst_stride);
2155 sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32, coeffs_256,
2156 s_256[1][1], &s_256[0][1],
2157 dst + dst_stride + 1 * 32);
2158 sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32, coeffs_256,
2159 s_256[1][2], &s_256[0][2],
2160 dst + dst_stride + 2 * 32);
2161 sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32, coeffs_256,
2162 s_256[1][3], &s_256[0][3],
2163 dst + dst_stride + 3 * 32);
2164
2165 src_ptr += 2 * src_stride;
2166 dst += 2 * dst_stride;
2167 y -= 2;
2168 } while (y);
2169 }
2170 }
2171 } else {
2172 // average to get half pel
2173 if (w <= 8) {
2174 if (w == 2) {
2175 __m128i s_16[2];
2176
2177 s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2178
2179 do {
2180 s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + src_stride));
2181 const __m128i d0 = _mm_avg_epu8(s_16[0], s_16[1]);
2182 *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d0);
2183 s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
2184 const __m128i d1 = _mm_avg_epu8(s_16[1], s_16[0]);
2185 *(int16_t *)(dst + dst_stride) = (int16_t)_mm_cvtsi128_si32(d1);
2186 src_ptr += 2 * src_stride;
2187 dst += 2 * dst_stride;
2188 y -= 2;
2189 } while (y);
2190 } else if (w == 4) {
2191 __m128i s_32[2];
2192
2193 s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2194
2195 do {
2196 s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + src_stride));
2197 const __m128i d0 = _mm_avg_epu8(s_32[0], s_32[1]);
2198 xx_storel_32(dst, d0);
2199 s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2200 const __m128i d1 = _mm_avg_epu8(s_32[1], s_32[0]);
2201 xx_storel_32(dst + dst_stride, d1);
2202 src_ptr += 2 * src_stride;
2203 dst += 2 * dst_stride;
2204 y -= 2;
2205 } while (y);
2206 } else {
2207 __m128i s_64[2];
2208
2209 assert(w == 8);
2210
2211 s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2212
2213 do {
2214 // Note: Faster than binding to AVX2 registers.
2215 s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2216 const __m128i d0 = _mm_avg_epu8(s_64[0], s_64[1]);
2217 _mm_storel_epi64((__m128i *)dst, d0);
2218 s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2219 const __m128i d1 = _mm_avg_epu8(s_64[1], s_64[0]);
2220 _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
2221 src_ptr += 2 * src_stride;
2222 dst += 2 * dst_stride;
2223 y -= 2;
2224 } while (y);
2225 }
2226 } else if (w == 16) {
2227 __m128i s_128[2];
2228
2229 s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2230
2231 do {
2232 s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
2233 const __m128i d0 = _mm_avg_epu8(s_128[0], s_128[1]);
2234 _mm_storeu_si128((__m128i *)dst, d0);
2235 s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2236 const __m128i d1 = _mm_avg_epu8(s_128[1], s_128[0]);
2237 _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
2238 src_ptr += 2 * src_stride;
2239 dst += 2 * dst_stride;
2240 y -= 2;
2241 } while (y);
2242 } else if (w == 32) {
2243 __m256i s_256[2];
2244
2245 s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2246
2247 do {
2248 sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0], &s_256[1], dst);
2249 sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1], &s_256[0],
2250 dst + dst_stride);
2251 src_ptr += 2 * src_stride;
2252 dst += 2 * dst_stride;
2253 y -= 2;
2254 } while (y);
2255 } else if (w == 64) {
2256 __m256i s_256[2][2];
2257
2258 s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2259 s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2260
2261 do {
2262 sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2263 dst);
2264 sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 32, s_256[0][1],
2265 &s_256[1][1], dst + 32);
2266
2267 sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2268 &s_256[0][0], dst + dst_stride);
2269 sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 32, s_256[1][1],
2270 &s_256[0][1], dst + dst_stride + 32);
2271
2272 src_ptr += 2 * src_stride;
2273 dst += 2 * dst_stride;
2274 y -= 2;
2275 } while (y);
2276 } else {
2277 __m256i s_256[2][4];
2278
2279 assert(w == 128);
2280
2281 s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2282 s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2283 s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2284 s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2285
2286 do {
2287 sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2288 dst);
2289 sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 1 * 32, s_256[0][1],
2290 &s_256[1][1], dst + 1 * 32);
2291 sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 2 * 32, s_256[0][2],
2292 &s_256[1][2], dst + 2 * 32);
2293 sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 3 * 32, s_256[0][3],
2294 &s_256[1][3], dst + 3 * 32);
2295
2296 sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2297 &s_256[0][0], dst + dst_stride);
2298 sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 1 * 32, s_256[1][1],
2299 &s_256[0][1], dst + dst_stride + 1 * 32);
2300 sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 2 * 32, s_256[1][2],
2301 &s_256[0][2], dst + dst_stride + 2 * 32);
2302 sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 3 * 32, s_256[1][3],
2303 &s_256[0][3], dst + dst_stride + 3 * 32);
2304
2305 src_ptr += 2 * src_stride;
2306 dst += 2 * dst_stride;
2307 y -= 2;
2308 } while (y);
2309 }
2310 }
2311 } else if (vert_tap == 4) {
2312 // vert_filt as 4 tap
2313 const uint8_t *src_ptr = src - src_stride;
2314
2315 y = h;
2316
2317 if (w <= 4) {
2318 prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2319
2320 if (w == 2) {
2321 __m128i s_16[4], ss_128[2];
2322
2323 s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2324 s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2325 s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2326
2327 const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2328 const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2329
2330 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2331
2332 do {
2333 src_ptr += 2 * src_stride;
2334 const __m128i res = y_convolve_4tap_2x2_ssse3(
2335 src_ptr, src_stride, coeffs_128, s_16, ss_128);
2336 const __m128i r = sr_y_round_sse2(res);
2337 pack_store_2x2_sse2(r, dst, dst_stride);
2338
2339 ss_128[0] = ss_128[1];
2340 dst += 2 * dst_stride;
2341 y -= 2;
2342 } while (y);
2343 } else {
2344 __m128i s_32[4], ss_128[2];
2345
2346 assert(w == 4);
2347
2348 s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2349 s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2350 s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2351
2352 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2353 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2354
2355 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2356
2357 do {
2358 src_ptr += 2 * src_stride;
2359 const __m128i res = y_convolve_4tap_4x2_ssse3(
2360 src_ptr, src_stride, coeffs_128, s_32, ss_128);
2361 const __m128i r = sr_y_round_sse2(res);
2362 pack_store_4x2_sse2(r, dst, dst_stride);
2363
2364 ss_128[0] = ss_128[1];
2365 dst += 2 * dst_stride;
2366 y -= 2;
2367 } while (y);
2368 }
2369 } else {
2370 prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2371
2372 if (w == 8) {
2373 __m128i s_64[4];
2374 __m256i ss_256[2];
2375
2376 s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2377 s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2378 s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2379
2380 // Load lines a and b. Line a to lower 128, line b to upper 128
2381 const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2382 const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2383
2384 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2385
2386 do {
2387 src_ptr += 2 * src_stride;
2388 const __m256i res = y_convolve_4tap_8x2_avx2(
2389 src_ptr, src_stride, coeffs_256, s_64, ss_256);
2390 sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2391
2392 ss_256[0] = ss_256[1];
2393 dst += 2 * dst_stride;
2394 y -= 2;
2395 } while (y);
2396 } else if (w == 16) {
2397 __m128i s_128[4];
2398 __m256i ss_256[4], r[2];
2399
2400 s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2401 s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2402 s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2403
2404 // Load lines a and b. Line a to lower 128, line b to upper 128
2405 const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2406 const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2407
2408 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2409 ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
2410
2411 do {
2412 src_ptr += 2 * src_stride;
2413 y_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2414 ss_256, r);
2415 sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2416
2417 ss_256[0] = ss_256[1];
2418 ss_256[2] = ss_256[3];
2419 dst += 2 * dst_stride;
2420 y -= 2;
2421 } while (y);
2422 } else if (w == 32) {
2423 // AV1 standard won't have 32x4 case.
2424 // This only favors some optimization feature which
2425 // subsamples 32x8 to 32x4 and triggers 4-tap filter.
2426
2427 __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2428
2429 s_256[0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * src_stride));
2430 s_256[1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * src_stride));
2431 s_256[2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * src_stride));
2432
2433 ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2434 ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2435
2436 tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2437 tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2438
2439 do {
2440 src_ptr += 2 * src_stride;
2441 y_convolve_4tap_32x2_avx2(src_ptr, src_stride, coeffs_256, s_256,
2442 ss_256, tt_256, r);
2443 sr_y_round_store_32x2_avx2(r, dst, dst_stride);
2444
2445 ss_256[0] = ss_256[1];
2446 ss_256[2] = ss_256[3];
2447
2448 tt_256[0] = tt_256[1];
2449 tt_256[2] = tt_256[3];
2450 dst += 2 * dst_stride;
2451 y -= 2;
2452 } while (y);
2453 } else {
2454 assert(!(w % 32));
2455
2456 __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2457 x = 0;
2458 do {
2459 const uint8_t *s = src_ptr + x;
2460 uint8_t *d = dst + x;
2461 s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2462 s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2463 s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2464
2465 ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2466 ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2467
2468 tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2469 tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2470
2471 y = h;
2472 do {
2473 s += 2 * src_stride;
2474 y_convolve_4tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2475 tt_256, r);
2476 sr_y_round_store_32x2_avx2(r, d, dst_stride);
2477
2478 ss_256[0] = ss_256[1];
2479 ss_256[2] = ss_256[3];
2480
2481 tt_256[0] = tt_256[1];
2482 tt_256[2] = tt_256[3];
2483 d += 2 * dst_stride;
2484 y -= 2;
2485 } while (y);
2486 x += 32;
2487 } while (x < w);
2488 }
2489 }
2490 } else if (vert_tap == 6) {
2491 // vert_filt as 6 tap
2492 const uint8_t *src_ptr = src - 2 * src_stride;
2493
2494 if (w <= 4) {
2495 prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2496
2497 y = h;
2498
2499 if (w == 2) {
2500 __m128i s_16[6], ss_128[3];
2501
2502 s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2503 s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2504 s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2505 s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2506 s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2507
2508 const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2509 const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2510 const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2511 const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2512
2513 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2514 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2515
2516 do {
2517 src_ptr += 2 * src_stride;
2518 const __m128i res = y_convolve_6tap_2x2_ssse3(
2519 src_ptr, src_stride, coeffs_128, s_16, ss_128);
2520 const __m128i r = sr_y_round_sse2(res);
2521 pack_store_2x2_sse2(r, dst, dst_stride);
2522
2523 ss_128[0] = ss_128[1];
2524 ss_128[1] = ss_128[2];
2525 dst += 2 * dst_stride;
2526 y -= 2;
2527 } while (y);
2528 } else {
2529 __m128i s_32[6], ss_128[3];
2530
2531 assert(w == 4);
2532
2533 s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2534 s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2535 s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2536 s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2537 s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2538
2539 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2540 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2541 const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2542 const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2543
2544 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2545 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2546
2547 do {
2548 src_ptr += 2 * src_stride;
2549 const __m128i res = y_convolve_6tap_4x2_ssse3(
2550 src_ptr, src_stride, coeffs_128, s_32, ss_128);
2551 const __m128i r = sr_y_round_sse2(res);
2552 pack_store_4x2_sse2(r, dst, dst_stride);
2553
2554 ss_128[0] = ss_128[1];
2555 ss_128[1] = ss_128[2];
2556 dst += 2 * dst_stride;
2557 y -= 2;
2558 } while (y);
2559 }
2560 } else {
2561 prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2562
2563 if (w == 8) {
2564 __m128i s_64[6];
2565 __m256i ss_256[3];
2566
2567 s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2568 s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2569 s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2570 s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2571 s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2572
2573 // Load lines a and b. Line a to lower 128, line b to upper 128
2574 const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2575 const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2576 const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2577 const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2578
2579 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2580 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2581
2582 y = h;
2583 do {
2584 src_ptr += 2 * src_stride;
2585 const __m256i res = y_convolve_6tap_8x2_avx2(
2586 src_ptr, src_stride, coeffs_256, s_64, ss_256);
2587 sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2588
2589 ss_256[0] = ss_256[1];
2590 ss_256[1] = ss_256[2];
2591 dst += 2 * dst_stride;
2592 y -= 2;
2593 } while (y);
2594 } else if (w == 16) {
2595 __m128i s_128[6];
2596 __m256i ss_256[6], r[2];
2597
2598 s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2599 s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2600 s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2601 s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2602 s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2603
2604 // Load lines a and b. Line a to lower 128, line b to upper 128
2605 const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2606 const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2607 const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2608 const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2609
2610 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2611 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2612
2613 ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
2614 ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
2615
2616 y = h;
2617 do {
2618 src_ptr += 2 * src_stride;
2619 y_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2620 ss_256, r);
2621 sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2622
2623 ss_256[0] = ss_256[1];
2624 ss_256[1] = ss_256[2];
2625
2626 ss_256[3] = ss_256[4];
2627 ss_256[4] = ss_256[5];
2628 dst += 2 * dst_stride;
2629 y -= 2;
2630 } while (y);
2631 } else {
2632 __m256i s_256[6], ss_256[6], tt_256[6], r[4];
2633
2634 assert(!(w % 32));
2635
2636 x = 0;
2637 do {
2638 const uint8_t *s = src_ptr + x;
2639 uint8_t *d = dst + x;
2640
2641 s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2642 s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2643 s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2644 s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2645 s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2646
2647 ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2648 ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2649 ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2650 ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2651
2652 tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2653 tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2654 tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2655 tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2656
2657 y = h;
2658 do {
2659 s += 2 * src_stride;
2660 y_convolve_6tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2661 tt_256, r);
2662 sr_y_round_store_32x2_avx2(r, d, dst_stride);
2663
2664 ss_256[0] = ss_256[1];
2665 ss_256[1] = ss_256[2];
2666 ss_256[3] = ss_256[4];
2667 ss_256[4] = ss_256[5];
2668
2669 tt_256[0] = tt_256[1];
2670 tt_256[1] = tt_256[2];
2671 tt_256[3] = tt_256[4];
2672 tt_256[4] = tt_256[5];
2673 d += 2 * dst_stride;
2674 y -= 2;
2675 } while (y);
2676
2677 x += 32;
2678 } while (x < w);
2679 }
2680 }
2681 } else if (vert_tap == 8) {
2682 // vert_filt as 8 tap
2683 const uint8_t *src_ptr = src - 3 * src_stride;
2684
2685 if (w <= 4) {
2686 prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2687
2688 y = h;
2689
2690 if (w == 2) {
2691 __m128i s_16[8], ss_128[4];
2692
2693 s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2694 s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2695 s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2696 s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2697 s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2698 s_16[5] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 5 * src_stride));
2699 s_16[6] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 6 * src_stride));
2700
2701 const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2702 const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2703 const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2704 const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2705 const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
2706 const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
2707
2708 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2709 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2710 ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2711
2712 do {
2713 const __m128i res = y_convolve_8tap_2x2_ssse3(
2714 src_ptr, src_stride, coeffs_128, s_16, ss_128);
2715 const __m128i r = sr_y_round_sse2(res);
2716 pack_store_2x2_sse2(r, dst, dst_stride);
2717 ss_128[0] = ss_128[1];
2718 ss_128[1] = ss_128[2];
2719 ss_128[2] = ss_128[3];
2720 src_ptr += 2 * src_stride;
2721 dst += 2 * dst_stride;
2722 y -= 2;
2723 } while (y);
2724 } else {
2725 __m128i s_32[8], ss_128[4];
2726
2727 assert(w == 4);
2728
2729 s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2730 s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2731 s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2732 s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2733 s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2734 s_32[5] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 5 * src_stride));
2735 s_32[6] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 6 * src_stride));
2736
2737 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2738 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2739 const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2740 const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2741 const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
2742 const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
2743
2744 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2745 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2746 ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2747
2748 do {
2749 const __m128i res = y_convolve_8tap_4x2_ssse3(
2750 src_ptr, src_stride, coeffs_128, s_32, ss_128);
2751 const __m128i r = sr_y_round_sse2(res);
2752 pack_store_4x2_sse2(r, dst, dst_stride);
2753 ss_128[0] = ss_128[1];
2754 ss_128[1] = ss_128[2];
2755 ss_128[2] = ss_128[3];
2756 src_ptr += 2 * src_stride;
2757 dst += 2 * dst_stride;
2758 y -= 2;
2759 } while (y);
2760 }
2761 } else {
2762 prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2763
2764 if (w == 8) {
2765 __m128i s_64[8];
2766 __m256i ss_256[4];
2767
2768 s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2769 s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2770 s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2771 s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2772 s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2773 s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
2774 s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
2775
2776 // Load lines a and b. Line a to lower 128, line b to upper 128
2777 const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2778 const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2779 const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2780 const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2781 const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
2782 const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
2783
2784 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2785 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2786 ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2787
2788 y = h;
2789 do {
2790 const __m256i res = y_convolve_8tap_8x2_avx2(
2791 src_ptr, src_stride, coeffs_256, s_64, ss_256);
2792 sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2793 ss_256[0] = ss_256[1];
2794 ss_256[1] = ss_256[2];
2795 ss_256[2] = ss_256[3];
2796 src_ptr += 2 * src_stride;
2797 dst += 2 * dst_stride;
2798 y -= 2;
2799 } while (y);
2800 } else if (w == 16) {
2801 __m128i s_128[8];
2802 __m256i ss_256[8], r[2];
2803
2804 s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2805 s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2806 s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2807 s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2808 s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2809 s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
2810 s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
2811
2812 // Load lines a and b. Line a to lower 128, line b to upper 128
2813 const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2814 const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2815 const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2816 const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2817 const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
2818 const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
2819
2820 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2821 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2822 ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2823
2824 ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
2825 ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
2826 ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
2827
2828 y = h;
2829 do {
2830 y_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2831 ss_256, r);
2832 sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2833
2834 ss_256[0] = ss_256[1];
2835 ss_256[1] = ss_256[2];
2836 ss_256[2] = ss_256[3];
2837
2838 ss_256[4] = ss_256[5];
2839 ss_256[5] = ss_256[6];
2840 ss_256[6] = ss_256[7];
2841 src_ptr += 2 * src_stride;
2842 dst += 2 * dst_stride;
2843 y -= 2;
2844 } while (y);
2845 } else {
2846 __m256i s_256[8], ss_256[8], tt_256[8], r[4];
2847
2848 assert(!(w % 32));
2849
2850 x = 0;
2851 do {
2852 const uint8_t *s = src_ptr + x;
2853 uint8_t *d = dst + x;
2854
2855 s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2856 s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2857 s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2858 s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2859 s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2860 s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
2861 s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
2862
2863 ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2864 ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2865 ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
2866 ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2867 ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2868 ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
2869
2870 tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2871 tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2872 tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
2873 tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2874 tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2875 tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
2876
2877 y = h;
2878 do {
2879 y_convolve_8tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2880 tt_256, r);
2881 sr_y_round_store_32x2_avx2(r, d, dst_stride);
2882
2883 ss_256[0] = ss_256[1];
2884 ss_256[1] = ss_256[2];
2885 ss_256[2] = ss_256[3];
2886 ss_256[4] = ss_256[5];
2887 ss_256[5] = ss_256[6];
2888 ss_256[6] = ss_256[7];
2889
2890 tt_256[0] = tt_256[1];
2891 tt_256[1] = tt_256[2];
2892 tt_256[2] = tt_256[3];
2893 tt_256[4] = tt_256[5];
2894 tt_256[5] = tt_256[6];
2895 tt_256[6] = tt_256[7];
2896 s += 2 * src_stride;
2897 d += 2 * dst_stride;
2898 y -= 2;
2899 } while (y);
2900
2901 x += 32;
2902 } while (x < w);
2903 }
2904 }
2905 }
2906 }
2907
sr_x_2tap_32_avx2(const uint8_t * const src,const __m256i coeffs[1],uint8_t * const dst)2908 static inline void sr_x_2tap_32_avx2(const uint8_t *const src,
2909 const __m256i coeffs[1],
2910 uint8_t *const dst) {
2911 __m256i r[2];
2912
2913 x_convolve_2tap_32_avx2(src, coeffs, r);
2914 sr_x_round_store_32_avx2(r, dst);
2915 }
2916
sr_x_6tap_32_avx2(const uint8_t * const src,const __m256i coeffs[3],const __m256i filt[3],uint8_t * const dst)2917 static inline void sr_x_6tap_32_avx2(const uint8_t *const src,
2918 const __m256i coeffs[3],
2919 const __m256i filt[3],
2920 uint8_t *const dst) {
2921 __m256i r[2];
2922
2923 x_convolve_6tap_32_avx2(src, coeffs, filt, r);
2924 sr_x_round_store_32_avx2(r, dst);
2925 }
2926
sr_x_8tap_32_avx2(const uint8_t * const src,const __m256i coeffs[4],const __m256i filt[4],uint8_t * const dst)2927 static AOM_FORCE_INLINE void sr_x_8tap_32_avx2(const uint8_t *const src,
2928 const __m256i coeffs[4],
2929 const __m256i filt[4],
2930 uint8_t *const dst) {
2931 __m256i r[2];
2932
2933 x_convolve_8tap_32_avx2(src, coeffs, filt, r);
2934 sr_x_round_store_32_avx2(r, dst);
2935 }
2936
av1_convolve_x_sr_specialized_avx2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const int32_t subpel_x_q4,ConvolveParams * conv_params)2937 static AOM_FORCE_INLINE void av1_convolve_x_sr_specialized_avx2(
2938 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
2939 int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
2940 const int32_t subpel_x_q4, ConvolveParams *conv_params) {
2941 int32_t y = h;
2942 __m128i coeffs_128[4];
2943 __m256i coeffs_256[4];
2944
2945 assert(conv_params->round_0 == 3);
2946 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
2947 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
2948 (void)conv_params;
2949
2950 const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
2951
2952 if (horz_tap == 2) {
2953 // horz_filt as 2 tap
2954 const uint8_t *src_ptr = src;
2955
2956 if (subpel_x_q4 != 8) {
2957 if (w <= 8) {
2958 prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4,
2959 coeffs_128);
2960
2961 if (w == 2) {
2962 do {
2963 const __m128i res =
2964 x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
2965 const __m128i r = sr_x_round_sse2(res);
2966 pack_store_2x2_sse2(r, dst, dst_stride);
2967 src_ptr += 2 * src_stride;
2968 dst += 2 * dst_stride;
2969 y -= 2;
2970 } while (y);
2971 } else if (w == 4) {
2972 do {
2973 const __m128i res =
2974 x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
2975 const __m128i r = sr_x_round_sse2(res);
2976 pack_store_4x2_sse2(r, dst, dst_stride);
2977 src_ptr += 2 * src_stride;
2978 dst += 2 * dst_stride;
2979 y -= 2;
2980 } while (y);
2981 } else {
2982 assert(w == 8);
2983
2984 do {
2985 __m128i res[2];
2986
2987 x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, res);
2988 res[0] = sr_x_round_sse2(res[0]);
2989 res[1] = sr_x_round_sse2(res[1]);
2990 const __m128i d = _mm_packus_epi16(res[0], res[1]);
2991 _mm_storel_epi64((__m128i *)dst, d);
2992 _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2993
2994 src_ptr += 2 * src_stride;
2995 dst += 2 * dst_stride;
2996 y -= 2;
2997 } while (y);
2998 }
2999 } else {
3000 prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3001
3002 if (w == 16) {
3003 do {
3004 __m256i r[2];
3005
3006 x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
3007 sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3008 src_ptr += 2 * src_stride;
3009 dst += 2 * dst_stride;
3010 y -= 2;
3011 } while (y);
3012 } else if (w == 32) {
3013 do {
3014 sr_x_2tap_32_avx2(src_ptr, coeffs_256, dst);
3015 src_ptr += src_stride;
3016 dst += dst_stride;
3017 } while (--y);
3018 } else if (w == 64) {
3019 do {
3020 sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3021 sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3022 src_ptr += src_stride;
3023 dst += dst_stride;
3024 } while (--y);
3025 } else {
3026 assert(w == 128);
3027
3028 do {
3029 sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3030 sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3031 sr_x_2tap_32_avx2(src_ptr + 2 * 32, coeffs_256, dst + 2 * 32);
3032 sr_x_2tap_32_avx2(src_ptr + 3 * 32, coeffs_256, dst + 3 * 32);
3033 src_ptr += src_stride;
3034 dst += dst_stride;
3035 } while (--y);
3036 }
3037 }
3038 } else {
3039 // average to get half pel
3040 if (w == 2) {
3041 do {
3042 __m128i s_128;
3043
3044 s_128 = load_u8_4x2_sse4_1(src_ptr, src_stride);
3045 const __m128i s1 = _mm_srli_si128(s_128, 1);
3046 const __m128i d = _mm_avg_epu8(s_128, s1);
3047 *(uint16_t *)dst = (uint16_t)_mm_cvtsi128_si32(d);
3048 *(uint16_t *)(dst + dst_stride) = _mm_extract_epi16(d, 2);
3049
3050 src_ptr += 2 * src_stride;
3051 dst += 2 * dst_stride;
3052 y -= 2;
3053 } while (y);
3054 } else if (w == 4) {
3055 do {
3056 __m128i s_128;
3057
3058 s_128 = load_u8_8x2_sse2(src_ptr, src_stride);
3059 const __m128i s1 = _mm_srli_si128(s_128, 1);
3060 const __m128i d = _mm_avg_epu8(s_128, s1);
3061 xx_storel_32(dst, d);
3062 *(int32_t *)(dst + dst_stride) = _mm_extract_epi32(d, 2);
3063
3064 src_ptr += 2 * src_stride;
3065 dst += 2 * dst_stride;
3066 y -= 2;
3067 } while (y);
3068 } else if (w == 8) {
3069 do {
3070 const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3071 const __m128i s10 =
3072 _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3073 const __m128i s01 = _mm_srli_si128(s00, 1);
3074 const __m128i s11 = _mm_srli_si128(s10, 1);
3075 const __m128i d0 = _mm_avg_epu8(s00, s01);
3076 const __m128i d1 = _mm_avg_epu8(s10, s11);
3077 _mm_storel_epi64((__m128i *)dst, d0);
3078 _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
3079
3080 src_ptr += 2 * src_stride;
3081 dst += 2 * dst_stride;
3082 y -= 2;
3083 } while (y);
3084 } else if (w == 16) {
3085 do {
3086 const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3087 const __m128i s01 = _mm_loadu_si128((__m128i *)(src_ptr + 1));
3088 const __m128i s10 =
3089 _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3090 const __m128i s11 =
3091 _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1));
3092 const __m128i d0 = _mm_avg_epu8(s00, s01);
3093 const __m128i d1 = _mm_avg_epu8(s10, s11);
3094 _mm_storeu_si128((__m128i *)dst, d0);
3095 _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
3096
3097 src_ptr += 2 * src_stride;
3098 dst += 2 * dst_stride;
3099 y -= 2;
3100 } while (y);
3101 } else if (w == 32) {
3102 do {
3103 sr_x_2tap_32_avg_avx2(src_ptr, dst);
3104 src_ptr += src_stride;
3105 dst += dst_stride;
3106 } while (--y);
3107 } else if (w == 64) {
3108 do {
3109 sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3110 sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3111 src_ptr += src_stride;
3112 dst += dst_stride;
3113 } while (--y);
3114 } else {
3115 assert(w == 128);
3116
3117 do {
3118 sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3119 sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3120 sr_x_2tap_32_avg_avx2(src_ptr + 2 * 32, dst + 2 * 32);
3121 sr_x_2tap_32_avg_avx2(src_ptr + 3 * 32, dst + 3 * 32);
3122 src_ptr += src_stride;
3123 dst += dst_stride;
3124 } while (--y);
3125 }
3126 }
3127 } else if (horz_tap == 4) {
3128 // horz_filt as 4 tap
3129 const uint8_t *src_ptr = src - 1;
3130
3131 prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
3132
3133 if (w == 2) {
3134 do {
3135 const __m128i res =
3136 x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
3137 const __m128i r = sr_x_round_sse2(res);
3138 pack_store_2x2_sse2(r, dst, dst_stride);
3139 src_ptr += 2 * src_stride;
3140 dst += 2 * dst_stride;
3141 y -= 2;
3142 } while (y);
3143 } else if (w == 4) {
3144 do {
3145 const __m128i res =
3146 x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
3147 const __m128i r = sr_x_round_sse2(res);
3148 pack_store_4x2_sse2(r, dst, dst_stride);
3149 src_ptr += 2 * src_stride;
3150 dst += 2 * dst_stride;
3151 y -= 2;
3152 } while (y);
3153 } else if (w == 8) {
3154 // TODO([email protected]): Reuse the old SIMD code here. Need to
3155 // rewrite this for better performance later.
3156 __m256i filt_256[2];
3157 prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3158
3159 filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3160 filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3161 for (int i = 0; i < h; i += 2) {
3162 const __m256i data = _mm256_permute2x128_si256(
3163 _mm256_castsi128_si256(
3164 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
3165 _mm256_castsi128_si256(_mm_loadu_si128(
3166 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
3167 0x20);
3168
3169 __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3170 res_16b = sr_x_round_avx2(res_16b);
3171
3172 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3173
3174 const __m128i res_0 = _mm256_castsi256_si128(res_8b);
3175 const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
3176
3177 _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
3178 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
3179 }
3180 } else {
3181 assert(!(w % 16));
3182 // TODO([email protected]): Reuse the old SIMD code here. Need to
3183 // rewrite this for better performance later.
3184 __m256i filt_256[2];
3185 prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3186 filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3187 filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3188
3189 for (int i = 0; i < h; ++i) {
3190 for (int j = 0; j < w; j += 16) {
3191 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
3192 // 18 19 20 21 22 23
3193 const __m256i data = _mm256_inserti128_si256(
3194 _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
3195 _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
3196 1);
3197
3198 __m256i res_16b =
3199 convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3200 res_16b = sr_x_round_avx2(res_16b);
3201
3202 /* rounding code */
3203 // 8 bit conversion and saturation to uint8
3204 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3205
3206 // Store values into the destination buffer
3207 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
3208 res_8b = _mm256_permute4x64_epi64(res_8b, 216);
3209 __m128i res = _mm256_castsi256_si128(res_8b);
3210 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
3211 }
3212 }
3213 }
3214 } else {
3215 __m256i filt_256[4];
3216
3217 filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3218 filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3219 filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
3220
3221 if (horz_tap == 6) {
3222 // horz_filt as 6 tap
3223 const uint8_t *src_ptr = src - 2;
3224
3225 prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3226
3227 if (w == 8) {
3228 do {
3229 const __m256i res = x_convolve_6tap_8x2_avx2(src_ptr, src_stride,
3230 coeffs_256, filt_256);
3231 sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3232 src_ptr += 2 * src_stride;
3233 dst += 2 * dst_stride;
3234 y -= 2;
3235 } while (y);
3236 } else if (w == 16) {
3237 do {
3238 __m256i r[2];
3239
3240 x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3241 r);
3242 sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3243 src_ptr += 2 * src_stride;
3244 dst += 2 * dst_stride;
3245 y -= 2;
3246 } while (y);
3247 } else if (w == 32) {
3248 do {
3249 sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3250 src_ptr += src_stride;
3251 dst += dst_stride;
3252 } while (--y);
3253 } else if (w == 64) {
3254 do {
3255 sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3256 sr_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3257 src_ptr += src_stride;
3258 dst += dst_stride;
3259 } while (--y);
3260 } else {
3261 assert(w == 128);
3262
3263 do {
3264 sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3265 sr_x_6tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3266 dst + 1 * 32);
3267 sr_x_6tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3268 dst + 2 * 32);
3269 sr_x_6tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3270 dst + 3 * 32);
3271 src_ptr += src_stride;
3272 dst += dst_stride;
3273 } while (--y);
3274 }
3275 } else if (horz_tap == 8) {
3276 // horz_filt as 8 tap
3277 const uint8_t *src_ptr = src - 3;
3278
3279 filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
3280
3281 prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3282
3283 if (w == 8) {
3284 do {
3285 const __m256i res = x_convolve_8tap_8x2_avx2(src_ptr, src_stride,
3286 coeffs_256, filt_256);
3287 sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3288 src_ptr += 2 * src_stride;
3289 dst += 2 * dst_stride;
3290 y -= 2;
3291 } while (y);
3292 } else if (w == 16) {
3293 do {
3294 __m256i r[2];
3295
3296 x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3297 r);
3298 sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3299 src_ptr += 2 * src_stride;
3300 dst += 2 * dst_stride;
3301 y -= 2;
3302 } while (y);
3303 } else if (w == 32) {
3304 do {
3305 sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3306 src_ptr += src_stride;
3307 dst += dst_stride;
3308 } while (--y);
3309 } else if (w == 64) {
3310 do {
3311 sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3312 sr_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3313 src_ptr += src_stride;
3314 dst += dst_stride;
3315 } while (--y);
3316 } else {
3317 assert(w == 128);
3318
3319 do {
3320 sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3321 sr_x_8tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3322 dst + 1 * 32);
3323 sr_x_8tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3324 dst + 2 * 32);
3325 sr_x_8tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3326 dst + 3 * 32);
3327 src_ptr += src_stride;
3328 dst += dst_stride;
3329 } while (--y);
3330 }
3331 }
3332 }
3333 }
3334
3335 #endif // THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
3336