1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #ifndef THIRD_PARTY_SVT_AV1_CONVOLVE_2D_AVX2_H_
13 #define THIRD_PARTY_SVT_AV1_CONVOLVE_2D_AVX2_H_
14
15 #include "convolve_avx2.h"
16
convolve_2d_sr_hor_2tap_avx2(const uint8_t * const src,const int32_t src_stride,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_x,const int32_t subpel_x_q4,int16_t * const im_block)17 static void convolve_2d_sr_hor_2tap_avx2(
18 const uint8_t *const src, const int32_t src_stride, const int32_t w,
19 const int32_t h, const InterpFilterParams *const filter_params_x,
20 const int32_t subpel_x_q4, int16_t *const im_block) {
21 const uint8_t *src_ptr = src;
22 int32_t y = h;
23 int16_t *im = im_block;
24
25 if (w <= 8) {
26 __m128i coeffs_128;
27
28 prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4, &coeffs_128);
29
30 if (w == 2) {
31 do {
32 const __m128i r =
33 x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, &coeffs_128);
34 xy_x_round_store_2x2_sse2(r, im);
35 src_ptr += 2 * src_stride;
36 im += 2 * 2;
37 y -= 2;
38 } while (y);
39 } else if (w == 4) {
40 do {
41 const __m128i r =
42 x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, &coeffs_128);
43 xy_x_round_store_4x2_sse2(r, im);
44 src_ptr += 2 * src_stride;
45 im += 2 * 4;
46 y -= 2;
47 } while (y);
48 } else {
49 assert(w == 8);
50
51 do {
52 __m128i r[2];
53
54 x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, &coeffs_128, r);
55 xy_x_round_store_8x2_sse2(r, im);
56 src_ptr += 2 * src_stride;
57 im += 2 * 8;
58 y -= 2;
59 } while (y);
60 }
61 } else {
62 __m256i coeffs_256;
63
64 prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, &coeffs_256);
65
66 if (w == 16) {
67 do {
68 __m256i r[2];
69
70 x_convolve_2tap_16x2_avx2(src_ptr, src_stride, &coeffs_256, r);
71 xy_x_round_store_32_avx2(r, im);
72 src_ptr += 2 * src_stride;
73 im += 2 * 16;
74 y -= 2;
75 } while (y);
76 } else if (w == 32) {
77 do {
78 xy_x_2tap_32_avx2(src_ptr, &coeffs_256, im);
79 src_ptr += src_stride;
80 im += 32;
81 } while (--y);
82 } else if (w == 64) {
83 do {
84 xy_x_2tap_32_avx2(src_ptr + 0 * 32, &coeffs_256, im + 0 * 32);
85 xy_x_2tap_32_avx2(src_ptr + 1 * 32, &coeffs_256, im + 1 * 32);
86 src_ptr += src_stride;
87 im += 64;
88 } while (--y);
89 } else {
90 assert(w == 128);
91
92 do {
93 xy_x_2tap_32_avx2(src_ptr + 0 * 32, &coeffs_256, im + 0 * 32);
94 xy_x_2tap_32_avx2(src_ptr + 1 * 32, &coeffs_256, im + 1 * 32);
95 xy_x_2tap_32_avx2(src_ptr + 2 * 32, &coeffs_256, im + 2 * 32);
96 xy_x_2tap_32_avx2(src_ptr + 3 * 32, &coeffs_256, im + 3 * 32);
97 src_ptr += src_stride;
98 im += 128;
99 } while (--y);
100 }
101 }
102 }
103
convolve_2d_sr_hor_4tap_ssse3(const uint8_t * const src,const int32_t src_stride,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_x,const int32_t subpel_x_q4,int16_t * const im_block)104 static void convolve_2d_sr_hor_4tap_ssse3(
105 const uint8_t *const src, const int32_t src_stride, const int32_t w,
106 const int32_t h, const InterpFilterParams *const filter_params_x,
107 const int32_t subpel_x_q4, int16_t *const im_block) {
108 const uint8_t *src_ptr = src - 1;
109 int32_t y = h;
110 int16_t *im = im_block;
111
112 if (w <= 4) {
113 __m128i coeffs_128[2];
114
115 prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
116 if (w == 2) {
117 do {
118 const __m128i r =
119 x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
120 xy_x_round_store_2x2_sse2(r, im);
121 src_ptr += 2 * src_stride;
122 im += 2 * 2;
123 y -= 2;
124 } while (y);
125 } else if (w == 4) {
126 do {
127 const __m128i r =
128 x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
129 xy_x_round_store_4x2_sse2(r, im);
130 src_ptr += 2 * src_stride;
131 im += 2 * 4;
132 y -= 2;
133 } while (y);
134 }
135 } else {
136 // TODO([email protected]): Add better optimization
137 __m256i coeffs_256[2], filt_256[2];
138
139 prepare_half_coeffs_4tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
140 filt_256[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
141 filt_256[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
142
143 if (w == 8) {
144 do {
145 __m256i res =
146 x_convolve_4tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256);
147 xy_x_round_store_8x2_avx2(res, im);
148
149 src_ptr += 2 * src_stride;
150 im += 2 * 8;
151 y -= 2;
152 } while (y);
153 } else if (w == 16) {
154 do {
155 __m256i r[2];
156
157 x_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
158 xy_x_round_store_32_avx2(r, im);
159 src_ptr += 2 * src_stride;
160 im += 2 * 16;
161 y -= 2;
162 } while (y);
163 } else if (w == 32) {
164 do {
165 xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
166
167 src_ptr += src_stride;
168 im += 32;
169 } while (--y);
170 } else if (w == 64) {
171 do {
172 xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
173 xy_x_4tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
174 src_ptr += src_stride;
175 im += 64;
176 } while (--y);
177 } else {
178 assert(w == 128);
179
180 do {
181 xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
182 xy_x_4tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
183 xy_x_4tap_32_avx2(src_ptr + 64, coeffs_256, filt_256, im + 64);
184 xy_x_4tap_32_avx2(src_ptr + 96, coeffs_256, filt_256, im + 96);
185 src_ptr += src_stride;
186 im += 128;
187 } while (--y);
188 }
189 }
190 }
191
convolve_2d_sr_hor_6tap_avx2(const uint8_t * const src,const int32_t src_stride,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_x,const int32_t subpel_x_q4,int16_t * const im_block)192 static void convolve_2d_sr_hor_6tap_avx2(
193 const uint8_t *const src, const int32_t src_stride, const int32_t w,
194 const int32_t h, const InterpFilterParams *const filter_params_x,
195 const int32_t subpel_x_q4, int16_t *const im_block) {
196 const uint8_t *src_ptr = src - 2;
197 int32_t y = h;
198 int16_t *im = im_block;
199
200 if (w <= 4) {
201 __m128i coeffs_128[3];
202
203 prepare_half_coeffs_6tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
204 if (w == 2) {
205 do {
206 const __m128i r =
207 x_convolve_6tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
208 xy_x_round_store_2x2_sse2(r, im);
209 src_ptr += 2 * src_stride;
210 im += 2 * 2;
211 y -= 2;
212 } while (y);
213 } else if (w == 4) {
214 do {
215 const __m128i r =
216 x_convolve_6tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
217 xy_x_round_store_4x2_sse2(r, im);
218 src_ptr += 2 * src_stride;
219 im += 2 * 4;
220 y -= 2;
221 } while (y);
222 }
223 } else {
224 __m256i coeffs_256[3], filt_256[3];
225
226 filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
227 filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
228 filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
229
230 prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
231
232 if (w == 8) {
233 do {
234 const __m256i res =
235 x_convolve_6tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256);
236 xy_x_round_store_8x2_avx2(res, im);
237
238 src_ptr += 2 * src_stride;
239 im += 2 * 8;
240 y -= 2;
241 } while (y);
242 } else if (w == 16) {
243 do {
244 __m256i r[2];
245
246 x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
247 xy_x_round_store_32_avx2(r, im);
248 src_ptr += 2 * src_stride;
249 im += 2 * 16;
250 y -= 2;
251 } while (y);
252 } else if (w == 32) {
253 do {
254 xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
255 src_ptr += src_stride;
256 im += 32;
257 } while (--y);
258 } else if (w == 64) {
259 do {
260 xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
261 xy_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
262 src_ptr += src_stride;
263 im += 64;
264 } while (--y);
265 } else {
266 assert(w == 128);
267
268 do {
269 xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
270 xy_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
271 xy_x_6tap_32_avx2(src_ptr + 64, coeffs_256, filt_256, im + 64);
272 xy_x_6tap_32_avx2(src_ptr + 96, coeffs_256, filt_256, im + 96);
273 src_ptr += src_stride;
274 im += 128;
275 } while (--y);
276 }
277 }
278 }
279
convolve_2d_sr_hor_8tap_avx2(const uint8_t * const src,const int32_t src_stride,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_x,const int32_t subpel_x_q4,int16_t * const im_block)280 static void convolve_2d_sr_hor_8tap_avx2(
281 const uint8_t *const src, const int32_t src_stride, const int32_t w,
282 const int32_t h, const InterpFilterParams *const filter_params_x,
283 const int32_t subpel_x_q4, int16_t *const im_block) {
284 const uint8_t *src_ptr = src - 3;
285 int32_t y = h;
286 int16_t *im = im_block;
287 __m256i coeffs_256[4], filt_256[4];
288
289 filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
290 filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
291 filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
292 filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
293
294 prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
295
296 if (w == 8) {
297 do {
298 const __m256i res =
299 x_convolve_8tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256);
300 xy_x_round_store_8x2_avx2(res, im);
301 src_ptr += 2 * src_stride;
302 im += 2 * 8;
303 y -= 2;
304 } while (y);
305 } else if (w == 16) {
306 do {
307 __m256i r[2];
308
309 x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
310 xy_x_round_store_32_avx2(r, im);
311 src_ptr += 2 * src_stride;
312 im += 2 * 16;
313 y -= 2;
314 } while (y);
315 } else if (w == 32) {
316 do {
317 xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
318 src_ptr += src_stride;
319 im += 32;
320 } while (--y);
321 } else if (w == 64) {
322 do {
323 xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
324 xy_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
325 src_ptr += src_stride;
326 im += 64;
327 } while (--y);
328 } else {
329 assert(w == 128);
330
331 do {
332 xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
333 xy_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
334 xy_x_8tap_32_avx2(src_ptr + 64, coeffs_256, filt_256, im + 64);
335 xy_x_8tap_32_avx2(src_ptr + 96, coeffs_256, filt_256, im + 96);
336 src_ptr += src_stride;
337 im += 128;
338 } while (--y);
339 }
340 }
341
convolve_2d_sr_ver_2tap_avx2(const int16_t * const im_block,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,uint8_t * dst,const int32_t dst_stride)342 static void convolve_2d_sr_ver_2tap_avx2(
343 const int16_t *const im_block, const int32_t w, const int32_t h,
344 const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
345 uint8_t *dst, const int32_t dst_stride) {
346 const int16_t *im = im_block;
347 int32_t y = h;
348
349 if (w <= 4) {
350 __m128i coeffs_128;
351
352 prepare_coeffs_2tap_sse2(filter_params_y, subpel_y_q4, &coeffs_128);
353
354 if (w == 2) {
355 __m128i s_32[2];
356
357 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
358
359 do {
360 const __m128i res = xy_y_convolve_2tap_2x2_sse2(im, s_32, &coeffs_128);
361 xy_y_round_store_2x2_sse2(res, dst, dst_stride);
362 im += 2 * 2;
363 dst += 2 * dst_stride;
364 y -= 2;
365 } while (y);
366 } else {
367 __m128i s_64[2], r[2];
368
369 assert(w == 4);
370
371 s_64[0] = _mm_loadl_epi64((__m128i *)im);
372
373 do {
374 xy_y_convolve_2tap_4x2_sse2(im, s_64, &coeffs_128, r);
375 r[0] = xy_y_round_sse2(r[0]);
376 r[1] = xy_y_round_sse2(r[1]);
377 const __m128i rr = _mm_packs_epi32(r[0], r[1]);
378 pack_store_4x2_sse2(rr, dst, dst_stride);
379 im += 2 * 4;
380 dst += 2 * dst_stride;
381 y -= 2;
382 } while (y);
383 }
384 } else {
385 __m256i coeffs_256;
386
387 prepare_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, &coeffs_256);
388
389 if (w == 8) {
390 __m128i s_128[2];
391 __m256i r[2];
392
393 s_128[0] = _mm_loadu_si128((__m128i *)im);
394
395 do {
396 xy_y_convolve_2tap_8x2_avx2(im, s_128, &coeffs_256, r);
397 xy_y_round_store_8x2_avx2(r, dst, dst_stride);
398 im += 2 * 8;
399 dst += 2 * dst_stride;
400 y -= 2;
401 } while (y);
402 } else if (w == 16) {
403 __m256i s_256[2], r[4];
404
405 s_256[0] = _mm256_loadu_si256((__m256i *)im);
406
407 do {
408 xy_y_convolve_2tap_16x2_avx2(im, s_256, &coeffs_256, r);
409 xy_y_round_store_16x2_avx2(r, dst, dst_stride);
410 im += 2 * 16;
411 dst += 2 * dst_stride;
412 y -= 2;
413 } while (y);
414 } else if (w == 32) {
415 __m256i s_256[2][2];
416
417 s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
418 s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
419
420 do {
421 xy_y_convolve_2tap_32_all_avx2(im + 32, s_256[0], s_256[1], &coeffs_256,
422 dst);
423 im += 2 * 32;
424 xy_y_convolve_2tap_32_all_avx2(im, s_256[1], s_256[0], &coeffs_256,
425 dst + dst_stride);
426 dst += 2 * dst_stride;
427 y -= 2;
428 } while (y);
429 } else if (w == 64) {
430 __m256i s_256[2][4];
431
432 s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
433 s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
434 s_256[0][2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
435 s_256[0][3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
436
437 do {
438 xy_y_convolve_2tap_32_all_avx2(im + 64, s_256[0] + 0, s_256[1] + 0,
439 &coeffs_256, dst);
440 xy_y_convolve_2tap_32_all_avx2(im + 96, s_256[0] + 2, s_256[1] + 2,
441 &coeffs_256, dst + 32);
442 im += 2 * 64;
443 xy_y_convolve_2tap_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
444 &coeffs_256, dst + dst_stride);
445 xy_y_convolve_2tap_32_all_avx2(im + 32, s_256[1] + 2, s_256[0] + 2,
446 &coeffs_256, dst + dst_stride + 32);
447 dst += 2 * dst_stride;
448 y -= 2;
449 } while (y);
450 } else {
451 __m256i s_256[2][8];
452
453 assert(w == 128);
454
455 load_16bit_8rows_avx2(im, 16, s_256[0]);
456
457 do {
458 xy_y_convolve_2tap_32_all_avx2(im + 128, s_256[0] + 0, s_256[1] + 0,
459 &coeffs_256, dst);
460 xy_y_convolve_2tap_32_all_avx2(im + 160, s_256[0] + 2, s_256[1] + 2,
461 &coeffs_256, dst + 1 * 32);
462 xy_y_convolve_2tap_32_all_avx2(im + 192, s_256[0] + 4, s_256[1] + 4,
463 &coeffs_256, dst + 2 * 32);
464 xy_y_convolve_2tap_32_all_avx2(im + 224, s_256[0] + 6, s_256[1] + 6,
465 &coeffs_256, dst + 3 * 32);
466 im += 2 * 128;
467 xy_y_convolve_2tap_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
468 &coeffs_256, dst + dst_stride);
469 xy_y_convolve_2tap_32_all_avx2(im + 32, s_256[1] + 2, s_256[0] + 2,
470 &coeffs_256, dst + dst_stride + 1 * 32);
471 xy_y_convolve_2tap_32_all_avx2(im + 64, s_256[1] + 4, s_256[0] + 4,
472 &coeffs_256, dst + dst_stride + 2 * 32);
473 xy_y_convolve_2tap_32_all_avx2(im + 96, s_256[1] + 6, s_256[0] + 6,
474 &coeffs_256, dst + dst_stride + 3 * 32);
475 dst += 2 * dst_stride;
476 y -= 2;
477 } while (y);
478 }
479 }
480 }
481
convolve_2d_sr_ver_2tap_half_avx2(const int16_t * const im_block,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,uint8_t * dst,const int32_t dst_stride)482 static void convolve_2d_sr_ver_2tap_half_avx2(
483 const int16_t *const im_block, const int32_t w, const int32_t h,
484 const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
485 uint8_t *dst, const int32_t dst_stride) {
486 const int16_t *im = im_block;
487 int32_t y = h;
488
489 (void)filter_params_y;
490 (void)subpel_y_q4;
491
492 if (w == 2) {
493 __m128i s_32[2];
494
495 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
496
497 do {
498 const __m128i res = xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
499 const __m128i r = xy_y_round_half_pel_sse2(res);
500 pack_store_2x2_sse2(r, dst, dst_stride);
501 im += 2 * 2;
502 dst += 2 * dst_stride;
503 y -= 2;
504 } while (y);
505 } else if (w == 4) {
506 __m128i s_64[2];
507
508 s_64[0] = _mm_loadl_epi64((__m128i *)im);
509
510 do {
511 const __m128i res = xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
512 const __m128i r = xy_y_round_half_pel_sse2(res);
513 pack_store_4x2_sse2(r, dst, dst_stride);
514 im += 2 * 4;
515 dst += 2 * dst_stride;
516 y -= 2;
517 } while (y);
518 } else if (w == 8) {
519 __m128i s_128[2];
520
521 s_128[0] = _mm_loadu_si128((__m128i *)im);
522
523 do {
524 const __m256i res = xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
525 const __m256i r = xy_y_round_half_pel_avx2(res);
526 pack_store_8x2_avx2(r, dst, dst_stride);
527 im += 2 * 8;
528 dst += 2 * dst_stride;
529 y -= 2;
530 } while (y);
531 } else if (w == 16) {
532 __m256i s_256[2], r[2];
533
534 s_256[0] = _mm256_loadu_si256((__m256i *)im);
535
536 do {
537 xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
538 r[0] = xy_y_round_half_pel_avx2(r[0]);
539 r[1] = xy_y_round_half_pel_avx2(r[1]);
540 xy_y_pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
541 im += 2 * 16;
542 dst += 2 * dst_stride;
543 y -= 2;
544 } while (y);
545 } else if (w == 32) {
546 __m256i s_256[2][2];
547
548 s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
549 s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
550
551 do {
552 xy_y_convolve_2tap_half_pel_32_all_avx2(im + 32, s_256[0], s_256[1], dst);
553 xy_y_convolve_2tap_half_pel_32_all_avx2(im + 2 * 32, s_256[1], s_256[0],
554 dst + dst_stride);
555 im += 2 * 32;
556 dst += 2 * dst_stride;
557 y -= 2;
558 } while (y);
559 } else if (w == 64) {
560 __m256i s_256[2][4];
561
562 s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
563 s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
564 s_256[0][2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
565 s_256[0][3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
566
567 do {
568 xy_y_convolve_2tap_half_pel_32_all_avx2(im + 64, s_256[0] + 0,
569 s_256[1] + 0, dst);
570 xy_y_convolve_2tap_half_pel_32_all_avx2(im + 96, s_256[0] + 2,
571 s_256[1] + 2, dst + 32);
572 im += 2 * 64;
573 xy_y_convolve_2tap_half_pel_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
574 dst + dst_stride);
575 xy_y_convolve_2tap_half_pel_32_all_avx2(
576 im + 32, s_256[1] + 2, s_256[0] + 2, dst + dst_stride + 32);
577 dst += 2 * dst_stride;
578 y -= 2;
579 } while (y);
580 } else {
581 __m256i s_256[2][8];
582
583 assert(w == 128);
584
585 load_16bit_8rows_avx2(im, 16, s_256[0]);
586
587 do {
588 xy_y_convolve_2tap_half_pel_32_all_avx2(im + 128, s_256[0] + 0,
589 s_256[1] + 0, dst);
590 xy_y_convolve_2tap_half_pel_32_all_avx2(im + 160, s_256[0] + 2,
591 s_256[1] + 2, dst + 1 * 32);
592 xy_y_convolve_2tap_half_pel_32_all_avx2(im + 192, s_256[0] + 4,
593 s_256[1] + 4, dst + 2 * 32);
594 xy_y_convolve_2tap_half_pel_32_all_avx2(im + 224, s_256[0] + 6,
595 s_256[1] + 6, dst + 3 * 32);
596 im += 2 * 128;
597 xy_y_convolve_2tap_half_pel_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
598 dst + dst_stride);
599 xy_y_convolve_2tap_half_pel_32_all_avx2(
600 im + 32, s_256[1] + 2, s_256[0] + 2, dst + dst_stride + 1 * 32);
601 xy_y_convolve_2tap_half_pel_32_all_avx2(
602 im + 64, s_256[1] + 4, s_256[0] + 4, dst + dst_stride + 2 * 32);
603 xy_y_convolve_2tap_half_pel_32_all_avx2(
604 im + 96, s_256[1] + 6, s_256[0] + 6, dst + dst_stride + 3 * 32);
605 dst += 2 * dst_stride;
606 y -= 2;
607 } while (y);
608 }
609 }
610
convolve_2d_sr_ver_4tap_avx2(const int16_t * const im_block,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,uint8_t * dst,const int32_t dst_stride)611 static void convolve_2d_sr_ver_4tap_avx2(
612 const int16_t *const im_block, const int32_t w, const int32_t h,
613 const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
614 uint8_t *dst, const int32_t dst_stride) {
615 const int16_t *im = im_block;
616 int32_t y = h;
617
618 if (w == 2) {
619 __m128i coeffs_128[2], s_32[4], ss_128[2];
620
621 prepare_coeffs_4tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
622
623 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
624 s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
625 s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
626
627 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
628 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
629
630 ss_128[0] = _mm_unpacklo_epi16(src01, src12);
631
632 do {
633 const __m128i res =
634 xy_y_convolve_4tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
635 xy_y_round_store_2x2_sse2(res, dst, dst_stride);
636 im += 2 * 2;
637 dst += 2 * dst_stride;
638 y -= 2;
639 } while (y);
640 } else {
641 __m256i coeffs_256[2];
642
643 prepare_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
644
645 if (w == 4) {
646 __m128i s_64[4];
647 __m256i s_256[2], ss_256[2];
648
649 s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
650 s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
651 s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
652
653 // Load lines a and b. Line a to lower 128, line b to upper 128
654 s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
655 s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
656
657 ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
658
659 do {
660 const __m256i res =
661 xy_y_convolve_4tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
662 xy_y_round_store_4x2_avx2(res, dst, dst_stride);
663 im += 2 * 4;
664 dst += 2 * dst_stride;
665 y -= 2;
666 } while (y);
667 } else if (w == 8) {
668 __m256i s_256[4], r[2];
669
670 s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
671 s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
672
673 if (subpel_y_q4 != 8) {
674 __m256i ss_256[4];
675
676 ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
677 ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
678
679 do {
680 xy_y_convolve_4tap_8x2_avx2(im, ss_256, coeffs_256, r);
681 xy_y_round_store_8x2_avx2(r, dst, dst_stride);
682 im += 2 * 8;
683 dst += 2 * dst_stride;
684 y -= 2;
685 } while (y);
686 } else {
687 do {
688 xy_y_convolve_4tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r);
689 xy_y_round_store_8x2_avx2(r, dst, dst_stride);
690 im += 2 * 8;
691 dst += 2 * dst_stride;
692 y -= 2;
693 } while (y);
694 }
695 } else if (w == 16) {
696 __m256i s_256[5];
697
698 s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
699 s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
700 s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
701
702 if (subpel_y_q4 != 8) {
703 __m256i ss_256[4], tt_256[4], r[4];
704
705 ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
706 ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
707
708 tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
709 tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
710
711 do {
712 xy_y_convolve_4tap_16x2_avx2(im, s_256, ss_256, tt_256, coeffs_256,
713 r);
714 xy_y_round_store_16x2_avx2(r, dst, dst_stride);
715 im += 2 * 16;
716 dst += 2 * dst_stride;
717 y -= 2;
718 } while (y);
719 } else {
720 __m256i r[4];
721
722 do {
723 xy_y_convolve_4tap_16x2_half_pelavx2(im, s_256, coeffs_256, r);
724 xy_y_round_store_16x2_avx2(r, dst, dst_stride);
725 im += 2 * 16;
726 dst += 2 * dst_stride;
727 y -= 2;
728 } while (y);
729 }
730 } else {
731 /*It's a special condition for OBMC. A/c to Av1 spec 4-tap won't
732 support for width(w)>16, but for OBMC while predicting above block
733 it reduces size block to Wx(h/2), for example, if above block size
734 is 32x8, we get block size as 32x4 for OBMC.*/
735 int32_t x = 0;
736
737 assert(!(w % 32));
738
739 __m256i s_256[2][4], ss_256[2][4], tt_256[2][4], r0[4], r1[4];
740 do {
741 const int16_t *s = im + x;
742 uint8_t *d = dst + x;
743
744 loadu_unpack_16bit_3rows_avx2(s, w, s_256[0], ss_256[0], tt_256[0]);
745 loadu_unpack_16bit_3rows_avx2(s + 16, w, s_256[1], ss_256[1],
746 tt_256[1]);
747
748 y = h;
749 do {
750 xy_y_convolve_4tap_32x2_avx2(s, w, s_256[0], ss_256[0], tt_256[0],
751 coeffs_256, r0);
752 xy_y_convolve_4tap_32x2_avx2(s + 16, w, s_256[1], ss_256[1],
753 tt_256[1], coeffs_256, r1);
754
755 xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
756 xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
757
758 s += 2 * w;
759 d += 2 * dst_stride;
760 y -= 2;
761 } while (y);
762
763 x += 32;
764 } while (x < w);
765 }
766 }
767 }
768
convolve_2d_sr_ver_6tap_avx2(const int16_t * const im_block,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,uint8_t * dst,const int32_t dst_stride)769 static void convolve_2d_sr_ver_6tap_avx2(
770 const int16_t *const im_block, const int32_t w, const int32_t h,
771 const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
772 uint8_t *dst, const int32_t dst_stride) {
773 const int16_t *im = im_block;
774 int32_t y;
775
776 if (w == 2) {
777 __m128i coeffs_128[3], s_32[6], ss_128[3];
778
779 prepare_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
780
781 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
782 s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
783 s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
784 s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
785 s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
786
787 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
788 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
789 const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
790 const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
791
792 ss_128[0] = _mm_unpacklo_epi16(src01, src12);
793 ss_128[1] = _mm_unpacklo_epi16(src23, src34);
794
795 y = h;
796 do {
797 const __m128i res =
798 xy_y_convolve_6tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
799 xy_y_round_store_2x2_sse2(res, dst, dst_stride);
800 im += 2 * 2;
801 dst += 2 * dst_stride;
802 y -= 2;
803 } while (y);
804 } else {
805 __m256i coeffs_256[3];
806
807 prepare_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
808
809 if (w == 4) {
810 __m128i s_64[6];
811 __m256i s_256[6], ss_256[3];
812
813 s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
814 s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
815 s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
816 s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
817 s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
818
819 // Load lines a and b. Line a to lower 128, line b to upper 128
820 s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
821 s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
822 s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
823 s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
824
825 ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
826 ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
827
828 y = h;
829 do {
830 const __m256i res =
831 xy_y_convolve_6tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
832 xy_y_round_store_4x2_avx2(res, dst, dst_stride);
833 im += 2 * 4;
834 dst += 2 * dst_stride;
835 y -= 2;
836 } while (y);
837 } else if (w == 8) {
838 __m256i s_256[6], r[2];
839
840 s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
841 s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
842 s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
843 s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
844 y = h;
845
846 if (subpel_y_q4 != 8) {
847 __m256i ss_256[6];
848
849 ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
850 ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
851
852 ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
853 ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
854
855 do {
856 xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
857 xy_y_round_store_8x2_avx2(r, dst, dst_stride);
858 im += 2 * 8;
859 dst += 2 * dst_stride;
860 y -= 2;
861 } while (y);
862 } else {
863 do {
864 xy_y_convolve_6tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r);
865 xy_y_round_store_8x2_avx2(r, dst, dst_stride);
866 im += 2 * 8;
867 dst += 2 * dst_stride;
868 y -= 2;
869 } while (y);
870 }
871 } else if (w == 16) {
872 __m256i s_256[6];
873
874 s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
875 s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
876 s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
877 s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
878 s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 16));
879 y = h;
880
881 if (subpel_y_q4 != 8) {
882 __m256i ss_256[6], tt_256[6], r[4];
883
884 ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
885 ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
886 ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
887 ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
888
889 tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
890 tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
891 tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
892 tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
893
894 do {
895 xy_y_convolve_6tap_16x2_avx2(im, 16, s_256, ss_256, tt_256,
896 coeffs_256, r);
897 xy_y_round_store_16x2_avx2(r, dst, dst_stride);
898 im += 2 * 16;
899 dst += 2 * dst_stride;
900 y -= 2;
901 } while (y);
902 } else {
903 __m256i ss_256[4], r[4];
904
905 do {
906 xy_y_convolve_6tap_16x2_half_pel_avx2(im, 16, s_256, ss_256,
907 coeffs_256, r);
908 xy_y_round_store_16x2_avx2(r, dst, dst_stride);
909
910 im += 2 * 16;
911 dst += 2 * dst_stride;
912 y -= 2;
913 } while (y);
914 }
915 } else {
916 int32_t x = 0;
917
918 assert(!(w % 32));
919
920 __m256i s_256[2][6], ss_256[2][6], tt_256[2][6], r0[4], r1[4];
921
922 do {
923 const int16_t *s = im + x;
924 uint8_t *d = dst + x;
925
926 loadu_unpack_16bit_5rows_avx2(s, w, s_256[0], ss_256[0], tt_256[0]);
927 loadu_unpack_16bit_5rows_avx2(s + 16, w, s_256[1], ss_256[1],
928 tt_256[1]);
929
930 y = h;
931 do {
932 xy_y_convolve_6tap_16x2_avx2(s, w, s_256[0], ss_256[0], tt_256[0],
933 coeffs_256, r0);
934 xy_y_convolve_6tap_16x2_avx2(s + 16, w, s_256[1], ss_256[1],
935 tt_256[1], coeffs_256, r1);
936
937 xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
938 xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
939
940 s += 2 * w;
941 d += 2 * dst_stride;
942 y -= 2;
943 } while (y);
944
945 x += 32;
946 } while (x < w);
947 }
948 }
949 }
950
convolve_2d_sr_ver_8tap_avx2(const int16_t * const im_block,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,uint8_t * dst,const int32_t dst_stride)951 static void convolve_2d_sr_ver_8tap_avx2(
952 const int16_t *const im_block, const int32_t w, const int32_t h,
953 const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
954 uint8_t *dst, const int32_t dst_stride) {
955 const int16_t *im = im_block;
956 int32_t y;
957
958 if (w == 2) {
959 __m128i coeffs_128[4], s_32[8], ss_128[4];
960
961 prepare_coeffs_8tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
962
963 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
964 s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
965 s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
966 s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
967 s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
968 s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(im + 5 * 2));
969 s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(im + 6 * 2));
970
971 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
972 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
973 const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
974 const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
975 const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
976 const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
977
978 ss_128[0] = _mm_unpacklo_epi16(src01, src12);
979 ss_128[1] = _mm_unpacklo_epi16(src23, src34);
980 ss_128[2] = _mm_unpacklo_epi16(src45, src56);
981
982 y = h;
983 do {
984 const __m128i res =
985 xy_y_convolve_8tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
986 xy_y_round_store_2x2_sse2(res, dst, dst_stride);
987 im += 2 * 2;
988 dst += 2 * dst_stride;
989 y -= 2;
990 } while (y);
991 } else {
992 __m256i coeffs_256[4];
993
994 prepare_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
995
996 if (w == 4) {
997 __m128i s_64[8];
998 __m256i s_256[8], ss_256[4];
999
1000 s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
1001 s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
1002 s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
1003 s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
1004 s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
1005 s_64[5] = _mm_loadl_epi64((__m128i *)(im + 5 * 4));
1006 s_64[6] = _mm_loadl_epi64((__m128i *)(im + 6 * 4));
1007
1008 // Load lines a and b. Line a to lower 128, line b to upper 128
1009 s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
1010 s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
1011 s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
1012 s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
1013 s_256[4] = _mm256_setr_m128i(s_64[4], s_64[5]);
1014 s_256[5] = _mm256_setr_m128i(s_64[5], s_64[6]);
1015
1016 ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1017 ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1018 ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
1019
1020 y = h;
1021 do {
1022 const __m256i res =
1023 xy_y_convolve_8tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
1024 xy_y_round_store_4x2_avx2(res, dst, dst_stride);
1025 im += 2 * 4;
1026 dst += 2 * dst_stride;
1027 y -= 2;
1028 } while (y);
1029 } else if (w == 8) {
1030 __m256i s_256[8], r[2];
1031
1032 s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
1033 s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
1034 s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
1035 s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
1036 s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 8));
1037 s_256[5] = _mm256_loadu_si256((__m256i *)(im + 5 * 8));
1038 y = h;
1039
1040 if (subpel_y_q4 != 8) {
1041 __m256i ss_256[8];
1042
1043 convolve_8tap_unpack_avx2(s_256, ss_256);
1044
1045 do {
1046 xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
1047 xy_y_round_store_8x2_avx2(r, dst, dst_stride);
1048 im += 2 * 8;
1049 dst += 2 * dst_stride;
1050 y -= 2;
1051 } while (y);
1052 } else {
1053 do {
1054 xy_y_convolve_8tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r);
1055 xy_y_round_store_8x2_avx2(r, dst, dst_stride);
1056 im += 2 * 8;
1057 dst += 2 * dst_stride;
1058 y -= 2;
1059 } while (y);
1060 }
1061 } else if (w == 16) {
1062 __m256i s_256[8], r[4];
1063
1064 load_16bit_7rows_avx2(im, 16, s_256);
1065 y = h;
1066
1067 if (subpel_y_q4 != 8) {
1068 __m256i ss_256[8], tt_256[8];
1069
1070 convolve_8tap_unpack_avx2(s_256, ss_256);
1071 convolve_8tap_unpack_avx2(s_256 + 1, tt_256);
1072
1073 do {
1074 xy_y_convolve_8tap_16x2_avx2(im, 16, coeffs_256, s_256, ss_256,
1075 tt_256, r);
1076 xy_y_round_store_16x2_avx2(r, dst, dst_stride);
1077
1078 im += 2 * 16;
1079 dst += 2 * dst_stride;
1080 y -= 2;
1081 } while (y);
1082 } else {
1083 do {
1084 xy_y_convolve_8tap_16x2_half_pel_avx2(im, 16, coeffs_256, s_256, r);
1085 xy_y_round_store_16x2_avx2(r, dst, dst_stride);
1086
1087 im += 2 * 16;
1088 dst += 2 * dst_stride;
1089 y -= 2;
1090 } while (y);
1091 }
1092 } else {
1093 int32_t x = 0;
1094 __m256i s_256[2][8], r0[4], r1[4];
1095
1096 assert(!(w % 32));
1097
1098 __m256i ss_256[2][8], tt_256[2][8];
1099
1100 do {
1101 const int16_t *s = im + x;
1102 uint8_t *d = dst + x;
1103
1104 load_16bit_7rows_avx2(s, w, s_256[0]);
1105 convolve_8tap_unpack_avx2(s_256[0], ss_256[0]);
1106 convolve_8tap_unpack_avx2(s_256[0] + 1, tt_256[0]);
1107
1108 load_16bit_7rows_avx2(s + 16, w, s_256[1]);
1109 convolve_8tap_unpack_avx2(s_256[1], ss_256[1]);
1110 convolve_8tap_unpack_avx2(s_256[1] + 1, tt_256[1]);
1111
1112 y = h;
1113 do {
1114 xy_y_convolve_8tap_16x2_avx2(s, w, coeffs_256, s_256[0], ss_256[0],
1115 tt_256[0], r0);
1116 xy_y_convolve_8tap_16x2_avx2(s + 16, w, coeffs_256, s_256[1],
1117 ss_256[1], tt_256[1], r1);
1118 xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
1119 xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
1120
1121 s += 2 * w;
1122 d += 2 * dst_stride;
1123 y -= 2;
1124 } while (y);
1125
1126 x += 32;
1127 } while (x < w);
1128 }
1129 }
1130 }
1131
1132 typedef void (*Convolve2dSrHorTapFunc)(
1133 const uint8_t *const src, const int32_t src_stride, const int32_t w,
1134 const int32_t h, const InterpFilterParams *const filter_params_x,
1135 const int32_t subpel_x_q4, int16_t *const im_block);
1136
1137 typedef void (*Convolve2dSrVerTapFunc)(
1138 const int16_t *const im_block, const int32_t w, const int32_t h,
1139 const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
1140 uint8_t *dst, const int32_t dst_stride);
1141
av1_convolve_2d_sr_specialized_avx2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)1142 static AOM_FORCE_INLINE void av1_convolve_2d_sr_specialized_avx2(
1143 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
1144 int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
1145 const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
1146 const int32_t subpel_y_q4, ConvolveParams *conv_params) {
1147 static const Convolve2dSrHorTapFunc
1148 convolve_2d_sr_hor_tap_func_table[MAX_FILTER_TAP + 1] = {
1149 NULL,
1150 NULL,
1151 convolve_2d_sr_hor_2tap_avx2,
1152 NULL,
1153 convolve_2d_sr_hor_4tap_ssse3,
1154 NULL,
1155 convolve_2d_sr_hor_6tap_avx2,
1156 NULL,
1157 convolve_2d_sr_hor_8tap_avx2
1158 };
1159 static const Convolve2dSrVerTapFunc
1160 convolve_2d_sr_ver_tap_func_table[MAX_FILTER_TAP + 1] = {
1161 NULL,
1162 convolve_2d_sr_ver_2tap_half_avx2,
1163 convolve_2d_sr_ver_2tap_avx2,
1164 convolve_2d_sr_ver_4tap_avx2,
1165 convolve_2d_sr_ver_4tap_avx2,
1166 convolve_2d_sr_ver_6tap_avx2,
1167 convolve_2d_sr_ver_6tap_avx2,
1168 convolve_2d_sr_ver_8tap_avx2,
1169 convolve_2d_sr_ver_8tap_avx2
1170 };
1171 const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_q4);
1172 const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_q4);
1173
1174 assert(tap_x != 12 && tap_y != 12);
1175
1176 const uint8_t *src_ptr = src - ((tap_y >> 1) - 1) * src_stride;
1177 // Note: im_block is 8-pixel interlaced for width 32 and up, to avoid data
1178 // permutation.
1179 DECLARE_ALIGNED(32, int16_t,
1180 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
1181
1182 (void)conv_params;
1183
1184 assert(conv_params->round_0 == 3);
1185 assert(conv_params->round_1 == 11);
1186
1187 // horizontal filter
1188 int32_t hh = h + tap_y;
1189 assert(!(hh % 2));
1190
1191 convolve_2d_sr_hor_tap_func_table[tap_x](
1192 src_ptr, src_stride, w, hh, filter_params_x, subpel_x_q4, im_block);
1193
1194 // vertical filter
1195 convolve_2d_sr_ver_tap_func_table[tap_y - (subpel_y_q4 == 8)](
1196 im_block, w, h, filter_params_y, subpel_y_q4, dst, dst_stride);
1197 }
1198
1199 #endif // THIRD_PARTY_SVT_AV1_CONVOLVE_2D_AVX2_H_
1200