xref: /aosp_15_r20/external/libaom/av1/common/x86/highbd_warp_affine_avx2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 #include <immintrin.h>
12 
13 #include "config/av1_rtcd.h"
14 
15 #include "av1/common/warped_motion.h"
16 
av1_highbd_warp_affine_avx2(const int32_t * mat,const uint16_t * ref,int width,int height,int stride,uint16_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)17 void av1_highbd_warp_affine_avx2(const int32_t *mat, const uint16_t *ref,
18                                  int width, int height, int stride,
19                                  uint16_t *pred, int p_col, int p_row,
20                                  int p_width, int p_height, int p_stride,
21                                  int subsampling_x, int subsampling_y, int bd,
22                                  ConvolveParams *conv_params, int16_t alpha,
23                                  int16_t beta, int16_t gamma, int16_t delta) {
24   __m256i tmp[15];
25   const int reduce_bits_horiz = conv_params->round_0;
26   const int reduce_bits_vert = conv_params->is_compound
27                                    ? conv_params->round_1
28                                    : 2 * FILTER_BITS - reduce_bits_horiz;
29   const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
30   const int offset_bits_horiz = bd + FILTER_BITS - 1;
31   const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
32   const int round_bits =
33       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
34   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
35   (void)max_bits_horiz;
36   assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
37 
38   // Check that, even with 12-bit input, the intermediate values will fit
39   // into an unsigned 16-bit intermediate array.
40   assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
41 
42   const __m256i clip_pixel =
43       _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
44   const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
45   const __m256i reduce_bits_vert_const =
46       _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1));
47   const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert);
48   const __m256i res_sub_const =
49       _mm256_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
50                         (1 << (offset_bits - conv_params->round_1 - 1)));
51   __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
52   __m256i round_bits_const = _mm256_set1_epi32(((1 << round_bits) >> 1));
53 
54   const int w0 = conv_params->fwd_offset;
55   const int w1 = conv_params->bck_offset;
56   const __m256i wt0 = _mm256_set1_epi32(w0);
57   const __m256i wt1 = _mm256_set1_epi32(w1);
58 
59   __m256i v_rbhoriz = _mm256_set1_epi32(1 << (reduce_bits_horiz - 1));
60   __m256i v_zeros = _mm256_setzero_si256();
61   int ohoriz = 1 << offset_bits_horiz;
62   int mhoriz = 1 << max_bits_horiz;
63   (void)mhoriz;
64   int sx;
65 
66   for (int i = 0; i < p_height; i += 8) {
67     for (int j = 0; j < p_width; j += 8) {
68       // Calculate the center of this 8x8 block,
69       // project to luma coordinates (if in a subsampled chroma plane),
70       // apply the affine transformation,
71       // then convert back to the original coordinates (if necessary)
72       const int32_t src_x = (p_col + j + 4) << subsampling_x;
73       const int32_t src_y = (p_row + i + 4) << subsampling_y;
74       const int64_t dst_x =
75           (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
76       const int64_t dst_y =
77           (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
78       const int64_t x4 = dst_x >> subsampling_x;
79       const int64_t y4 = dst_y >> subsampling_y;
80 
81       const int16_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
82       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
83       const int16_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
84       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
85 
86       sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
87              (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
88       sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
89              (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
90 
91       sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
92       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
93 
94       // Horizontal filter
95       if (ix4 <= -7) {
96         for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
97           int iy = iy4 + k;
98           if (iy < 0)
99             iy = 0;
100           else if (iy > height - 1)
101             iy = height - 1;
102           tmp[k + 7] = _mm256_cvtepi16_epi32(_mm_set1_epi16(
103               (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
104               ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz))));
105         }
106       } else if (ix4 >= width + 6) {
107         for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
108           int iy = iy4 + k;
109           if (iy < 0)
110             iy = 0;
111           else if (iy > height - 1)
112             iy = height - 1;
113           tmp[k + 7] = _mm256_cvtepi16_epi32(
114               _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
115                              ref[iy * stride + (width - 1)] *
116                                  (1 << (FILTER_BITS - reduce_bits_horiz))));
117         }
118       } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
119         int32_t tmp1[8];
120         for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
121           const int iy = clamp(iy4 + k, 0, height - 1);
122 
123           sx = sx4 + beta * (k + 4);
124           for (int l = -4; l < 4; ++l) {
125             int ix = ix4 + l - 3;
126             const int offs = sx >> WARPEDDIFF_PREC_BITS;
127             const int16_t *coeffs = av1_warped_filter[offs];
128 
129             int32_t sum = 1 << offset_bits_horiz;
130             for (int m = 0; m < 8; ++m) {
131               const int sample_x = clamp(ix + m, 0, width - 1);
132               sum += ref[iy * stride + sample_x] * coeffs[m];
133             }
134             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
135             tmp1[(l + 4) / 2 + ((l + 4) % 2) * 4] = sum;
136             sx += alpha;
137           }
138           tmp[k + 7] = _mm256_loadu_si256((__m256i *)tmp1);
139         }
140       } else {
141         if (beta == 0 && alpha == 0) {
142           sx = sx4;
143           __m128i v_01 = _mm_loadu_si128(
144               (__m128i *)
145                   av1_warped_filter[sx >>
146                                     WARPEDDIFF_PREC_BITS]);  // A7A6A5A4A3A2A1A0
147           __m256i v_c01 = _mm256_broadcastd_epi32(v_01);     // A1A0A1A0A1A0A1A0
148           __m256i v_c23 = _mm256_broadcastd_epi32(
149               _mm_shuffle_epi32(v_01, 1));  // A3A2A3A2A3A2A3A2
150           __m256i v_c45 = _mm256_broadcastd_epi32(
151               _mm_shuffle_epi32(v_01, 2));  // A5A4A5A4A5A4A5A4
152           __m256i v_c67 = _mm256_broadcastd_epi32(
153               _mm_shuffle_epi32(v_01, 3));  // A7A6A7A6A7A6A7A6
154           for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
155             int iy = iy4 + k;
156             if (iy < 0)
157               iy = 0;
158             else if (iy > height - 1)
159               iy = height - 1;
160             iy = iy * stride;
161 
162             __m256i v_refl = _mm256_inserti128_si256(
163                 _mm256_setzero_si256(),
164                 _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
165             v_refl = _mm256_inserti128_si256(
166                 v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
167                 1);  // R15 .. R0
168 
169             __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
170 
171             __m256i v_refu =
172                 _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
173             v_refl = _mm256_inserti128_si256(
174                 v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
175             v_refu = _mm256_inserti128_si256(
176                 v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
177 
178             __m256i v_sum = _mm256_set1_epi32(ohoriz);
179             __m256i parsum = _mm256_madd_epi16(
180                 v_c01, _mm256_alignr_epi8(v_refu, v_refl,
181                                           0));  // R8R7R6..R1R7R6R5..R1R0
182             __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
183 
184             parsum = _mm256_madd_epi16(
185                 v_c23,
186                 _mm256_alignr_epi8(v_refu, v_refl, 4));  // R10R9..R3R9R8..R3R2
187             __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
188             parsum = _mm256_madd_epi16(
189                 v_c45, _mm256_alignr_epi8(v_refu, v_refl,
190                                           8));  // R12R11..R5R11R10..R5R4
191             __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
192             parsum = _mm256_madd_epi16(
193                 v_c67, _mm256_alignr_epi8(v_refu, v_refl,
194                                           12));  // R14R13..R7R13R12..R7R6
195             __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
196 
197             tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
198                                            reduce_bits_horiz);
199           }
200         } else if (alpha == 0) {
201           for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
202             int iy = iy4 + k;
203             if (iy < 0)
204               iy = 0;
205             else if (iy > height - 1)
206               iy = height - 1;
207             iy = iy * stride;
208 
209             sx = sx4 + beta * (k + 4);
210 
211             __m128i v_01 = _mm_loadu_si128(
212                 (__m128i *)av1_warped_filter
213                     [sx >> WARPEDDIFF_PREC_BITS]);          // A7A6A5A4A3A2A1A0
214             __m256i v_c01 = _mm256_broadcastd_epi32(v_01);  // A1A0A1A0A1A0A1A0
215             __m256i v_c23 = _mm256_broadcastd_epi32(
216                 _mm_shuffle_epi32(v_01, 1));  // A3A2A3A2A3A2A3A2
217             __m256i v_c45 = _mm256_broadcastd_epi32(
218                 _mm_shuffle_epi32(v_01, 2));  // A5A4A5A4A5A4A5A4
219             __m256i v_c67 = _mm256_broadcastd_epi32(
220                 _mm_shuffle_epi32(v_01, 3));  // A7A6A7A6A7A6A7A6
221 
222             __m256i v_refl = _mm256_inserti128_si256(
223                 _mm256_setzero_si256(),
224                 _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
225             v_refl = _mm256_inserti128_si256(
226                 v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
227                 1);  // R15 .. R0
228 
229             __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
230 
231             __m256i v_refu =
232                 _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
233 
234             v_refl = _mm256_inserti128_si256(
235                 v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
236             v_refu = _mm256_inserti128_si256(
237                 v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
238 
239             __m256i v_sum = _mm256_set1_epi32(ohoriz);
240             __m256i parsum =
241                 _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0));
242             __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
243 
244             parsum =
245                 _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4));
246             __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
247             parsum =
248                 _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8));
249             __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
250             parsum = _mm256_madd_epi16(v_c67,
251                                        _mm256_alignr_epi8(v_refu, v_refl, 12));
252             __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
253 
254             tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
255                                            reduce_bits_horiz);
256           }
257         } else if (beta == 0) {
258           sx = sx4;
259           __m256i v_coeff01 = _mm256_inserti128_si256(
260               v_zeros,
261               _mm_loadu_si128(
262                   (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]),
263               0);
264           v_coeff01 = _mm256_inserti128_si256(
265               v_coeff01,
266               _mm_loadu_si128(
267                   (__m128i *)
268                       av1_warped_filter[(sx + alpha) >> WARPEDDIFF_PREC_BITS]),
269               1);  // B7B6..B1B0A7A6..A1A0
270           __m256i v_coeff23 = _mm256_inserti128_si256(
271               v_zeros,
272               _mm_loadu_si128(
273                   (__m128i *)av1_warped_filter[(sx + 2 * alpha) >>
274                                                WARPEDDIFF_PREC_BITS]),
275               0);
276           v_coeff23 = _mm256_inserti128_si256(
277               v_coeff23,
278               _mm_loadu_si128(
279                   (__m128i *)av1_warped_filter[(sx + 3 * alpha) >>
280                                                WARPEDDIFF_PREC_BITS]),
281               1);  // D7D6..D1D0C7C6..C1C0
282           __m256i v_coeff45 = _mm256_inserti128_si256(
283               v_zeros,
284               _mm_loadu_si128(
285                   (__m128i *)av1_warped_filter[(sx + 4 * alpha) >>
286                                                WARPEDDIFF_PREC_BITS]),
287               0);
288           v_coeff45 = _mm256_inserti128_si256(
289               v_coeff45,
290               _mm_loadu_si128(
291                   (__m128i *)av1_warped_filter[(sx + 5 * alpha) >>
292                                                WARPEDDIFF_PREC_BITS]),
293               1);  // F7F6..F1F0E7E6..E1E0
294           __m256i v_coeff67 = _mm256_inserti128_si256(
295               v_zeros,
296               _mm_loadu_si128(
297                   (__m128i *)av1_warped_filter[(sx + 6 * alpha) >>
298                                                WARPEDDIFF_PREC_BITS]),
299               0);
300           v_coeff67 = _mm256_inserti128_si256(
301               v_coeff67,
302               _mm_loadu_si128(
303                   (__m128i *)av1_warped_filter[(sx + 7 * alpha) >>
304                                                WARPEDDIFF_PREC_BITS]),
305               1);  // H7H6..H1H0G7G6..G1G0
306 
307           __m256i v_c0123 = _mm256_unpacklo_epi32(
308               v_coeff01,
309               v_coeff23);  // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
310           __m256i v_c0123u = _mm256_unpackhi_epi32(
311               v_coeff01,
312               v_coeff23);  // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
313           __m256i v_c4567 = _mm256_unpacklo_epi32(
314               v_coeff45,
315               v_coeff67);  // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
316           __m256i v_c4567u = _mm256_unpackhi_epi32(
317               v_coeff45,
318               v_coeff67);  // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
319 
320           __m256i v_c01 = _mm256_unpacklo_epi64(
321               v_c0123, v_c4567);  // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
322           __m256i v_c23 =
323               _mm256_unpackhi_epi64(v_c0123, v_c4567);  // H3H2 ... A3A2
324           __m256i v_c45 =
325               _mm256_unpacklo_epi64(v_c0123u, v_c4567u);  // H5H4 ... A5A4
326           __m256i v_c67 =
327               _mm256_unpackhi_epi64(v_c0123u, v_c4567u);  // H7H6 ... A7A6
328 
329           for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
330             int iy = iy4 + k;
331             if (iy < 0)
332               iy = 0;
333             else if (iy > height - 1)
334               iy = height - 1;
335             iy = iy * stride;
336 
337             __m256i v_refl = _mm256_inserti128_si256(
338                 _mm256_setzero_si256(),
339                 _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
340             v_refl = _mm256_inserti128_si256(
341                 v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
342                 1);  // R15 .. R0
343 
344             __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
345 
346             __m256i v_refu =
347                 _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
348 
349             v_refl = _mm256_inserti128_si256(
350                 v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
351             v_refu = _mm256_inserti128_si256(
352                 v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
353 
354             __m256i v_sum = _mm256_set1_epi32(ohoriz);
355             __m256i parsum = _mm256_madd_epi16(
356                 v_c01, _mm256_alignr_epi8(v_refu, v_refl,
357                                           0));  // R8R7R6..R1R7R6R5..R1R0
358             __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
359 
360             parsum = _mm256_madd_epi16(
361                 v_c23,
362                 _mm256_alignr_epi8(v_refu, v_refl, 4));  // R10R9..R3R9R8..R3R2
363             __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
364             parsum = _mm256_madd_epi16(
365                 v_c45, _mm256_alignr_epi8(v_refu, v_refl,
366                                           8));  // R12R11..R5R11R10..R5R4
367             __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
368             parsum = _mm256_madd_epi16(
369                 v_c67, _mm256_alignr_epi8(v_refu, v_refl,
370                                           12));  // R14R13..R7R13R12..R7R6
371             __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
372 
373             tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
374                                            reduce_bits_horiz);
375           }
376 
377         } else {
378           for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
379             int iy = iy4 + k;
380             if (iy < 0)
381               iy = 0;
382             else if (iy > height - 1)
383               iy = height - 1;
384             iy = iy * stride;
385 
386             sx = sx4 + beta * (k + 4);
387 
388             __m256i v_coeff01 = _mm256_inserti128_si256(
389                 v_zeros,
390                 _mm_loadu_si128(
391                     (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]),
392                 0);
393             v_coeff01 = _mm256_inserti128_si256(
394                 v_coeff01,
395                 _mm_loadu_si128(
396                     (__m128i *)av1_warped_filter[(sx + alpha) >>
397                                                  WARPEDDIFF_PREC_BITS]),
398                 1);  // B7B6..B1B0A7A6..A1A0
399             __m256i v_coeff23 = _mm256_inserti128_si256(
400                 v_zeros,
401                 _mm_loadu_si128(
402                     (__m128i *)av1_warped_filter[(sx + 2 * alpha) >>
403                                                  WARPEDDIFF_PREC_BITS]),
404                 0);
405             v_coeff23 = _mm256_inserti128_si256(
406                 v_coeff23,
407                 _mm_loadu_si128(
408                     (__m128i *)av1_warped_filter[(sx + 3 * alpha) >>
409                                                  WARPEDDIFF_PREC_BITS]),
410                 1);  // D7D6..D1D0C7C6..C1C0
411             __m256i v_coeff45 = _mm256_inserti128_si256(
412                 v_zeros,
413                 _mm_loadu_si128(
414                     (__m128i *)av1_warped_filter[(sx + 4 * alpha) >>
415                                                  WARPEDDIFF_PREC_BITS]),
416                 0);
417             v_coeff45 = _mm256_inserti128_si256(
418                 v_coeff45,
419                 _mm_loadu_si128(
420                     (__m128i *)av1_warped_filter[(sx + 5 * alpha) >>
421                                                  WARPEDDIFF_PREC_BITS]),
422                 1);  // F7F6..F1F0E7E6..E1E0
423             __m256i v_coeff67 = _mm256_inserti128_si256(
424                 v_zeros,
425                 _mm_loadu_si128(
426                     (__m128i *)av1_warped_filter[(sx + 6 * alpha) >>
427                                                  WARPEDDIFF_PREC_BITS]),
428                 0);
429             v_coeff67 = _mm256_inserti128_si256(
430                 v_coeff67,
431                 _mm_loadu_si128(
432                     (__m128i *)av1_warped_filter[(sx + 7 * alpha) >>
433                                                  WARPEDDIFF_PREC_BITS]),
434                 1);  // H7H6..H1H0G7G6..G1G0
435 
436             __m256i v_c0123 = _mm256_unpacklo_epi32(
437                 v_coeff01,
438                 v_coeff23);  // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
439             __m256i v_c0123u = _mm256_unpackhi_epi32(
440                 v_coeff01,
441                 v_coeff23);  // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
442             __m256i v_c4567 = _mm256_unpacklo_epi32(
443                 v_coeff45,
444                 v_coeff67);  // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
445             __m256i v_c4567u = _mm256_unpackhi_epi32(
446                 v_coeff45,
447                 v_coeff67);  // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
448 
449             __m256i v_c01 = _mm256_unpacklo_epi64(
450                 v_c0123, v_c4567);  // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
451             __m256i v_c23 =
452                 _mm256_unpackhi_epi64(v_c0123, v_c4567);  // H3H2 ... A3A2
453             __m256i v_c45 =
454                 _mm256_unpacklo_epi64(v_c0123u, v_c4567u);  // H5H4 ... A5A4
455             __m256i v_c67 =
456                 _mm256_unpackhi_epi64(v_c0123u, v_c4567u);  // H7H6 ... A7A6
457 
458             __m256i v_refl = _mm256_inserti128_si256(
459                 _mm256_setzero_si256(),
460                 _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
461             v_refl = _mm256_inserti128_si256(
462                 v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
463                 1);  // R15 .. R0
464 
465             __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
466 
467             __m256i v_refu =
468                 _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
469 
470             v_refl = _mm256_inserti128_si256(
471                 v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
472             v_refu = _mm256_inserti128_si256(
473                 v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
474 
475             __m256i v_sum = _mm256_set1_epi32(ohoriz);
476             __m256i parsum =
477                 _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0));
478             __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
479 
480             parsum =
481                 _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4));
482             __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
483             parsum =
484                 _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8));
485             __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
486             parsum = _mm256_madd_epi16(v_c67,
487                                        _mm256_alignr_epi8(v_refu, v_refl, 12));
488             __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
489 
490             tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
491                                            reduce_bits_horiz);
492           }
493         }
494       }
495 
496       // Vertical filter
497       for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
498         int sy = sy4 + delta * (k + 4);
499         const __m256i *src = tmp + (k + 4);
500 
501         __m256i v_coeff01 = _mm256_inserti128_si256(
502             v_zeros,
503             _mm_loadu_si128(
504                 (__m128i *)av1_warped_filter[(sy) >> WARPEDDIFF_PREC_BITS]),
505             0);
506         v_coeff01 = _mm256_inserti128_si256(
507             v_coeff01,
508             _mm_loadu_si128(
509                 (__m128i *)
510                     av1_warped_filter[(sy + gamma) >> WARPEDDIFF_PREC_BITS]),
511             1);
512         __m256i v_coeff23 = _mm256_inserti128_si256(
513             v_zeros,
514             _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 2 * gamma) >>
515                                                          WARPEDDIFF_PREC_BITS]),
516             0);
517         v_coeff23 = _mm256_inserti128_si256(
518             v_coeff23,
519             _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 3 * gamma) >>
520                                                          WARPEDDIFF_PREC_BITS]),
521             1);
522         __m256i v_coeff45 = _mm256_inserti128_si256(
523             v_zeros,
524             _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 4 * gamma) >>
525                                                          WARPEDDIFF_PREC_BITS]),
526             0);
527         v_coeff45 = _mm256_inserti128_si256(
528             v_coeff45,
529             _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 5 * gamma) >>
530                                                          WARPEDDIFF_PREC_BITS]),
531             1);
532         __m256i v_coeff67 = _mm256_inserti128_si256(
533             v_zeros,
534             _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 6 * gamma) >>
535                                                          WARPEDDIFF_PREC_BITS]),
536             0);
537         v_coeff67 = _mm256_inserti128_si256(
538             v_coeff67,
539             _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 7 * gamma) >>
540                                                          WARPEDDIFF_PREC_BITS]),
541             1);
542 
543         __m256i v_c0123 = _mm256_unpacklo_epi32(
544             v_coeff01,
545             v_coeff23);  // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
546         __m256i v_c0123u = _mm256_unpackhi_epi32(
547             v_coeff01,
548             v_coeff23);  // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
549         __m256i v_c4567 = _mm256_unpacklo_epi32(
550             v_coeff45,
551             v_coeff67);  // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
552         __m256i v_c4567u = _mm256_unpackhi_epi32(
553             v_coeff45,
554             v_coeff67);  // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
555 
556         __m256i v_c01 = _mm256_unpacklo_epi64(
557             v_c0123, v_c4567);  // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
558         __m256i v_c23 =
559             _mm256_unpackhi_epi64(v_c0123, v_c4567);  // H3H2 ... A3A2
560         __m256i v_c45 =
561             _mm256_unpacklo_epi64(v_c0123u, v_c4567u);  // H5H4 ... A5A4
562         __m256i v_c67 =
563             _mm256_unpackhi_epi64(v_c0123u, v_c4567u);  // H7H6 ... A7A6
564 
565         __m256i v_src01l =
566             _mm256_unpacklo_epi32(src[0], src[1]);  // T13T03T11T01T12T02T10T00
567         __m256i v_src01u =
568             _mm256_unpackhi_epi32(src[0], src[1]);  // T17T07T15T05T16T06T14T04
569         __m256i v_sum =
570             _mm256_madd_epi16(_mm256_packus_epi32(v_src01l, v_src01u),
571                               v_c01);  // S7S5S3S1S6S4S2S0
572 
573         __m256i v_src23l = _mm256_unpacklo_epi32(src[2], src[3]);
574         __m256i v_src23u = _mm256_unpackhi_epi32(src[2], src[3]);
575         v_sum = _mm256_add_epi32(
576             v_sum,
577             _mm256_madd_epi16(_mm256_packus_epi32(v_src23l, v_src23u), v_c23));
578 
579         __m256i v_src45l = _mm256_unpacklo_epi32(src[4], src[5]);
580         __m256i v_src45u = _mm256_unpackhi_epi32(src[4], src[5]);
581         v_sum = _mm256_add_epi32(
582             v_sum,
583             _mm256_madd_epi16(_mm256_packus_epi32(v_src45l, v_src45u), v_c45));
584 
585         __m256i v_src67l = _mm256_unpacklo_epi32(src[6], src[7]);
586         __m256i v_src67u = _mm256_unpackhi_epi32(src[6], src[7]);
587         v_sum = _mm256_add_epi32(
588             v_sum,
589             _mm256_madd_epi16(_mm256_packus_epi32(v_src67l, v_src67u), v_c67));
590 
591         // unpack S7S5S3S1S6S4S2S0 to S7S6S5S4S3S2S1S0
592 
593         __m256i v_suml =
594             _mm256_permute4x64_epi64(v_sum, 0xD8);  // S7S5S6S4S3S1S2S0
595         __m256i v_sumh =
596             _mm256_permute4x64_epi64(v_sum, 0x32);      // S2S0S7S5S2S0S3S1
597         v_sum = _mm256_unpacklo_epi32(v_suml, v_sumh);  // S7S6S5S4S3S2S1S0
598 
599         if (conv_params->is_compound) {
600           __m128i *const p =
601               (__m128i *)&conv_params
602                   ->dst[(i + k + 4) * conv_params->dst_stride + j];
603 
604           v_sum = _mm256_add_epi32(v_sum, res_add_const);
605           v_sum =
606               _mm256_sra_epi32(_mm256_add_epi32(v_sum, reduce_bits_vert_const),
607                                reduce_bits_vert_shift);
608           if (conv_params->do_average) {
609             __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
610             __m256i p_32 = _mm256_cvtepu16_epi32(_mm_loadu_si128(p));
611 
612             if (conv_params->use_dist_wtd_comp_avg) {
613               v_sum = _mm256_add_epi32(_mm256_mullo_epi32(p_32, wt0),
614                                        _mm256_mullo_epi32(v_sum, wt1));
615               v_sum = _mm256_srai_epi32(v_sum, DIST_PRECISION_BITS);
616             } else {
617               v_sum = _mm256_srai_epi32(_mm256_add_epi32(p_32, v_sum), 1);
618             }
619 
620             __m256i v_sum1 = _mm256_add_epi32(v_sum, res_sub_const);
621             v_sum1 = _mm256_sra_epi32(
622                 _mm256_add_epi32(v_sum1, round_bits_const), round_bits_shift);
623 
624             __m256i v_sum16 = _mm256_packus_epi32(v_sum1, v_sum1);
625             v_sum16 = _mm256_permute4x64_epi64(v_sum16, 0xD8);
626             v_sum16 = _mm256_min_epi16(v_sum16, clip_pixel);
627             _mm_storeu_si128(dst16, _mm256_extracti128_si256(v_sum16, 0));
628           } else {
629             v_sum = _mm256_packus_epi32(v_sum, v_sum);
630             __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum, 0xD8);
631             _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0));
632           }
633         } else {
634           // Round and pack into 8 bits
635           const __m256i round_const =
636               _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
637                                 ((1 << reduce_bits_vert) >> 1));
638 
639           __m256i v_sum1 = _mm256_srai_epi32(
640               _mm256_add_epi32(v_sum, round_const), reduce_bits_vert);
641 
642           v_sum1 = _mm256_packus_epi32(v_sum1, v_sum1);
643           __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum1, 0xD8);
644           // Clamp res_16bit to the range [0, 2^bd - 1]
645           const __m256i max_val = _mm256_set1_epi16((1 << bd) - 1);
646           const __m256i zero = _mm256_setzero_si256();
647           v_sum16 = _mm256_max_epi16(_mm256_min_epi16(v_sum16, max_val), zero);
648 
649           __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
650 
651           _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0));
652         }
653       }
654     }
655   }
656 }
657