xref: /aosp_15_r20/external/libaom/av1/common/x86/highbd_warp_plane_sse4.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <smmintrin.h>
13 
14 #include "config/av1_rtcd.h"
15 
16 #include "av1/common/warped_motion.h"
17 
18 static const uint8_t warp_highbd_arrange_bytes[16] = { 0,  2,  4,  6, 8, 10,
19                                                        12, 14, 1,  3, 5, 7,
20                                                        9,  11, 13, 15 };
21 
22 static const uint8_t highbd_shuffle_alpha0_mask0[16] = {
23   0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
24 };
25 static const uint8_t highbd_shuffle_alpha0_mask1[16] = {
26   4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
27 };
28 static const uint8_t highbd_shuffle_alpha0_mask2[16] = { 8,  9,  10, 11, 8,  9,
29                                                          10, 11, 8,  9,  10, 11,
30                                                          8,  9,  10, 11 };
31 static const uint8_t highbd_shuffle_alpha0_mask3[16] = { 12, 13, 14, 15, 12, 13,
32                                                          14, 15, 12, 13, 14, 15,
33                                                          12, 13, 14, 15 };
34 
highbd_prepare_horizontal_filter_coeff(int alpha,int sx,__m128i * coeff)35 static inline void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
36                                                           __m128i *coeff) {
37   // Filter even-index pixels
38   const __m128i tmp_0 =
39       _mm_loadu_si128((__m128i *)(av1_warped_filter +
40                                   ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
41   const __m128i tmp_2 =
42       _mm_loadu_si128((__m128i *)(av1_warped_filter +
43                                   ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
44   const __m128i tmp_4 =
45       _mm_loadu_si128((__m128i *)(av1_warped_filter +
46                                   ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
47   const __m128i tmp_6 =
48       _mm_loadu_si128((__m128i *)(av1_warped_filter +
49                                   ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
50 
51   // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
52   const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
53   // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
54   const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
55   // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
56   const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
57   // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
58   const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
59 
60   // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
61   coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
62   // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
63   coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10);
64   // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
65   coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14);
66   // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
67   coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14);
68 
69   // Filter odd-index pixels
70   const __m128i tmp_1 =
71       _mm_loadu_si128((__m128i *)(av1_warped_filter +
72                                   ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
73   const __m128i tmp_3 =
74       _mm_loadu_si128((__m128i *)(av1_warped_filter +
75                                   ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
76   const __m128i tmp_5 =
77       _mm_loadu_si128((__m128i *)(av1_warped_filter +
78                                   ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
79   const __m128i tmp_7 =
80       _mm_loadu_si128((__m128i *)(av1_warped_filter +
81                                   ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
82 
83   const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
84   const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
85   const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
86   const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
87 
88   coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11);
89   coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11);
90   coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15);
91   coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
92 }
93 
highbd_prepare_horizontal_filter_coeff_alpha0(int sx,__m128i * coeff)94 static inline void highbd_prepare_horizontal_filter_coeff_alpha0(
95     int sx, __m128i *coeff) {
96   // Filter coeff
97   const __m128i tmp_0 = _mm_loadu_si128(
98       (__m128i *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
99 
100   coeff[0] = _mm_shuffle_epi8(
101       tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0));
102   coeff[2] = _mm_shuffle_epi8(
103       tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1));
104   coeff[4] = _mm_shuffle_epi8(
105       tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2));
106   coeff[6] = _mm_shuffle_epi8(
107       tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3));
108 
109   coeff[1] = coeff[0];
110   coeff[3] = coeff[2];
111   coeff[5] = coeff[4];
112   coeff[7] = coeff[6];
113 }
114 
highbd_filter_src_pixels(const __m128i * src,const __m128i * src2,__m128i * tmp,__m128i * coeff,const int offset_bits_horiz,const int reduce_bits_horiz,int k)115 static inline void highbd_filter_src_pixels(
116     const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff,
117     const int offset_bits_horiz, const int reduce_bits_horiz, int k) {
118   const __m128i src_1 = *src;
119   const __m128i src2_1 = *src2;
120 
121   const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
122                                              ((1 << reduce_bits_horiz) >> 1));
123 
124   const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]);
125   const __m128i res_2 =
126       _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]);
127   const __m128i res_4 =
128       _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]);
129   const __m128i res_6 =
130       _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]);
131 
132   __m128i res_even =
133       _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
134   res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
135                            _mm_cvtsi32_si128(reduce_bits_horiz));
136 
137   const __m128i res_1 =
138       _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]);
139   const __m128i res_3 =
140       _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]);
141   const __m128i res_5 =
142       _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]);
143   const __m128i res_7 =
144       _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]);
145 
146   __m128i res_odd =
147       _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7));
148   res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
149                           _mm_cvtsi32_si128(reduce_bits_horiz));
150 
151   // Combine results into one register.
152   // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
153   // as this order helps with the vertical filter.
154   tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
155 }
156 
highbd_horiz_filter(const __m128i * src,const __m128i * src2,__m128i * tmp,int sx,int alpha,int k,const int offset_bits_horiz,const int reduce_bits_horiz)157 static inline void highbd_horiz_filter(const __m128i *src, const __m128i *src2,
158                                        __m128i *tmp, int sx, int alpha, int k,
159                                        const int offset_bits_horiz,
160                                        const int reduce_bits_horiz) {
161   __m128i coeff[8];
162   highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff);
163   highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz,
164                            reduce_bits_horiz, k);
165 }
166 
highbd_warp_horizontal_filter_alpha0_beta0(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)167 static inline void highbd_warp_horizontal_filter_alpha0_beta0(
168     const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
169     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
170     const int offset_bits_horiz, const int reduce_bits_horiz) {
171   (void)beta;
172   (void)alpha;
173   int k;
174 
175   __m128i coeff[8];
176   highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
177 
178   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
179     int iy = iy4 + k;
180     if (iy < 0)
181       iy = 0;
182     else if (iy > height - 1)
183       iy = height - 1;
184 
185     // Load source pixels
186     const __m128i src =
187         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
188     const __m128i src2 =
189         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
190     highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
191                              reduce_bits_horiz, k);
192   }
193 }
194 
highbd_warp_horizontal_filter_alpha0(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)195 static inline void highbd_warp_horizontal_filter_alpha0(
196     const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
197     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
198     const int offset_bits_horiz, const int reduce_bits_horiz) {
199   (void)alpha;
200   int k;
201   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
202     int iy = iy4 + k;
203     if (iy < 0)
204       iy = 0;
205     else if (iy > height - 1)
206       iy = height - 1;
207     int sx = sx4 + beta * (k + 4);
208 
209     // Load source pixels
210     const __m128i src =
211         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
212     const __m128i src2 =
213         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
214 
215     __m128i coeff[8];
216     highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff);
217     highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
218                              reduce_bits_horiz, k);
219   }
220 }
221 
highbd_warp_horizontal_filter_beta0(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)222 static inline void highbd_warp_horizontal_filter_beta0(
223     const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
224     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
225     const int offset_bits_horiz, const int reduce_bits_horiz) {
226   (void)beta;
227   int k;
228   __m128i coeff[8];
229   highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff);
230 
231   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
232     int iy = iy4 + k;
233     if (iy < 0)
234       iy = 0;
235     else if (iy > height - 1)
236       iy = height - 1;
237 
238     // Load source pixels
239     const __m128i src =
240         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
241     const __m128i src2 =
242         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
243     highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
244                              reduce_bits_horiz, k);
245   }
246 }
247 
highbd_warp_horizontal_filter(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)248 static inline void highbd_warp_horizontal_filter(
249     const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
250     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
251     const int offset_bits_horiz, const int reduce_bits_horiz) {
252   int k;
253   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
254     int iy = iy4 + k;
255     if (iy < 0)
256       iy = 0;
257     else if (iy > height - 1)
258       iy = height - 1;
259     int sx = sx4 + beta * (k + 4);
260 
261     // Load source pixels
262     const __m128i src =
263         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
264     const __m128i src2 =
265         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
266 
267     highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz,
268                         reduce_bits_horiz);
269   }
270 }
271 
highbd_prepare_warp_horizontal_filter(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)272 static inline void highbd_prepare_warp_horizontal_filter(
273     const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
274     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
275     const int offset_bits_horiz, const int reduce_bits_horiz) {
276   if (alpha == 0 && beta == 0)
277     highbd_warp_horizontal_filter_alpha0_beta0(
278         ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
279         offset_bits_horiz, reduce_bits_horiz);
280 
281   else if (alpha == 0 && beta != 0)
282     highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha,
283                                          beta, p_height, height, i,
284                                          offset_bits_horiz, reduce_bits_horiz);
285 
286   else if (alpha != 0 && beta == 0)
287     highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
288                                         beta, p_height, height, i,
289                                         offset_bits_horiz, reduce_bits_horiz);
290   else
291     highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
292                                   p_height, height, i, offset_bits_horiz,
293                                   reduce_bits_horiz);
294 }
295 
av1_highbd_warp_affine_sse4_1(const int32_t * mat,const uint16_t * ref,int width,int height,int stride,uint16_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)296 void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
297                                    int width, int height, int stride,
298                                    uint16_t *pred, int p_col, int p_row,
299                                    int p_width, int p_height, int p_stride,
300                                    int subsampling_x, int subsampling_y, int bd,
301                                    ConvolveParams *conv_params, int16_t alpha,
302                                    int16_t beta, int16_t gamma, int16_t delta) {
303   __m128i tmp[15];
304   int i, j, k;
305   const int reduce_bits_horiz = conv_params->round_0;
306   const int reduce_bits_vert = conv_params->is_compound
307                                    ? conv_params->round_1
308                                    : 2 * FILTER_BITS - reduce_bits_horiz;
309   const int offset_bits_horiz = bd + FILTER_BITS - 1;
310   assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
311   assert(!(bd == 12 && reduce_bits_horiz < 5));
312   assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
313 
314   // Check that, even with 12-bit input, the intermediate values will fit
315   // into an unsigned 16-bit intermediate array.
316   assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
317 
318   const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
319   const __m128i clip_pixel =
320       _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
321   const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
322   const __m128i reduce_bits_vert_const =
323       _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
324   const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
325   const int round_bits =
326       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
327   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
328   const __m128i res_sub_const =
329       _mm_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
330                      (1 << (offset_bits - conv_params->round_1 - 1)));
331   __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
332   __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
333 
334   const int w0 = conv_params->fwd_offset;
335   const int w1 = conv_params->bck_offset;
336   const __m128i wt0 = _mm_set1_epi32(w0);
337   const __m128i wt1 = _mm_set1_epi32(w1);
338 
339   /* Note: For this code to work, the left/right frame borders need to be
340   extended by at least 13 pixels each. By the time we get here, other
341   code will have set up this border, but we allow an explicit check
342   for debugging purposes.
343   */
344   /*for (i = 0; i < height; ++i) {
345   for (j = 0; j < 13; ++j) {
346   assert(ref[i * stride - 13 + j] == ref[i * stride]);
347   assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
348   }
349   }*/
350 
351   for (i = 0; i < p_height; i += 8) {
352     for (j = 0; j < p_width; j += 8) {
353       const int32_t src_x = (p_col + j + 4) << subsampling_x;
354       const int32_t src_y = (p_row + i + 4) << subsampling_y;
355       const int64_t dst_x =
356           (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
357       const int64_t dst_y =
358           (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
359       const int64_t x4 = dst_x >> subsampling_x;
360       const int64_t y4 = dst_y >> subsampling_y;
361 
362       int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
363       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
364       int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
365       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
366 
367       // Add in all the constant terms, including rounding and offset
368       sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
369              (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
370       sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
371              (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
372 
373       sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
374       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
375 
376       // Horizontal filter
377       // If the block is aligned such that, after clamping, every sample
378       // would be taken from the leftmost/rightmost column, then we can
379       // skip the expensive horizontal filter.
380       if (ix4 <= -7) {
381         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
382           int iy = iy4 + k;
383           if (iy < 0)
384             iy = 0;
385           else if (iy > height - 1)
386             iy = height - 1;
387           tmp[k + 7] = _mm_set1_epi16(
388               (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
389               ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
390         }
391       } else if (ix4 >= width + 6) {
392         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
393           int iy = iy4 + k;
394           if (iy < 0)
395             iy = 0;
396           else if (iy > height - 1)
397             iy = height - 1;
398           tmp[k + 7] =
399               _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
400                              ref[iy * stride + (width - 1)] *
401                                  (1 << (FILTER_BITS - reduce_bits_horiz)));
402         }
403       } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
404         const int out_of_boundary_left = -(ix4 - 6);
405         const int out_of_boundary_right = (ix4 + 8) - width;
406 
407         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
408           int iy = iy4 + k;
409           if (iy < 0)
410             iy = 0;
411           else if (iy > height - 1)
412             iy = height - 1;
413           int sx = sx4 + beta * (k + 4);
414 
415           // Load source pixels
416           const __m128i src =
417               _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
418           const __m128i src2 =
419               _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
420 
421           const __m128i src_01 = _mm_shuffle_epi8(
422               src, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
423           const __m128i src2_01 = _mm_shuffle_epi8(
424               src2, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
425 
426           __m128i src_lo = _mm_unpacklo_epi64(src_01, src2_01);
427           __m128i src_hi = _mm_unpackhi_epi64(src_01, src2_01);
428 
429           if (out_of_boundary_left >= 0) {
430             const __m128i shuffle_reg_left =
431                 _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
432             src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_left);
433             src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_left);
434           }
435 
436           if (out_of_boundary_right >= 0) {
437             const __m128i shuffle_reg_right = _mm_loadu_si128(
438                 (__m128i *)warp_pad_right[out_of_boundary_right]);
439             src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_right);
440             src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_right);
441           }
442 
443           const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi);
444           const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi);
445 
446           highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k,
447                               offset_bits_horiz, reduce_bits_horiz);
448         }
449       } else {
450         highbd_prepare_warp_horizontal_filter(
451             ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
452             offset_bits_horiz, reduce_bits_horiz);
453       }
454 
455       // Vertical filter
456       for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
457         int sy = sy4 + delta * (k + 4);
458 
459         // Load from tmp and rearrange pairs of consecutive rows into the
460         // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
461         const __m128i *src = tmp + (k + 4);
462         const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
463         const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
464         const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
465         const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
466 
467         // Filter even-index pixels
468         const __m128i tmp_0 = _mm_loadu_si128(
469             (__m128i *)(av1_warped_filter +
470                         ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
471         const __m128i tmp_2 = _mm_loadu_si128(
472             (__m128i *)(av1_warped_filter +
473                         ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
474         const __m128i tmp_4 = _mm_loadu_si128(
475             (__m128i *)(av1_warped_filter +
476                         ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
477         const __m128i tmp_6 = _mm_loadu_si128(
478             (__m128i *)(av1_warped_filter +
479                         ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
480 
481         const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
482         const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
483         const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
484         const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
485 
486         const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
487         const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
488         const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
489         const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
490 
491         const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
492         const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
493         const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
494         const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
495 
496         const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
497                                                _mm_add_epi32(res_4, res_6));
498 
499         // Filter odd-index pixels
500         const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
501         const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
502         const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
503         const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
504 
505         const __m128i tmp_1 = _mm_loadu_si128(
506             (__m128i *)(av1_warped_filter +
507                         ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
508         const __m128i tmp_3 = _mm_loadu_si128(
509             (__m128i *)(av1_warped_filter +
510                         ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
511         const __m128i tmp_5 = _mm_loadu_si128(
512             (__m128i *)(av1_warped_filter +
513                         ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
514         const __m128i tmp_7 = _mm_loadu_si128(
515             (__m128i *)(av1_warped_filter +
516                         ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
517 
518         const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
519         const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
520         const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
521         const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
522 
523         const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
524         const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
525         const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
526         const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
527 
528         const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
529         const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
530         const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
531         const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
532 
533         const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
534                                               _mm_add_epi32(res_5, res_7));
535 
536         // Rearrange pixels back into the order 0 ... 7
537         __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
538         __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
539 
540         if (conv_params->is_compound) {
541           __m128i *const p =
542               (__m128i *)&conv_params
543                   ->dst[(i + k + 4) * conv_params->dst_stride + j];
544           res_lo = _mm_add_epi32(res_lo, res_add_const);
545           res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),
546                                  reduce_bits_vert_shift);
547 
548           if (conv_params->do_average) {
549             __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
550             __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p));
551 
552             if (conv_params->use_dist_wtd_comp_avg) {
553               res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
554                                      _mm_mullo_epi32(res_lo, wt1));
555               res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS);
556             } else {
557               res_lo = _mm_srai_epi32(_mm_add_epi32(p_32, res_lo), 1);
558             }
559 
560             __m128i res32_lo = _mm_add_epi32(res_lo, res_sub_const);
561             res32_lo = _mm_sra_epi32(_mm_add_epi32(res32_lo, round_bits_const),
562                                      round_bits_shift);
563 
564             __m128i res16_lo = _mm_packus_epi32(res32_lo, res32_lo);
565             res16_lo = _mm_min_epi16(res16_lo, clip_pixel);
566             _mm_storel_epi64(dst16, res16_lo);
567           } else {
568             res_lo = _mm_packus_epi32(res_lo, res_lo);
569             _mm_storel_epi64(p, res_lo);
570           }
571           if (p_width > 4) {
572             __m128i *const p4 =
573                 (__m128i *)&conv_params
574                     ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
575 
576             res_hi = _mm_add_epi32(res_hi, res_add_const);
577             res_hi =
578                 _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),
579                               reduce_bits_vert_shift);
580             if (conv_params->do_average) {
581               __m128i *const dst16_4 =
582                   (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
583               __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4));
584 
585               if (conv_params->use_dist_wtd_comp_avg) {
586                 res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0),
587                                        _mm_mullo_epi32(res_hi, wt1));
588                 res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS);
589               } else {
590                 res_hi = _mm_srai_epi32(_mm_add_epi32(p4_32, res_hi), 1);
591               }
592 
593               __m128i res32_hi = _mm_add_epi32(res_hi, res_sub_const);
594               res32_hi = _mm_sra_epi32(
595                   _mm_add_epi32(res32_hi, round_bits_const), round_bits_shift);
596               __m128i res16_hi = _mm_packus_epi32(res32_hi, res32_hi);
597               res16_hi = _mm_min_epi16(res16_hi, clip_pixel);
598               _mm_storel_epi64(dst16_4, res16_hi);
599             } else {
600               res_hi = _mm_packus_epi32(res_hi, res_hi);
601               _mm_storel_epi64(p4, res_hi);
602             }
603           }
604         } else {
605           // Round and pack into 8 bits
606           const __m128i round_const =
607               _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
608                              ((1 << reduce_bits_vert) >> 1));
609 
610           const __m128i res_lo_round = _mm_srai_epi32(
611               _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
612           const __m128i res_hi_round = _mm_srai_epi32(
613               _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
614 
615           __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
616           // Clamp res_16bit to the range [0, 2^bd - 1]
617           const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
618           const __m128i zero = _mm_setzero_si128();
619           res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
620 
621           // Store, blending with 'pred' if needed
622           __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
623 
624           // Note: If we're outputting a 4x4 block, we need to be very careful
625           // to only output 4 pixels at this point, to avoid encode/decode
626           // mismatches when encoding with multiple threads.
627           if (p_width == 4) {
628             _mm_storel_epi64(p, res_16bit);
629           } else {
630             _mm_storeu_si128(p, res_16bit);
631           }
632         }
633       }
634     }
635   }
636 }
637