1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <smmintrin.h>
13
14 #include "config/av1_rtcd.h"
15
16 #include "av1/common/warped_motion.h"
17
18 static const uint8_t warp_highbd_arrange_bytes[16] = { 0, 2, 4, 6, 8, 10,
19 12, 14, 1, 3, 5, 7,
20 9, 11, 13, 15 };
21
22 static const uint8_t highbd_shuffle_alpha0_mask0[16] = {
23 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
24 };
25 static const uint8_t highbd_shuffle_alpha0_mask1[16] = {
26 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
27 };
28 static const uint8_t highbd_shuffle_alpha0_mask2[16] = { 8, 9, 10, 11, 8, 9,
29 10, 11, 8, 9, 10, 11,
30 8, 9, 10, 11 };
31 static const uint8_t highbd_shuffle_alpha0_mask3[16] = { 12, 13, 14, 15, 12, 13,
32 14, 15, 12, 13, 14, 15,
33 12, 13, 14, 15 };
34
highbd_prepare_horizontal_filter_coeff(int alpha,int sx,__m128i * coeff)35 static inline void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
36 __m128i *coeff) {
37 // Filter even-index pixels
38 const __m128i tmp_0 =
39 _mm_loadu_si128((__m128i *)(av1_warped_filter +
40 ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
41 const __m128i tmp_2 =
42 _mm_loadu_si128((__m128i *)(av1_warped_filter +
43 ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
44 const __m128i tmp_4 =
45 _mm_loadu_si128((__m128i *)(av1_warped_filter +
46 ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
47 const __m128i tmp_6 =
48 _mm_loadu_si128((__m128i *)(av1_warped_filter +
49 ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
50
51 // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
52 const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
53 // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
54 const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
55 // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
56 const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
57 // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
58 const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
59
60 // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
61 coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
62 // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
63 coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10);
64 // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
65 coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14);
66 // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
67 coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14);
68
69 // Filter odd-index pixels
70 const __m128i tmp_1 =
71 _mm_loadu_si128((__m128i *)(av1_warped_filter +
72 ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
73 const __m128i tmp_3 =
74 _mm_loadu_si128((__m128i *)(av1_warped_filter +
75 ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
76 const __m128i tmp_5 =
77 _mm_loadu_si128((__m128i *)(av1_warped_filter +
78 ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
79 const __m128i tmp_7 =
80 _mm_loadu_si128((__m128i *)(av1_warped_filter +
81 ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
82
83 const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
84 const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
85 const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
86 const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
87
88 coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11);
89 coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11);
90 coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15);
91 coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
92 }
93
highbd_prepare_horizontal_filter_coeff_alpha0(int sx,__m128i * coeff)94 static inline void highbd_prepare_horizontal_filter_coeff_alpha0(
95 int sx, __m128i *coeff) {
96 // Filter coeff
97 const __m128i tmp_0 = _mm_loadu_si128(
98 (__m128i *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
99
100 coeff[0] = _mm_shuffle_epi8(
101 tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0));
102 coeff[2] = _mm_shuffle_epi8(
103 tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1));
104 coeff[4] = _mm_shuffle_epi8(
105 tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2));
106 coeff[6] = _mm_shuffle_epi8(
107 tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3));
108
109 coeff[1] = coeff[0];
110 coeff[3] = coeff[2];
111 coeff[5] = coeff[4];
112 coeff[7] = coeff[6];
113 }
114
highbd_filter_src_pixels(const __m128i * src,const __m128i * src2,__m128i * tmp,__m128i * coeff,const int offset_bits_horiz,const int reduce_bits_horiz,int k)115 static inline void highbd_filter_src_pixels(
116 const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff,
117 const int offset_bits_horiz, const int reduce_bits_horiz, int k) {
118 const __m128i src_1 = *src;
119 const __m128i src2_1 = *src2;
120
121 const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
122 ((1 << reduce_bits_horiz) >> 1));
123
124 const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]);
125 const __m128i res_2 =
126 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]);
127 const __m128i res_4 =
128 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]);
129 const __m128i res_6 =
130 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]);
131
132 __m128i res_even =
133 _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
134 res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
135 _mm_cvtsi32_si128(reduce_bits_horiz));
136
137 const __m128i res_1 =
138 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]);
139 const __m128i res_3 =
140 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]);
141 const __m128i res_5 =
142 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]);
143 const __m128i res_7 =
144 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]);
145
146 __m128i res_odd =
147 _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7));
148 res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
149 _mm_cvtsi32_si128(reduce_bits_horiz));
150
151 // Combine results into one register.
152 // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
153 // as this order helps with the vertical filter.
154 tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
155 }
156
highbd_horiz_filter(const __m128i * src,const __m128i * src2,__m128i * tmp,int sx,int alpha,int k,const int offset_bits_horiz,const int reduce_bits_horiz)157 static inline void highbd_horiz_filter(const __m128i *src, const __m128i *src2,
158 __m128i *tmp, int sx, int alpha, int k,
159 const int offset_bits_horiz,
160 const int reduce_bits_horiz) {
161 __m128i coeff[8];
162 highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff);
163 highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz,
164 reduce_bits_horiz, k);
165 }
166
highbd_warp_horizontal_filter_alpha0_beta0(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)167 static inline void highbd_warp_horizontal_filter_alpha0_beta0(
168 const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
169 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
170 const int offset_bits_horiz, const int reduce_bits_horiz) {
171 (void)beta;
172 (void)alpha;
173 int k;
174
175 __m128i coeff[8];
176 highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
177
178 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
179 int iy = iy4 + k;
180 if (iy < 0)
181 iy = 0;
182 else if (iy > height - 1)
183 iy = height - 1;
184
185 // Load source pixels
186 const __m128i src =
187 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
188 const __m128i src2 =
189 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
190 highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
191 reduce_bits_horiz, k);
192 }
193 }
194
highbd_warp_horizontal_filter_alpha0(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)195 static inline void highbd_warp_horizontal_filter_alpha0(
196 const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
197 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
198 const int offset_bits_horiz, const int reduce_bits_horiz) {
199 (void)alpha;
200 int k;
201 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
202 int iy = iy4 + k;
203 if (iy < 0)
204 iy = 0;
205 else if (iy > height - 1)
206 iy = height - 1;
207 int sx = sx4 + beta * (k + 4);
208
209 // Load source pixels
210 const __m128i src =
211 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
212 const __m128i src2 =
213 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
214
215 __m128i coeff[8];
216 highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff);
217 highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
218 reduce_bits_horiz, k);
219 }
220 }
221
highbd_warp_horizontal_filter_beta0(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)222 static inline void highbd_warp_horizontal_filter_beta0(
223 const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
224 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
225 const int offset_bits_horiz, const int reduce_bits_horiz) {
226 (void)beta;
227 int k;
228 __m128i coeff[8];
229 highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff);
230
231 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
232 int iy = iy4 + k;
233 if (iy < 0)
234 iy = 0;
235 else if (iy > height - 1)
236 iy = height - 1;
237
238 // Load source pixels
239 const __m128i src =
240 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
241 const __m128i src2 =
242 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
243 highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
244 reduce_bits_horiz, k);
245 }
246 }
247
highbd_warp_horizontal_filter(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)248 static inline void highbd_warp_horizontal_filter(
249 const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
250 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
251 const int offset_bits_horiz, const int reduce_bits_horiz) {
252 int k;
253 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
254 int iy = iy4 + k;
255 if (iy < 0)
256 iy = 0;
257 else if (iy > height - 1)
258 iy = height - 1;
259 int sx = sx4 + beta * (k + 4);
260
261 // Load source pixels
262 const __m128i src =
263 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
264 const __m128i src2 =
265 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
266
267 highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz,
268 reduce_bits_horiz);
269 }
270 }
271
highbd_prepare_warp_horizontal_filter(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)272 static inline void highbd_prepare_warp_horizontal_filter(
273 const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
274 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
275 const int offset_bits_horiz, const int reduce_bits_horiz) {
276 if (alpha == 0 && beta == 0)
277 highbd_warp_horizontal_filter_alpha0_beta0(
278 ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
279 offset_bits_horiz, reduce_bits_horiz);
280
281 else if (alpha == 0 && beta != 0)
282 highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha,
283 beta, p_height, height, i,
284 offset_bits_horiz, reduce_bits_horiz);
285
286 else if (alpha != 0 && beta == 0)
287 highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
288 beta, p_height, height, i,
289 offset_bits_horiz, reduce_bits_horiz);
290 else
291 highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
292 p_height, height, i, offset_bits_horiz,
293 reduce_bits_horiz);
294 }
295
av1_highbd_warp_affine_sse4_1(const int32_t * mat,const uint16_t * ref,int width,int height,int stride,uint16_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)296 void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
297 int width, int height, int stride,
298 uint16_t *pred, int p_col, int p_row,
299 int p_width, int p_height, int p_stride,
300 int subsampling_x, int subsampling_y, int bd,
301 ConvolveParams *conv_params, int16_t alpha,
302 int16_t beta, int16_t gamma, int16_t delta) {
303 __m128i tmp[15];
304 int i, j, k;
305 const int reduce_bits_horiz = conv_params->round_0;
306 const int reduce_bits_vert = conv_params->is_compound
307 ? conv_params->round_1
308 : 2 * FILTER_BITS - reduce_bits_horiz;
309 const int offset_bits_horiz = bd + FILTER_BITS - 1;
310 assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
311 assert(!(bd == 12 && reduce_bits_horiz < 5));
312 assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
313
314 // Check that, even with 12-bit input, the intermediate values will fit
315 // into an unsigned 16-bit intermediate array.
316 assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
317
318 const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
319 const __m128i clip_pixel =
320 _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
321 const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
322 const __m128i reduce_bits_vert_const =
323 _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
324 const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
325 const int round_bits =
326 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
327 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
328 const __m128i res_sub_const =
329 _mm_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
330 (1 << (offset_bits - conv_params->round_1 - 1)));
331 __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
332 __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
333
334 const int w0 = conv_params->fwd_offset;
335 const int w1 = conv_params->bck_offset;
336 const __m128i wt0 = _mm_set1_epi32(w0);
337 const __m128i wt1 = _mm_set1_epi32(w1);
338
339 /* Note: For this code to work, the left/right frame borders need to be
340 extended by at least 13 pixels each. By the time we get here, other
341 code will have set up this border, but we allow an explicit check
342 for debugging purposes.
343 */
344 /*for (i = 0; i < height; ++i) {
345 for (j = 0; j < 13; ++j) {
346 assert(ref[i * stride - 13 + j] == ref[i * stride]);
347 assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
348 }
349 }*/
350
351 for (i = 0; i < p_height; i += 8) {
352 for (j = 0; j < p_width; j += 8) {
353 const int32_t src_x = (p_col + j + 4) << subsampling_x;
354 const int32_t src_y = (p_row + i + 4) << subsampling_y;
355 const int64_t dst_x =
356 (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
357 const int64_t dst_y =
358 (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
359 const int64_t x4 = dst_x >> subsampling_x;
360 const int64_t y4 = dst_y >> subsampling_y;
361
362 int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
363 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
364 int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
365 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
366
367 // Add in all the constant terms, including rounding and offset
368 sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
369 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
370 sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
371 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
372
373 sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
374 sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
375
376 // Horizontal filter
377 // If the block is aligned such that, after clamping, every sample
378 // would be taken from the leftmost/rightmost column, then we can
379 // skip the expensive horizontal filter.
380 if (ix4 <= -7) {
381 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
382 int iy = iy4 + k;
383 if (iy < 0)
384 iy = 0;
385 else if (iy > height - 1)
386 iy = height - 1;
387 tmp[k + 7] = _mm_set1_epi16(
388 (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
389 ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
390 }
391 } else if (ix4 >= width + 6) {
392 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
393 int iy = iy4 + k;
394 if (iy < 0)
395 iy = 0;
396 else if (iy > height - 1)
397 iy = height - 1;
398 tmp[k + 7] =
399 _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
400 ref[iy * stride + (width - 1)] *
401 (1 << (FILTER_BITS - reduce_bits_horiz)));
402 }
403 } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
404 const int out_of_boundary_left = -(ix4 - 6);
405 const int out_of_boundary_right = (ix4 + 8) - width;
406
407 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
408 int iy = iy4 + k;
409 if (iy < 0)
410 iy = 0;
411 else if (iy > height - 1)
412 iy = height - 1;
413 int sx = sx4 + beta * (k + 4);
414
415 // Load source pixels
416 const __m128i src =
417 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
418 const __m128i src2 =
419 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
420
421 const __m128i src_01 = _mm_shuffle_epi8(
422 src, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
423 const __m128i src2_01 = _mm_shuffle_epi8(
424 src2, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
425
426 __m128i src_lo = _mm_unpacklo_epi64(src_01, src2_01);
427 __m128i src_hi = _mm_unpackhi_epi64(src_01, src2_01);
428
429 if (out_of_boundary_left >= 0) {
430 const __m128i shuffle_reg_left =
431 _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
432 src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_left);
433 src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_left);
434 }
435
436 if (out_of_boundary_right >= 0) {
437 const __m128i shuffle_reg_right = _mm_loadu_si128(
438 (__m128i *)warp_pad_right[out_of_boundary_right]);
439 src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_right);
440 src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_right);
441 }
442
443 const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi);
444 const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi);
445
446 highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k,
447 offset_bits_horiz, reduce_bits_horiz);
448 }
449 } else {
450 highbd_prepare_warp_horizontal_filter(
451 ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
452 offset_bits_horiz, reduce_bits_horiz);
453 }
454
455 // Vertical filter
456 for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
457 int sy = sy4 + delta * (k + 4);
458
459 // Load from tmp and rearrange pairs of consecutive rows into the
460 // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
461 const __m128i *src = tmp + (k + 4);
462 const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
463 const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
464 const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
465 const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
466
467 // Filter even-index pixels
468 const __m128i tmp_0 = _mm_loadu_si128(
469 (__m128i *)(av1_warped_filter +
470 ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
471 const __m128i tmp_2 = _mm_loadu_si128(
472 (__m128i *)(av1_warped_filter +
473 ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
474 const __m128i tmp_4 = _mm_loadu_si128(
475 (__m128i *)(av1_warped_filter +
476 ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
477 const __m128i tmp_6 = _mm_loadu_si128(
478 (__m128i *)(av1_warped_filter +
479 ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
480
481 const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
482 const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
483 const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
484 const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
485
486 const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
487 const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
488 const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
489 const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
490
491 const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
492 const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
493 const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
494 const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
495
496 const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
497 _mm_add_epi32(res_4, res_6));
498
499 // Filter odd-index pixels
500 const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
501 const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
502 const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
503 const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
504
505 const __m128i tmp_1 = _mm_loadu_si128(
506 (__m128i *)(av1_warped_filter +
507 ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
508 const __m128i tmp_3 = _mm_loadu_si128(
509 (__m128i *)(av1_warped_filter +
510 ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
511 const __m128i tmp_5 = _mm_loadu_si128(
512 (__m128i *)(av1_warped_filter +
513 ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
514 const __m128i tmp_7 = _mm_loadu_si128(
515 (__m128i *)(av1_warped_filter +
516 ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
517
518 const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
519 const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
520 const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
521 const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
522
523 const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
524 const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
525 const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
526 const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
527
528 const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
529 const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
530 const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
531 const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
532
533 const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
534 _mm_add_epi32(res_5, res_7));
535
536 // Rearrange pixels back into the order 0 ... 7
537 __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
538 __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
539
540 if (conv_params->is_compound) {
541 __m128i *const p =
542 (__m128i *)&conv_params
543 ->dst[(i + k + 4) * conv_params->dst_stride + j];
544 res_lo = _mm_add_epi32(res_lo, res_add_const);
545 res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),
546 reduce_bits_vert_shift);
547
548 if (conv_params->do_average) {
549 __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
550 __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p));
551
552 if (conv_params->use_dist_wtd_comp_avg) {
553 res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
554 _mm_mullo_epi32(res_lo, wt1));
555 res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS);
556 } else {
557 res_lo = _mm_srai_epi32(_mm_add_epi32(p_32, res_lo), 1);
558 }
559
560 __m128i res32_lo = _mm_add_epi32(res_lo, res_sub_const);
561 res32_lo = _mm_sra_epi32(_mm_add_epi32(res32_lo, round_bits_const),
562 round_bits_shift);
563
564 __m128i res16_lo = _mm_packus_epi32(res32_lo, res32_lo);
565 res16_lo = _mm_min_epi16(res16_lo, clip_pixel);
566 _mm_storel_epi64(dst16, res16_lo);
567 } else {
568 res_lo = _mm_packus_epi32(res_lo, res_lo);
569 _mm_storel_epi64(p, res_lo);
570 }
571 if (p_width > 4) {
572 __m128i *const p4 =
573 (__m128i *)&conv_params
574 ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
575
576 res_hi = _mm_add_epi32(res_hi, res_add_const);
577 res_hi =
578 _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),
579 reduce_bits_vert_shift);
580 if (conv_params->do_average) {
581 __m128i *const dst16_4 =
582 (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
583 __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4));
584
585 if (conv_params->use_dist_wtd_comp_avg) {
586 res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0),
587 _mm_mullo_epi32(res_hi, wt1));
588 res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS);
589 } else {
590 res_hi = _mm_srai_epi32(_mm_add_epi32(p4_32, res_hi), 1);
591 }
592
593 __m128i res32_hi = _mm_add_epi32(res_hi, res_sub_const);
594 res32_hi = _mm_sra_epi32(
595 _mm_add_epi32(res32_hi, round_bits_const), round_bits_shift);
596 __m128i res16_hi = _mm_packus_epi32(res32_hi, res32_hi);
597 res16_hi = _mm_min_epi16(res16_hi, clip_pixel);
598 _mm_storel_epi64(dst16_4, res16_hi);
599 } else {
600 res_hi = _mm_packus_epi32(res_hi, res_hi);
601 _mm_storel_epi64(p4, res_hi);
602 }
603 }
604 } else {
605 // Round and pack into 8 bits
606 const __m128i round_const =
607 _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
608 ((1 << reduce_bits_vert) >> 1));
609
610 const __m128i res_lo_round = _mm_srai_epi32(
611 _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
612 const __m128i res_hi_round = _mm_srai_epi32(
613 _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
614
615 __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
616 // Clamp res_16bit to the range [0, 2^bd - 1]
617 const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
618 const __m128i zero = _mm_setzero_si128();
619 res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
620
621 // Store, blending with 'pred' if needed
622 __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
623
624 // Note: If we're outputting a 4x4 block, we need to be very careful
625 // to only output 4 pixels at this point, to avoid encode/decode
626 // mismatches when encoding with multiple threads.
627 if (p_width == 4) {
628 _mm_storel_epi64(p, res_16bit);
629 } else {
630 _mm_storeu_si128(p, res_16bit);
631 }
632 }
633 }
634 }
635 }
636 }
637