1 /*
2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11 #include <immintrin.h>
12
13 #include "config/av1_rtcd.h"
14
15 #include "av1/common/warped_motion.h"
16
av1_highbd_warp_affine_avx2(const int32_t * mat,const uint16_t * ref,int width,int height,int stride,uint16_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)17 void av1_highbd_warp_affine_avx2(const int32_t *mat, const uint16_t *ref,
18 int width, int height, int stride,
19 uint16_t *pred, int p_col, int p_row,
20 int p_width, int p_height, int p_stride,
21 int subsampling_x, int subsampling_y, int bd,
22 ConvolveParams *conv_params, int16_t alpha,
23 int16_t beta, int16_t gamma, int16_t delta) {
24 __m256i tmp[15];
25 const int reduce_bits_horiz = conv_params->round_0;
26 const int reduce_bits_vert = conv_params->is_compound
27 ? conv_params->round_1
28 : 2 * FILTER_BITS - reduce_bits_horiz;
29 const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
30 const int offset_bits_horiz = bd + FILTER_BITS - 1;
31 const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
32 const int round_bits =
33 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
34 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
35 (void)max_bits_horiz;
36 assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
37
38 // Check that, even with 12-bit input, the intermediate values will fit
39 // into an unsigned 16-bit intermediate array.
40 assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
41
42 const __m256i clip_pixel =
43 _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
44 const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
45 const __m256i reduce_bits_vert_const =
46 _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1));
47 const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert);
48 const __m256i res_sub_const =
49 _mm256_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
50 (1 << (offset_bits - conv_params->round_1 - 1)));
51 __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
52 __m256i round_bits_const = _mm256_set1_epi32(((1 << round_bits) >> 1));
53
54 const int w0 = conv_params->fwd_offset;
55 const int w1 = conv_params->bck_offset;
56 const __m256i wt0 = _mm256_set1_epi32(w0);
57 const __m256i wt1 = _mm256_set1_epi32(w1);
58
59 __m256i v_rbhoriz = _mm256_set1_epi32(1 << (reduce_bits_horiz - 1));
60 __m256i v_zeros = _mm256_setzero_si256();
61 int ohoriz = 1 << offset_bits_horiz;
62 int mhoriz = 1 << max_bits_horiz;
63 (void)mhoriz;
64 int sx;
65
66 for (int i = 0; i < p_height; i += 8) {
67 for (int j = 0; j < p_width; j += 8) {
68 // Calculate the center of this 8x8 block,
69 // project to luma coordinates (if in a subsampled chroma plane),
70 // apply the affine transformation,
71 // then convert back to the original coordinates (if necessary)
72 const int32_t src_x = (p_col + j + 4) << subsampling_x;
73 const int32_t src_y = (p_row + i + 4) << subsampling_y;
74 const int64_t dst_x =
75 (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
76 const int64_t dst_y =
77 (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
78 const int64_t x4 = dst_x >> subsampling_x;
79 const int64_t y4 = dst_y >> subsampling_y;
80
81 const int16_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
82 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
83 const int16_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
84 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
85
86 sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
87 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
88 sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
89 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
90
91 sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
92 sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
93
94 // Horizontal filter
95 if (ix4 <= -7) {
96 for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
97 int iy = iy4 + k;
98 if (iy < 0)
99 iy = 0;
100 else if (iy > height - 1)
101 iy = height - 1;
102 tmp[k + 7] = _mm256_cvtepi16_epi32(_mm_set1_epi16(
103 (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
104 ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz))));
105 }
106 } else if (ix4 >= width + 6) {
107 for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
108 int iy = iy4 + k;
109 if (iy < 0)
110 iy = 0;
111 else if (iy > height - 1)
112 iy = height - 1;
113 tmp[k + 7] = _mm256_cvtepi16_epi32(
114 _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
115 ref[iy * stride + (width - 1)] *
116 (1 << (FILTER_BITS - reduce_bits_horiz))));
117 }
118 } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
119 int32_t tmp1[8];
120 for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
121 const int iy = clamp(iy4 + k, 0, height - 1);
122
123 sx = sx4 + beta * (k + 4);
124 for (int l = -4; l < 4; ++l) {
125 int ix = ix4 + l - 3;
126 const int offs = sx >> WARPEDDIFF_PREC_BITS;
127 const int16_t *coeffs = av1_warped_filter[offs];
128
129 int32_t sum = 1 << offset_bits_horiz;
130 for (int m = 0; m < 8; ++m) {
131 const int sample_x = clamp(ix + m, 0, width - 1);
132 sum += ref[iy * stride + sample_x] * coeffs[m];
133 }
134 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
135 tmp1[(l + 4) / 2 + ((l + 4) % 2) * 4] = sum;
136 sx += alpha;
137 }
138 tmp[k + 7] = _mm256_loadu_si256((__m256i *)tmp1);
139 }
140 } else {
141 if (beta == 0 && alpha == 0) {
142 sx = sx4;
143 __m128i v_01 = _mm_loadu_si128(
144 (__m128i *)
145 av1_warped_filter[sx >>
146 WARPEDDIFF_PREC_BITS]); // A7A6A5A4A3A2A1A0
147 __m256i v_c01 = _mm256_broadcastd_epi32(v_01); // A1A0A1A0A1A0A1A0
148 __m256i v_c23 = _mm256_broadcastd_epi32(
149 _mm_shuffle_epi32(v_01, 1)); // A3A2A3A2A3A2A3A2
150 __m256i v_c45 = _mm256_broadcastd_epi32(
151 _mm_shuffle_epi32(v_01, 2)); // A5A4A5A4A5A4A5A4
152 __m256i v_c67 = _mm256_broadcastd_epi32(
153 _mm_shuffle_epi32(v_01, 3)); // A7A6A7A6A7A6A7A6
154 for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
155 int iy = iy4 + k;
156 if (iy < 0)
157 iy = 0;
158 else if (iy > height - 1)
159 iy = height - 1;
160 iy = iy * stride;
161
162 __m256i v_refl = _mm256_inserti128_si256(
163 _mm256_setzero_si256(),
164 _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
165 v_refl = _mm256_inserti128_si256(
166 v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
167 1); // R15 .. R0
168
169 __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
170
171 __m256i v_refu =
172 _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1
173 v_refl = _mm256_inserti128_si256(
174 v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
175 v_refu = _mm256_inserti128_si256(
176 v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
177
178 __m256i v_sum = _mm256_set1_epi32(ohoriz);
179 __m256i parsum = _mm256_madd_epi16(
180 v_c01, _mm256_alignr_epi8(v_refu, v_refl,
181 0)); // R8R7R6..R1R7R6R5..R1R0
182 __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
183
184 parsum = _mm256_madd_epi16(
185 v_c23,
186 _mm256_alignr_epi8(v_refu, v_refl, 4)); // R10R9..R3R9R8..R3R2
187 __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
188 parsum = _mm256_madd_epi16(
189 v_c45, _mm256_alignr_epi8(v_refu, v_refl,
190 8)); // R12R11..R5R11R10..R5R4
191 __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
192 parsum = _mm256_madd_epi16(
193 v_c67, _mm256_alignr_epi8(v_refu, v_refl,
194 12)); // R14R13..R7R13R12..R7R6
195 __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
196
197 tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
198 reduce_bits_horiz);
199 }
200 } else if (alpha == 0) {
201 for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
202 int iy = iy4 + k;
203 if (iy < 0)
204 iy = 0;
205 else if (iy > height - 1)
206 iy = height - 1;
207 iy = iy * stride;
208
209 sx = sx4 + beta * (k + 4);
210
211 __m128i v_01 = _mm_loadu_si128(
212 (__m128i *)av1_warped_filter
213 [sx >> WARPEDDIFF_PREC_BITS]); // A7A6A5A4A3A2A1A0
214 __m256i v_c01 = _mm256_broadcastd_epi32(v_01); // A1A0A1A0A1A0A1A0
215 __m256i v_c23 = _mm256_broadcastd_epi32(
216 _mm_shuffle_epi32(v_01, 1)); // A3A2A3A2A3A2A3A2
217 __m256i v_c45 = _mm256_broadcastd_epi32(
218 _mm_shuffle_epi32(v_01, 2)); // A5A4A5A4A5A4A5A4
219 __m256i v_c67 = _mm256_broadcastd_epi32(
220 _mm_shuffle_epi32(v_01, 3)); // A7A6A7A6A7A6A7A6
221
222 __m256i v_refl = _mm256_inserti128_si256(
223 _mm256_setzero_si256(),
224 _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
225 v_refl = _mm256_inserti128_si256(
226 v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
227 1); // R15 .. R0
228
229 __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
230
231 __m256i v_refu =
232 _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1
233
234 v_refl = _mm256_inserti128_si256(
235 v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
236 v_refu = _mm256_inserti128_si256(
237 v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
238
239 __m256i v_sum = _mm256_set1_epi32(ohoriz);
240 __m256i parsum =
241 _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0));
242 __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
243
244 parsum =
245 _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4));
246 __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
247 parsum =
248 _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8));
249 __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
250 parsum = _mm256_madd_epi16(v_c67,
251 _mm256_alignr_epi8(v_refu, v_refl, 12));
252 __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
253
254 tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
255 reduce_bits_horiz);
256 }
257 } else if (beta == 0) {
258 sx = sx4;
259 __m256i v_coeff01 = _mm256_inserti128_si256(
260 v_zeros,
261 _mm_loadu_si128(
262 (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]),
263 0);
264 v_coeff01 = _mm256_inserti128_si256(
265 v_coeff01,
266 _mm_loadu_si128(
267 (__m128i *)
268 av1_warped_filter[(sx + alpha) >> WARPEDDIFF_PREC_BITS]),
269 1); // B7B6..B1B0A7A6..A1A0
270 __m256i v_coeff23 = _mm256_inserti128_si256(
271 v_zeros,
272 _mm_loadu_si128(
273 (__m128i *)av1_warped_filter[(sx + 2 * alpha) >>
274 WARPEDDIFF_PREC_BITS]),
275 0);
276 v_coeff23 = _mm256_inserti128_si256(
277 v_coeff23,
278 _mm_loadu_si128(
279 (__m128i *)av1_warped_filter[(sx + 3 * alpha) >>
280 WARPEDDIFF_PREC_BITS]),
281 1); // D7D6..D1D0C7C6..C1C0
282 __m256i v_coeff45 = _mm256_inserti128_si256(
283 v_zeros,
284 _mm_loadu_si128(
285 (__m128i *)av1_warped_filter[(sx + 4 * alpha) >>
286 WARPEDDIFF_PREC_BITS]),
287 0);
288 v_coeff45 = _mm256_inserti128_si256(
289 v_coeff45,
290 _mm_loadu_si128(
291 (__m128i *)av1_warped_filter[(sx + 5 * alpha) >>
292 WARPEDDIFF_PREC_BITS]),
293 1); // F7F6..F1F0E7E6..E1E0
294 __m256i v_coeff67 = _mm256_inserti128_si256(
295 v_zeros,
296 _mm_loadu_si128(
297 (__m128i *)av1_warped_filter[(sx + 6 * alpha) >>
298 WARPEDDIFF_PREC_BITS]),
299 0);
300 v_coeff67 = _mm256_inserti128_si256(
301 v_coeff67,
302 _mm_loadu_si128(
303 (__m128i *)av1_warped_filter[(sx + 7 * alpha) >>
304 WARPEDDIFF_PREC_BITS]),
305 1); // H7H6..H1H0G7G6..G1G0
306
307 __m256i v_c0123 = _mm256_unpacklo_epi32(
308 v_coeff01,
309 v_coeff23); // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
310 __m256i v_c0123u = _mm256_unpackhi_epi32(
311 v_coeff01,
312 v_coeff23); // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
313 __m256i v_c4567 = _mm256_unpacklo_epi32(
314 v_coeff45,
315 v_coeff67); // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
316 __m256i v_c4567u = _mm256_unpackhi_epi32(
317 v_coeff45,
318 v_coeff67); // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
319
320 __m256i v_c01 = _mm256_unpacklo_epi64(
321 v_c0123, v_c4567); // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
322 __m256i v_c23 =
323 _mm256_unpackhi_epi64(v_c0123, v_c4567); // H3H2 ... A3A2
324 __m256i v_c45 =
325 _mm256_unpacklo_epi64(v_c0123u, v_c4567u); // H5H4 ... A5A4
326 __m256i v_c67 =
327 _mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6
328
329 for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
330 int iy = iy4 + k;
331 if (iy < 0)
332 iy = 0;
333 else if (iy > height - 1)
334 iy = height - 1;
335 iy = iy * stride;
336
337 __m256i v_refl = _mm256_inserti128_si256(
338 _mm256_setzero_si256(),
339 _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
340 v_refl = _mm256_inserti128_si256(
341 v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
342 1); // R15 .. R0
343
344 __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
345
346 __m256i v_refu =
347 _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1
348
349 v_refl = _mm256_inserti128_si256(
350 v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
351 v_refu = _mm256_inserti128_si256(
352 v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
353
354 __m256i v_sum = _mm256_set1_epi32(ohoriz);
355 __m256i parsum = _mm256_madd_epi16(
356 v_c01, _mm256_alignr_epi8(v_refu, v_refl,
357 0)); // R8R7R6..R1R7R6R5..R1R0
358 __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
359
360 parsum = _mm256_madd_epi16(
361 v_c23,
362 _mm256_alignr_epi8(v_refu, v_refl, 4)); // R10R9..R3R9R8..R3R2
363 __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
364 parsum = _mm256_madd_epi16(
365 v_c45, _mm256_alignr_epi8(v_refu, v_refl,
366 8)); // R12R11..R5R11R10..R5R4
367 __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
368 parsum = _mm256_madd_epi16(
369 v_c67, _mm256_alignr_epi8(v_refu, v_refl,
370 12)); // R14R13..R7R13R12..R7R6
371 __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
372
373 tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
374 reduce_bits_horiz);
375 }
376
377 } else {
378 for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
379 int iy = iy4 + k;
380 if (iy < 0)
381 iy = 0;
382 else if (iy > height - 1)
383 iy = height - 1;
384 iy = iy * stride;
385
386 sx = sx4 + beta * (k + 4);
387
388 __m256i v_coeff01 = _mm256_inserti128_si256(
389 v_zeros,
390 _mm_loadu_si128(
391 (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]),
392 0);
393 v_coeff01 = _mm256_inserti128_si256(
394 v_coeff01,
395 _mm_loadu_si128(
396 (__m128i *)av1_warped_filter[(sx + alpha) >>
397 WARPEDDIFF_PREC_BITS]),
398 1); // B7B6..B1B0A7A6..A1A0
399 __m256i v_coeff23 = _mm256_inserti128_si256(
400 v_zeros,
401 _mm_loadu_si128(
402 (__m128i *)av1_warped_filter[(sx + 2 * alpha) >>
403 WARPEDDIFF_PREC_BITS]),
404 0);
405 v_coeff23 = _mm256_inserti128_si256(
406 v_coeff23,
407 _mm_loadu_si128(
408 (__m128i *)av1_warped_filter[(sx + 3 * alpha) >>
409 WARPEDDIFF_PREC_BITS]),
410 1); // D7D6..D1D0C7C6..C1C0
411 __m256i v_coeff45 = _mm256_inserti128_si256(
412 v_zeros,
413 _mm_loadu_si128(
414 (__m128i *)av1_warped_filter[(sx + 4 * alpha) >>
415 WARPEDDIFF_PREC_BITS]),
416 0);
417 v_coeff45 = _mm256_inserti128_si256(
418 v_coeff45,
419 _mm_loadu_si128(
420 (__m128i *)av1_warped_filter[(sx + 5 * alpha) >>
421 WARPEDDIFF_PREC_BITS]),
422 1); // F7F6..F1F0E7E6..E1E0
423 __m256i v_coeff67 = _mm256_inserti128_si256(
424 v_zeros,
425 _mm_loadu_si128(
426 (__m128i *)av1_warped_filter[(sx + 6 * alpha) >>
427 WARPEDDIFF_PREC_BITS]),
428 0);
429 v_coeff67 = _mm256_inserti128_si256(
430 v_coeff67,
431 _mm_loadu_si128(
432 (__m128i *)av1_warped_filter[(sx + 7 * alpha) >>
433 WARPEDDIFF_PREC_BITS]),
434 1); // H7H6..H1H0G7G6..G1G0
435
436 __m256i v_c0123 = _mm256_unpacklo_epi32(
437 v_coeff01,
438 v_coeff23); // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
439 __m256i v_c0123u = _mm256_unpackhi_epi32(
440 v_coeff01,
441 v_coeff23); // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
442 __m256i v_c4567 = _mm256_unpacklo_epi32(
443 v_coeff45,
444 v_coeff67); // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
445 __m256i v_c4567u = _mm256_unpackhi_epi32(
446 v_coeff45,
447 v_coeff67); // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
448
449 __m256i v_c01 = _mm256_unpacklo_epi64(
450 v_c0123, v_c4567); // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
451 __m256i v_c23 =
452 _mm256_unpackhi_epi64(v_c0123, v_c4567); // H3H2 ... A3A2
453 __m256i v_c45 =
454 _mm256_unpacklo_epi64(v_c0123u, v_c4567u); // H5H4 ... A5A4
455 __m256i v_c67 =
456 _mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6
457
458 __m256i v_refl = _mm256_inserti128_si256(
459 _mm256_setzero_si256(),
460 _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
461 v_refl = _mm256_inserti128_si256(
462 v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
463 1); // R15 .. R0
464
465 __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
466
467 __m256i v_refu =
468 _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1
469
470 v_refl = _mm256_inserti128_si256(
471 v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
472 v_refu = _mm256_inserti128_si256(
473 v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
474
475 __m256i v_sum = _mm256_set1_epi32(ohoriz);
476 __m256i parsum =
477 _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0));
478 __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
479
480 parsum =
481 _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4));
482 __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
483 parsum =
484 _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8));
485 __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
486 parsum = _mm256_madd_epi16(v_c67,
487 _mm256_alignr_epi8(v_refu, v_refl, 12));
488 __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
489
490 tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
491 reduce_bits_horiz);
492 }
493 }
494 }
495
496 // Vertical filter
497 for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
498 int sy = sy4 + delta * (k + 4);
499 const __m256i *src = tmp + (k + 4);
500
501 __m256i v_coeff01 = _mm256_inserti128_si256(
502 v_zeros,
503 _mm_loadu_si128(
504 (__m128i *)av1_warped_filter[(sy) >> WARPEDDIFF_PREC_BITS]),
505 0);
506 v_coeff01 = _mm256_inserti128_si256(
507 v_coeff01,
508 _mm_loadu_si128(
509 (__m128i *)
510 av1_warped_filter[(sy + gamma) >> WARPEDDIFF_PREC_BITS]),
511 1);
512 __m256i v_coeff23 = _mm256_inserti128_si256(
513 v_zeros,
514 _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 2 * gamma) >>
515 WARPEDDIFF_PREC_BITS]),
516 0);
517 v_coeff23 = _mm256_inserti128_si256(
518 v_coeff23,
519 _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 3 * gamma) >>
520 WARPEDDIFF_PREC_BITS]),
521 1);
522 __m256i v_coeff45 = _mm256_inserti128_si256(
523 v_zeros,
524 _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 4 * gamma) >>
525 WARPEDDIFF_PREC_BITS]),
526 0);
527 v_coeff45 = _mm256_inserti128_si256(
528 v_coeff45,
529 _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 5 * gamma) >>
530 WARPEDDIFF_PREC_BITS]),
531 1);
532 __m256i v_coeff67 = _mm256_inserti128_si256(
533 v_zeros,
534 _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 6 * gamma) >>
535 WARPEDDIFF_PREC_BITS]),
536 0);
537 v_coeff67 = _mm256_inserti128_si256(
538 v_coeff67,
539 _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 7 * gamma) >>
540 WARPEDDIFF_PREC_BITS]),
541 1);
542
543 __m256i v_c0123 = _mm256_unpacklo_epi32(
544 v_coeff01,
545 v_coeff23); // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
546 __m256i v_c0123u = _mm256_unpackhi_epi32(
547 v_coeff01,
548 v_coeff23); // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
549 __m256i v_c4567 = _mm256_unpacklo_epi32(
550 v_coeff45,
551 v_coeff67); // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
552 __m256i v_c4567u = _mm256_unpackhi_epi32(
553 v_coeff45,
554 v_coeff67); // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
555
556 __m256i v_c01 = _mm256_unpacklo_epi64(
557 v_c0123, v_c4567); // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
558 __m256i v_c23 =
559 _mm256_unpackhi_epi64(v_c0123, v_c4567); // H3H2 ... A3A2
560 __m256i v_c45 =
561 _mm256_unpacklo_epi64(v_c0123u, v_c4567u); // H5H4 ... A5A4
562 __m256i v_c67 =
563 _mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6
564
565 __m256i v_src01l =
566 _mm256_unpacklo_epi32(src[0], src[1]); // T13T03T11T01T12T02T10T00
567 __m256i v_src01u =
568 _mm256_unpackhi_epi32(src[0], src[1]); // T17T07T15T05T16T06T14T04
569 __m256i v_sum =
570 _mm256_madd_epi16(_mm256_packus_epi32(v_src01l, v_src01u),
571 v_c01); // S7S5S3S1S6S4S2S0
572
573 __m256i v_src23l = _mm256_unpacklo_epi32(src[2], src[3]);
574 __m256i v_src23u = _mm256_unpackhi_epi32(src[2], src[3]);
575 v_sum = _mm256_add_epi32(
576 v_sum,
577 _mm256_madd_epi16(_mm256_packus_epi32(v_src23l, v_src23u), v_c23));
578
579 __m256i v_src45l = _mm256_unpacklo_epi32(src[4], src[5]);
580 __m256i v_src45u = _mm256_unpackhi_epi32(src[4], src[5]);
581 v_sum = _mm256_add_epi32(
582 v_sum,
583 _mm256_madd_epi16(_mm256_packus_epi32(v_src45l, v_src45u), v_c45));
584
585 __m256i v_src67l = _mm256_unpacklo_epi32(src[6], src[7]);
586 __m256i v_src67u = _mm256_unpackhi_epi32(src[6], src[7]);
587 v_sum = _mm256_add_epi32(
588 v_sum,
589 _mm256_madd_epi16(_mm256_packus_epi32(v_src67l, v_src67u), v_c67));
590
591 // unpack S7S5S3S1S6S4S2S0 to S7S6S5S4S3S2S1S0
592
593 __m256i v_suml =
594 _mm256_permute4x64_epi64(v_sum, 0xD8); // S7S5S6S4S3S1S2S0
595 __m256i v_sumh =
596 _mm256_permute4x64_epi64(v_sum, 0x32); // S2S0S7S5S2S0S3S1
597 v_sum = _mm256_unpacklo_epi32(v_suml, v_sumh); // S7S6S5S4S3S2S1S0
598
599 if (conv_params->is_compound) {
600 __m128i *const p =
601 (__m128i *)&conv_params
602 ->dst[(i + k + 4) * conv_params->dst_stride + j];
603
604 v_sum = _mm256_add_epi32(v_sum, res_add_const);
605 v_sum =
606 _mm256_sra_epi32(_mm256_add_epi32(v_sum, reduce_bits_vert_const),
607 reduce_bits_vert_shift);
608 if (conv_params->do_average) {
609 __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
610 __m256i p_32 = _mm256_cvtepu16_epi32(_mm_loadu_si128(p));
611
612 if (conv_params->use_dist_wtd_comp_avg) {
613 v_sum = _mm256_add_epi32(_mm256_mullo_epi32(p_32, wt0),
614 _mm256_mullo_epi32(v_sum, wt1));
615 v_sum = _mm256_srai_epi32(v_sum, DIST_PRECISION_BITS);
616 } else {
617 v_sum = _mm256_srai_epi32(_mm256_add_epi32(p_32, v_sum), 1);
618 }
619
620 __m256i v_sum1 = _mm256_add_epi32(v_sum, res_sub_const);
621 v_sum1 = _mm256_sra_epi32(
622 _mm256_add_epi32(v_sum1, round_bits_const), round_bits_shift);
623
624 __m256i v_sum16 = _mm256_packus_epi32(v_sum1, v_sum1);
625 v_sum16 = _mm256_permute4x64_epi64(v_sum16, 0xD8);
626 v_sum16 = _mm256_min_epi16(v_sum16, clip_pixel);
627 _mm_storeu_si128(dst16, _mm256_extracti128_si256(v_sum16, 0));
628 } else {
629 v_sum = _mm256_packus_epi32(v_sum, v_sum);
630 __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum, 0xD8);
631 _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0));
632 }
633 } else {
634 // Round and pack into 8 bits
635 const __m256i round_const =
636 _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
637 ((1 << reduce_bits_vert) >> 1));
638
639 __m256i v_sum1 = _mm256_srai_epi32(
640 _mm256_add_epi32(v_sum, round_const), reduce_bits_vert);
641
642 v_sum1 = _mm256_packus_epi32(v_sum1, v_sum1);
643 __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum1, 0xD8);
644 // Clamp res_16bit to the range [0, 2^bd - 1]
645 const __m256i max_val = _mm256_set1_epi16((1 << bd) - 1);
646 const __m256i zero = _mm256_setzero_si256();
647 v_sum16 = _mm256_max_epi16(_mm256_min_epi16(v_sum16, max_val), zero);
648
649 __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
650
651 _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0));
652 }
653 }
654 }
655 }
656 }
657