1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <emmintrin.h>
13 #include <smmintrin.h>
14
15 #include "config/av1_rtcd.h"
16
17 #include "av1/common/warped_motion.h"
18
19 /* This is a modified version of 'av1_warped_filter' from warped_motion.c:
20 * Each coefficient is stored in 8 bits instead of 16 bits
21 * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
22
23 This is done in order to avoid overflow: Since the tap with the largest
24 coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
25 order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
26 convolve functions.
27
28 Instead, we use the summation order
29 ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
30 The rearrangement of coefficients in this table is so that we can get the
31 coefficients into the correct order more quickly.
32 */
33 /* clang-format off */
34 DECLARE_ALIGNED(8, const int8_t,
35 av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
36 // [-1, 0)
37 { 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0},
38 { 1, 127, -1, 0, -3, 4, 0, 0}, { 1, 126, -2, 0, -4, 6, 1, 0},
39 { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 125, -4, 0, -6, 11, 1, 0},
40 { 1, 124, -4, 0, -7, 13, 1, 0}, { 2, 123, -5, 0, -8, 15, 1, 0},
41 { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 121, -6, 0, -10, 20, 1, 0},
42 { 2, 120, -7, 0, -11, 22, 2, 0}, { 2, 119, -8, 0, -12, 25, 2, 0},
43 { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 116, -9, 0, -13, 29, 2, 0},
44 { 3, 114, -10, 0, -14, 32, 3, 0}, { 3, 113, -10, 0, -15, 35, 2, 0},
45 { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 109, -11, 0, -16, 40, 3, 0},
46 { 3, 108, -12, 0, -16, 42, 3, 0}, { 4, 106, -13, 0, -17, 45, 3, 0},
47 { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 102, -14, 0, -17, 50, 3, 0},
48 { 4, 100, -14, 0, -17, 52, 3, 0}, { 4, 98, -15, 0, -18, 55, 4, 0},
49 { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 94, -16, 0, -18, 60, 4, 0},
50 { 4, 91, -16, 0, -18, 63, 4, 0}, { 4, 89, -16, 0, -18, 65, 4, 0},
51 { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 85, -17, 0, -18, 70, 4, 0},
52 { 4, 82, -17, 0, -18, 73, 4, 0}, { 4, 80, -17, 0, -18, 75, 4, 0},
53 { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 75, -18, 0, -17, 80, 4, 0},
54 { 4, 73, -18, 0, -17, 82, 4, 0}, { 4, 70, -18, 0, -17, 85, 4, 0},
55 { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 65, -18, 0, -16, 89, 4, 0},
56 { 4, 63, -18, 0, -16, 91, 4, 0}, { 4, 60, -18, 0, -16, 94, 4, 0},
57 { 3, 58, -18, 0, -15, 96, 4, 0}, { 4, 55, -18, 0, -15, 98, 4, 0},
58 { 3, 52, -17, 0, -14, 100, 4, 0}, { 3, 50, -17, 0, -14, 102, 4, 0},
59 { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 45, -17, 0, -13, 106, 4, 0},
60 { 3, 42, -16, 0, -12, 108, 3, 0}, { 3, 40, -16, 0, -11, 109, 3, 0},
61 { 3, 37, -15, 0, -11, 111, 3, 0}, { 2, 35, -15, 0, -10, 113, 3, 0},
62 { 3, 32, -14, 0, -10, 114, 3, 0}, { 2, 29, -13, 0, -9, 116, 3, 0},
63 { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 25, -12, 0, -8, 119, 2, 0},
64 { 2, 22, -11, 0, -7, 120, 2, 0}, { 1, 20, -10, 0, -6, 121, 2, 0},
65 { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 15, -8, 0, -5, 123, 2, 0},
66 { 1, 13, -7, 0, -4, 124, 1, 0}, { 1, 11, -6, 0, -4, 125, 1, 0},
67 { 1, 8, -5, 0, -3, 126, 1, 0}, { 1, 6, -4, 0, -2, 126, 1, 0},
68 { 0, 4, -3, 0, -1, 127, 1, 0}, { 0, 2, -1, 0, 0, 127, 0, 0},
69 // [0, 1)
70 { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -1, 2, 0, 0, 127, 0, 0},
71 { 0, -3, 4, 1, 1, 127, -2, 0}, { 0, -5, 6, 1, 1, 127, -2, 0},
72 { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -7, 11, 2, 2, 126, -4, -1},
73 {-1, -8, 13, 2, 3, 125, -5, -1}, {-1, -10, 16, 3, 3, 124, -6, -1},
74 {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -12, 20, 3, 4, 122, -7, -1},
75 {-1, -13, 23, 3, 4, 121, -8, -1}, {-2, -14, 25, 4, 5, 120, -9, -1},
76 {-1, -15, 27, 4, 5, 119, -10, -1}, {-1, -16, 30, 4, 5, 118, -11, -1},
77 {-2, -17, 33, 5, 6, 116, -12, -1}, {-2, -17, 35, 5, 6, 114, -12, -1},
78 {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 41, 6, 7, 111, -14, -2},
79 {-2, -19, 43, 6, 7, 110, -15, -2}, {-2, -20, 46, 6, 7, 108, -15, -2},
80 {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 51, 7, 7, 104, -16, -2},
81 {-2, -21, 54, 7, 7, 102, -17, -2}, {-2, -21, 56, 7, 8, 100, -18, -2},
82 {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 62, 7, 8, 96, -19, -2},
83 {-2, -22, 64, 7, 8, 94, -19, -2}, {-2, -22, 67, 8, 8, 91, -20, -2},
84 {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -22, 72, 8, 8, 87, -21, -2},
85 {-2, -21, 74, 8, 8, 84, -21, -2}, {-2, -22, 77, 8, 8, 82, -21, -2},
86 {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 82, 8, 8, 77, -22, -2},
87 {-2, -21, 84, 8, 8, 74, -21, -2}, {-2, -21, 87, 8, 8, 72, -22, -2},
88 {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -20, 91, 8, 8, 67, -22, -2},
89 {-2, -19, 94, 8, 7, 64, -22, -2}, {-2, -19, 96, 8, 7, 62, -22, -2},
90 {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -18, 100, 8, 7, 56, -21, -2},
91 {-2, -17, 102, 7, 7, 54, -21, -2}, {-2, -16, 104, 7, 7, 51, -21, -2},
92 {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 108, 7, 6, 46, -20, -2},
93 {-2, -15, 110, 7, 6, 43, -19, -2}, {-2, -14, 111, 7, 6, 41, -19, -2},
94 {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 114, 6, 5, 35, -17, -2},
95 {-1, -12, 116, 6, 5, 33, -17, -2}, {-1, -11, 118, 5, 4, 30, -16, -1},
96 {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -9, 120, 5, 4, 25, -14, -2},
97 {-1, -8, 121, 4, 3, 23, -13, -1}, {-1, -7, 122, 4, 3, 20, -12, -1},
98 {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -6, 124, 3, 3, 16, -10, -1},
99 {-1, -5, 125, 3, 2, 13, -8, -1}, {-1, -4, 126, 2, 2, 11, -7, -1},
100 { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 6, -5, 0},
101 { 0, -2, 127, 1, 1, 4, -3, 0}, { 0, 0, 127, 0, 0, 2, -1, 0},
102 // [1, 2)
103 { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 0, 127, 0, 0, -1, 2, 0},
104 { 0, 1, 127, -1, 0, -3, 4, 0}, { 0, 1, 126, -2, 0, -4, 6, 1},
105 { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 125, -4, 0, -6, 11, 1},
106 { 0, 1, 124, -4, 0, -7, 13, 1}, { 0, 2, 123, -5, 0, -8, 15, 1},
107 { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 121, -6, 0, -10, 20, 1},
108 { 0, 2, 120, -7, 0, -11, 22, 2}, { 0, 2, 119, -8, 0, -12, 25, 2},
109 { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 116, -9, 0, -13, 29, 2},
110 { 0, 3, 114, -10, 0, -14, 32, 3}, { 0, 3, 113, -10, 0, -15, 35, 2},
111 { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 109, -11, 0, -16, 40, 3},
112 { 0, 3, 108, -12, 0, -16, 42, 3}, { 0, 4, 106, -13, 0, -17, 45, 3},
113 { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 102, -14, 0, -17, 50, 3},
114 { 0, 4, 100, -14, 0, -17, 52, 3}, { 0, 4, 98, -15, 0, -18, 55, 4},
115 { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 94, -16, 0, -18, 60, 4},
116 { 0, 4, 91, -16, 0, -18, 63, 4}, { 0, 4, 89, -16, 0, -18, 65, 4},
117 { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 85, -17, 0, -18, 70, 4},
118 { 0, 4, 82, -17, 0, -18, 73, 4}, { 0, 4, 80, -17, 0, -18, 75, 4},
119 { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 75, -18, 0, -17, 80, 4},
120 { 0, 4, 73, -18, 0, -17, 82, 4}, { 0, 4, 70, -18, 0, -17, 85, 4},
121 { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 65, -18, 0, -16, 89, 4},
122 { 0, 4, 63, -18, 0, -16, 91, 4}, { 0, 4, 60, -18, 0, -16, 94, 4},
123 { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 4, 55, -18, 0, -15, 98, 4},
124 { 0, 3, 52, -17, 0, -14, 100, 4}, { 0, 3, 50, -17, 0, -14, 102, 4},
125 { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 45, -17, 0, -13, 106, 4},
126 { 0, 3, 42, -16, 0, -12, 108, 3}, { 0, 3, 40, -16, 0, -11, 109, 3},
127 { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 2, 35, -15, 0, -10, 113, 3},
128 { 0, 3, 32, -14, 0, -10, 114, 3}, { 0, 2, 29, -13, 0, -9, 116, 3},
129 { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 25, -12, 0, -8, 119, 2},
130 { 0, 2, 22, -11, 0, -7, 120, 2}, { 0, 1, 20, -10, 0, -6, 121, 2},
131 { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 15, -8, 0, -5, 123, 2},
132 { 0, 1, 13, -7, 0, -4, 124, 1}, { 0, 1, 11, -6, 0, -4, 125, 1},
133 { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 1, 6, -4, 0, -2, 126, 1},
134 { 0, 0, 4, -3, 0, -1, 127, 1}, { 0, 0, 2, -1, 0, 0, 127, 0},
135 // dummy (replicate row index 191)
136 { 0, 0, 2, -1, 0, 0, 127, 0},
137 };
138 /* clang-format on */
139
140 // Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15
141 // in an SSE register into two sequences:
142 // 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
143 // 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
144 DECLARE_ALIGNED(16, static const uint8_t,
145 even_mask[16]) = { 0, 2, 2, 4, 4, 6, 6, 8,
146 8, 10, 10, 12, 12, 14, 14, 0 };
147
148 DECLARE_ALIGNED(16, static const uint8_t,
149 odd_mask[16]) = { 1, 3, 3, 5, 5, 7, 7, 9,
150 9, 11, 11, 13, 13, 15, 15, 0 };
151
152 DECLARE_ALIGNED(16, static const uint8_t,
153 shuffle_alpha0_mask01[16]) = { 0, 1, 0, 1, 0, 1, 0, 1,
154 0, 1, 0, 1, 0, 1, 0, 1 };
155
156 DECLARE_ALIGNED(16, static const uint8_t,
157 shuffle_alpha0_mask23[16]) = { 2, 3, 2, 3, 2, 3, 2, 3,
158 2, 3, 2, 3, 2, 3, 2, 3 };
159
160 DECLARE_ALIGNED(16, static const uint8_t,
161 shuffle_alpha0_mask45[16]) = { 4, 5, 4, 5, 4, 5, 4, 5,
162 4, 5, 4, 5, 4, 5, 4, 5 };
163
164 DECLARE_ALIGNED(16, static const uint8_t,
165 shuffle_alpha0_mask67[16]) = { 6, 7, 6, 7, 6, 7, 6, 7,
166 6, 7, 6, 7, 6, 7, 6, 7 };
167
168 DECLARE_ALIGNED(16, static const uint8_t,
169 shuffle_gamma0_mask0[16]) = { 0, 1, 2, 3, 0, 1, 2, 3,
170 0, 1, 2, 3, 0, 1, 2, 3 };
171
172 DECLARE_ALIGNED(16, static const uint8_t,
173 shuffle_gamma0_mask1[16]) = { 4, 5, 6, 7, 4, 5, 6, 7,
174 4, 5, 6, 7, 4, 5, 6, 7 };
175
176 DECLARE_ALIGNED(16, static const uint8_t,
177 shuffle_gamma0_mask2[16]) = { 8, 9, 10, 11, 8, 9, 10, 11,
178 8, 9, 10, 11, 8, 9, 10, 11 };
179
180 DECLARE_ALIGNED(16, static const uint8_t,
181 shuffle_gamma0_mask3[16]) = { 12, 13, 14, 15, 12, 13, 14, 15,
182 12, 13, 14, 15, 12, 13, 14, 15 };
183
filter_src_pixels(__m128i src,__m128i * tmp,__m128i * coeff,const int offset_bits_horiz,const int reduce_bits_horiz,int k)184 static inline void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
185 const int offset_bits_horiz,
186 const int reduce_bits_horiz, int k) {
187 const __m128i src_even =
188 _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)even_mask));
189 const __m128i src_odd =
190 _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)odd_mask));
191 // The pixel order we need for 'src' is:
192 // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
193 const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
194 const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]);
195 // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
196 const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
197 _mm_srli_si128(src_odd, 4));
198 const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]);
199 // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
200 const __m128i src_13 =
201 _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
202 const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]);
203 // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
204 const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
205 _mm_srli_si128(src_even, 6));
206 const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]);
207
208 const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
209 ((1 << reduce_bits_horiz) >> 1));
210
211 // Note: The values res_02 + res_46 and res_13 + res_57 both
212 // fit into int16s at this point, but their sum may be too wide to fit
213 // into an int16. However, once we also add round_const, the sum of
214 // all of these fits into a uint16.
215 //
216 // The wrapping behaviour of _mm_add_* is used here to make sure we
217 // get the correct result despite converting between different
218 // (implicit) types.
219 const __m128i res_even = _mm_add_epi16(res_02, res_46);
220 const __m128i res_odd = _mm_add_epi16(res_13, res_57);
221 const __m128i res =
222 _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
223 tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
224 }
225
prepare_horizontal_filter_coeff(int alpha,int sx,__m128i * coeff)226 static inline void prepare_horizontal_filter_coeff(int alpha, int sx,
227 __m128i *coeff) {
228 // Filter even-index pixels
229 const __m128i tmp_0 = _mm_loadl_epi64(
230 (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
231 const __m128i tmp_1 = _mm_loadl_epi64(
232 (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
233 const __m128i tmp_2 = _mm_loadl_epi64(
234 (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
235 const __m128i tmp_3 = _mm_loadl_epi64(
236 (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
237 const __m128i tmp_4 = _mm_loadl_epi64(
238 (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
239 const __m128i tmp_5 = _mm_loadl_epi64(
240 (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
241 const __m128i tmp_6 = _mm_loadl_epi64(
242 (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
243 const __m128i tmp_7 = _mm_loadl_epi64(
244 (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
245
246 // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
247 const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
248 // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
249 const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
250 // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
251 const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
252 // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
253 const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
254
255 // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
256 const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
257 // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
258 const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
259 // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
260 const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
261 // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
262 const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
263
264 // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
265 coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14);
266 // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
267 coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14);
268 // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
269 coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15);
270 // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
271 coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15);
272 }
273
prepare_horizontal_filter_coeff_alpha0(int sx,__m128i * coeff)274 static inline void prepare_horizontal_filter_coeff_alpha0(int sx,
275 __m128i *coeff) {
276 // Filter even-index pixels
277 const __m128i tmp_0 =
278 _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
279
280 // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
281 coeff[0] =
282 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask01));
283 // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
284 coeff[1] =
285 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask23));
286 // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
287 coeff[2] =
288 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask45));
289 // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
290 coeff[3] =
291 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask67));
292 }
293
horizontal_filter(__m128i src,__m128i * tmp,int sx,int alpha,int k,const int offset_bits_horiz,const int reduce_bits_horiz)294 static inline void horizontal_filter(__m128i src, __m128i *tmp, int sx,
295 int alpha, int k,
296 const int offset_bits_horiz,
297 const int reduce_bits_horiz) {
298 __m128i coeff[4];
299 prepare_horizontal_filter_coeff(alpha, sx, coeff);
300 filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
301 }
302
warp_horizontal_filter(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)303 static inline void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp,
304 int stride, int32_t ix4, int32_t iy4,
305 int32_t sx4, int alpha, int beta,
306 int p_height, int height, int i,
307 const int offset_bits_horiz,
308 const int reduce_bits_horiz) {
309 int k;
310 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
311 int iy = iy4 + k;
312 if (iy < 0)
313 iy = 0;
314 else if (iy > height - 1)
315 iy = height - 1;
316 int sx = sx4 + beta * (k + 4);
317
318 // Load source pixels
319 const __m128i src =
320 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
321 horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
322 reduce_bits_horiz);
323 }
324 }
325
warp_horizontal_filter_alpha0(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)326 static inline void warp_horizontal_filter_alpha0(
327 const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
328 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
329 const int offset_bits_horiz, const int reduce_bits_horiz) {
330 (void)alpha;
331 int k;
332 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
333 int iy = iy4 + k;
334 if (iy < 0)
335 iy = 0;
336 else if (iy > height - 1)
337 iy = height - 1;
338 int sx = sx4 + beta * (k + 4);
339
340 // Load source pixels
341 const __m128i src =
342 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
343
344 __m128i coeff[4];
345 prepare_horizontal_filter_coeff_alpha0(sx, coeff);
346 filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
347 }
348 }
349
warp_horizontal_filter_beta0(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)350 static inline void warp_horizontal_filter_beta0(
351 const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
352 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
353 const int offset_bits_horiz, const int reduce_bits_horiz) {
354 (void)beta;
355 int k;
356 __m128i coeff[4];
357 prepare_horizontal_filter_coeff(alpha, sx4, coeff);
358
359 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
360 int iy = iy4 + k;
361 if (iy < 0)
362 iy = 0;
363 else if (iy > height - 1)
364 iy = height - 1;
365
366 // Load source pixels
367 const __m128i src =
368 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
369 filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
370 }
371 }
372
warp_horizontal_filter_alpha0_beta0(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)373 static inline void warp_horizontal_filter_alpha0_beta0(
374 const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
375 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
376 const int offset_bits_horiz, const int reduce_bits_horiz) {
377 (void)beta;
378 (void)alpha;
379 int k;
380
381 __m128i coeff[4];
382 prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
383
384 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
385 int iy = iy4 + k;
386 if (iy < 0)
387 iy = 0;
388 else if (iy > height - 1)
389 iy = height - 1;
390
391 // Load source pixels
392 const __m128i src =
393 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
394 filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
395 }
396 }
397
unpack_weights_and_set_round_const(ConvolveParams * conv_params,const int round_bits,const int offset_bits,__m128i * res_sub_const,__m128i * round_bits_const,__m128i * wt)398 static inline void unpack_weights_and_set_round_const(
399 ConvolveParams *conv_params, const int round_bits, const int offset_bits,
400 __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) {
401 *res_sub_const =
402 _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
403 (1 << (offset_bits - conv_params->round_1 - 1)));
404 *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
405
406 const int w0 = conv_params->fwd_offset;
407 const int w1 = conv_params->bck_offset;
408 const __m128i wt0 = _mm_set1_epi16((int16_t)w0);
409 const __m128i wt1 = _mm_set1_epi16((int16_t)w1);
410 *wt = _mm_unpacklo_epi16(wt0, wt1);
411 }
412
prepare_vertical_filter_coeffs(int gamma,int sy,__m128i * coeffs)413 static inline void prepare_vertical_filter_coeffs(int gamma, int sy,
414 __m128i *coeffs) {
415 const __m128i tmp_0 =
416 _mm_loadu_si128((__m128i *)(av1_warped_filter +
417 ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
418 const __m128i tmp_2 =
419 _mm_loadu_si128((__m128i *)(av1_warped_filter +
420 ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
421 const __m128i tmp_4 =
422 _mm_loadu_si128((__m128i *)(av1_warped_filter +
423 ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
424 const __m128i tmp_6 =
425 _mm_loadu_si128((__m128i *)(av1_warped_filter +
426 ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
427
428 const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
429 const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
430 const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
431 const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
432
433 // even coeffs
434 coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
435 coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10);
436 coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14);
437 coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14);
438
439 const __m128i tmp_1 =
440 _mm_loadu_si128((__m128i *)(av1_warped_filter +
441 ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
442 const __m128i tmp_3 =
443 _mm_loadu_si128((__m128i *)(av1_warped_filter +
444 ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
445 const __m128i tmp_5 =
446 _mm_loadu_si128((__m128i *)(av1_warped_filter +
447 ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
448 const __m128i tmp_7 =
449 _mm_loadu_si128((__m128i *)(av1_warped_filter +
450 ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
451
452 const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
453 const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
454 const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
455 const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
456
457 // odd coeffs
458 coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11);
459 coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11);
460 coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15);
461 coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
462 }
463
prepare_vertical_filter_coeffs_gamma0(int sy,__m128i * coeffs)464 static inline void prepare_vertical_filter_coeffs_gamma0(int sy,
465 __m128i *coeffs) {
466 const __m128i tmp_0 = _mm_loadu_si128(
467 (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
468
469 // even coeffs
470 coeffs[0] =
471 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask0));
472 coeffs[1] =
473 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask1));
474 coeffs[2] =
475 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask2));
476 coeffs[3] =
477 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask3));
478
479 // odd coeffs
480 coeffs[4] = coeffs[0];
481 coeffs[5] = coeffs[1];
482 coeffs[6] = coeffs[2];
483 coeffs[7] = coeffs[3];
484 }
485
filter_src_pixels_vertical(__m128i * tmp,__m128i * coeffs,__m128i * res_lo,__m128i * res_hi,int k)486 static inline void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs,
487 __m128i *res_lo, __m128i *res_hi,
488 int k) {
489 // Load from tmp and rearrange pairs of consecutive rows into the
490 // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
491 const __m128i *src = tmp + (k + 4);
492 const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
493 const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
494 const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
495 const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
496
497 const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
498 const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
499 const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
500 const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
501
502 const __m128i res_even =
503 _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6));
504
505 // Filter odd-index pixels
506 const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
507 const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
508 const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
509 const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
510
511 const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]);
512 const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]);
513 const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]);
514 const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]);
515
516 const __m128i res_odd =
517 _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7));
518
519 // Rearrange pixels back into the order 0 ... 7
520 *res_lo = _mm_unpacklo_epi32(res_even, res_odd);
521 *res_hi = _mm_unpackhi_epi32(res_even, res_odd);
522 }
523
store_vertical_filter_output(__m128i * res_lo,__m128i * res_hi,const __m128i * res_add_const,const __m128i * wt,const __m128i * res_sub_const,__m128i * round_bits_const,uint8_t * pred,ConvolveParams * conv_params,int i,int j,int k,const int reduce_bits_vert,int p_stride,int p_width,const int round_bits)524 static inline void store_vertical_filter_output(
525 __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const,
526 const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const,
527 uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k,
528 const int reduce_bits_vert, int p_stride, int p_width,
529 const int round_bits) {
530 __m128i res_lo_1 = *res_lo;
531 __m128i res_hi_1 = *res_hi;
532
533 if (conv_params->is_compound) {
534 __m128i *const p =
535 (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
536 res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const),
537 reduce_bits_vert);
538 const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1);
539 __m128i res_lo_16;
540 if (conv_params->do_average) {
541 __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
542 const __m128i p_16 = _mm_loadl_epi64(p);
543
544 if (conv_params->use_dist_wtd_comp_avg) {
545 const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
546 const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
547 const __m128i shifted_32 =
548 _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
549 res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
550 } else {
551 res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
552 }
553
554 res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const);
555
556 res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const),
557 round_bits);
558 __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
559 *(int *)dst8 = _mm_cvtsi128_si32(res_8_lo);
560 } else {
561 _mm_storel_epi64(p, temp_lo_16);
562 }
563 if (p_width > 4) {
564 __m128i *const p4 =
565 (__m128i *)&conv_params
566 ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
567 res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const),
568 reduce_bits_vert);
569 const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1);
570 __m128i res_hi_16;
571
572 if (conv_params->do_average) {
573 __m128i *const dst8_4 =
574 (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
575 const __m128i p4_16 = _mm_loadl_epi64(p4);
576
577 if (conv_params->use_dist_wtd_comp_avg) {
578 const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
579 const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
580 const __m128i shifted_32 =
581 _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
582 res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
583 } else {
584 res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
585 }
586 res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const);
587
588 res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const),
589 round_bits);
590 __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
591 *(int *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
592
593 } else {
594 _mm_storel_epi64(p4, temp_hi_16);
595 }
596 }
597 } else {
598 const __m128i res_lo_round = _mm_srai_epi32(
599 _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
600 const __m128i res_hi_round = _mm_srai_epi32(
601 _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
602
603 const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
604 __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
605
606 // Store, blending with 'pred' if needed
607 __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
608
609 // Note: If we're outputting a 4x4 block, we need to be very careful
610 // to only output 4 pixels at this point, to avoid encode/decode
611 // mismatches when encoding with multiple threads.
612 if (p_width == 4) {
613 *(int *)p = _mm_cvtsi128_si32(res_8bit);
614 } else {
615 _mm_storel_epi64(p, res_8bit);
616 }
617 }
618 }
619
warp_vertical_filter(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)620 static inline void warp_vertical_filter(
621 uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
622 int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
623 int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
624 const int round_bits, const int offset_bits) {
625 int k;
626 __m128i res_sub_const, round_bits_const, wt;
627 unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
628 &res_sub_const, &round_bits_const, &wt);
629 // Vertical filter
630 for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
631 int sy = sy4 + delta * (k + 4);
632
633 __m128i coeffs[8];
634 prepare_vertical_filter_coeffs(gamma, sy, coeffs);
635
636 __m128i res_lo;
637 __m128i res_hi;
638 filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
639
640 store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
641 &res_sub_const, &round_bits_const, pred,
642 conv_params, i, j, k, reduce_bits_vert,
643 p_stride, p_width, round_bits);
644 }
645 }
646
warp_vertical_filter_gamma0(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)647 static inline void warp_vertical_filter_gamma0(
648 uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
649 int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
650 int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
651 const int round_bits, const int offset_bits) {
652 int k;
653 (void)gamma;
654 __m128i res_sub_const, round_bits_const, wt;
655 unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
656 &res_sub_const, &round_bits_const, &wt);
657 // Vertical filter
658 for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
659 int sy = sy4 + delta * (k + 4);
660
661 __m128i coeffs[8];
662 prepare_vertical_filter_coeffs_gamma0(sy, coeffs);
663
664 __m128i res_lo;
665 __m128i res_hi;
666 filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
667
668 store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
669 &res_sub_const, &round_bits_const, pred,
670 conv_params, i, j, k, reduce_bits_vert,
671 p_stride, p_width, round_bits);
672 }
673 }
674
warp_vertical_filter_delta0(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)675 static inline void warp_vertical_filter_delta0(
676 uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
677 int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
678 int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
679 const int round_bits, const int offset_bits) {
680 (void)delta;
681 int k;
682 __m128i res_sub_const, round_bits_const, wt;
683 unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
684 &res_sub_const, &round_bits_const, &wt);
685
686 __m128i coeffs[8];
687 prepare_vertical_filter_coeffs(gamma, sy4, coeffs);
688 // Vertical filter
689 for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
690 __m128i res_lo;
691 __m128i res_hi;
692 filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
693
694 store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
695 &res_sub_const, &round_bits_const, pred,
696 conv_params, i, j, k, reduce_bits_vert,
697 p_stride, p_width, round_bits);
698 }
699 }
700
warp_vertical_filter_gamma0_delta0(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)701 static inline void warp_vertical_filter_gamma0_delta0(
702 uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
703 int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
704 int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
705 const int round_bits, const int offset_bits) {
706 (void)delta;
707 (void)gamma;
708 int k;
709 __m128i res_sub_const, round_bits_const, wt;
710 unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
711 &res_sub_const, &round_bits_const, &wt);
712
713 __m128i coeffs[8];
714 prepare_vertical_filter_coeffs_gamma0(sy4, coeffs);
715 // Vertical filter
716 for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
717 __m128i res_lo;
718 __m128i res_hi;
719 filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
720
721 store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
722 &res_sub_const, &round_bits_const, pred,
723 conv_params, i, j, k, reduce_bits_vert,
724 p_stride, p_width, round_bits);
725 }
726 }
727
prepare_warp_vertical_filter(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)728 static inline void prepare_warp_vertical_filter(
729 uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
730 int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
731 int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
732 const int round_bits, const int offset_bits) {
733 if (gamma == 0 && delta == 0)
734 warp_vertical_filter_gamma0_delta0(
735 pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j,
736 sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits);
737 else if (gamma == 0 && delta != 0)
738 warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height,
739 p_stride, p_width, i, j, sy4, reduce_bits_vert,
740 res_add_const, round_bits, offset_bits);
741 else if (gamma != 0 && delta == 0)
742 warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height,
743 p_stride, p_width, i, j, sy4, reduce_bits_vert,
744 res_add_const, round_bits, offset_bits);
745 else
746 warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height,
747 p_stride, p_width, i, j, sy4, reduce_bits_vert,
748 res_add_const, round_bits, offset_bits);
749 }
750
prepare_warp_horizontal_filter(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)751 static inline void prepare_warp_horizontal_filter(
752 const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
753 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
754 const int offset_bits_horiz, const int reduce_bits_horiz) {
755 if (alpha == 0 && beta == 0)
756 warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
757 beta, p_height, height, i,
758 offset_bits_horiz, reduce_bits_horiz);
759 else if (alpha == 0 && beta != 0)
760 warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
761 p_height, height, i, offset_bits_horiz,
762 reduce_bits_horiz);
763 else if (alpha != 0 && beta == 0)
764 warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
765 p_height, height, i, offset_bits_horiz,
766 reduce_bits_horiz);
767 else
768 warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
769 p_height, height, i, offset_bits_horiz,
770 reduce_bits_horiz);
771 }
772
av1_warp_affine_sse4_1(const int32_t * mat,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)773 void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
774 int height, int stride, uint8_t *pred, int p_col,
775 int p_row, int p_width, int p_height, int p_stride,
776 int subsampling_x, int subsampling_y,
777 ConvolveParams *conv_params, int16_t alpha,
778 int16_t beta, int16_t gamma, int16_t delta) {
779 __m128i tmp[15];
780 int i, j, k;
781 const int bd = 8;
782 const int reduce_bits_horiz = conv_params->round_0;
783 const int reduce_bits_vert = conv_params->is_compound
784 ? conv_params->round_1
785 : 2 * FILTER_BITS - reduce_bits_horiz;
786 const int offset_bits_horiz = bd + FILTER_BITS - 1;
787 assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
788
789 const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
790 const __m128i reduce_bits_vert_const =
791 _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
792 const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
793 const int round_bits =
794 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
795 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
796 assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
797
798 /* Note: For this code to work, the left/right frame borders need to be
799 extended by at least 13 pixels each. By the time we get here, other
800 code will have set up this border, but we allow an explicit check
801 for debugging purposes.
802 */
803 /*for (i = 0; i < height; ++i) {
804 for (j = 0; j < 13; ++j) {
805 assert(ref[i * stride - 13 + j] == ref[i * stride]);
806 assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
807 }
808 }*/
809 __m128i res_add_const_1;
810 if (conv_params->is_compound == 1) {
811 res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const);
812 } else {
813 res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
814 ((1 << reduce_bits_vert) >> 1));
815 }
816
817 for (i = 0; i < p_height; i += 8) {
818 for (j = 0; j < p_width; j += 8) {
819 const int32_t src_x = (p_col + j + 4) << subsampling_x;
820 const int32_t src_y = (p_row + i + 4) << subsampling_y;
821 const int64_t dst_x =
822 (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
823 const int64_t dst_y =
824 (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
825 const int64_t x4 = dst_x >> subsampling_x;
826 const int64_t y4 = dst_y >> subsampling_y;
827
828 int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
829 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
830 int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
831 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
832
833 // Add in all the constant terms, including rounding and offset
834 sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
835 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
836 sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
837 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
838
839 sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
840 sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
841
842 // Horizontal filter
843 // If the block is aligned such that, after clamping, every sample
844 // would be taken from the leftmost/rightmost column, then we can
845 // skip the expensive horizontal filter.
846 if (ix4 <= -7) {
847 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
848 int iy = iy4 + k;
849 if (iy < 0)
850 iy = 0;
851 else if (iy > height - 1)
852 iy = height - 1;
853 tmp[k + 7] = _mm_set1_epi16(
854 (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
855 ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
856 }
857 } else if (ix4 >= width + 6) {
858 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
859 int iy = iy4 + k;
860 if (iy < 0)
861 iy = 0;
862 else if (iy > height - 1)
863 iy = height - 1;
864 tmp[k + 7] =
865 _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
866 ref[iy * stride + (width - 1)] *
867 (1 << (FILTER_BITS - reduce_bits_horiz)));
868 }
869 } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
870 const int out_of_boundary_left = -(ix4 - 6);
871 const int out_of_boundary_right = (ix4 + 8) - width;
872 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
873 int iy = iy4 + k;
874 if (iy < 0)
875 iy = 0;
876 else if (iy > height - 1)
877 iy = height - 1;
878 int sx = sx4 + beta * (k + 4);
879
880 // Load source pixels
881 __m128i src =
882 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
883 if (out_of_boundary_left >= 0) {
884 const __m128i shuffle_reg_left =
885 _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
886 src = _mm_shuffle_epi8(src, shuffle_reg_left);
887 }
888 if (out_of_boundary_right >= 0) {
889 const __m128i shuffle_reg_right = _mm_loadu_si128(
890 (__m128i *)warp_pad_right[out_of_boundary_right]);
891 src = _mm_shuffle_epi8(src, shuffle_reg_right);
892 }
893 horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
894 reduce_bits_horiz);
895 }
896 } else {
897 prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha,
898 beta, p_height, height, i,
899 offset_bits_horiz, reduce_bits_horiz);
900 }
901
902 // Vertical filter
903 prepare_warp_vertical_filter(
904 pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i,
905 j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits);
906 }
907 }
908 }
909