xref: /aosp_15_r20/external/libaom/av1/common/x86/warp_plane_sse4.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <emmintrin.h>
13 #include <smmintrin.h>
14 
15 #include "config/av1_rtcd.h"
16 
17 #include "av1/common/warped_motion.h"
18 
19 /* This is a modified version of 'av1_warped_filter' from warped_motion.c:
20    * Each coefficient is stored in 8 bits instead of 16 bits
21    * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
22 
23      This is done in order to avoid overflow: Since the tap with the largest
24      coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
25      order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
26      convolve functions.
27 
28      Instead, we use the summation order
29      ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
30      The rearrangement of coefficients in this table is so that we can get the
31      coefficients into the correct order more quickly.
32 */
33 /* clang-format off */
34 DECLARE_ALIGNED(8, const int8_t,
35                 av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
36   // [-1, 0)
37   { 0, 127,   0, 0,   0,   1, 0, 0}, { 0, 127,   0, 0,  -1,   2, 0, 0},
38   { 1, 127,  -1, 0,  -3,   4, 0, 0}, { 1, 126,  -2, 0,  -4,   6, 1, 0},
39   { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 125,  -4, 0,  -6,  11, 1, 0},
40   { 1, 124,  -4, 0,  -7,  13, 1, 0}, { 2, 123,  -5, 0,  -8,  15, 1, 0},
41   { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 121,  -6, 0, -10,  20, 1, 0},
42   { 2, 120,  -7, 0, -11,  22, 2, 0}, { 2, 119,  -8, 0, -12,  25, 2, 0},
43   { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 116,  -9, 0, -13,  29, 2, 0},
44   { 3, 114, -10, 0, -14,  32, 3, 0}, { 3, 113, -10, 0, -15,  35, 2, 0},
45   { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 109, -11, 0, -16,  40, 3, 0},
46   { 3, 108, -12, 0, -16,  42, 3, 0}, { 4, 106, -13, 0, -17,  45, 3, 0},
47   { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 102, -14, 0, -17,  50, 3, 0},
48   { 4, 100, -14, 0, -17,  52, 3, 0}, { 4,  98, -15, 0, -18,  55, 4, 0},
49   { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  94, -16, 0, -18,  60, 4, 0},
50   { 4,  91, -16, 0, -18,  63, 4, 0}, { 4,  89, -16, 0, -18,  65, 4, 0},
51   { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  85, -17, 0, -18,  70, 4, 0},
52   { 4,  82, -17, 0, -18,  73, 4, 0}, { 4,  80, -17, 0, -18,  75, 4, 0},
53   { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  75, -18, 0, -17,  80, 4, 0},
54   { 4,  73, -18, 0, -17,  82, 4, 0}, { 4,  70, -18, 0, -17,  85, 4, 0},
55   { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  65, -18, 0, -16,  89, 4, 0},
56   { 4,  63, -18, 0, -16,  91, 4, 0}, { 4,  60, -18, 0, -16,  94, 4, 0},
57   { 3,  58, -18, 0, -15,  96, 4, 0}, { 4,  55, -18, 0, -15,  98, 4, 0},
58   { 3,  52, -17, 0, -14, 100, 4, 0}, { 3,  50, -17, 0, -14, 102, 4, 0},
59   { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  45, -17, 0, -13, 106, 4, 0},
60   { 3,  42, -16, 0, -12, 108, 3, 0}, { 3,  40, -16, 0, -11, 109, 3, 0},
61   { 3,  37, -15, 0, -11, 111, 3, 0}, { 2,  35, -15, 0, -10, 113, 3, 0},
62   { 3,  32, -14, 0, -10, 114, 3, 0}, { 2,  29, -13, 0,  -9, 116, 3, 0},
63   { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  25, -12, 0,  -8, 119, 2, 0},
64   { 2,  22, -11, 0,  -7, 120, 2, 0}, { 1,  20, -10, 0,  -6, 121, 2, 0},
65   { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  15,  -8, 0,  -5, 123, 2, 0},
66   { 1,  13,  -7, 0,  -4, 124, 1, 0}, { 1,  11,  -6, 0,  -4, 125, 1, 0},
67   { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 1,   6,  -4, 0,  -2, 126, 1, 0},
68   { 0,   4,  -3, 0,  -1, 127, 1, 0}, { 0,   2,  -1, 0,   0, 127, 0, 0},
69   // [0, 1)
70   { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -1,   2, 0, 0, 127,   0,  0},
71   { 0,  -3,   4, 1, 1, 127,  -2,  0}, { 0,  -5,   6, 1, 1, 127,  -2,  0},
72   { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -7,  11, 2, 2, 126,  -4, -1},
73   {-1,  -8,  13, 2, 3, 125,  -5, -1}, {-1, -10,  16, 3, 3, 124,  -6, -1},
74   {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -12,  20, 3, 4, 122,  -7, -1},
75   {-1, -13,  23, 3, 4, 121,  -8, -1}, {-2, -14,  25, 4, 5, 120,  -9, -1},
76   {-1, -15,  27, 4, 5, 119, -10, -1}, {-1, -16,  30, 4, 5, 118, -11, -1},
77   {-2, -17,  33, 5, 6, 116, -12, -1}, {-2, -17,  35, 5, 6, 114, -12, -1},
78   {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  41, 6, 7, 111, -14, -2},
79   {-2, -19,  43, 6, 7, 110, -15, -2}, {-2, -20,  46, 6, 7, 108, -15, -2},
80   {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  51, 7, 7, 104, -16, -2},
81   {-2, -21,  54, 7, 7, 102, -17, -2}, {-2, -21,  56, 7, 8, 100, -18, -2},
82   {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  62, 7, 8,  96, -19, -2},
83   {-2, -22,  64, 7, 8,  94, -19, -2}, {-2, -22,  67, 8, 8,  91, -20, -2},
84   {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -22,  72, 8, 8,  87, -21, -2},
85   {-2, -21,  74, 8, 8,  84, -21, -2}, {-2, -22,  77, 8, 8,  82, -21, -2},
86   {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  82, 8, 8,  77, -22, -2},
87   {-2, -21,  84, 8, 8,  74, -21, -2}, {-2, -21,  87, 8, 8,  72, -22, -2},
88   {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -20,  91, 8, 8,  67, -22, -2},
89   {-2, -19,  94, 8, 7,  64, -22, -2}, {-2, -19,  96, 8, 7,  62, -22, -2},
90   {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -18, 100, 8, 7,  56, -21, -2},
91   {-2, -17, 102, 7, 7,  54, -21, -2}, {-2, -16, 104, 7, 7,  51, -21, -2},
92   {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 108, 7, 6,  46, -20, -2},
93   {-2, -15, 110, 7, 6,  43, -19, -2}, {-2, -14, 111, 7, 6,  41, -19, -2},
94   {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 114, 6, 5,  35, -17, -2},
95   {-1, -12, 116, 6, 5,  33, -17, -2}, {-1, -11, 118, 5, 4,  30, -16, -1},
96   {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -9, 120, 5, 4,  25, -14, -2},
97   {-1,  -8, 121, 4, 3,  23, -13, -1}, {-1,  -7, 122, 4, 3,  20, -12, -1},
98   {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -6, 124, 3, 3,  16, -10, -1},
99   {-1,  -5, 125, 3, 2,  13,  -8, -1}, {-1,  -4, 126, 2, 2,  11,  -7, -1},
100   { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   6,  -5,  0},
101   { 0,  -2, 127, 1, 1,   4,  -3,  0}, { 0,   0, 127, 0, 0,   2,  -1,  0},
102   // [1, 2)
103   { 0, 0, 127,   0, 0,   1,   0, 0}, { 0, 0, 127,   0, 0,  -1,   2, 0},
104   { 0, 1, 127,  -1, 0,  -3,   4, 0}, { 0, 1, 126,  -2, 0,  -4,   6, 1},
105   { 0, 1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 125,  -4, 0,  -6,  11, 1},
106   { 0, 1, 124,  -4, 0,  -7,  13, 1}, { 0, 2, 123,  -5, 0,  -8,  15, 1},
107   { 0, 2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 121,  -6, 0, -10,  20, 1},
108   { 0, 2, 120,  -7, 0, -11,  22, 2}, { 0, 2, 119,  -8, 0, -12,  25, 2},
109   { 0, 3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 116,  -9, 0, -13,  29, 2},
110   { 0, 3, 114, -10, 0, -14,  32, 3}, { 0, 3, 113, -10, 0, -15,  35, 2},
111   { 0, 3, 111, -11, 0, -15,  37, 3}, { 0, 3, 109, -11, 0, -16,  40, 3},
112   { 0, 3, 108, -12, 0, -16,  42, 3}, { 0, 4, 106, -13, 0, -17,  45, 3},
113   { 0, 4, 104, -13, 0, -17,  47, 3}, { 0, 4, 102, -14, 0, -17,  50, 3},
114   { 0, 4, 100, -14, 0, -17,  52, 3}, { 0, 4,  98, -15, 0, -18,  55, 4},
115   { 0, 4,  96, -15, 0, -18,  58, 3}, { 0, 4,  94, -16, 0, -18,  60, 4},
116   { 0, 4,  91, -16, 0, -18,  63, 4}, { 0, 4,  89, -16, 0, -18,  65, 4},
117   { 0, 4,  87, -17, 0, -18,  68, 4}, { 0, 4,  85, -17, 0, -18,  70, 4},
118   { 0, 4,  82, -17, 0, -18,  73, 4}, { 0, 4,  80, -17, 0, -18,  75, 4},
119   { 0, 4,  78, -18, 0, -18,  78, 4}, { 0, 4,  75, -18, 0, -17,  80, 4},
120   { 0, 4,  73, -18, 0, -17,  82, 4}, { 0, 4,  70, -18, 0, -17,  85, 4},
121   { 0, 4,  68, -18, 0, -17,  87, 4}, { 0, 4,  65, -18, 0, -16,  89, 4},
122   { 0, 4,  63, -18, 0, -16,  91, 4}, { 0, 4,  60, -18, 0, -16,  94, 4},
123   { 0, 3,  58, -18, 0, -15,  96, 4}, { 0, 4,  55, -18, 0, -15,  98, 4},
124   { 0, 3,  52, -17, 0, -14, 100, 4}, { 0, 3,  50, -17, 0, -14, 102, 4},
125   { 0, 3,  47, -17, 0, -13, 104, 4}, { 0, 3,  45, -17, 0, -13, 106, 4},
126   { 0, 3,  42, -16, 0, -12, 108, 3}, { 0, 3,  40, -16, 0, -11, 109, 3},
127   { 0, 3,  37, -15, 0, -11, 111, 3}, { 0, 2,  35, -15, 0, -10, 113, 3},
128   { 0, 3,  32, -14, 0, -10, 114, 3}, { 0, 2,  29, -13, 0,  -9, 116, 3},
129   { 0, 2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  25, -12, 0,  -8, 119, 2},
130   { 0, 2,  22, -11, 0,  -7, 120, 2}, { 0, 1,  20, -10, 0,  -6, 121, 2},
131   { 0, 1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  15,  -8, 0,  -5, 123, 2},
132   { 0, 1,  13,  -7, 0,  -4, 124, 1}, { 0, 1,  11,  -6, 0,  -4, 125, 1},
133   { 0, 1,   8,  -5, 0,  -3, 126, 1}, { 0, 1,   6,  -4, 0,  -2, 126, 1},
134   { 0, 0,   4,  -3, 0,  -1, 127, 1}, { 0, 0,   2,  -1, 0,   0, 127, 0},
135   // dummy (replicate row index 191)
136   { 0, 0,   2,  -1, 0,   0, 127, 0},
137 };
138 /* clang-format on */
139 
140 // Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15
141 // in an SSE register into two sequences:
142 // 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
143 // 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
144 DECLARE_ALIGNED(16, static const uint8_t,
145                 even_mask[16]) = { 0, 2,  2,  4,  4,  6,  6,  8,
146                                    8, 10, 10, 12, 12, 14, 14, 0 };
147 
148 DECLARE_ALIGNED(16, static const uint8_t,
149                 odd_mask[16]) = { 1, 3,  3,  5,  5,  7,  7,  9,
150                                   9, 11, 11, 13, 13, 15, 15, 0 };
151 
152 DECLARE_ALIGNED(16, static const uint8_t,
153                 shuffle_alpha0_mask01[16]) = { 0, 1, 0, 1, 0, 1, 0, 1,
154                                                0, 1, 0, 1, 0, 1, 0, 1 };
155 
156 DECLARE_ALIGNED(16, static const uint8_t,
157                 shuffle_alpha0_mask23[16]) = { 2, 3, 2, 3, 2, 3, 2, 3,
158                                                2, 3, 2, 3, 2, 3, 2, 3 };
159 
160 DECLARE_ALIGNED(16, static const uint8_t,
161                 shuffle_alpha0_mask45[16]) = { 4, 5, 4, 5, 4, 5, 4, 5,
162                                                4, 5, 4, 5, 4, 5, 4, 5 };
163 
164 DECLARE_ALIGNED(16, static const uint8_t,
165                 shuffle_alpha0_mask67[16]) = { 6, 7, 6, 7, 6, 7, 6, 7,
166                                                6, 7, 6, 7, 6, 7, 6, 7 };
167 
168 DECLARE_ALIGNED(16, static const uint8_t,
169                 shuffle_gamma0_mask0[16]) = { 0, 1, 2, 3, 0, 1, 2, 3,
170                                               0, 1, 2, 3, 0, 1, 2, 3 };
171 
172 DECLARE_ALIGNED(16, static const uint8_t,
173                 shuffle_gamma0_mask1[16]) = { 4, 5, 6, 7, 4, 5, 6, 7,
174                                               4, 5, 6, 7, 4, 5, 6, 7 };
175 
176 DECLARE_ALIGNED(16, static const uint8_t,
177                 shuffle_gamma0_mask2[16]) = { 8, 9, 10, 11, 8, 9, 10, 11,
178                                               8, 9, 10, 11, 8, 9, 10, 11 };
179 
180 DECLARE_ALIGNED(16, static const uint8_t,
181                 shuffle_gamma0_mask3[16]) = { 12, 13, 14, 15, 12, 13, 14, 15,
182                                               12, 13, 14, 15, 12, 13, 14, 15 };
183 
filter_src_pixels(__m128i src,__m128i * tmp,__m128i * coeff,const int offset_bits_horiz,const int reduce_bits_horiz,int k)184 static inline void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
185                                      const int offset_bits_horiz,
186                                      const int reduce_bits_horiz, int k) {
187   const __m128i src_even =
188       _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)even_mask));
189   const __m128i src_odd =
190       _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)odd_mask));
191   // The pixel order we need for 'src' is:
192   // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
193   const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
194   const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]);
195   // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
196   const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
197                                             _mm_srli_si128(src_odd, 4));
198   const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]);
199   // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
200   const __m128i src_13 =
201       _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
202   const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]);
203   // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
204   const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
205                                             _mm_srli_si128(src_even, 6));
206   const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]);
207 
208   const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
209                                              ((1 << reduce_bits_horiz) >> 1));
210 
211   // Note: The values res_02 + res_46 and res_13 + res_57 both
212   // fit into int16s at this point, but their sum may be too wide to fit
213   // into an int16. However, once we also add round_const, the sum of
214   // all of these fits into a uint16.
215   //
216   // The wrapping behaviour of _mm_add_* is used here to make sure we
217   // get the correct result despite converting between different
218   // (implicit) types.
219   const __m128i res_even = _mm_add_epi16(res_02, res_46);
220   const __m128i res_odd = _mm_add_epi16(res_13, res_57);
221   const __m128i res =
222       _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
223   tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
224 }
225 
prepare_horizontal_filter_coeff(int alpha,int sx,__m128i * coeff)226 static inline void prepare_horizontal_filter_coeff(int alpha, int sx,
227                                                    __m128i *coeff) {
228   // Filter even-index pixels
229   const __m128i tmp_0 = _mm_loadl_epi64(
230       (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
231   const __m128i tmp_1 = _mm_loadl_epi64(
232       (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
233   const __m128i tmp_2 = _mm_loadl_epi64(
234       (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
235   const __m128i tmp_3 = _mm_loadl_epi64(
236       (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
237   const __m128i tmp_4 = _mm_loadl_epi64(
238       (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
239   const __m128i tmp_5 = _mm_loadl_epi64(
240       (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
241   const __m128i tmp_6 = _mm_loadl_epi64(
242       (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
243   const __m128i tmp_7 = _mm_loadl_epi64(
244       (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
245 
246   // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
247   const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
248   // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
249   const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
250   // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
251   const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
252   // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
253   const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
254 
255   // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
256   const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
257   // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
258   const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
259   // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
260   const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
261   // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
262   const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
263 
264   // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
265   coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14);
266   // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
267   coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14);
268   // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
269   coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15);
270   // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
271   coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15);
272 }
273 
prepare_horizontal_filter_coeff_alpha0(int sx,__m128i * coeff)274 static inline void prepare_horizontal_filter_coeff_alpha0(int sx,
275                                                           __m128i *coeff) {
276   // Filter even-index pixels
277   const __m128i tmp_0 =
278       _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
279 
280   // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
281   coeff[0] =
282       _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask01));
283   // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
284   coeff[1] =
285       _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask23));
286   // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
287   coeff[2] =
288       _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask45));
289   // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
290   coeff[3] =
291       _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask67));
292 }
293 
horizontal_filter(__m128i src,__m128i * tmp,int sx,int alpha,int k,const int offset_bits_horiz,const int reduce_bits_horiz)294 static inline void horizontal_filter(__m128i src, __m128i *tmp, int sx,
295                                      int alpha, int k,
296                                      const int offset_bits_horiz,
297                                      const int reduce_bits_horiz) {
298   __m128i coeff[4];
299   prepare_horizontal_filter_coeff(alpha, sx, coeff);
300   filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
301 }
302 
warp_horizontal_filter(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)303 static inline void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp,
304                                           int stride, int32_t ix4, int32_t iy4,
305                                           int32_t sx4, int alpha, int beta,
306                                           int p_height, int height, int i,
307                                           const int offset_bits_horiz,
308                                           const int reduce_bits_horiz) {
309   int k;
310   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
311     int iy = iy4 + k;
312     if (iy < 0)
313       iy = 0;
314     else if (iy > height - 1)
315       iy = height - 1;
316     int sx = sx4 + beta * (k + 4);
317 
318     // Load source pixels
319     const __m128i src =
320         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
321     horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
322                       reduce_bits_horiz);
323   }
324 }
325 
warp_horizontal_filter_alpha0(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)326 static inline void warp_horizontal_filter_alpha0(
327     const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
328     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
329     const int offset_bits_horiz, const int reduce_bits_horiz) {
330   (void)alpha;
331   int k;
332   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
333     int iy = iy4 + k;
334     if (iy < 0)
335       iy = 0;
336     else if (iy > height - 1)
337       iy = height - 1;
338     int sx = sx4 + beta * (k + 4);
339 
340     // Load source pixels
341     const __m128i src =
342         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
343 
344     __m128i coeff[4];
345     prepare_horizontal_filter_coeff_alpha0(sx, coeff);
346     filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
347   }
348 }
349 
warp_horizontal_filter_beta0(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)350 static inline void warp_horizontal_filter_beta0(
351     const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
352     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
353     const int offset_bits_horiz, const int reduce_bits_horiz) {
354   (void)beta;
355   int k;
356   __m128i coeff[4];
357   prepare_horizontal_filter_coeff(alpha, sx4, coeff);
358 
359   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
360     int iy = iy4 + k;
361     if (iy < 0)
362       iy = 0;
363     else if (iy > height - 1)
364       iy = height - 1;
365 
366     // Load source pixels
367     const __m128i src =
368         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
369     filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
370   }
371 }
372 
warp_horizontal_filter_alpha0_beta0(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)373 static inline void warp_horizontal_filter_alpha0_beta0(
374     const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
375     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
376     const int offset_bits_horiz, const int reduce_bits_horiz) {
377   (void)beta;
378   (void)alpha;
379   int k;
380 
381   __m128i coeff[4];
382   prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
383 
384   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
385     int iy = iy4 + k;
386     if (iy < 0)
387       iy = 0;
388     else if (iy > height - 1)
389       iy = height - 1;
390 
391     // Load source pixels
392     const __m128i src =
393         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
394     filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
395   }
396 }
397 
unpack_weights_and_set_round_const(ConvolveParams * conv_params,const int round_bits,const int offset_bits,__m128i * res_sub_const,__m128i * round_bits_const,__m128i * wt)398 static inline void unpack_weights_and_set_round_const(
399     ConvolveParams *conv_params, const int round_bits, const int offset_bits,
400     __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) {
401   *res_sub_const =
402       _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
403                      (1 << (offset_bits - conv_params->round_1 - 1)));
404   *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
405 
406   const int w0 = conv_params->fwd_offset;
407   const int w1 = conv_params->bck_offset;
408   const __m128i wt0 = _mm_set1_epi16((int16_t)w0);
409   const __m128i wt1 = _mm_set1_epi16((int16_t)w1);
410   *wt = _mm_unpacklo_epi16(wt0, wt1);
411 }
412 
prepare_vertical_filter_coeffs(int gamma,int sy,__m128i * coeffs)413 static inline void prepare_vertical_filter_coeffs(int gamma, int sy,
414                                                   __m128i *coeffs) {
415   const __m128i tmp_0 =
416       _mm_loadu_si128((__m128i *)(av1_warped_filter +
417                                   ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
418   const __m128i tmp_2 =
419       _mm_loadu_si128((__m128i *)(av1_warped_filter +
420                                   ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
421   const __m128i tmp_4 =
422       _mm_loadu_si128((__m128i *)(av1_warped_filter +
423                                   ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
424   const __m128i tmp_6 =
425       _mm_loadu_si128((__m128i *)(av1_warped_filter +
426                                   ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
427 
428   const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
429   const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
430   const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
431   const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
432 
433   // even coeffs
434   coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
435   coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10);
436   coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14);
437   coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14);
438 
439   const __m128i tmp_1 =
440       _mm_loadu_si128((__m128i *)(av1_warped_filter +
441                                   ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
442   const __m128i tmp_3 =
443       _mm_loadu_si128((__m128i *)(av1_warped_filter +
444                                   ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
445   const __m128i tmp_5 =
446       _mm_loadu_si128((__m128i *)(av1_warped_filter +
447                                   ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
448   const __m128i tmp_7 =
449       _mm_loadu_si128((__m128i *)(av1_warped_filter +
450                                   ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
451 
452   const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
453   const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
454   const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
455   const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
456 
457   // odd coeffs
458   coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11);
459   coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11);
460   coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15);
461   coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
462 }
463 
prepare_vertical_filter_coeffs_gamma0(int sy,__m128i * coeffs)464 static inline void prepare_vertical_filter_coeffs_gamma0(int sy,
465                                                          __m128i *coeffs) {
466   const __m128i tmp_0 = _mm_loadu_si128(
467       (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
468 
469   // even coeffs
470   coeffs[0] =
471       _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask0));
472   coeffs[1] =
473       _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask1));
474   coeffs[2] =
475       _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask2));
476   coeffs[3] =
477       _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask3));
478 
479   // odd coeffs
480   coeffs[4] = coeffs[0];
481   coeffs[5] = coeffs[1];
482   coeffs[6] = coeffs[2];
483   coeffs[7] = coeffs[3];
484 }
485 
filter_src_pixels_vertical(__m128i * tmp,__m128i * coeffs,__m128i * res_lo,__m128i * res_hi,int k)486 static inline void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs,
487                                               __m128i *res_lo, __m128i *res_hi,
488                                               int k) {
489   // Load from tmp and rearrange pairs of consecutive rows into the
490   // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
491   const __m128i *src = tmp + (k + 4);
492   const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
493   const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
494   const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
495   const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
496 
497   const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
498   const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
499   const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
500   const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
501 
502   const __m128i res_even =
503       _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6));
504 
505   // Filter odd-index pixels
506   const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
507   const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
508   const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
509   const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
510 
511   const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]);
512   const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]);
513   const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]);
514   const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]);
515 
516   const __m128i res_odd =
517       _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7));
518 
519   // Rearrange pixels back into the order 0 ... 7
520   *res_lo = _mm_unpacklo_epi32(res_even, res_odd);
521   *res_hi = _mm_unpackhi_epi32(res_even, res_odd);
522 }
523 
store_vertical_filter_output(__m128i * res_lo,__m128i * res_hi,const __m128i * res_add_const,const __m128i * wt,const __m128i * res_sub_const,__m128i * round_bits_const,uint8_t * pred,ConvolveParams * conv_params,int i,int j,int k,const int reduce_bits_vert,int p_stride,int p_width,const int round_bits)524 static inline void store_vertical_filter_output(
525     __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const,
526     const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const,
527     uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k,
528     const int reduce_bits_vert, int p_stride, int p_width,
529     const int round_bits) {
530   __m128i res_lo_1 = *res_lo;
531   __m128i res_hi_1 = *res_hi;
532 
533   if (conv_params->is_compound) {
534     __m128i *const p =
535         (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
536     res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const),
537                               reduce_bits_vert);
538     const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1);
539     __m128i res_lo_16;
540     if (conv_params->do_average) {
541       __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
542       const __m128i p_16 = _mm_loadl_epi64(p);
543 
544       if (conv_params->use_dist_wtd_comp_avg) {
545         const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
546         const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
547         const __m128i shifted_32 =
548             _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
549         res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
550       } else {
551         res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
552       }
553 
554       res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const);
555 
556       res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const),
557                                  round_bits);
558       __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
559       *(int *)dst8 = _mm_cvtsi128_si32(res_8_lo);
560     } else {
561       _mm_storel_epi64(p, temp_lo_16);
562     }
563     if (p_width > 4) {
564       __m128i *const p4 =
565           (__m128i *)&conv_params
566               ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
567       res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const),
568                                 reduce_bits_vert);
569       const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1);
570       __m128i res_hi_16;
571 
572       if (conv_params->do_average) {
573         __m128i *const dst8_4 =
574             (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
575         const __m128i p4_16 = _mm_loadl_epi64(p4);
576 
577         if (conv_params->use_dist_wtd_comp_avg) {
578           const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
579           const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
580           const __m128i shifted_32 =
581               _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
582           res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
583         } else {
584           res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
585         }
586         res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const);
587 
588         res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const),
589                                    round_bits);
590         __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
591         *(int *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
592 
593       } else {
594         _mm_storel_epi64(p4, temp_hi_16);
595       }
596     }
597   } else {
598     const __m128i res_lo_round = _mm_srai_epi32(
599         _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
600     const __m128i res_hi_round = _mm_srai_epi32(
601         _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
602 
603     const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
604     __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
605 
606     // Store, blending with 'pred' if needed
607     __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
608 
609     // Note: If we're outputting a 4x4 block, we need to be very careful
610     // to only output 4 pixels at this point, to avoid encode/decode
611     // mismatches when encoding with multiple threads.
612     if (p_width == 4) {
613       *(int *)p = _mm_cvtsi128_si32(res_8bit);
614     } else {
615       _mm_storel_epi64(p, res_8bit);
616     }
617   }
618 }
619 
warp_vertical_filter(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)620 static inline void warp_vertical_filter(
621     uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
622     int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
623     int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
624     const int round_bits, const int offset_bits) {
625   int k;
626   __m128i res_sub_const, round_bits_const, wt;
627   unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
628                                      &res_sub_const, &round_bits_const, &wt);
629   // Vertical filter
630   for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
631     int sy = sy4 + delta * (k + 4);
632 
633     __m128i coeffs[8];
634     prepare_vertical_filter_coeffs(gamma, sy, coeffs);
635 
636     __m128i res_lo;
637     __m128i res_hi;
638     filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
639 
640     store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
641                                  &res_sub_const, &round_bits_const, pred,
642                                  conv_params, i, j, k, reduce_bits_vert,
643                                  p_stride, p_width, round_bits);
644   }
645 }
646 
warp_vertical_filter_gamma0(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)647 static inline void warp_vertical_filter_gamma0(
648     uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
649     int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
650     int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
651     const int round_bits, const int offset_bits) {
652   int k;
653   (void)gamma;
654   __m128i res_sub_const, round_bits_const, wt;
655   unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
656                                      &res_sub_const, &round_bits_const, &wt);
657   // Vertical filter
658   for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
659     int sy = sy4 + delta * (k + 4);
660 
661     __m128i coeffs[8];
662     prepare_vertical_filter_coeffs_gamma0(sy, coeffs);
663 
664     __m128i res_lo;
665     __m128i res_hi;
666     filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
667 
668     store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
669                                  &res_sub_const, &round_bits_const, pred,
670                                  conv_params, i, j, k, reduce_bits_vert,
671                                  p_stride, p_width, round_bits);
672   }
673 }
674 
warp_vertical_filter_delta0(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)675 static inline void warp_vertical_filter_delta0(
676     uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
677     int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
678     int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
679     const int round_bits, const int offset_bits) {
680   (void)delta;
681   int k;
682   __m128i res_sub_const, round_bits_const, wt;
683   unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
684                                      &res_sub_const, &round_bits_const, &wt);
685 
686   __m128i coeffs[8];
687   prepare_vertical_filter_coeffs(gamma, sy4, coeffs);
688   // Vertical filter
689   for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
690     __m128i res_lo;
691     __m128i res_hi;
692     filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
693 
694     store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
695                                  &res_sub_const, &round_bits_const, pred,
696                                  conv_params, i, j, k, reduce_bits_vert,
697                                  p_stride, p_width, round_bits);
698   }
699 }
700 
warp_vertical_filter_gamma0_delta0(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)701 static inline void warp_vertical_filter_gamma0_delta0(
702     uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
703     int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
704     int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
705     const int round_bits, const int offset_bits) {
706   (void)delta;
707   (void)gamma;
708   int k;
709   __m128i res_sub_const, round_bits_const, wt;
710   unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
711                                      &res_sub_const, &round_bits_const, &wt);
712 
713   __m128i coeffs[8];
714   prepare_vertical_filter_coeffs_gamma0(sy4, coeffs);
715   // Vertical filter
716   for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
717     __m128i res_lo;
718     __m128i res_hi;
719     filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
720 
721     store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
722                                  &res_sub_const, &round_bits_const, pred,
723                                  conv_params, i, j, k, reduce_bits_vert,
724                                  p_stride, p_width, round_bits);
725   }
726 }
727 
prepare_warp_vertical_filter(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)728 static inline void prepare_warp_vertical_filter(
729     uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
730     int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
731     int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
732     const int round_bits, const int offset_bits) {
733   if (gamma == 0 && delta == 0)
734     warp_vertical_filter_gamma0_delta0(
735         pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j,
736         sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits);
737   else if (gamma == 0 && delta != 0)
738     warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height,
739                                 p_stride, p_width, i, j, sy4, reduce_bits_vert,
740                                 res_add_const, round_bits, offset_bits);
741   else if (gamma != 0 && delta == 0)
742     warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height,
743                                 p_stride, p_width, i, j, sy4, reduce_bits_vert,
744                                 res_add_const, round_bits, offset_bits);
745   else
746     warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height,
747                          p_stride, p_width, i, j, sy4, reduce_bits_vert,
748                          res_add_const, round_bits, offset_bits);
749 }
750 
prepare_warp_horizontal_filter(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)751 static inline void prepare_warp_horizontal_filter(
752     const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
753     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
754     const int offset_bits_horiz, const int reduce_bits_horiz) {
755   if (alpha == 0 && beta == 0)
756     warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
757                                         beta, p_height, height, i,
758                                         offset_bits_horiz, reduce_bits_horiz);
759   else if (alpha == 0 && beta != 0)
760     warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
761                                   p_height, height, i, offset_bits_horiz,
762                                   reduce_bits_horiz);
763   else if (alpha != 0 && beta == 0)
764     warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
765                                  p_height, height, i, offset_bits_horiz,
766                                  reduce_bits_horiz);
767   else
768     warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
769                            p_height, height, i, offset_bits_horiz,
770                            reduce_bits_horiz);
771 }
772 
av1_warp_affine_sse4_1(const int32_t * mat,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)773 void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
774                             int height, int stride, uint8_t *pred, int p_col,
775                             int p_row, int p_width, int p_height, int p_stride,
776                             int subsampling_x, int subsampling_y,
777                             ConvolveParams *conv_params, int16_t alpha,
778                             int16_t beta, int16_t gamma, int16_t delta) {
779   __m128i tmp[15];
780   int i, j, k;
781   const int bd = 8;
782   const int reduce_bits_horiz = conv_params->round_0;
783   const int reduce_bits_vert = conv_params->is_compound
784                                    ? conv_params->round_1
785                                    : 2 * FILTER_BITS - reduce_bits_horiz;
786   const int offset_bits_horiz = bd + FILTER_BITS - 1;
787   assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
788 
789   const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
790   const __m128i reduce_bits_vert_const =
791       _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
792   const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
793   const int round_bits =
794       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
795   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
796   assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
797 
798   /* Note: For this code to work, the left/right frame borders need to be
799   extended by at least 13 pixels each. By the time we get here, other
800   code will have set up this border, but we allow an explicit check
801   for debugging purposes.
802   */
803   /*for (i = 0; i < height; ++i) {
804   for (j = 0; j < 13; ++j) {
805   assert(ref[i * stride - 13 + j] == ref[i * stride]);
806   assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
807   }
808   }*/
809   __m128i res_add_const_1;
810   if (conv_params->is_compound == 1) {
811     res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const);
812   } else {
813     res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
814                                      ((1 << reduce_bits_vert) >> 1));
815   }
816 
817   for (i = 0; i < p_height; i += 8) {
818     for (j = 0; j < p_width; j += 8) {
819       const int32_t src_x = (p_col + j + 4) << subsampling_x;
820       const int32_t src_y = (p_row + i + 4) << subsampling_y;
821       const int64_t dst_x =
822           (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
823       const int64_t dst_y =
824           (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
825       const int64_t x4 = dst_x >> subsampling_x;
826       const int64_t y4 = dst_y >> subsampling_y;
827 
828       int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
829       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
830       int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
831       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
832 
833       // Add in all the constant terms, including rounding and offset
834       sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
835              (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
836       sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
837              (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
838 
839       sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
840       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
841 
842       // Horizontal filter
843       // If the block is aligned such that, after clamping, every sample
844       // would be taken from the leftmost/rightmost column, then we can
845       // skip the expensive horizontal filter.
846       if (ix4 <= -7) {
847         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
848           int iy = iy4 + k;
849           if (iy < 0)
850             iy = 0;
851           else if (iy > height - 1)
852             iy = height - 1;
853           tmp[k + 7] = _mm_set1_epi16(
854               (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
855               ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
856         }
857       } else if (ix4 >= width + 6) {
858         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
859           int iy = iy4 + k;
860           if (iy < 0)
861             iy = 0;
862           else if (iy > height - 1)
863             iy = height - 1;
864           tmp[k + 7] =
865               _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
866                              ref[iy * stride + (width - 1)] *
867                                  (1 << (FILTER_BITS - reduce_bits_horiz)));
868         }
869       } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
870         const int out_of_boundary_left = -(ix4 - 6);
871         const int out_of_boundary_right = (ix4 + 8) - width;
872         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
873           int iy = iy4 + k;
874           if (iy < 0)
875             iy = 0;
876           else if (iy > height - 1)
877             iy = height - 1;
878           int sx = sx4 + beta * (k + 4);
879 
880           // Load source pixels
881           __m128i src =
882               _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
883           if (out_of_boundary_left >= 0) {
884             const __m128i shuffle_reg_left =
885                 _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
886             src = _mm_shuffle_epi8(src, shuffle_reg_left);
887           }
888           if (out_of_boundary_right >= 0) {
889             const __m128i shuffle_reg_right = _mm_loadu_si128(
890                 (__m128i *)warp_pad_right[out_of_boundary_right]);
891             src = _mm_shuffle_epi8(src, shuffle_reg_right);
892           }
893           horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
894                             reduce_bits_horiz);
895         }
896       } else {
897         prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha,
898                                        beta, p_height, height, i,
899                                        offset_bits_horiz, reduce_bits_horiz);
900       }
901 
902       // Vertical filter
903       prepare_warp_vertical_filter(
904           pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i,
905           j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits);
906     }
907   }
908 }
909