xref: /aosp_15_r20/external/libgav1/src/dsp/x86/intrapred_smooth_sse4.cc (revision 095378508e87ed692bf8dfeb34008b65b3735891)
1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/intrapred_smooth.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_TARGETING_SSE4_1
19 
20 #include <xmmintrin.h>
21 
22 #include <cassert>
23 #include <cstddef>
24 #include <cstdint>
25 
26 #include "src/dsp/constants.h"
27 #include "src/dsp/dsp.h"
28 #include "src/dsp/x86/common_sse4.h"
29 #include "src/utils/common.h"
30 #include "src/utils/constants.h"
31 
32 namespace libgav1 {
33 namespace dsp {
34 namespace low_bitdepth {
35 namespace {
36 
37 // Note these constants are duplicated from intrapred.cc to allow the compiler
38 // to have visibility of the values. This helps reduce loads and in the
39 // creation of the inverse weights.
40 constexpr uint8_t kSmoothWeights[] = {
41 #include "src/dsp/smooth_weights.inc"
42 };
43 
44 template <int y_mask>
WriteSmoothHorizontalSum4(void * LIBGAV1_RESTRICT const dest,const __m128i & left,const __m128i & weights,const __m128i & scaled_top_right,const __m128i & round)45 inline void WriteSmoothHorizontalSum4(void* LIBGAV1_RESTRICT const dest,
46                                       const __m128i& left,
47                                       const __m128i& weights,
48                                       const __m128i& scaled_top_right,
49                                       const __m128i& round) {
50   const __m128i left_y = _mm_shuffle_epi32(left, y_mask);
51   const __m128i weighted_left_y = _mm_mullo_epi16(left_y, weights);
52   const __m128i pred_sum = _mm_add_epi32(scaled_top_right, weighted_left_y);
53   // Equivalent to RightShiftWithRounding(pred[x][y], 8).
54   const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 8);
55   const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
56   Store4(dest, _mm_shuffle_epi8(pred, cvtepi32_epi8));
57 }
58 
59 // For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,
60 // |pixels| is a segment of the top row or the whole top row, and |weights| is
61 // repeated.
SmoothDirectionalSum8(const __m128i & pixels,const __m128i & weights,const __m128i & scaled_corner)62 inline __m128i SmoothDirectionalSum8(const __m128i& pixels,
63                                      const __m128i& weights,
64                                      const __m128i& scaled_corner) {
65   const __m128i weighted_px = _mm_mullo_epi16(pixels, weights);
66   return _mm_add_epi16(scaled_corner, weighted_px);
67 }
68 
WriteSmoothDirectionalSum8(uint8_t * LIBGAV1_RESTRICT dest,const __m128i & pixels,const __m128i & weights,const __m128i & scaled_corner,const __m128i & round)69 inline void WriteSmoothDirectionalSum8(uint8_t* LIBGAV1_RESTRICT dest,
70                                        const __m128i& pixels,
71                                        const __m128i& weights,
72                                        const __m128i& scaled_corner,
73                                        const __m128i& round) {
74   const __m128i pred_sum =
75       SmoothDirectionalSum8(pixels, weights, scaled_corner);
76   // Equivalent to RightShiftWithRounding(pred[x][y], 8).
77   const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, round), 8);
78   StoreLo8(dest, _mm_packus_epi16(pred, pred));
79 }
80 
81 // For Horizontal, pixels1 and pixels2 are the same repeated value. For
82 // Vertical, weights1 and weights2 are the same, and scaled_corner1 and
83 // scaled_corner2 are the same.
WriteSmoothDirectionalSum16(uint8_t * LIBGAV1_RESTRICT dest,const __m128i & pixels1,const __m128i & pixels2,const __m128i & weights1,const __m128i & weights2,const __m128i & scaled_corner1,const __m128i & scaled_corner2,const __m128i & round)84 inline void WriteSmoothDirectionalSum16(
85     uint8_t* LIBGAV1_RESTRICT dest, const __m128i& pixels1,
86     const __m128i& pixels2, const __m128i& weights1, const __m128i& weights2,
87     const __m128i& scaled_corner1, const __m128i& scaled_corner2,
88     const __m128i& round) {
89   const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1);
90   const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2);
91   const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1);
92   const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2);
93   // Equivalent to RightShiftWithRounding(pred[x][y], 8).
94   const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8);
95   const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8);
96   StoreUnaligned16(dest, _mm_packus_epi16(pred1, pred2));
97 }
98 
99 template <int y_mask>
WriteSmoothPredSum4(uint8_t * LIBGAV1_RESTRICT const dest,const __m128i & top,const __m128i & left,const __m128i & weights_x,const __m128i & weights_y,const __m128i & scaled_bottom_left,const __m128i & scaled_top_right,const __m128i & round)100 inline void WriteSmoothPredSum4(uint8_t* LIBGAV1_RESTRICT const dest,
101                                 const __m128i& top, const __m128i& left,
102                                 const __m128i& weights_x,
103                                 const __m128i& weights_y,
104                                 const __m128i& scaled_bottom_left,
105                                 const __m128i& scaled_top_right,
106                                 const __m128i& round) {
107   const __m128i left_y = _mm_shuffle_epi32(left, y_mask);
108   const __m128i weighted_left_y = _mm_mullo_epi32(left_y, weights_x);
109   const __m128i weight_y = _mm_shuffle_epi32(weights_y, y_mask);
110   const __m128i weighted_top = _mm_mullo_epi32(weight_y, top);
111   const __m128i scaled_bottom_left_y =
112       _mm_shuffle_epi32(scaled_bottom_left, y_mask);
113   const __m128i col_pred = _mm_add_epi32(scaled_bottom_left_y, weighted_left_y);
114   const __m128i row_pred = _mm_add_epi32(scaled_top_right, weighted_top);
115   const __m128i pred_sum = _mm_add_epi32(row_pred, col_pred);
116 
117   // Equivalent to RightShiftWithRounding(pred[x][y], 9).
118   const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 9);
119 
120   const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
121   Store4(dest, _mm_shuffle_epi8(pred, cvtepi32_epi8));
122 }
123 
124 // pixels[0]: above and below_pred interleave vector
125 // pixels[1]: left vector
126 // pixels[2]: right_pred vector
LoadSmoothPixels4(const uint8_t * LIBGAV1_RESTRICT above,const uint8_t * LIBGAV1_RESTRICT left,const int height,__m128i * pixels)127 inline void LoadSmoothPixels4(const uint8_t* LIBGAV1_RESTRICT above,
128                               const uint8_t* LIBGAV1_RESTRICT left,
129                               const int height, __m128i* pixels) {
130   if (height == 4) {
131     pixels[1] = Load4(left);
132   } else if (height == 8) {
133     pixels[1] = LoadLo8(left);
134   } else {
135     pixels[1] = LoadUnaligned16(left);
136   }
137 
138   const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
139   const __m128i top = _mm_cvtepu8_epi16(Load4(above));
140   pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
141   pixels[2] = _mm_set1_epi16(above[3]);
142 }
143 
144 // weight_h[0]: weight_h vector
145 // weight_h[1]: scale - weight_h vector
146 // weight_h[2]: same as [0], second half for height = 16 only
147 // weight_h[3]: same as [1], second half for height = 16 only
148 // weight_w[0]: weights_w and scale - weights_w interleave vector
LoadSmoothWeights4(const uint8_t * LIBGAV1_RESTRICT weight_array,const int height,__m128i * weight_h,__m128i * weight_w)149 inline void LoadSmoothWeights4(const uint8_t* LIBGAV1_RESTRICT weight_array,
150                                const int height, __m128i* weight_h,
151                                __m128i* weight_w) {
152   const __m128i scale = _mm_set1_epi16(256);
153   const __m128i x_weights = Load4(weight_array);
154   weight_h[0] = _mm_cvtepu8_epi16(x_weights);
155   weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
156   weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
157 
158   if (height == 8) {
159     const __m128i y_weights = LoadLo8(weight_array + 4);
160     weight_h[0] = _mm_cvtepu8_epi16(y_weights);
161     weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
162   } else if (height == 16) {
163     const __m128i zero = _mm_setzero_si128();
164     const __m128i y_weights = LoadUnaligned16(weight_array + 12);
165     weight_h[0] = _mm_cvtepu8_epi16(y_weights);
166     weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
167     weight_h[2] = _mm_unpackhi_epi8(y_weights, zero);
168     weight_h[3] = _mm_sub_epi16(scale, weight_h[2]);
169   }
170 }
171 
WriteSmoothPred4x8(const __m128i * pixel,const __m128i * weights_y,const __m128i * weight_x,uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const bool use_second_half)172 inline void WriteSmoothPred4x8(const __m128i* pixel, const __m128i* weights_y,
173                                const __m128i* weight_x,
174                                uint8_t* LIBGAV1_RESTRICT dst,
175                                const ptrdiff_t stride,
176                                const bool use_second_half) {
177   const __m128i round = _mm_set1_epi32(256);
178   const __m128i mask_increment = _mm_set1_epi16(0x0202);
179   const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
180   const __m128i zero = _mm_setzero_si128();
181   const __m128i left = use_second_half ? _mm_unpackhi_epi8(pixel[1], zero)
182                                        : _mm_unpacklo_epi8(pixel[1], zero);
183   __m128i y_select = _mm_set1_epi16(0x0100);
184 
185   for (int i = 0; i < 8; ++i) {
186     const __m128i weight_y = _mm_shuffle_epi8(weights_y[0], y_select);
187     const __m128i inverted_weight_y = _mm_shuffle_epi8(weights_y[1], y_select);
188     const __m128i interleaved_weights =
189         _mm_unpacklo_epi16(weight_y, inverted_weight_y);
190     __m128i vertical_pred = _mm_madd_epi16(pixel[0], interleaved_weights);
191 
192     __m128i horizontal_vect = _mm_shuffle_epi8(left, y_select);
193     horizontal_vect = _mm_unpacklo_epi16(horizontal_vect, pixel[2]);
194     __m128i sum = _mm_madd_epi16(horizontal_vect, weight_x[0]);
195 
196     sum = _mm_add_epi32(vertical_pred, sum);
197     sum = _mm_add_epi32(sum, round);
198     sum = _mm_srai_epi32(sum, 9);
199 
200     sum = _mm_shuffle_epi8(sum, cvtepi32_epi8);
201     Store4(dst, sum);
202     dst += stride;
203 
204     y_select = _mm_add_epi16(y_select, mask_increment);
205   }
206 }
207 
208 // The interleaving approach has some overhead that causes it to underperform in
209 // the 4x4 case.
Smooth4x4_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT top_row,const void * LIBGAV1_RESTRICT left_column)210 void Smooth4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
211                       const void* LIBGAV1_RESTRICT top_row,
212                       const void* LIBGAV1_RESTRICT left_column) {
213   const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
214   const __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
215   const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
216   const __m128i scale = _mm_set1_epi32(256);
217   // Fourth short is top_row[3].
218   const __m128i top_right = _mm_shuffle_epi32(top, 0xFF);
219   // Fourth short is left_column[3].
220   const __m128i bottom_left = _mm_shuffle_epi32(left, 0xFF);
221   const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
222   const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
223   const __m128i scaled_bottom_left =
224       _mm_mullo_epi16(inverted_weights, bottom_left);
225   auto* dst = static_cast<uint8_t*>(dest);
226   // AV1 spec 7.11.2.6 (3) describes the sum:
227   // smoothPred[y][x:x+3] = weighted_top + scaled_right + weighted_left[y] +
228   // scaled_bottom[y] This could be a loop, but for the immediate value in the
229   // shuffles.
230   WriteSmoothPredSum4<0>(dst, top, left, weights, weights, scaled_bottom_left,
231                          scaled_top_right, scale);
232   dst += stride;
233   WriteSmoothPredSum4<0x55>(dst, top, left, weights, weights,
234                             scaled_bottom_left, scaled_top_right, scale);
235   dst += stride;
236   WriteSmoothPredSum4<0xAA>(dst, top, left, weights, weights,
237                             scaled_bottom_left, scaled_top_right, scale);
238   dst += stride;
239   WriteSmoothPredSum4<0xFF>(dst, top, left, weights, weights,
240                             scaled_bottom_left, scaled_top_right, scale);
241 }
242 
Smooth4x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT top_row,const void * LIBGAV1_RESTRICT left_column)243 void Smooth4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
244                       const void* LIBGAV1_RESTRICT top_row,
245                       const void* LIBGAV1_RESTRICT left_column) {
246   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
247   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
248   __m128i weights_x[1];
249   __m128i weights_y[2];
250   LoadSmoothWeights4(kSmoothWeights, 8, weights_y, weights_x);
251   __m128i pixels[3];
252   LoadSmoothPixels4(top_ptr, left_ptr, 8, pixels);
253   auto* dst = static_cast<uint8_t*>(dest);
254   WriteSmoothPred4x8(pixels, weights_y, weights_x, dst, stride, false);
255 }
256 
Smooth4x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT top_row,const void * LIBGAV1_RESTRICT left_column)257 void Smooth4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
258                        const ptrdiff_t stride,
259                        const void* LIBGAV1_RESTRICT top_row,
260                        const void* LIBGAV1_RESTRICT left_column) {
261   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
262   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
263   __m128i weights_x[1];
264   __m128i weights_y[4];
265   LoadSmoothWeights4(kSmoothWeights, 16, weights_y, weights_x);
266   __m128i pixels[3];
267   LoadSmoothPixels4(top_ptr, left_ptr, 16, pixels);
268   auto* dst = static_cast<uint8_t*>(dest);
269   WriteSmoothPred4x8(pixels, weights_y, weights_x, dst, stride, false);
270   dst += stride << 3;
271   WriteSmoothPred4x8(pixels, &weights_y[2], weights_x, dst, stride, true);
272 }
273 
274 // pixels[0]: above and below_pred interleave vector, first half
275 // pixels[1]: above and below_pred interleave vector, second half
276 // pixels[2]: left vector
277 // pixels[3]: right_pred vector
278 // pixels[4]: above and below_pred interleave vector, first half
279 // pixels[5]: above and below_pred interleave vector, second half
280 // pixels[6]: left vector + 16
281 // pixels[7]: right_pred vector
LoadSmoothPixels8(const uint8_t * LIBGAV1_RESTRICT above,const uint8_t * LIBGAV1_RESTRICT left,const int height,__m128i * pixels)282 inline void LoadSmoothPixels8(const uint8_t* LIBGAV1_RESTRICT above,
283                               const uint8_t* LIBGAV1_RESTRICT left,
284                               const int height, __m128i* pixels) {
285   const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
286   __m128i top_row = _mm_cvtepu8_epi16(LoadLo8(above));
287   pixels[0] = _mm_unpacklo_epi16(top_row, bottom_left);
288   pixels[1] = _mm_unpackhi_epi16(top_row, bottom_left);
289 
290   pixels[3] = _mm_set1_epi16(above[7]);
291 
292   if (height == 4) {
293     pixels[2] = Load4(left);
294   } else if (height == 8) {
295     pixels[2] = LoadLo8(left);
296   } else if (height == 16) {
297     pixels[2] = LoadUnaligned16(left);
298   } else {
299     pixels[2] = LoadUnaligned16(left);
300     pixels[4] = pixels[0];
301     pixels[5] = pixels[1];
302     pixels[6] = LoadUnaligned16(left + 16);
303     pixels[7] = pixels[3];
304   }
305 }
306 
307 // weight_h[0]: weight_h vector
308 // weight_h[1]: scale - weight_h vector
309 // weight_h[2]: same as [0], offset 8
310 // weight_h[3]: same as [1], offset 8
311 // weight_h[4]: same as [0], offset 16
312 // weight_h[5]: same as [1], offset 16
313 // weight_h[6]: same as [0], offset 24
314 // weight_h[7]: same as [1], offset 24
315 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
316 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
LoadSmoothWeights8(const uint8_t * LIBGAV1_RESTRICT weight_array,const int height,__m128i * weight_w,__m128i * weight_h)317 inline void LoadSmoothWeights8(const uint8_t* LIBGAV1_RESTRICT weight_array,
318                                const int height, __m128i* weight_w,
319                                __m128i* weight_h) {
320   const int offset = (height < 8) ? 0 : 4;
321   __m128i loaded_weights = LoadUnaligned16(&weight_array[offset]);
322   weight_h[0] = _mm_cvtepu8_epi16(loaded_weights);
323   const __m128i inverter = _mm_set1_epi16(256);
324   weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
325 
326   if (height == 4) {
327     loaded_weights = _mm_srli_si128(loaded_weights, 4);
328     __m128i weights_x = _mm_cvtepu8_epi16(loaded_weights);
329     __m128i inverted_weights_x = _mm_sub_epi16(inverter, weights_x);
330     weight_w[0] = _mm_unpacklo_epi16(weights_x, inverted_weights_x);
331     weight_w[1] = _mm_unpackhi_epi16(weights_x, inverted_weights_x);
332   } else {
333     weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
334     weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
335   }
336 
337   if (height == 16) {
338     const __m128i zero = _mm_setzero_si128();
339     loaded_weights = LoadUnaligned16(weight_array + 12);
340     weight_h[0] = _mm_cvtepu8_epi16(loaded_weights);
341     weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
342     weight_h[2] = _mm_unpackhi_epi8(loaded_weights, zero);
343     weight_h[3] = _mm_sub_epi16(inverter, weight_h[2]);
344   } else if (height == 32) {
345     const __m128i zero = _mm_setzero_si128();
346     const __m128i weight_lo = LoadUnaligned16(weight_array + 28);
347     weight_h[0] = _mm_cvtepu8_epi16(weight_lo);
348     weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
349     weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
350     weight_h[3] = _mm_sub_epi16(inverter, weight_h[2]);
351     const __m128i weight_hi = LoadUnaligned16(weight_array + 44);
352     weight_h[4] = _mm_cvtepu8_epi16(weight_hi);
353     weight_h[5] = _mm_sub_epi16(inverter, weight_h[4]);
354     weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
355     weight_h[7] = _mm_sub_epi16(inverter, weight_h[6]);
356   }
357 }
358 
WriteSmoothPred8xH(const __m128i * pixels,const __m128i * weights_x,const __m128i * weights_y,const int height,uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const bool use_second_half)359 inline void WriteSmoothPred8xH(const __m128i* pixels, const __m128i* weights_x,
360                                const __m128i* weights_y, const int height,
361                                uint8_t* LIBGAV1_RESTRICT dst,
362                                const ptrdiff_t stride,
363                                const bool use_second_half) {
364   const __m128i round = _mm_set1_epi32(256);
365   const __m128i mask_increment = _mm_set1_epi16(0x0202);
366   const __m128i cvt_epu16_epi8 = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
367 
368   const __m128i zero = _mm_setzero_si128();
369   const __m128i left = use_second_half ? _mm_unpackhi_epi8(pixels[2], zero)
370                                        : _mm_unpacklo_epi8(pixels[2], zero);
371   __m128i y_select = _mm_set1_epi16(0x100);
372 
373   for (int i = 0; i < height; ++i) {
374     const __m128i weight_y = _mm_shuffle_epi8(weights_y[0], y_select);
375     const __m128i inverted_weight_y = _mm_shuffle_epi8(weights_y[1], y_select);
376     const __m128i interleaved_weights =
377         _mm_unpacklo_epi16(weight_y, inverted_weight_y);
378     const __m128i vertical_sum0 =
379         _mm_madd_epi16(pixels[0], interleaved_weights);
380     const __m128i vertical_sum1 =
381         _mm_madd_epi16(pixels[1], interleaved_weights);
382 
383     __m128i horizontal_pixels = _mm_shuffle_epi8(left, y_select);
384     horizontal_pixels = _mm_unpacklo_epi16(horizontal_pixels, pixels[3]);
385     const __m128i horizontal_sum0 =
386         _mm_madd_epi16(horizontal_pixels, weights_x[0]);
387     const __m128i horizontal_sum1 =
388         _mm_madd_epi16(horizontal_pixels, weights_x[1]);
389 
390     __m128i sum0 = _mm_add_epi32(vertical_sum0, horizontal_sum0);
391     sum0 = _mm_add_epi32(sum0, round);
392     sum0 = _mm_srai_epi32(sum0, 9);
393 
394     __m128i sum1 = _mm_add_epi32(vertical_sum1, horizontal_sum1);
395     sum1 = _mm_add_epi32(sum1, round);
396     sum1 = _mm_srai_epi32(sum1, 9);
397 
398     sum0 = _mm_packus_epi16(sum0, sum1);
399     sum0 = _mm_shuffle_epi8(sum0, cvt_epu16_epi8);
400     StoreLo8(dst, sum0);
401     dst += stride;
402 
403     y_select = _mm_add_epi16(y_select, mask_increment);
404   }
405 }
406 
Smooth8x4_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT top_row,const void * LIBGAV1_RESTRICT left_column)407 void Smooth8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
408                       const void* LIBGAV1_RESTRICT top_row,
409                       const void* LIBGAV1_RESTRICT left_column) {
410   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
411   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
412   __m128i pixels[4];
413   LoadSmoothPixels8(top_ptr, left_ptr, 4, pixels);
414 
415   __m128i weights_x[2], weights_y[2];
416   LoadSmoothWeights8(kSmoothWeights, 4, weights_x, weights_y);
417 
418   auto* dst = static_cast<uint8_t*>(dest);
419   WriteSmoothPred8xH(pixels, weights_x, weights_y, 4, dst, stride, false);
420 }
421 
Smooth8x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT top_row,const void * LIBGAV1_RESTRICT left_column)422 void Smooth8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
423                       const void* LIBGAV1_RESTRICT top_row,
424                       const void* LIBGAV1_RESTRICT left_column) {
425   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
426   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
427 
428   __m128i pixels[4];
429   LoadSmoothPixels8(top_ptr, left_ptr, 8, pixels);
430 
431   __m128i weights_x[2], weights_y[2];
432   LoadSmoothWeights8(kSmoothWeights, 8, weights_x, weights_y);
433 
434   auto* dst = static_cast<uint8_t*>(dest);
435   WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
436 }
437 
Smooth8x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT top_row,const void * LIBGAV1_RESTRICT left_column)438 void Smooth8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
439                        const ptrdiff_t stride,
440                        const void* LIBGAV1_RESTRICT top_row,
441                        const void* LIBGAV1_RESTRICT left_column) {
442   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
443   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
444   __m128i pixels[4];
445   LoadSmoothPixels8(top_ptr, left_ptr, 16, pixels);
446 
447   __m128i weights_x[2], weights_y[4];
448   LoadSmoothWeights8(kSmoothWeights, 16, weights_x, weights_y);
449 
450   auto* dst = static_cast<uint8_t*>(dest);
451   WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
452   dst += stride << 3;
453   WriteSmoothPred8xH(pixels, weights_x, &weights_y[2], 8, dst, stride, true);
454 }
455 
Smooth8x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT top_row,const void * LIBGAV1_RESTRICT left_column)456 void Smooth8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
457                        const ptrdiff_t stride,
458                        const void* LIBGAV1_RESTRICT top_row,
459                        const void* LIBGAV1_RESTRICT left_column) {
460   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
461   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
462   __m128i pixels[8];
463   LoadSmoothPixels8(top_ptr, left_ptr, 32, pixels);
464 
465   __m128i weights_x[2], weights_y[8];
466   LoadSmoothWeights8(kSmoothWeights, 32, weights_x, weights_y);
467 
468   auto* dst = static_cast<uint8_t*>(dest);
469   WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
470   dst += stride << 3;
471   WriteSmoothPred8xH(pixels, weights_x, &weights_y[2], 8, dst, stride, true);
472   dst += stride << 3;
473   WriteSmoothPred8xH(&pixels[4], weights_x, &weights_y[4], 8, dst, stride,
474                      false);
475   dst += stride << 3;
476   WriteSmoothPred8xH(&pixels[4], weights_x, &weights_y[6], 8, dst, stride,
477                      true);
478 }
479 
480 template <int width, int height>
SmoothWxH(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)481 void SmoothWxH(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
482                const void* LIBGAV1_RESTRICT const top_row,
483                const void* LIBGAV1_RESTRICT const left_column) {
484   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
485   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
486   const uint8_t* const sm_weights_h = kSmoothWeights + height - 4;
487   const uint8_t* const sm_weights_w = kSmoothWeights + width - 4;
488   const __m128i zero = _mm_setzero_si128();
489   const __m128i scale_value = _mm_set1_epi16(256);
490   const __m128i bottom_left = _mm_cvtsi32_si128(left_ptr[height - 1]);
491   const __m128i top_right = _mm_set1_epi16(top_ptr[width - 1]);
492   const __m128i round = _mm_set1_epi32(256);
493   auto* dst = static_cast<uint8_t*>(dest);
494   for (int y = 0; y < height; ++y) {
495     const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
496     const __m128i left_y = _mm_cvtsi32_si128(left_ptr[y]);
497     const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
498     __m128i scaled_bottom_left =
499         _mm_mullo_epi16(scale_m_weights_y, bottom_left);
500     const __m128i weight_left_y =
501         _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
502     scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round);
503     scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0);
504     for (int x = 0; x < width; x += 8) {
505       const __m128i top_x = LoadLo8(top_ptr + x);
506       const __m128i weights_x = LoadLo8(sm_weights_w + x);
507       const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x);
508       const __m128i top_weights_x_lo = _mm_cvtepu8_epi16(top_weights_x);
509       const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero);
510 
511       // Here opposite weights and pixels are multiplied, where the order of
512       // interleaving is indicated in the names.
513       __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y);
514       __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y);
515 
516       // |scaled_bottom_left| is always scaled by the same weight each row, so
517       // we only derive |scaled_top_right| values here.
518       const __m128i inverted_weights_x =
519           _mm_sub_epi16(scale_value, _mm_cvtepu8_epi16(weights_x));
520       const __m128i scaled_top_right =
521           _mm_mullo_epi16(inverted_weights_x, top_right);
522       const __m128i scaled_top_right_lo = _mm_cvtepu16_epi32(scaled_top_right);
523       const __m128i scaled_top_right_hi =
524           _mm_unpackhi_epi16(scaled_top_right, zero);
525       pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left);
526       pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left);
527       pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo);
528       pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi);
529 
530       // The round value for RightShiftWithRounding was added with
531       // |scaled_bottom_left|.
532       pred_lo = _mm_srli_epi32(pred_lo, 9);
533       pred_hi = _mm_srli_epi32(pred_hi, 9);
534       const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
535       StoreLo8(dst + x, _mm_packus_epi16(pred, pred));
536     }
537     dst += stride;
538   }
539 }
540 
SmoothHorizontal4x4_SSE4_1(void * LIBGAV1_RESTRICT dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT top_row,const void * LIBGAV1_RESTRICT left_column)541 void SmoothHorizontal4x4_SSE4_1(void* LIBGAV1_RESTRICT dest,
542                                 const ptrdiff_t stride,
543                                 const void* LIBGAV1_RESTRICT top_row,
544                                 const void* LIBGAV1_RESTRICT left_column) {
545   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
546   const __m128i top_right = _mm_set1_epi32(top_ptr[3]);
547   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
548   const __m128i left = _mm_cvtepu8_epi32(Load4(left_ptr));
549   const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
550   __m128i scale = _mm_set1_epi32(256);
551   const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
552   const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
553   scale = _mm_set1_epi32(128);
554   auto* dst = static_cast<uint8_t*>(dest);
555   WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
556   dst += stride;
557   WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
558   dst += stride;
559   WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
560   dst += stride;
561   WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
562 }
563 
SmoothHorizontal4x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)564 void SmoothHorizontal4x8_SSE4_1(
565     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
566     const void* LIBGAV1_RESTRICT const top_row,
567     const void* LIBGAV1_RESTRICT const left_column) {
568   const auto* const top = static_cast<const uint8_t*>(top_row);
569   const __m128i top_right = _mm_set1_epi32(top[3]);
570   const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
571   __m128i scale = _mm_set1_epi32(256);
572   const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
573   const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
574   scale = _mm_set1_epi32(128);
575   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
576   __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
577   auto* dst = static_cast<uint8_t*>(dest);
578   WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
579   dst += stride;
580   WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
581   dst += stride;
582   WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
583   dst += stride;
584   WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
585   dst += stride;
586 
587   left = _mm_cvtepu8_epi32(Load4(left_ptr + 4));
588   WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
589   dst += stride;
590   WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
591   dst += stride;
592   WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
593   dst += stride;
594   WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
595 }
596 
SmoothHorizontal4x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)597 void SmoothHorizontal4x16_SSE4_1(
598     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
599     const void* LIBGAV1_RESTRICT const top_row,
600     const void* LIBGAV1_RESTRICT const left_column) {
601   const auto* const top = static_cast<const uint8_t*>(top_row);
602   const __m128i top_right = _mm_set1_epi32(top[3]);
603   const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
604   __m128i scale = _mm_set1_epi32(256);
605   const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
606   const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
607   scale = _mm_set1_epi32(128);
608   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
609   __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
610   auto* dst = static_cast<uint8_t*>(dest);
611   WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
612   dst += stride;
613   WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
614   dst += stride;
615   WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
616   dst += stride;
617   WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
618   dst += stride;
619 
620   left = _mm_cvtepu8_epi32(Load4(left_ptr + 4));
621   WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
622   dst += stride;
623   WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
624   dst += stride;
625   WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
626   dst += stride;
627   WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
628   dst += stride;
629 
630   left = _mm_cvtepu8_epi32(Load4(left_ptr + 8));
631   WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
632   dst += stride;
633   WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
634   dst += stride;
635   WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
636   dst += stride;
637   WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
638   dst += stride;
639 
640   left = _mm_cvtepu8_epi32(Load4(left_ptr + 12));
641   WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
642   dst += stride;
643   WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
644   dst += stride;
645   WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
646   dst += stride;
647   WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
648 }
649 
SmoothHorizontal8x4_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)650 void SmoothHorizontal8x4_SSE4_1(
651     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
652     const void* LIBGAV1_RESTRICT const top_row,
653     const void* LIBGAV1_RESTRICT const left_column) {
654   const auto* const top = static_cast<const uint8_t*>(top_row);
655   const __m128i top_right = _mm_set1_epi16(top[7]);
656   const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
657   const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
658   __m128i scale = _mm_set1_epi16(256);
659   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
660   const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
661   scale = _mm_set1_epi16(128);
662   __m128i y_select = _mm_set1_epi32(0x01000100);
663   __m128i left_y = _mm_shuffle_epi8(left, y_select);
664   auto* dst = static_cast<uint8_t*>(dest);
665   WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
666   dst += stride;
667   y_select = _mm_set1_epi32(0x03020302);
668   left_y = _mm_shuffle_epi8(left, y_select);
669   WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
670   dst += stride;
671   y_select = _mm_set1_epi32(0x05040504);
672   left_y = _mm_shuffle_epi8(left, y_select);
673   WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
674   dst += stride;
675   y_select = _mm_set1_epi32(0x07060706);
676   left_y = _mm_shuffle_epi8(left, y_select);
677   WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
678 }
679 
SmoothHorizontal8x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)680 void SmoothHorizontal8x8_SSE4_1(
681     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
682     const void* LIBGAV1_RESTRICT const top_row,
683     const void* LIBGAV1_RESTRICT const left_column) {
684   const auto* const top = static_cast<const uint8_t*>(top_row);
685   const __m128i top_right = _mm_set1_epi16(top[7]);
686   const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
687   const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
688   __m128i scale = _mm_set1_epi16(256);
689   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
690   const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
691   scale = _mm_set1_epi16(128);
692   auto* dst = static_cast<uint8_t*>(dest);
693   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
694     const __m128i y_select = _mm_set1_epi32(y_mask);
695     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
696     WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
697     dst += stride;
698   }
699 }
700 
SmoothHorizontal8x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)701 void SmoothHorizontal8x16_SSE4_1(
702     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
703     const void* LIBGAV1_RESTRICT const top_row,
704     const void* LIBGAV1_RESTRICT const left_column) {
705   const auto* const top = static_cast<const uint8_t*>(top_row);
706   const __m128i top_right = _mm_set1_epi16(top[7]);
707   const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
708   __m128i scale = _mm_set1_epi16(256);
709   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
710   const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
711   scale = _mm_set1_epi16(128);
712   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
713   __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
714   auto* dst = static_cast<uint8_t*>(dest);
715   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
716     const __m128i y_select = _mm_set1_epi32(y_mask);
717     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
718     WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
719     dst += stride;
720   }
721   left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
722   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
723     const __m128i y_select = _mm_set1_epi32(y_mask);
724     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
725     WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
726     dst += stride;
727   }
728 }
729 
SmoothHorizontal8x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)730 void SmoothHorizontal8x32_SSE4_1(
731     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
732     const void* LIBGAV1_RESTRICT const top_row,
733     const void* LIBGAV1_RESTRICT const left_column) {
734   const auto* const top = static_cast<const uint8_t*>(top_row);
735   const __m128i top_right = _mm_set1_epi16(top[7]);
736   const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
737   __m128i scale = _mm_set1_epi16(256);
738   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
739   const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
740   scale = _mm_set1_epi16(128);
741   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
742   __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
743   auto* dst = static_cast<uint8_t*>(dest);
744   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
745     const __m128i y_select = _mm_set1_epi32(y_mask);
746     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
747     WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
748     dst += stride;
749   }
750   left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
751   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
752     const __m128i y_select = _mm_set1_epi32(y_mask);
753     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
754     WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
755     dst += stride;
756   }
757   left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
758   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
759     const __m128i y_select = _mm_set1_epi32(y_mask);
760     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
761     WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
762     dst += stride;
763   }
764   left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
765   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
766     const __m128i y_select = _mm_set1_epi32(y_mask);
767     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
768     WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
769     dst += stride;
770   }
771 }
772 
SmoothHorizontal16x4_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)773 void SmoothHorizontal16x4_SSE4_1(
774     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
775     const void* LIBGAV1_RESTRICT const top_row,
776     const void* LIBGAV1_RESTRICT const left_column) {
777   const auto* const top = static_cast<const uint8_t*>(top_row);
778   const __m128i top_right = _mm_set1_epi16(top[15]);
779   const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
780   const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
781   __m128i scale = _mm_set1_epi16(256);
782   const __m128i weights1 = _mm_cvtepu8_epi16(weights);
783   const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
784   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
785   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
786   const __m128i scaled_top_right1 =
787       _mm_mullo_epi16(inverted_weights1, top_right);
788   const __m128i scaled_top_right2 =
789       _mm_mullo_epi16(inverted_weights2, top_right);
790   scale = _mm_set1_epi16(128);
791   __m128i y_mask = _mm_set1_epi32(0x01000100);
792   __m128i left_y = _mm_shuffle_epi8(left, y_mask);
793   auto* dst = static_cast<uint8_t*>(dest);
794   WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
795                               scaled_top_right1, scaled_top_right2, scale);
796   dst += stride;
797   y_mask = _mm_set1_epi32(0x03020302);
798   left_y = _mm_shuffle_epi8(left, y_mask);
799   WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
800                               scaled_top_right1, scaled_top_right2, scale);
801   dst += stride;
802   y_mask = _mm_set1_epi32(0x05040504);
803   left_y = _mm_shuffle_epi8(left, y_mask);
804   WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
805                               scaled_top_right1, scaled_top_right2, scale);
806   dst += stride;
807   y_mask = _mm_set1_epi32(0x07060706);
808   left_y = _mm_shuffle_epi8(left, y_mask);
809   WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
810                               scaled_top_right1, scaled_top_right2, scale);
811 }
812 
SmoothHorizontal16x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)813 void SmoothHorizontal16x8_SSE4_1(
814     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
815     const void* LIBGAV1_RESTRICT const top_row,
816     const void* LIBGAV1_RESTRICT const left_column) {
817   const auto* const top = static_cast<const uint8_t*>(top_row);
818   const __m128i top_right = _mm_set1_epi16(top[15]);
819   const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
820   const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
821   __m128i scale = _mm_set1_epi16(256);
822   const __m128i weights1 = _mm_cvtepu8_epi16(weights);
823   const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
824   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
825   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
826   const __m128i scaled_top_right1 =
827       _mm_mullo_epi16(inverted_weights1, top_right);
828   const __m128i scaled_top_right2 =
829       _mm_mullo_epi16(inverted_weights2, top_right);
830   scale = _mm_set1_epi16(128);
831   auto* dst = static_cast<uint8_t*>(dest);
832   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
833     const __m128i y_select = _mm_set1_epi32(y_mask);
834     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
835     WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
836                                 scaled_top_right1, scaled_top_right2, scale);
837     dst += stride;
838   }
839 }
840 
SmoothHorizontal16x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)841 void SmoothHorizontal16x16_SSE4_1(
842     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
843     const void* LIBGAV1_RESTRICT const top_row,
844     const void* LIBGAV1_RESTRICT const left_column) {
845   const auto* const top = static_cast<const uint8_t*>(top_row);
846   const __m128i top_right = _mm_set1_epi16(top[15]);
847   const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
848   __m128i scale = _mm_set1_epi16(256);
849   const __m128i weights1 = _mm_cvtepu8_epi16(weights);
850   const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
851   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
852   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
853   const __m128i scaled_top_right1 =
854       _mm_mullo_epi16(inverted_weights1, top_right);
855   const __m128i scaled_top_right2 =
856       _mm_mullo_epi16(inverted_weights2, top_right);
857   scale = _mm_set1_epi16(128);
858   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
859   __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
860   auto* dst = static_cast<uint8_t*>(dest);
861   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
862     const __m128i y_select = _mm_set1_epi32(y_mask);
863     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
864     WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
865                                 scaled_top_right1, scaled_top_right2, scale);
866     dst += stride;
867   }
868   left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
869   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
870     const __m128i y_select = _mm_set1_epi32(y_mask);
871     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
872     WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
873                                 scaled_top_right1, scaled_top_right2, scale);
874     dst += stride;
875   }
876 }
877 
SmoothHorizontal16x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)878 void SmoothHorizontal16x32_SSE4_1(
879     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
880     const void* LIBGAV1_RESTRICT const top_row,
881     const void* LIBGAV1_RESTRICT const left_column) {
882   const auto* const top = static_cast<const uint8_t*>(top_row);
883   const __m128i top_right = _mm_set1_epi16(top[15]);
884   const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
885   __m128i scale = _mm_set1_epi16(256);
886   const __m128i weights1 = _mm_cvtepu8_epi16(weights);
887   const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
888   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
889   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
890   const __m128i scaled_top_right1 =
891       _mm_mullo_epi16(inverted_weights1, top_right);
892   const __m128i scaled_top_right2 =
893       _mm_mullo_epi16(inverted_weights2, top_right);
894   scale = _mm_set1_epi16(128);
895   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
896   __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
897   auto* dst = static_cast<uint8_t*>(dest);
898   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
899     const __m128i y_select = _mm_set1_epi32(y_mask);
900     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
901     WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
902                                 scaled_top_right1, scaled_top_right2, scale);
903     dst += stride;
904   }
905   left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
906   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
907     const __m128i y_select = _mm_set1_epi32(y_mask);
908     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
909     WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
910                                 scaled_top_right1, scaled_top_right2, scale);
911     dst += stride;
912   }
913   left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
914   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
915     const __m128i y_select = _mm_set1_epi32(y_mask);
916     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
917     WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
918                                 scaled_top_right1, scaled_top_right2, scale);
919     dst += stride;
920   }
921   left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
922   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
923     const __m128i y_select = _mm_set1_epi32(y_mask);
924     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
925     WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
926                                 scaled_top_right1, scaled_top_right2, scale);
927     dst += stride;
928   }
929 }
930 
SmoothHorizontal16x64_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)931 void SmoothHorizontal16x64_SSE4_1(
932     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
933     const void* LIBGAV1_RESTRICT const top_row,
934     const void* LIBGAV1_RESTRICT const left_column) {
935   const auto* const top = static_cast<const uint8_t*>(top_row);
936   const __m128i top_right = _mm_set1_epi16(top[15]);
937   const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
938   __m128i scale = _mm_set1_epi16(256);
939   const __m128i weights1 = _mm_cvtepu8_epi16(weights);
940   const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
941   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
942   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
943   const __m128i scaled_top_right1 =
944       _mm_mullo_epi16(inverted_weights1, top_right);
945   const __m128i scaled_top_right2 =
946       _mm_mullo_epi16(inverted_weights2, top_right);
947   scale = _mm_set1_epi16(128);
948   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
949   auto* dst = static_cast<uint8_t*>(dest);
950   for (int left_offset = 0; left_offset < 64; left_offset += 8) {
951     const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
952     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
953       const __m128i y_select = _mm_set1_epi32(y_mask);
954       const __m128i left_y = _mm_shuffle_epi8(left, y_select);
955       WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
956                                   scaled_top_right1, scaled_top_right2, scale);
957       dst += stride;
958     }
959   }
960 }
961 
SmoothHorizontal32x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)962 void SmoothHorizontal32x8_SSE4_1(
963     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
964     const void* LIBGAV1_RESTRICT const top_row,
965     const void* LIBGAV1_RESTRICT const left_column) {
966   const auto* const top = static_cast<const uint8_t*>(top_row);
967   const __m128i top_right = _mm_set1_epi16(top[31]);
968   const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
969   const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
970   const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
971   __m128i scale = _mm_set1_epi16(256);
972   const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
973   const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
974   const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
975   const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
976   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
977   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
978   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
979   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
980   const __m128i scaled_top_right1 =
981       _mm_mullo_epi16(inverted_weights1, top_right);
982   const __m128i scaled_top_right2 =
983       _mm_mullo_epi16(inverted_weights2, top_right);
984   const __m128i scaled_top_right3 =
985       _mm_mullo_epi16(inverted_weights3, top_right);
986   const __m128i scaled_top_right4 =
987       _mm_mullo_epi16(inverted_weights4, top_right);
988   scale = _mm_set1_epi16(128);
989   auto* dst = static_cast<uint8_t*>(dest);
990   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
991     __m128i y_select = _mm_set1_epi32(y_mask);
992     __m128i left_y = _mm_shuffle_epi8(left, y_select);
993     WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
994                                 scaled_top_right1, scaled_top_right2, scale);
995     WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
996                                 scaled_top_right3, scaled_top_right4, scale);
997     dst += stride;
998   }
999 }
1000 
SmoothHorizontal32x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1001 void SmoothHorizontal32x16_SSE4_1(
1002     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
1003     const void* LIBGAV1_RESTRICT const top_row,
1004     const void* LIBGAV1_RESTRICT const left_column) {
1005   const auto* const top = static_cast<const uint8_t*>(top_row);
1006   const __m128i top_right = _mm_set1_epi16(top[31]);
1007   const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
1008   const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
1009   const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
1010   __m128i scale = _mm_set1_epi16(256);
1011   const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
1012   const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
1013   const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
1014   const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
1015   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1016   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1017   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1018   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1019   const __m128i scaled_top_right1 =
1020       _mm_mullo_epi16(inverted_weights1, top_right);
1021   const __m128i scaled_top_right2 =
1022       _mm_mullo_epi16(inverted_weights2, top_right);
1023   const __m128i scaled_top_right3 =
1024       _mm_mullo_epi16(inverted_weights3, top_right);
1025   const __m128i scaled_top_right4 =
1026       _mm_mullo_epi16(inverted_weights4, top_right);
1027   scale = _mm_set1_epi16(128);
1028   auto* dst = static_cast<uint8_t*>(dest);
1029   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1030     __m128i y_select = _mm_set1_epi32(y_mask);
1031     __m128i left_y = _mm_shuffle_epi8(left1, y_select);
1032     WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1033                                 scaled_top_right1, scaled_top_right2, scale);
1034     WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1035                                 scaled_top_right3, scaled_top_right4, scale);
1036     dst += stride;
1037   }
1038   const __m128i left2 =
1039       _mm_cvtepu8_epi16(LoadLo8(static_cast<const uint8_t*>(left_column) + 8));
1040   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1041     __m128i y_select = _mm_set1_epi32(y_mask);
1042     __m128i left_y = _mm_shuffle_epi8(left2, y_select);
1043     WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1044                                 scaled_top_right1, scaled_top_right2, scale);
1045     WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1046                                 scaled_top_right3, scaled_top_right4, scale);
1047     dst += stride;
1048   }
1049 }
1050 
SmoothHorizontal32x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1051 void SmoothHorizontal32x32_SSE4_1(
1052     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
1053     const void* LIBGAV1_RESTRICT const top_row,
1054     const void* LIBGAV1_RESTRICT const left_column) {
1055   const auto* const top = static_cast<const uint8_t*>(top_row);
1056   const __m128i top_right = _mm_set1_epi16(top[31]);
1057   const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
1058   const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
1059   __m128i scale = _mm_set1_epi16(256);
1060   const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
1061   const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
1062   const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
1063   const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
1064   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1065   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1066   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1067   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1068   const __m128i scaled_top_right1 =
1069       _mm_mullo_epi16(inverted_weights1, top_right);
1070   const __m128i scaled_top_right2 =
1071       _mm_mullo_epi16(inverted_weights2, top_right);
1072   const __m128i scaled_top_right3 =
1073       _mm_mullo_epi16(inverted_weights3, top_right);
1074   const __m128i scaled_top_right4 =
1075       _mm_mullo_epi16(inverted_weights4, top_right);
1076   scale = _mm_set1_epi16(128);
1077   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1078   __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
1079   auto* dst = static_cast<uint8_t*>(dest);
1080   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1081     __m128i y_select = _mm_set1_epi32(y_mask);
1082     __m128i left_y = _mm_shuffle_epi8(left, y_select);
1083     WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1084                                 scaled_top_right1, scaled_top_right2, scale);
1085     WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1086                                 scaled_top_right3, scaled_top_right4, scale);
1087     dst += stride;
1088   }
1089   left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
1090   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1091     __m128i y_select = _mm_set1_epi32(y_mask);
1092     __m128i left_y = _mm_shuffle_epi8(left, y_select);
1093     WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1094                                 scaled_top_right1, scaled_top_right2, scale);
1095     WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1096                                 scaled_top_right3, scaled_top_right4, scale);
1097     dst += stride;
1098   }
1099   left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
1100   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1101     __m128i y_select = _mm_set1_epi32(y_mask);
1102     __m128i left_y = _mm_shuffle_epi8(left, y_select);
1103     WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1104                                 scaled_top_right1, scaled_top_right2, scale);
1105     WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1106                                 scaled_top_right3, scaled_top_right4, scale);
1107     dst += stride;
1108   }
1109   left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
1110   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1111     __m128i y_select = _mm_set1_epi32(y_mask);
1112     __m128i left_y = _mm_shuffle_epi8(left, y_select);
1113     WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1114                                 scaled_top_right1, scaled_top_right2, scale);
1115     WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1116                                 scaled_top_right3, scaled_top_right4, scale);
1117     dst += stride;
1118   }
1119 }
1120 
SmoothHorizontal32x64_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1121 void SmoothHorizontal32x64_SSE4_1(
1122     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
1123     const void* LIBGAV1_RESTRICT const top_row,
1124     const void* LIBGAV1_RESTRICT const left_column) {
1125   const auto* const top = static_cast<const uint8_t*>(top_row);
1126   const __m128i top_right = _mm_set1_epi16(top[31]);
1127   const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
1128   const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
1129   __m128i scale = _mm_set1_epi16(256);
1130   const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
1131   const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
1132   const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
1133   const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
1134   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1135   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1136   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1137   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1138   const __m128i scaled_top_right1 =
1139       _mm_mullo_epi16(inverted_weights1, top_right);
1140   const __m128i scaled_top_right2 =
1141       _mm_mullo_epi16(inverted_weights2, top_right);
1142   const __m128i scaled_top_right3 =
1143       _mm_mullo_epi16(inverted_weights3, top_right);
1144   const __m128i scaled_top_right4 =
1145       _mm_mullo_epi16(inverted_weights4, top_right);
1146   scale = _mm_set1_epi16(128);
1147   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1148   auto* dst = static_cast<uint8_t*>(dest);
1149   for (int left_offset = 0; left_offset < 64; left_offset += 8) {
1150     const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
1151     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1152       const __m128i y_select = _mm_set1_epi32(y_mask);
1153       const __m128i left_y = _mm_shuffle_epi8(left, y_select);
1154       WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1155                                   scaled_top_right1, scaled_top_right2, scale);
1156       WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1157                                   scaled_top_right3, scaled_top_right4, scale);
1158       dst += stride;
1159     }
1160   }
1161 }
1162 
SmoothHorizontal64x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1163 void SmoothHorizontal64x16_SSE4_1(
1164     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
1165     const void* LIBGAV1_RESTRICT const top_row,
1166     const void* LIBGAV1_RESTRICT const left_column) {
1167   const auto* const top = static_cast<const uint8_t*>(top_row);
1168   const __m128i top_right = _mm_set1_epi16(top[63]);
1169   const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
1170   const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
1171   const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
1172   __m128i scale = _mm_set1_epi16(256);
1173   const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
1174   const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
1175   const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
1176   const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
1177   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1178   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1179   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1180   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1181   const __m128i scaled_top_right1 =
1182       _mm_mullo_epi16(inverted_weights1, top_right);
1183   const __m128i scaled_top_right2 =
1184       _mm_mullo_epi16(inverted_weights2, top_right);
1185   const __m128i scaled_top_right3 =
1186       _mm_mullo_epi16(inverted_weights3, top_right);
1187   const __m128i scaled_top_right4 =
1188       _mm_mullo_epi16(inverted_weights4, top_right);
1189   const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
1190   const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
1191   const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
1192   const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
1193   const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
1194   const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
1195   const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
1196   const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
1197   const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
1198   const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
1199   const __m128i scaled_top_right5 =
1200       _mm_mullo_epi16(inverted_weights5, top_right);
1201   const __m128i scaled_top_right6 =
1202       _mm_mullo_epi16(inverted_weights6, top_right);
1203   const __m128i scaled_top_right7 =
1204       _mm_mullo_epi16(inverted_weights7, top_right);
1205   const __m128i scaled_top_right8 =
1206       _mm_mullo_epi16(inverted_weights8, top_right);
1207   scale = _mm_set1_epi16(128);
1208   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1209   auto* dst = static_cast<uint8_t*>(dest);
1210   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1211     __m128i y_select = _mm_set1_epi32(y_mask);
1212     __m128i left_y = _mm_shuffle_epi8(left1, y_select);
1213     WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1214                                 scaled_top_right1, scaled_top_right2, scale);
1215     WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1216                                 scaled_top_right3, scaled_top_right4, scale);
1217     WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
1218                                 scaled_top_right5, scaled_top_right6, scale);
1219     WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
1220                                 scaled_top_right7, scaled_top_right8, scale);
1221     dst += stride;
1222   }
1223   const __m128i left2 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
1224   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1225     __m128i y_select = _mm_set1_epi32(y_mask);
1226     __m128i left_y = _mm_shuffle_epi8(left2, y_select);
1227     WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1228                                 scaled_top_right1, scaled_top_right2, scale);
1229     WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1230                                 scaled_top_right3, scaled_top_right4, scale);
1231     WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
1232                                 scaled_top_right5, scaled_top_right6, scale);
1233     WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
1234                                 scaled_top_right7, scaled_top_right8, scale);
1235     dst += stride;
1236   }
1237 }
1238 
SmoothHorizontal64x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1239 void SmoothHorizontal64x32_SSE4_1(
1240     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
1241     const void* LIBGAV1_RESTRICT const top_row,
1242     const void* LIBGAV1_RESTRICT const left_column) {
1243   const auto* const top = static_cast<const uint8_t*>(top_row);
1244   const __m128i top_right = _mm_set1_epi16(top[63]);
1245   const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
1246   const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
1247   const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
1248   __m128i scale = _mm_set1_epi16(256);
1249   const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
1250   const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
1251   const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
1252   const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
1253   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1254   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1255   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1256   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1257   const __m128i scaled_top_right1 =
1258       _mm_mullo_epi16(inverted_weights1, top_right);
1259   const __m128i scaled_top_right2 =
1260       _mm_mullo_epi16(inverted_weights2, top_right);
1261   const __m128i scaled_top_right3 =
1262       _mm_mullo_epi16(inverted_weights3, top_right);
1263   const __m128i scaled_top_right4 =
1264       _mm_mullo_epi16(inverted_weights4, top_right);
1265   const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
1266   const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
1267   const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
1268   const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
1269   const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
1270   const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
1271   const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
1272   const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
1273   const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
1274   const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
1275   const __m128i scaled_top_right5 =
1276       _mm_mullo_epi16(inverted_weights5, top_right);
1277   const __m128i scaled_top_right6 =
1278       _mm_mullo_epi16(inverted_weights6, top_right);
1279   const __m128i scaled_top_right7 =
1280       _mm_mullo_epi16(inverted_weights7, top_right);
1281   const __m128i scaled_top_right8 =
1282       _mm_mullo_epi16(inverted_weights8, top_right);
1283   scale = _mm_set1_epi16(128);
1284   auto* dst = static_cast<uint8_t*>(dest);
1285   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1286     const __m128i y_select = _mm_set1_epi32(y_mask);
1287     const __m128i left_y = _mm_shuffle_epi8(left1, y_select);
1288     WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1289                                 scaled_top_right1, scaled_top_right2, scale);
1290     WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1291                                 scaled_top_right3, scaled_top_right4, scale);
1292     WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
1293                                 scaled_top_right5, scaled_top_right6, scale);
1294     WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
1295                                 scaled_top_right7, scaled_top_right8, scale);
1296     dst += stride;
1297   }
1298   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1299   const __m128i left2 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
1300   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1301     const __m128i y_select = _mm_set1_epi32(y_mask);
1302     const __m128i left_y = _mm_shuffle_epi8(left2, y_select);
1303     WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1304                                 scaled_top_right1, scaled_top_right2, scale);
1305     WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1306                                 scaled_top_right3, scaled_top_right4, scale);
1307     WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
1308                                 scaled_top_right5, scaled_top_right6, scale);
1309     WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
1310                                 scaled_top_right7, scaled_top_right8, scale);
1311     dst += stride;
1312   }
1313   const __m128i left3 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
1314   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1315     const __m128i y_select = _mm_set1_epi32(y_mask);
1316     const __m128i left_y = _mm_shuffle_epi8(left3, y_select);
1317     WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1318                                 scaled_top_right1, scaled_top_right2, scale);
1319     WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1320                                 scaled_top_right3, scaled_top_right4, scale);
1321     WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
1322                                 scaled_top_right5, scaled_top_right6, scale);
1323     WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
1324                                 scaled_top_right7, scaled_top_right8, scale);
1325     dst += stride;
1326   }
1327   const __m128i left4 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
1328   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1329     const __m128i y_select = _mm_set1_epi32(y_mask);
1330     const __m128i left_y = _mm_shuffle_epi8(left4, y_select);
1331     WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1332                                 scaled_top_right1, scaled_top_right2, scale);
1333     WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1334                                 scaled_top_right3, scaled_top_right4, scale);
1335     WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
1336                                 scaled_top_right5, scaled_top_right6, scale);
1337     WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
1338                                 scaled_top_right7, scaled_top_right8, scale);
1339     dst += stride;
1340   }
1341 }
1342 
SmoothHorizontal64x64_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1343 void SmoothHorizontal64x64_SSE4_1(
1344     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
1345     const void* LIBGAV1_RESTRICT const top_row,
1346     const void* LIBGAV1_RESTRICT const left_column) {
1347   const auto* const top = static_cast<const uint8_t*>(top_row);
1348   const __m128i top_right = _mm_set1_epi16(top[63]);
1349   const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
1350   const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
1351   __m128i scale = _mm_set1_epi16(256);
1352   const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
1353   const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
1354   const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
1355   const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
1356   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1357   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1358   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1359   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1360   const __m128i scaled_top_right1 =
1361       _mm_mullo_epi16(inverted_weights1, top_right);
1362   const __m128i scaled_top_right2 =
1363       _mm_mullo_epi16(inverted_weights2, top_right);
1364   const __m128i scaled_top_right3 =
1365       _mm_mullo_epi16(inverted_weights3, top_right);
1366   const __m128i scaled_top_right4 =
1367       _mm_mullo_epi16(inverted_weights4, top_right);
1368   const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
1369   const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
1370   const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
1371   const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
1372   const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
1373   const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
1374   const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
1375   const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
1376   const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
1377   const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
1378   const __m128i scaled_top_right5 =
1379       _mm_mullo_epi16(inverted_weights5, top_right);
1380   const __m128i scaled_top_right6 =
1381       _mm_mullo_epi16(inverted_weights6, top_right);
1382   const __m128i scaled_top_right7 =
1383       _mm_mullo_epi16(inverted_weights7, top_right);
1384   const __m128i scaled_top_right8 =
1385       _mm_mullo_epi16(inverted_weights8, top_right);
1386   scale = _mm_set1_epi16(128);
1387   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1388   auto* dst = static_cast<uint8_t*>(dest);
1389   for (int left_offset = 0; left_offset < 64; left_offset += 8) {
1390     const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
1391     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1392       const __m128i y_select = _mm_set1_epi32(y_mask);
1393       const __m128i left_y = _mm_shuffle_epi8(left, y_select);
1394       WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1395                                   scaled_top_right1, scaled_top_right2, scale);
1396       WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1397                                   scaled_top_right3, scaled_top_right4, scale);
1398       WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
1399                                   scaled_top_right5, scaled_top_right6, scale);
1400       WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
1401                                   scaled_top_right7, scaled_top_right8, scale);
1402       dst += stride;
1403     }
1404   }
1405 }
1406 
LoadSmoothVerticalPixels4(const uint8_t * LIBGAV1_RESTRICT above,const uint8_t * LIBGAV1_RESTRICT left,const int height,__m128i * pixels)1407 inline void LoadSmoothVerticalPixels4(const uint8_t* LIBGAV1_RESTRICT above,
1408                                       const uint8_t* LIBGAV1_RESTRICT left,
1409                                       const int height, __m128i* pixels) {
1410   __m128i top = Load4(above);
1411   const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
1412   top = _mm_cvtepu8_epi16(top);
1413   pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
1414 }
1415 
1416 // |weight_array| alternates weight vectors from the table with their inverted
1417 // (256-w) counterparts. This is precomputed by the compiler when the weights
1418 // table is visible to this module. Removing this visibility can cut speed by up
1419 // to half in both 4xH and 8xH transforms.
LoadSmoothVerticalWeights4(const uint8_t * LIBGAV1_RESTRICT weight_array,const int height,__m128i * weights)1420 inline void LoadSmoothVerticalWeights4(const uint8_t* LIBGAV1_RESTRICT
1421                                            weight_array,
1422                                        const int height, __m128i* weights) {
1423   const __m128i inverter = _mm_set1_epi16(256);
1424 
1425   if (height == 4) {
1426     const __m128i weight = Load4(weight_array);
1427     weights[0] = _mm_cvtepu8_epi16(weight);
1428     weights[1] = _mm_sub_epi16(inverter, weights[0]);
1429   } else if (height == 8) {
1430     const __m128i weight = LoadLo8(weight_array + 4);
1431     weights[0] = _mm_cvtepu8_epi16(weight);
1432     weights[1] = _mm_sub_epi16(inverter, weights[0]);
1433   } else {
1434     const __m128i weight = LoadUnaligned16(weight_array + 12);
1435     const __m128i zero = _mm_setzero_si128();
1436     weights[0] = _mm_cvtepu8_epi16(weight);
1437     weights[1] = _mm_sub_epi16(inverter, weights[0]);
1438     weights[2] = _mm_unpackhi_epi8(weight, zero);
1439     weights[3] = _mm_sub_epi16(inverter, weights[2]);
1440   }
1441 }
1442 
WriteSmoothVertical4xH(const __m128i * pixel,const __m128i * weight,const int height,uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride)1443 inline void WriteSmoothVertical4xH(const __m128i* pixel, const __m128i* weight,
1444                                    const int height,
1445                                    uint8_t* LIBGAV1_RESTRICT dst,
1446                                    const ptrdiff_t stride) {
1447   const __m128i pred_round = _mm_set1_epi32(128);
1448   const __m128i mask_increment = _mm_set1_epi16(0x0202);
1449   const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400);
1450   __m128i y_select = _mm_set1_epi16(0x0100);
1451 
1452   for (int y = 0; y < height; ++y) {
1453     const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select);
1454     const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select);
1455     const __m128i alternate_weights =
1456         _mm_unpacklo_epi16(weight_y, inverted_weight_y);
1457     // Here the pixel vector is top_row[0], corner, top_row[1], corner, ...
1458     // The madd instruction yields four results of the form:
1459     // (top_row[x] * weight[y] + corner * inverted_weight[y])
1460     __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights);
1461     sum = _mm_add_epi32(sum, pred_round);
1462     sum = _mm_srai_epi32(sum, 8);
1463     sum = _mm_shuffle_epi8(sum, cvtepu8_epi32);
1464     Store4(dst, sum);
1465     dst += stride;
1466     y_select = _mm_add_epi16(y_select, mask_increment);
1467   }
1468 }
1469 
SmoothVertical4x4_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1470 void SmoothVertical4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1471                               const ptrdiff_t stride,
1472                               const void* LIBGAV1_RESTRICT const top_row,
1473                               const void* LIBGAV1_RESTRICT const left_column) {
1474   const auto* const left = static_cast<const uint8_t*>(left_column);
1475   const auto* const above = static_cast<const uint8_t*>(top_row);
1476   auto* dst = static_cast<uint8_t*>(dest);
1477   __m128i pixels;
1478   LoadSmoothVerticalPixels4(above, left, 4, &pixels);
1479 
1480   __m128i weights[2];
1481   LoadSmoothVerticalWeights4(kSmoothWeights, 4, weights);
1482 
1483   WriteSmoothVertical4xH(&pixels, weights, 4, dst, stride);
1484 }
1485 
SmoothVertical4x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1486 void SmoothVertical4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1487                               const ptrdiff_t stride,
1488                               const void* LIBGAV1_RESTRICT const top_row,
1489                               const void* LIBGAV1_RESTRICT const left_column) {
1490   const auto* const left = static_cast<const uint8_t*>(left_column);
1491   const auto* const above = static_cast<const uint8_t*>(top_row);
1492   auto* dst = static_cast<uint8_t*>(dest);
1493   __m128i pixels;
1494   LoadSmoothVerticalPixels4(above, left, 8, &pixels);
1495 
1496   __m128i weights[2];
1497   LoadSmoothVerticalWeights4(kSmoothWeights, 8, weights);
1498 
1499   WriteSmoothVertical4xH(&pixels, weights, 8, dst, stride);
1500 }
1501 
SmoothVertical4x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1502 void SmoothVertical4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1503                                const ptrdiff_t stride,
1504                                const void* LIBGAV1_RESTRICT const top_row,
1505                                const void* LIBGAV1_RESTRICT const left_column) {
1506   const auto* const left = static_cast<const uint8_t*>(left_column);
1507   const auto* const above = static_cast<const uint8_t*>(top_row);
1508   auto* dst = static_cast<uint8_t*>(dest);
1509   __m128i pixels;
1510   LoadSmoothVerticalPixels4(above, left, 16, &pixels);
1511 
1512   __m128i weights[4];
1513   LoadSmoothVerticalWeights4(kSmoothWeights, 16, weights);
1514 
1515   WriteSmoothVertical4xH(&pixels, weights, 8, dst, stride);
1516   dst += stride << 3;
1517   WriteSmoothVertical4xH(&pixels, &weights[2], 8, dst, stride);
1518 }
1519 
SmoothVertical8x4_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1520 void SmoothVertical8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1521                               const ptrdiff_t stride,
1522                               const void* LIBGAV1_RESTRICT const top_row,
1523                               const void* LIBGAV1_RESTRICT const left_column) {
1524   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1525   const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]);
1526   const __m128i weights = _mm_cvtepu8_epi16(Load4(kSmoothWeights));
1527   __m128i scale = _mm_set1_epi16(256);
1528   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1529   const __m128i scaled_bottom_left =
1530       _mm_mullo_epi16(inverted_weights, bottom_left);
1531   scale = _mm_set1_epi16(128);
1532 
1533   auto* dst = static_cast<uint8_t*>(dest);
1534   __m128i y_select = _mm_set1_epi32(0x01000100);
1535   const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
1536   __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1537   __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1538   WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
1539   dst += stride;
1540   y_select = _mm_set1_epi32(0x03020302);
1541   weights_y = _mm_shuffle_epi8(weights, y_select);
1542   scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1543   WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
1544   dst += stride;
1545   y_select = _mm_set1_epi32(0x05040504);
1546   weights_y = _mm_shuffle_epi8(weights, y_select);
1547   scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1548   WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
1549   dst += stride;
1550   y_select = _mm_set1_epi32(0x07060706);
1551   weights_y = _mm_shuffle_epi8(weights, y_select);
1552   scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1553   WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
1554 }
1555 
SmoothVertical8x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1556 void SmoothVertical8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1557                               const ptrdiff_t stride,
1558                               const void* LIBGAV1_RESTRICT const top_row,
1559                               const void* LIBGAV1_RESTRICT const left_column) {
1560   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1561   const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
1562   const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
1563   __m128i scale = _mm_set1_epi16(256);
1564   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1565   const __m128i scaled_bottom_left =
1566       _mm_mullo_epi16(inverted_weights, bottom_left);
1567   scale = _mm_set1_epi16(128);
1568   const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
1569   auto* dst = static_cast<uint8_t*>(dest);
1570   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1571     const __m128i y_select = _mm_set1_epi32(y_mask);
1572     const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1573     const __m128i scaled_bottom_left_y =
1574         _mm_shuffle_epi8(scaled_bottom_left, y_select);
1575     WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
1576                                scale);
1577     dst += stride;
1578   }
1579 }
1580 
SmoothVertical8x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1581 void SmoothVertical8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1582                                const ptrdiff_t stride,
1583                                const void* LIBGAV1_RESTRICT const top_row,
1584                                const void* LIBGAV1_RESTRICT const left_column) {
1585   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1586   const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
1587   const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
1588 
1589   const __m128i weights1 = _mm_cvtepu8_epi16(weights);
1590   const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
1591   __m128i scale = _mm_set1_epi16(256);
1592   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1593   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1594   const __m128i scaled_bottom_left1 =
1595       _mm_mullo_epi16(inverted_weights1, bottom_left);
1596   const __m128i scaled_bottom_left2 =
1597       _mm_mullo_epi16(inverted_weights2, bottom_left);
1598   scale = _mm_set1_epi16(128);
1599   const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
1600   auto* dst = static_cast<uint8_t*>(dest);
1601   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1602     const __m128i y_select = _mm_set1_epi32(y_mask);
1603     const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1604     const __m128i scaled_bottom_left_y =
1605         _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1606     WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
1607                                scale);
1608     dst += stride;
1609   }
1610   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1611     const __m128i y_select = _mm_set1_epi32(y_mask);
1612     const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1613     const __m128i scaled_bottom_left_y =
1614         _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1615     WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
1616                                scale);
1617     dst += stride;
1618   }
1619 }
1620 
SmoothVertical8x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1621 void SmoothVertical8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1622                                const ptrdiff_t stride,
1623                                const void* LIBGAV1_RESTRICT const top_row,
1624                                const void* LIBGAV1_RESTRICT const left_column) {
1625   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1626   const __m128i zero = _mm_setzero_si128();
1627   const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
1628   const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
1629   const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
1630   const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
1631   const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1632   const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
1633   const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1634   __m128i scale = _mm_set1_epi16(256);
1635   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1636   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1637   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1638   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1639   const __m128i scaled_bottom_left1 =
1640       _mm_mullo_epi16(inverted_weights1, bottom_left);
1641   const __m128i scaled_bottom_left2 =
1642       _mm_mullo_epi16(inverted_weights2, bottom_left);
1643   const __m128i scaled_bottom_left3 =
1644       _mm_mullo_epi16(inverted_weights3, bottom_left);
1645   const __m128i scaled_bottom_left4 =
1646       _mm_mullo_epi16(inverted_weights4, bottom_left);
1647   scale = _mm_set1_epi16(128);
1648   auto* dst = static_cast<uint8_t*>(dest);
1649   const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
1650   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1651     const __m128i y_select = _mm_set1_epi32(y_mask);
1652     const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1653     const __m128i scaled_bottom_left_y =
1654         _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1655     WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
1656                                scale);
1657     dst += stride;
1658   }
1659   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1660     const __m128i y_select = _mm_set1_epi32(y_mask);
1661     const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1662     const __m128i scaled_bottom_left_y =
1663         _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1664     WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
1665                                scale);
1666     dst += stride;
1667   }
1668   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1669     const __m128i y_select = _mm_set1_epi32(y_mask);
1670     const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1671     const __m128i scaled_bottom_left_y =
1672         _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1673     WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
1674                                scale);
1675     dst += stride;
1676   }
1677   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1678     const __m128i y_select = _mm_set1_epi32(y_mask);
1679     const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
1680     const __m128i scaled_bottom_left_y =
1681         _mm_shuffle_epi8(scaled_bottom_left4, y_select);
1682     WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
1683                                scale);
1684     dst += stride;
1685   }
1686 }
1687 
SmoothVertical16x4_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1688 void SmoothVertical16x4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1689                                const ptrdiff_t stride,
1690                                const void* LIBGAV1_RESTRICT const top_row,
1691                                const void* LIBGAV1_RESTRICT const left_column) {
1692   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1693   auto* dst = static_cast<uint8_t*>(dest);
1694   const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]);
1695   const __m128i weights = _mm_cvtepu8_epi16(Load4(kSmoothWeights));
1696   __m128i scale = _mm_set1_epi16(256);
1697   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1698   const __m128i scaled_bottom_left =
1699       _mm_mullo_epi16(inverted_weights, bottom_left);
1700   scale = _mm_set1_epi16(128);
1701   const __m128i top = LoadUnaligned16(top_row);
1702   const __m128i top_lo = _mm_cvtepu8_epi16(top);
1703   const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
1704 
1705   __m128i y_select = _mm_set1_epi32(0x01000100);
1706   __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1707   __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1708   WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1709                               scaled_bottom_left_y, scaled_bottom_left_y,
1710                               scale);
1711   dst += stride;
1712   y_select = _mm_set1_epi32(0x03020302);
1713   weights_y = _mm_shuffle_epi8(weights, y_select);
1714   scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1715   WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1716                               scaled_bottom_left_y, scaled_bottom_left_y,
1717                               scale);
1718   dst += stride;
1719   y_select = _mm_set1_epi32(0x05040504);
1720   weights_y = _mm_shuffle_epi8(weights, y_select);
1721   scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1722   WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1723                               scaled_bottom_left_y, scaled_bottom_left_y,
1724                               scale);
1725   dst += stride;
1726   y_select = _mm_set1_epi32(0x07060706);
1727   weights_y = _mm_shuffle_epi8(weights, y_select);
1728   scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1729   WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1730                               scaled_bottom_left_y, scaled_bottom_left_y,
1731                               scale);
1732 }
1733 
SmoothVertical16x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1734 void SmoothVertical16x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1735                                const ptrdiff_t stride,
1736                                const void* LIBGAV1_RESTRICT const top_row,
1737                                const void* LIBGAV1_RESTRICT const left_column) {
1738   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1739   auto* dst = static_cast<uint8_t*>(dest);
1740   const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
1741   const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
1742   __m128i scale = _mm_set1_epi16(256);
1743   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1744   const __m128i scaled_bottom_left =
1745       _mm_mullo_epi16(inverted_weights, bottom_left);
1746   scale = _mm_set1_epi16(128);
1747 
1748   const __m128i top = LoadUnaligned16(top_row);
1749   const __m128i top_lo = _mm_cvtepu8_epi16(top);
1750   const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
1751   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1752     const __m128i y_select = _mm_set1_epi32(y_mask);
1753     const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1754     const __m128i scaled_bottom_left_y =
1755         _mm_shuffle_epi8(scaled_bottom_left, y_select);
1756     WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1757                                 scaled_bottom_left_y, scaled_bottom_left_y,
1758                                 scale);
1759     dst += stride;
1760   }
1761 }
1762 
SmoothVertical16x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1763 void SmoothVertical16x16_SSE4_1(
1764     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
1765     const void* LIBGAV1_RESTRICT const top_row,
1766     const void* LIBGAV1_RESTRICT const left_column) {
1767   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1768   auto* dst = static_cast<uint8_t*>(dest);
1769   const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
1770   const __m128i zero = _mm_setzero_si128();
1771   __m128i scale = _mm_set1_epi16(256);
1772   const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
1773   const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
1774   const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
1775   const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
1776   const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
1777   const __m128i scaled_bottom_left_lo =
1778       _mm_mullo_epi16(inverted_weights_lo, bottom_left);
1779   const __m128i scaled_bottom_left_hi =
1780       _mm_mullo_epi16(inverted_weights_hi, bottom_left);
1781   scale = _mm_set1_epi16(128);
1782 
1783   const __m128i top = LoadUnaligned16(top_row);
1784   const __m128i top_lo = _mm_cvtepu8_epi16(top);
1785   const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1786   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1787     const __m128i y_select = _mm_set1_epi32(y_mask);
1788     const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
1789     const __m128i scaled_bottom_left_y =
1790         _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
1791     WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1792                                 scaled_bottom_left_y, scaled_bottom_left_y,
1793                                 scale);
1794     dst += stride;
1795   }
1796   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1797     const __m128i y_select = _mm_set1_epi32(y_mask);
1798     const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
1799     const __m128i scaled_bottom_left_y =
1800         _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
1801     WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1802                                 scaled_bottom_left_y, scaled_bottom_left_y,
1803                                 scale);
1804     dst += stride;
1805   }
1806 }
1807 
SmoothVertical16x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1808 void SmoothVertical16x32_SSE4_1(
1809     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
1810     const void* LIBGAV1_RESTRICT const top_row,
1811     const void* LIBGAV1_RESTRICT const left_column) {
1812   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1813   auto* dst = static_cast<uint8_t*>(dest);
1814   const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
1815   const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
1816   const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
1817   __m128i scale = _mm_set1_epi16(256);
1818   const __m128i zero = _mm_setzero_si128();
1819   const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
1820   const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1821   const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
1822   const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1823   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1824   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1825   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1826   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1827   const __m128i scaled_bottom_left1 =
1828       _mm_mullo_epi16(inverted_weights1, bottom_left);
1829   const __m128i scaled_bottom_left2 =
1830       _mm_mullo_epi16(inverted_weights2, bottom_left);
1831   const __m128i scaled_bottom_left3 =
1832       _mm_mullo_epi16(inverted_weights3, bottom_left);
1833   const __m128i scaled_bottom_left4 =
1834       _mm_mullo_epi16(inverted_weights4, bottom_left);
1835   scale = _mm_set1_epi16(128);
1836 
1837   const __m128i top = LoadUnaligned16(top_row);
1838   const __m128i top_lo = _mm_cvtepu8_epi16(top);
1839   const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1840   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1841     const __m128i y_select = _mm_set1_epi32(y_mask);
1842     const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1843     const __m128i scaled_bottom_left_y =
1844         _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1845     WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1846                                 scaled_bottom_left_y, scaled_bottom_left_y,
1847                                 scale);
1848     dst += stride;
1849   }
1850   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1851     const __m128i y_select = _mm_set1_epi32(y_mask);
1852     const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1853     const __m128i scaled_bottom_left_y =
1854         _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1855     WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1856                                 scaled_bottom_left_y, scaled_bottom_left_y,
1857                                 scale);
1858     dst += stride;
1859   }
1860   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1861     const __m128i y_select = _mm_set1_epi32(y_mask);
1862     const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1863     const __m128i scaled_bottom_left_y =
1864         _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1865     WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1866                                 scaled_bottom_left_y, scaled_bottom_left_y,
1867                                 scale);
1868     dst += stride;
1869   }
1870   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1871     const __m128i y_select = _mm_set1_epi32(y_mask);
1872     const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
1873     const __m128i scaled_bottom_left_y =
1874         _mm_shuffle_epi8(scaled_bottom_left4, y_select);
1875     WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1876                                 scaled_bottom_left_y, scaled_bottom_left_y,
1877                                 scale);
1878     dst += stride;
1879   }
1880 }
1881 
SmoothVertical16x64_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1882 void SmoothVertical16x64_SSE4_1(
1883     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
1884     const void* LIBGAV1_RESTRICT const top_row,
1885     const void* LIBGAV1_RESTRICT const left_column) {
1886   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1887   auto* dst = static_cast<uint8_t*>(dest);
1888   const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
1889   const __m128i scale = _mm_set1_epi16(256);
1890   const __m128i round = _mm_set1_epi16(128);
1891   const __m128i zero = _mm_setzero_si128();
1892 
1893   const __m128i top = LoadUnaligned16(top_row);
1894   const __m128i top_lo = _mm_cvtepu8_epi16(top);
1895   const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1896   const uint8_t* weights_base_ptr = kSmoothWeights + 60;
1897   for (int left_offset = 0; left_offset < 64; left_offset += 16) {
1898     const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
1899     const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
1900     const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
1901     const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
1902     const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
1903     const __m128i scaled_bottom_left_lo =
1904         _mm_mullo_epi16(inverted_weights_lo, bottom_left);
1905     const __m128i scaled_bottom_left_hi =
1906         _mm_mullo_epi16(inverted_weights_hi, bottom_left);
1907 
1908     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1909       const __m128i y_select = _mm_set1_epi32(y_mask);
1910       const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
1911       const __m128i scaled_bottom_left_y =
1912           _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
1913       WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1914                                   scaled_bottom_left_y, scaled_bottom_left_y,
1915                                   round);
1916       dst += stride;
1917     }
1918     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1919       const __m128i y_select = _mm_set1_epi32(y_mask);
1920       const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
1921       const __m128i scaled_bottom_left_y =
1922           _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
1923       WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1924                                   scaled_bottom_left_y, scaled_bottom_left_y,
1925                                   round);
1926       dst += stride;
1927     }
1928   }
1929 }
1930 
SmoothVertical32x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1931 void SmoothVertical32x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1932                                const ptrdiff_t stride,
1933                                const void* LIBGAV1_RESTRICT const top_row,
1934                                const void* LIBGAV1_RESTRICT const left_column) {
1935   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1936   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1937   auto* dst = static_cast<uint8_t*>(dest);
1938   const __m128i zero = _mm_setzero_si128();
1939   const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
1940   const __m128i top_lo = LoadUnaligned16(top_ptr);
1941   const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
1942   const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
1943   const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1944   const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
1945   const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1946   __m128i scale = _mm_set1_epi16(256);
1947   const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
1948   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1949   const __m128i scaled_bottom_left =
1950       _mm_mullo_epi16(inverted_weights, bottom_left);
1951   scale = _mm_set1_epi16(128);
1952   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1953     __m128i y_select = _mm_set1_epi32(y_mask);
1954     const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1955     const __m128i scaled_bottom_left_y =
1956         _mm_shuffle_epi8(scaled_bottom_left, y_select);
1957     WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
1958                                 scaled_bottom_left_y, scaled_bottom_left_y,
1959                                 scale);
1960     WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
1961                                 scaled_bottom_left_y, scaled_bottom_left_y,
1962                                 scale);
1963     dst += stride;
1964   }
1965 }
1966 
SmoothVertical32x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1967 void SmoothVertical32x16_SSE4_1(
1968     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
1969     const void* LIBGAV1_RESTRICT const top_row,
1970     const void* LIBGAV1_RESTRICT const left_column) {
1971   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1972   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1973   auto* dst = static_cast<uint8_t*>(dest);
1974   const __m128i zero = _mm_setzero_si128();
1975   const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
1976   const __m128i top_lo = LoadUnaligned16(top_ptr);
1977   const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
1978   const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
1979   const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1980   const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
1981   const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1982   const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
1983   const __m128i weights1 = _mm_cvtepu8_epi16(weights);
1984   const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
1985   __m128i scale = _mm_set1_epi16(256);
1986   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1987   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1988   const __m128i scaled_bottom_left1 =
1989       _mm_mullo_epi16(inverted_weights1, bottom_left);
1990   const __m128i scaled_bottom_left2 =
1991       _mm_mullo_epi16(inverted_weights2, bottom_left);
1992   scale = _mm_set1_epi16(128);
1993   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1994     __m128i y_select = _mm_set1_epi32(y_mask);
1995     const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1996     const __m128i scaled_bottom_left_y =
1997         _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1998     WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
1999                                 scaled_bottom_left_y, scaled_bottom_left_y,
2000                                 scale);
2001     WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2002                                 scaled_bottom_left_y, scaled_bottom_left_y,
2003                                 scale);
2004     dst += stride;
2005   }
2006   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2007     __m128i y_select = _mm_set1_epi32(y_mask);
2008     const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
2009     const __m128i scaled_bottom_left_y =
2010         _mm_shuffle_epi8(scaled_bottom_left2, y_select);
2011     WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2012                                 scaled_bottom_left_y, scaled_bottom_left_y,
2013                                 scale);
2014     WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2015                                 scaled_bottom_left_y, scaled_bottom_left_y,
2016                                 scale);
2017     dst += stride;
2018   }
2019 }
2020 
SmoothVertical32x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)2021 void SmoothVertical32x32_SSE4_1(
2022     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
2023     const void* LIBGAV1_RESTRICT const top_row,
2024     const void* LIBGAV1_RESTRICT const left_column) {
2025   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
2026   auto* dst = static_cast<uint8_t*>(dest);
2027   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
2028   const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
2029   const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
2030   const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
2031   const __m128i zero = _mm_setzero_si128();
2032   __m128i scale = _mm_set1_epi16(256);
2033   const __m128i top_lo = LoadUnaligned16(top_ptr);
2034   const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
2035   const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
2036   const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
2037   const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
2038   const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
2039   const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
2040   const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
2041   const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
2042   const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
2043   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2044   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2045   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2046   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2047   const __m128i scaled_bottom_left1 =
2048       _mm_mullo_epi16(inverted_weights1, bottom_left);
2049   const __m128i scaled_bottom_left2 =
2050       _mm_mullo_epi16(inverted_weights2, bottom_left);
2051   const __m128i scaled_bottom_left3 =
2052       _mm_mullo_epi16(inverted_weights3, bottom_left);
2053   const __m128i scaled_bottom_left4 =
2054       _mm_mullo_epi16(inverted_weights4, bottom_left);
2055   scale = _mm_set1_epi16(128);
2056   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2057     const __m128i y_select = _mm_set1_epi32(y_mask);
2058     const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
2059     const __m128i scaled_bottom_left_y =
2060         _mm_shuffle_epi8(scaled_bottom_left1, y_select);
2061     WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2062                                 scaled_bottom_left_y, scaled_bottom_left_y,
2063                                 scale);
2064     WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2065                                 scaled_bottom_left_y, scaled_bottom_left_y,
2066                                 scale);
2067     dst += stride;
2068   }
2069   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2070     const __m128i y_select = _mm_set1_epi32(y_mask);
2071     const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
2072     const __m128i scaled_bottom_left_y =
2073         _mm_shuffle_epi8(scaled_bottom_left2, y_select);
2074     WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2075                                 scaled_bottom_left_y, scaled_bottom_left_y,
2076                                 scale);
2077     WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2078                                 scaled_bottom_left_y, scaled_bottom_left_y,
2079                                 scale);
2080     dst += stride;
2081   }
2082   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2083     const __m128i y_select = _mm_set1_epi32(y_mask);
2084     const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
2085     const __m128i scaled_bottom_left_y =
2086         _mm_shuffle_epi8(scaled_bottom_left3, y_select);
2087     WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2088                                 scaled_bottom_left_y, scaled_bottom_left_y,
2089                                 scale);
2090     WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2091                                 scaled_bottom_left_y, scaled_bottom_left_y,
2092                                 scale);
2093     dst += stride;
2094   }
2095   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2096     const __m128i y_select = _mm_set1_epi32(y_mask);
2097     const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
2098     const __m128i scaled_bottom_left_y =
2099         _mm_shuffle_epi8(scaled_bottom_left4, y_select);
2100     WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2101                                 scaled_bottom_left_y, scaled_bottom_left_y,
2102                                 scale);
2103     WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2104                                 scaled_bottom_left_y, scaled_bottom_left_y,
2105                                 scale);
2106     dst += stride;
2107   }
2108 }
2109 
SmoothVertical32x64_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)2110 void SmoothVertical32x64_SSE4_1(
2111     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
2112     const void* LIBGAV1_RESTRICT const top_row,
2113     const void* LIBGAV1_RESTRICT const left_column) {
2114   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
2115   auto* dst = static_cast<uint8_t*>(dest);
2116   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
2117   const __m128i zero = _mm_setzero_si128();
2118   const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
2119   const __m128i top_lo = LoadUnaligned16(top_ptr);
2120   const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
2121   const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
2122   const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
2123   const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
2124   const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
2125   const __m128i scale = _mm_set1_epi16(256);
2126   const __m128i round = _mm_set1_epi16(128);
2127   const uint8_t* weights_base_ptr = kSmoothWeights + 60;
2128   for (int left_offset = 0; left_offset < 64; left_offset += 16) {
2129     const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
2130     const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
2131     const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
2132     const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
2133     const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
2134     const __m128i scaled_bottom_left_lo =
2135         _mm_mullo_epi16(inverted_weights_lo, bottom_left);
2136     const __m128i scaled_bottom_left_hi =
2137         _mm_mullo_epi16(inverted_weights_hi, bottom_left);
2138 
2139     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2140       const __m128i y_select = _mm_set1_epi32(y_mask);
2141       const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
2142       const __m128i scaled_bottom_left_y =
2143           _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
2144       WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2145                                   scaled_bottom_left_y, scaled_bottom_left_y,
2146                                   round);
2147       WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2148                                   scaled_bottom_left_y, scaled_bottom_left_y,
2149                                   round);
2150       dst += stride;
2151     }
2152     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2153       const __m128i y_select = _mm_set1_epi32(y_mask);
2154       const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
2155       const __m128i scaled_bottom_left_y =
2156           _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
2157       WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2158                                   scaled_bottom_left_y, scaled_bottom_left_y,
2159                                   round);
2160       WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2161                                   scaled_bottom_left_y, scaled_bottom_left_y,
2162                                   round);
2163       dst += stride;
2164     }
2165   }
2166 }
2167 
SmoothVertical64x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)2168 void SmoothVertical64x16_SSE4_1(
2169     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
2170     const void* LIBGAV1_RESTRICT const top_row,
2171     const void* LIBGAV1_RESTRICT const left_column) {
2172   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
2173   auto* dst = static_cast<uint8_t*>(dest);
2174   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
2175   const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
2176   __m128i scale = _mm_set1_epi16(256);
2177   const __m128i zero = _mm_setzero_si128();
2178   const __m128i top_lolo = LoadUnaligned16(top_ptr);
2179   const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
2180   const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
2181   const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
2182   const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
2183   const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
2184 
2185   const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
2186   const __m128i weights1 = _mm_cvtepu8_epi16(weights);
2187   const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
2188   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2189   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2190   const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
2191   const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
2192   const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
2193   const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
2194   const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
2195   const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
2196   const __m128i scaled_bottom_left1 =
2197       _mm_mullo_epi16(inverted_weights1, bottom_left);
2198   const __m128i scaled_bottom_left2 =
2199       _mm_mullo_epi16(inverted_weights2, bottom_left);
2200   scale = _mm_set1_epi16(128);
2201   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2202     const __m128i y_select = _mm_set1_epi32(y_mask);
2203     const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
2204     const __m128i scaled_bottom_left_y =
2205         _mm_shuffle_epi8(scaled_bottom_left1, y_select);
2206     WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2207                                 scaled_bottom_left_y, scaled_bottom_left_y,
2208                                 scale);
2209     WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2210                                 scaled_bottom_left_y, scaled_bottom_left_y,
2211                                 scale);
2212     WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
2213                                 scaled_bottom_left_y, scaled_bottom_left_y,
2214                                 scale);
2215     WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
2216                                 scaled_bottom_left_y, scaled_bottom_left_y,
2217                                 scale);
2218     dst += stride;
2219   }
2220   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2221     const __m128i y_select = _mm_set1_epi32(y_mask);
2222     const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
2223     const __m128i scaled_bottom_left_y =
2224         _mm_shuffle_epi8(scaled_bottom_left2, y_select);
2225     WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2226                                 scaled_bottom_left_y, scaled_bottom_left_y,
2227                                 scale);
2228     WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2229                                 scaled_bottom_left_y, scaled_bottom_left_y,
2230                                 scale);
2231     WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
2232                                 scaled_bottom_left_y, scaled_bottom_left_y,
2233                                 scale);
2234     WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
2235                                 scaled_bottom_left_y, scaled_bottom_left_y,
2236                                 scale);
2237     dst += stride;
2238   }
2239 }
2240 
SmoothVertical64x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)2241 void SmoothVertical64x32_SSE4_1(
2242     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
2243     const void* LIBGAV1_RESTRICT const top_row,
2244     const void* LIBGAV1_RESTRICT const left_column) {
2245   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
2246   auto* dst = static_cast<uint8_t*>(dest);
2247   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
2248   const __m128i zero = _mm_setzero_si128();
2249   const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
2250   const __m128i top_lolo = LoadUnaligned16(top_ptr);
2251   const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
2252   const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
2253   const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
2254   const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
2255   const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
2256   const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
2257   const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
2258   const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
2259   const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
2260   const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
2261   const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
2262   const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
2263   const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
2264   const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
2265   const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
2266   const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
2267   const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
2268   __m128i scale = _mm_set1_epi16(256);
2269   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2270   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2271   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2272   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2273   const __m128i scaled_bottom_left1 =
2274       _mm_mullo_epi16(inverted_weights1, bottom_left);
2275   const __m128i scaled_bottom_left2 =
2276       _mm_mullo_epi16(inverted_weights2, bottom_left);
2277   const __m128i scaled_bottom_left3 =
2278       _mm_mullo_epi16(inverted_weights3, bottom_left);
2279   const __m128i scaled_bottom_left4 =
2280       _mm_mullo_epi16(inverted_weights4, bottom_left);
2281   scale = _mm_set1_epi16(128);
2282 
2283   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2284     const __m128i y_select = _mm_set1_epi32(y_mask);
2285     const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
2286     const __m128i scaled_bottom_left_y =
2287         _mm_shuffle_epi8(scaled_bottom_left1, y_select);
2288     WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2289                                 scaled_bottom_left_y, scaled_bottom_left_y,
2290                                 scale);
2291     WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2292                                 scaled_bottom_left_y, scaled_bottom_left_y,
2293                                 scale);
2294     WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
2295                                 scaled_bottom_left_y, scaled_bottom_left_y,
2296                                 scale);
2297     WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
2298                                 scaled_bottom_left_y, scaled_bottom_left_y,
2299                                 scale);
2300     dst += stride;
2301   }
2302   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2303     const __m128i y_select = _mm_set1_epi32(y_mask);
2304     const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
2305     const __m128i scaled_bottom_left_y =
2306         _mm_shuffle_epi8(scaled_bottom_left2, y_select);
2307     WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2308                                 scaled_bottom_left_y, scaled_bottom_left_y,
2309                                 scale);
2310     WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2311                                 scaled_bottom_left_y, scaled_bottom_left_y,
2312                                 scale);
2313     WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
2314                                 scaled_bottom_left_y, scaled_bottom_left_y,
2315                                 scale);
2316     WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
2317                                 scaled_bottom_left_y, scaled_bottom_left_y,
2318                                 scale);
2319     dst += stride;
2320   }
2321   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2322     const __m128i y_select = _mm_set1_epi32(y_mask);
2323     const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
2324     const __m128i scaled_bottom_left_y =
2325         _mm_shuffle_epi8(scaled_bottom_left3, y_select);
2326     WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2327                                 scaled_bottom_left_y, scaled_bottom_left_y,
2328                                 scale);
2329     WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2330                                 scaled_bottom_left_y, scaled_bottom_left_y,
2331                                 scale);
2332     WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
2333                                 scaled_bottom_left_y, scaled_bottom_left_y,
2334                                 scale);
2335     WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
2336                                 scaled_bottom_left_y, scaled_bottom_left_y,
2337                                 scale);
2338     dst += stride;
2339   }
2340   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2341     const __m128i y_select = _mm_set1_epi32(y_mask);
2342     const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
2343     const __m128i scaled_bottom_left_y =
2344         _mm_shuffle_epi8(scaled_bottom_left4, y_select);
2345     WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2346                                 scaled_bottom_left_y, scaled_bottom_left_y,
2347                                 scale);
2348     WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2349                                 scaled_bottom_left_y, scaled_bottom_left_y,
2350                                 scale);
2351     WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
2352                                 scaled_bottom_left_y, scaled_bottom_left_y,
2353                                 scale);
2354     WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
2355                                 scaled_bottom_left_y, scaled_bottom_left_y,
2356                                 scale);
2357     dst += stride;
2358   }
2359 }
2360 
SmoothVertical64x64_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)2361 void SmoothVertical64x64_SSE4_1(
2362     void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
2363     const void* LIBGAV1_RESTRICT const top_row,
2364     const void* LIBGAV1_RESTRICT const left_column) {
2365   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
2366   auto* dst = static_cast<uint8_t*>(dest);
2367   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
2368   const __m128i zero = _mm_setzero_si128();
2369   const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
2370   const __m128i top_lolo = LoadUnaligned16(top_ptr);
2371   const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
2372   const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
2373   const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
2374   const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
2375   const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
2376   const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
2377   const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
2378   const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
2379   const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
2380   const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
2381   const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
2382   const __m128i scale = _mm_set1_epi16(256);
2383   const __m128i round = _mm_set1_epi16(128);
2384   const uint8_t* weights_base_ptr = kSmoothWeights + 60;
2385   for (int left_offset = 0; left_offset < 64; left_offset += 16) {
2386     const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
2387     const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
2388     const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
2389     const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
2390     const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
2391     const __m128i scaled_bottom_left_lo =
2392         _mm_mullo_epi16(inverted_weights_lo, bottom_left);
2393     const __m128i scaled_bottom_left_hi =
2394         _mm_mullo_epi16(inverted_weights_hi, bottom_left);
2395     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2396       const __m128i y_select = _mm_set1_epi32(y_mask);
2397       const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
2398       const __m128i scaled_bottom_left_y =
2399           _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
2400       WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2401                                   scaled_bottom_left_y, scaled_bottom_left_y,
2402                                   round);
2403       WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2404                                   scaled_bottom_left_y, scaled_bottom_left_y,
2405                                   round);
2406       WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
2407                                   scaled_bottom_left_y, scaled_bottom_left_y,
2408                                   round);
2409       WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
2410                                   scaled_bottom_left_y, scaled_bottom_left_y,
2411                                   round);
2412       dst += stride;
2413     }
2414     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2415       const __m128i y_select = _mm_set1_epi32(y_mask);
2416       const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
2417       const __m128i scaled_bottom_left_y =
2418           _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
2419       WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2420                                   scaled_bottom_left_y, scaled_bottom_left_y,
2421                                   round);
2422       WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2423                                   scaled_bottom_left_y, scaled_bottom_left_y,
2424                                   round);
2425       WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
2426                                   scaled_bottom_left_y, scaled_bottom_left_y,
2427                                   round);
2428       WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
2429                                   scaled_bottom_left_y, scaled_bottom_left_y,
2430                                   round);
2431       dst += stride;
2432     }
2433   }
2434 }
2435 
Init8bpp()2436 void Init8bpp() {
2437   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
2438   assert(dsp != nullptr);
2439 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmooth)
2440   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
2441       Smooth4x4_SSE4_1;
2442 #endif
2443 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmooth)
2444   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
2445       Smooth4x8_SSE4_1;
2446 #endif
2447 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmooth)
2448   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
2449       Smooth4x16_SSE4_1;
2450 #endif
2451 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmooth)
2452   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
2453       Smooth8x4_SSE4_1;
2454 #endif
2455 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmooth)
2456   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
2457       Smooth8x8_SSE4_1;
2458 #endif
2459 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmooth)
2460   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
2461       Smooth8x16_SSE4_1;
2462 #endif
2463 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmooth)
2464   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
2465       Smooth8x32_SSE4_1;
2466 #endif
2467 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmooth)
2468   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
2469       SmoothWxH<16, 4>;
2470 #endif
2471 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmooth)
2472   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
2473       SmoothWxH<16, 8>;
2474 #endif
2475 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmooth)
2476   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
2477       SmoothWxH<16, 16>;
2478 #endif
2479 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmooth)
2480   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
2481       SmoothWxH<16, 32>;
2482 #endif
2483 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmooth)
2484   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
2485       SmoothWxH<16, 64>;
2486 #endif
2487 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmooth)
2488   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
2489       SmoothWxH<32, 8>;
2490 #endif
2491 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmooth)
2492   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
2493       SmoothWxH<32, 16>;
2494 #endif
2495 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmooth)
2496   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
2497       SmoothWxH<32, 32>;
2498 #endif
2499 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmooth)
2500   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
2501       SmoothWxH<32, 64>;
2502 #endif
2503 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmooth)
2504   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
2505       SmoothWxH<64, 16>;
2506 #endif
2507 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmooth)
2508   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
2509       SmoothWxH<64, 32>;
2510 #endif
2511 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmooth)
2512   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
2513       SmoothWxH<64, 64>;
2514 #endif
2515 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmoothVertical)
2516   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
2517       SmoothVertical4x4_SSE4_1;
2518 #endif
2519 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmoothVertical)
2520   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
2521       SmoothVertical4x8_SSE4_1;
2522 #endif
2523 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmoothVertical)
2524   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
2525       SmoothVertical4x16_SSE4_1;
2526 #endif
2527 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmoothVertical)
2528   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
2529       SmoothVertical8x4_SSE4_1;
2530 #endif
2531 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmoothVertical)
2532   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
2533       SmoothVertical8x8_SSE4_1;
2534 #endif
2535 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmoothVertical)
2536   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
2537       SmoothVertical8x16_SSE4_1;
2538 #endif
2539 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmoothVertical)
2540   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
2541       SmoothVertical8x32_SSE4_1;
2542 #endif
2543 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmoothVertical)
2544   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
2545       SmoothVertical16x4_SSE4_1;
2546 #endif
2547 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmoothVertical)
2548   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
2549       SmoothVertical16x8_SSE4_1;
2550 #endif
2551 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmoothVertical)
2552   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
2553       SmoothVertical16x16_SSE4_1;
2554 #endif
2555 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmoothVertical)
2556   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
2557       SmoothVertical16x32_SSE4_1;
2558 #endif
2559 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmoothVertical)
2560   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
2561       SmoothVertical16x64_SSE4_1;
2562 #endif
2563 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmoothVertical)
2564   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
2565       SmoothVertical32x8_SSE4_1;
2566 #endif
2567 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmoothVertical)
2568   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
2569       SmoothVertical32x16_SSE4_1;
2570 #endif
2571 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmoothVertical)
2572   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
2573       SmoothVertical32x32_SSE4_1;
2574 #endif
2575 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmoothVertical)
2576   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
2577       SmoothVertical32x64_SSE4_1;
2578 #endif
2579 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmoothVertical)
2580   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
2581       SmoothVertical64x16_SSE4_1;
2582 #endif
2583 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmoothVertical)
2584   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
2585       SmoothVertical64x32_SSE4_1;
2586 #endif
2587 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmoothVertical)
2588   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
2589       SmoothVertical64x64_SSE4_1;
2590 #endif
2591 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmoothHorizontal)
2592   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
2593       SmoothHorizontal4x4_SSE4_1;
2594 #endif
2595 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmoothHorizontal)
2596   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
2597       SmoothHorizontal4x8_SSE4_1;
2598 #endif
2599 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmoothHorizontal)
2600   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
2601       SmoothHorizontal4x16_SSE4_1;
2602 #endif
2603 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmoothHorizontal)
2604   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
2605       SmoothHorizontal8x4_SSE4_1;
2606 #endif
2607 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmoothHorizontal)
2608   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
2609       SmoothHorizontal8x8_SSE4_1;
2610 #endif
2611 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmoothHorizontal)
2612   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
2613       SmoothHorizontal8x16_SSE4_1;
2614 #endif
2615 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmoothHorizontal)
2616   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
2617       SmoothHorizontal8x32_SSE4_1;
2618 #endif
2619 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmoothHorizontal)
2620   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
2621       SmoothHorizontal16x4_SSE4_1;
2622 #endif
2623 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmoothHorizontal)
2624   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
2625       SmoothHorizontal16x8_SSE4_1;
2626 #endif
2627 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmoothHorizontal)
2628   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
2629       SmoothHorizontal16x16_SSE4_1;
2630 #endif
2631 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmoothHorizontal)
2632   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
2633       SmoothHorizontal16x32_SSE4_1;
2634 #endif
2635 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmoothHorizontal)
2636   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
2637       SmoothHorizontal16x64_SSE4_1;
2638 #endif
2639 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmoothHorizontal)
2640   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
2641       SmoothHorizontal32x8_SSE4_1;
2642 #endif
2643 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmoothHorizontal)
2644   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
2645       SmoothHorizontal32x16_SSE4_1;
2646 #endif
2647 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmoothHorizontal)
2648   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
2649       SmoothHorizontal32x32_SSE4_1;
2650 #endif
2651 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmoothHorizontal)
2652   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
2653       SmoothHorizontal32x64_SSE4_1;
2654 #endif
2655 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmoothHorizontal)
2656   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
2657       SmoothHorizontal64x16_SSE4_1;
2658 #endif
2659 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmoothHorizontal)
2660   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
2661       SmoothHorizontal64x32_SSE4_1;
2662 #endif
2663 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmoothHorizontal)
2664   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
2665       SmoothHorizontal64x64_SSE4_1;
2666 #endif
2667 }
2668 
2669 }  // namespace
2670 }  // namespace low_bitdepth
2671 
IntraPredSmoothInit_SSE4_1()2672 void IntraPredSmoothInit_SSE4_1() { low_bitdepth::Init8bpp(); }
2673 
2674 }  // namespace dsp
2675 }  // namespace libgav1
2676 
2677 #else  // !LIBGAV1_TARGETING_SSE4_1
2678 
2679 namespace libgav1 {
2680 namespace dsp {
2681 
IntraPredSmoothInit_SSE4_1()2682 void IntraPredSmoothInit_SSE4_1() {}
2683 
2684 }  // namespace dsp
2685 }  // namespace libgav1
2686 
2687 #endif  // LIBGAV1_TARGETING_SSE4_1
2688