xref: /aosp_15_r20/external/libaom/aom_dsp/x86/intrapred_ssse3.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <tmmintrin.h>
13 
14 #include "config/aom_dsp_rtcd.h"
15 
16 #include "aom_dsp/intrapred_common.h"
17 
18 // -----------------------------------------------------------------------------
19 // PAETH_PRED
20 
21 // Return 8 16-bit pixels in one row
paeth_8x1_pred(const __m128i * left,const __m128i * top,const __m128i * topleft)22 static inline __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
23                                      const __m128i *topleft) {
24   const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
25 
26   __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
27   __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
28   __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
29 
30   __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
31   mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
32   __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
33 
34   pl = _mm_andnot_si128(mask1, *left);
35 
36   ptl = _mm_and_si128(mask2, *topleft);
37   pt = _mm_andnot_si128(mask2, *top);
38   pt = _mm_or_si128(pt, ptl);
39   pt = _mm_and_si128(mask1, pt);
40 
41   return _mm_or_si128(pl, pt);
42 }
43 
aom_paeth_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)44 void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
45                                    const uint8_t *above, const uint8_t *left) {
46   __m128i l = _mm_loadl_epi64((const __m128i *)left);
47   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
48   const __m128i zero = _mm_setzero_si128();
49   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
50   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
51   __m128i rep = _mm_set1_epi16((short)0x8000);
52   const __m128i one = _mm_set1_epi16(1);
53 
54   int i;
55   for (i = 0; i < 4; ++i) {
56     const __m128i l16 = _mm_shuffle_epi8(l, rep);
57     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
58 
59     *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
60     dst += stride;
61     rep = _mm_add_epi16(rep, one);
62   }
63 }
64 
aom_paeth_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)65 void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
66                                    const uint8_t *above, const uint8_t *left) {
67   __m128i l = _mm_loadl_epi64((const __m128i *)left);
68   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
69   const __m128i zero = _mm_setzero_si128();
70   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
71   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
72   __m128i rep = _mm_set1_epi16((short)0x8000);
73   const __m128i one = _mm_set1_epi16(1);
74 
75   int i;
76   for (i = 0; i < 8; ++i) {
77     const __m128i l16 = _mm_shuffle_epi8(l, rep);
78     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
79 
80     *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
81     dst += stride;
82     rep = _mm_add_epi16(rep, one);
83   }
84 }
85 
86 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_paeth_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)87 void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
88                                     const uint8_t *above, const uint8_t *left) {
89   __m128i l = _mm_load_si128((const __m128i *)left);
90   const __m128i t = _mm_cvtsi32_si128(((const int *)above)[0]);
91   const __m128i zero = _mm_setzero_si128();
92   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
93   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
94   __m128i rep = _mm_set1_epi16((short)0x8000);
95   const __m128i one = _mm_set1_epi16(1);
96 
97   for (int i = 0; i < 16; ++i) {
98     const __m128i l16 = _mm_shuffle_epi8(l, rep);
99     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
100 
101     *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
102     dst += stride;
103     rep = _mm_add_epi16(rep, one);
104   }
105 }
106 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
107 
aom_paeth_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)108 void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
109                                    const uint8_t *above, const uint8_t *left) {
110   __m128i l = _mm_loadl_epi64((const __m128i *)left);
111   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
112   const __m128i zero = _mm_setzero_si128();
113   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
114   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
115   __m128i rep = _mm_set1_epi16((short)0x8000);
116   const __m128i one = _mm_set1_epi16(1);
117 
118   int i;
119   for (i = 0; i < 4; ++i) {
120     const __m128i l16 = _mm_shuffle_epi8(l, rep);
121     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
122 
123     _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
124     dst += stride;
125     rep = _mm_add_epi16(rep, one);
126   }
127 }
128 
aom_paeth_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)129 void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
130                                    const uint8_t *above, const uint8_t *left) {
131   __m128i l = _mm_loadl_epi64((const __m128i *)left);
132   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
133   const __m128i zero = _mm_setzero_si128();
134   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
135   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
136   __m128i rep = _mm_set1_epi16((short)0x8000);
137   const __m128i one = _mm_set1_epi16(1);
138 
139   int i;
140   for (i = 0; i < 8; ++i) {
141     const __m128i l16 = _mm_shuffle_epi8(l, rep);
142     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
143 
144     _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
145     dst += stride;
146     rep = _mm_add_epi16(rep, one);
147   }
148 }
149 
aom_paeth_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)150 void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
151                                     const uint8_t *above, const uint8_t *left) {
152   __m128i l = _mm_load_si128((const __m128i *)left);
153   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
154   const __m128i zero = _mm_setzero_si128();
155   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
156   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
157   __m128i rep = _mm_set1_epi16((short)0x8000);
158   const __m128i one = _mm_set1_epi16(1);
159 
160   int i;
161   for (i = 0; i < 16; ++i) {
162     const __m128i l16 = _mm_shuffle_epi8(l, rep);
163     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
164 
165     _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
166     dst += stride;
167     rep = _mm_add_epi16(rep, one);
168   }
169 }
170 
171 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_paeth_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)172 void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
173                                     const uint8_t *above, const uint8_t *left) {
174   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
175   const __m128i zero = _mm_setzero_si128();
176   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
177   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
178   const __m128i one = _mm_set1_epi16(1);
179 
180   for (int j = 0; j < 2; ++j) {
181     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
182     __m128i rep = _mm_set1_epi16((short)0x8000);
183     for (int i = 0; i < 16; ++i) {
184       const __m128i l16 = _mm_shuffle_epi8(l, rep);
185       const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
186 
187       _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
188       dst += stride;
189       rep = _mm_add_epi16(rep, one);
190     }
191   }
192 }
193 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
194 
195 // Return 16 8-bit pixels in one row
paeth_16x1_pred(const __m128i * left,const __m128i * top0,const __m128i * top1,const __m128i * topleft)196 static inline __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
197                                       const __m128i *top1,
198                                       const __m128i *topleft) {
199   const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
200   const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
201   return _mm_packus_epi16(p0, p1);
202 }
203 
204 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_paeth_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)205 void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
206                                     const uint8_t *above, const uint8_t *left) {
207   __m128i l = _mm_cvtsi32_si128(((const int *)left)[0]);
208   const __m128i t = _mm_load_si128((const __m128i *)above);
209   const __m128i zero = _mm_setzero_si128();
210   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
211   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
212   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
213   __m128i rep = _mm_set1_epi16((short)0x8000);
214   const __m128i one = _mm_set1_epi16(1);
215 
216   for (int i = 0; i < 4; ++i) {
217     const __m128i l16 = _mm_shuffle_epi8(l, rep);
218     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
219 
220     _mm_store_si128((__m128i *)dst, row);
221     dst += stride;
222     rep = _mm_add_epi16(rep, one);
223   }
224 }
225 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
226 
aom_paeth_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)227 void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
228                                     const uint8_t *above, const uint8_t *left) {
229   __m128i l = _mm_loadl_epi64((const __m128i *)left);
230   const __m128i t = _mm_load_si128((const __m128i *)above);
231   const __m128i zero = _mm_setzero_si128();
232   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
233   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
234   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
235   __m128i rep = _mm_set1_epi16((short)0x8000);
236   const __m128i one = _mm_set1_epi16(1);
237 
238   int i;
239   for (i = 0; i < 8; ++i) {
240     const __m128i l16 = _mm_shuffle_epi8(l, rep);
241     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
242 
243     _mm_store_si128((__m128i *)dst, row);
244     dst += stride;
245     rep = _mm_add_epi16(rep, one);
246   }
247 }
248 
aom_paeth_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)249 void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
250                                      const uint8_t *above,
251                                      const uint8_t *left) {
252   __m128i l = _mm_load_si128((const __m128i *)left);
253   const __m128i t = _mm_load_si128((const __m128i *)above);
254   const __m128i zero = _mm_setzero_si128();
255   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
256   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
257   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
258   __m128i rep = _mm_set1_epi16((short)0x8000);
259   const __m128i one = _mm_set1_epi16(1);
260 
261   int i;
262   for (i = 0; i < 16; ++i) {
263     const __m128i l16 = _mm_shuffle_epi8(l, rep);
264     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
265 
266     _mm_store_si128((__m128i *)dst, row);
267     dst += stride;
268     rep = _mm_add_epi16(rep, one);
269   }
270 }
271 
aom_paeth_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)272 void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
273                                      const uint8_t *above,
274                                      const uint8_t *left) {
275   __m128i l = _mm_load_si128((const __m128i *)left);
276   const __m128i t = _mm_load_si128((const __m128i *)above);
277   const __m128i zero = _mm_setzero_si128();
278   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
279   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
280   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
281   __m128i rep = _mm_set1_epi16((short)0x8000);
282   const __m128i one = _mm_set1_epi16(1);
283   __m128i l16;
284 
285   int i;
286   for (i = 0; i < 16; ++i) {
287     l16 = _mm_shuffle_epi8(l, rep);
288     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
289 
290     _mm_store_si128((__m128i *)dst, row);
291     dst += stride;
292     rep = _mm_add_epi16(rep, one);
293   }
294 
295   l = _mm_load_si128((const __m128i *)(left + 16));
296   rep = _mm_set1_epi16((short)0x8000);
297   for (i = 0; i < 16; ++i) {
298     l16 = _mm_shuffle_epi8(l, rep);
299     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
300 
301     _mm_store_si128((__m128i *)dst, row);
302     dst += stride;
303     rep = _mm_add_epi16(rep, one);
304   }
305 }
306 
307 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_paeth_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)308 void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
309                                      const uint8_t *above,
310                                      const uint8_t *left) {
311   const __m128i t = _mm_load_si128((const __m128i *)above);
312   const __m128i zero = _mm_setzero_si128();
313   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
314   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
315   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
316   const __m128i one = _mm_set1_epi16(1);
317 
318   for (int j = 0; j < 4; ++j) {
319     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
320     __m128i rep = _mm_set1_epi16((short)0x8000);
321     for (int i = 0; i < 16; ++i) {
322       const __m128i l16 = _mm_shuffle_epi8(l, rep);
323       const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
324       _mm_store_si128((__m128i *)dst, row);
325       dst += stride;
326       rep = _mm_add_epi16(rep, one);
327     }
328   }
329 }
330 
aom_paeth_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)331 void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
332                                     const uint8_t *above, const uint8_t *left) {
333   const __m128i a = _mm_load_si128((const __m128i *)above);
334   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
335   const __m128i zero = _mm_setzero_si128();
336   const __m128i al = _mm_unpacklo_epi8(a, zero);
337   const __m128i ah = _mm_unpackhi_epi8(a, zero);
338   const __m128i bl = _mm_unpacklo_epi8(b, zero);
339   const __m128i bh = _mm_unpackhi_epi8(b, zero);
340 
341   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
342   __m128i rep = _mm_set1_epi16((short)0x8000);
343   const __m128i one = _mm_set1_epi16(1);
344   const __m128i l = _mm_loadl_epi64((const __m128i *)left);
345   __m128i l16;
346 
347   for (int i = 0; i < 8; ++i) {
348     l16 = _mm_shuffle_epi8(l, rep);
349     const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
350     const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
351 
352     _mm_store_si128((__m128i *)dst, r32l);
353     _mm_store_si128((__m128i *)(dst + 16), r32h);
354     dst += stride;
355     rep = _mm_add_epi16(rep, one);
356   }
357 }
358 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
359 
aom_paeth_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)360 void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
361                                      const uint8_t *above,
362                                      const uint8_t *left) {
363   const __m128i a = _mm_load_si128((const __m128i *)above);
364   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
365   const __m128i zero = _mm_setzero_si128();
366   const __m128i al = _mm_unpacklo_epi8(a, zero);
367   const __m128i ah = _mm_unpackhi_epi8(a, zero);
368   const __m128i bl = _mm_unpacklo_epi8(b, zero);
369   const __m128i bh = _mm_unpackhi_epi8(b, zero);
370 
371   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
372   __m128i rep = _mm_set1_epi16((short)0x8000);
373   const __m128i one = _mm_set1_epi16(1);
374   __m128i l = _mm_load_si128((const __m128i *)left);
375   __m128i l16;
376 
377   int i;
378   for (i = 0; i < 16; ++i) {
379     l16 = _mm_shuffle_epi8(l, rep);
380     const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
381     const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
382 
383     _mm_store_si128((__m128i *)dst, r32l);
384     _mm_store_si128((__m128i *)(dst + 16), r32h);
385     dst += stride;
386     rep = _mm_add_epi16(rep, one);
387   }
388 }
389 
aom_paeth_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)390 void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
391                                      const uint8_t *above,
392                                      const uint8_t *left) {
393   const __m128i a = _mm_load_si128((const __m128i *)above);
394   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
395   const __m128i zero = _mm_setzero_si128();
396   const __m128i al = _mm_unpacklo_epi8(a, zero);
397   const __m128i ah = _mm_unpackhi_epi8(a, zero);
398   const __m128i bl = _mm_unpacklo_epi8(b, zero);
399   const __m128i bh = _mm_unpackhi_epi8(b, zero);
400 
401   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
402   __m128i rep = _mm_set1_epi16((short)0x8000);
403   const __m128i one = _mm_set1_epi16(1);
404   __m128i l = _mm_load_si128((const __m128i *)left);
405   __m128i l16;
406 
407   int i;
408   for (i = 0; i < 16; ++i) {
409     l16 = _mm_shuffle_epi8(l, rep);
410     const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
411     const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
412 
413     _mm_store_si128((__m128i *)dst, r32l);
414     _mm_store_si128((__m128i *)(dst + 16), r32h);
415     dst += stride;
416     rep = _mm_add_epi16(rep, one);
417   }
418 
419   rep = _mm_set1_epi16((short)0x8000);
420   l = _mm_load_si128((const __m128i *)(left + 16));
421   for (i = 0; i < 16; ++i) {
422     l16 = _mm_shuffle_epi8(l, rep);
423     const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
424     const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
425 
426     _mm_store_si128((__m128i *)dst, r32l);
427     _mm_store_si128((__m128i *)(dst + 16), r32h);
428     dst += stride;
429     rep = _mm_add_epi16(rep, one);
430   }
431 }
432 
aom_paeth_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)433 void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
434                                      const uint8_t *above,
435                                      const uint8_t *left) {
436   const __m128i a = _mm_load_si128((const __m128i *)above);
437   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
438   const __m128i zero = _mm_setzero_si128();
439   const __m128i al = _mm_unpacklo_epi8(a, zero);
440   const __m128i ah = _mm_unpackhi_epi8(a, zero);
441   const __m128i bl = _mm_unpacklo_epi8(b, zero);
442   const __m128i bh = _mm_unpackhi_epi8(b, zero);
443 
444   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
445   const __m128i one = _mm_set1_epi16(1);
446   __m128i l16;
447 
448   int i, j;
449   for (j = 0; j < 4; ++j) {
450     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
451     __m128i rep = _mm_set1_epi16((short)0x8000);
452     for (i = 0; i < 16; ++i) {
453       l16 = _mm_shuffle_epi8(l, rep);
454       const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
455       const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
456 
457       _mm_store_si128((__m128i *)dst, r32l);
458       _mm_store_si128((__m128i *)(dst + 16), r32h);
459       dst += stride;
460       rep = _mm_add_epi16(rep, one);
461     }
462   }
463 }
464 
aom_paeth_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)465 void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
466                                      const uint8_t *above,
467                                      const uint8_t *left) {
468   const __m128i a = _mm_load_si128((const __m128i *)above);
469   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
470   const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
471   const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
472   const __m128i zero = _mm_setzero_si128();
473   const __m128i al = _mm_unpacklo_epi8(a, zero);
474   const __m128i ah = _mm_unpackhi_epi8(a, zero);
475   const __m128i bl = _mm_unpacklo_epi8(b, zero);
476   const __m128i bh = _mm_unpackhi_epi8(b, zero);
477   const __m128i cl = _mm_unpacklo_epi8(c, zero);
478   const __m128i ch = _mm_unpackhi_epi8(c, zero);
479   const __m128i dl = _mm_unpacklo_epi8(d, zero);
480   const __m128i dh = _mm_unpackhi_epi8(d, zero);
481 
482   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
483   const __m128i one = _mm_set1_epi16(1);
484   __m128i l16;
485 
486   int i, j;
487   for (j = 0; j < 2; ++j) {
488     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
489     __m128i rep = _mm_set1_epi16((short)0x8000);
490     for (i = 0; i < 16; ++i) {
491       l16 = _mm_shuffle_epi8(l, rep);
492       const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
493       const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
494       const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
495       const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
496 
497       _mm_store_si128((__m128i *)dst, r0);
498       _mm_store_si128((__m128i *)(dst + 16), r1);
499       _mm_store_si128((__m128i *)(dst + 32), r2);
500       _mm_store_si128((__m128i *)(dst + 48), r3);
501       dst += stride;
502       rep = _mm_add_epi16(rep, one);
503     }
504   }
505 }
506 
aom_paeth_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)507 void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
508                                      const uint8_t *above,
509                                      const uint8_t *left) {
510   const __m128i a = _mm_load_si128((const __m128i *)above);
511   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
512   const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
513   const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
514   const __m128i zero = _mm_setzero_si128();
515   const __m128i al = _mm_unpacklo_epi8(a, zero);
516   const __m128i ah = _mm_unpackhi_epi8(a, zero);
517   const __m128i bl = _mm_unpacklo_epi8(b, zero);
518   const __m128i bh = _mm_unpackhi_epi8(b, zero);
519   const __m128i cl = _mm_unpacklo_epi8(c, zero);
520   const __m128i ch = _mm_unpackhi_epi8(c, zero);
521   const __m128i dl = _mm_unpacklo_epi8(d, zero);
522   const __m128i dh = _mm_unpackhi_epi8(d, zero);
523 
524   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
525   const __m128i one = _mm_set1_epi16(1);
526   __m128i l16;
527 
528   int i, j;
529   for (j = 0; j < 4; ++j) {
530     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
531     __m128i rep = _mm_set1_epi16((short)0x8000);
532     for (i = 0; i < 16; ++i) {
533       l16 = _mm_shuffle_epi8(l, rep);
534       const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
535       const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
536       const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
537       const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
538 
539       _mm_store_si128((__m128i *)dst, r0);
540       _mm_store_si128((__m128i *)(dst + 16), r1);
541       _mm_store_si128((__m128i *)(dst + 32), r2);
542       _mm_store_si128((__m128i *)(dst + 48), r3);
543       dst += stride;
544       rep = _mm_add_epi16(rep, one);
545     }
546   }
547 }
548 
549 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_paeth_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)550 void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
551                                      const uint8_t *above,
552                                      const uint8_t *left) {
553   const __m128i a = _mm_load_si128((const __m128i *)above);
554   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
555   const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
556   const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
557   const __m128i zero = _mm_setzero_si128();
558   const __m128i al = _mm_unpacklo_epi8(a, zero);
559   const __m128i ah = _mm_unpackhi_epi8(a, zero);
560   const __m128i bl = _mm_unpacklo_epi8(b, zero);
561   const __m128i bh = _mm_unpackhi_epi8(b, zero);
562   const __m128i cl = _mm_unpacklo_epi8(c, zero);
563   const __m128i ch = _mm_unpackhi_epi8(c, zero);
564   const __m128i dl = _mm_unpacklo_epi8(d, zero);
565   const __m128i dh = _mm_unpackhi_epi8(d, zero);
566 
567   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
568   const __m128i one = _mm_set1_epi16(1);
569   __m128i l16;
570 
571   int i;
572   const __m128i l = _mm_load_si128((const __m128i *)left);
573   __m128i rep = _mm_set1_epi16((short)0x8000);
574   for (i = 0; i < 16; ++i) {
575     l16 = _mm_shuffle_epi8(l, rep);
576     const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
577     const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
578     const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
579     const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
580 
581     _mm_store_si128((__m128i *)dst, r0);
582     _mm_store_si128((__m128i *)(dst + 16), r1);
583     _mm_store_si128((__m128i *)(dst + 32), r2);
584     _mm_store_si128((__m128i *)(dst + 48), r3);
585     dst += stride;
586     rep = _mm_add_epi16(rep, one);
587   }
588 }
589 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
590 
591 // -----------------------------------------------------------------------------
592 // SMOOTH_PRED
593 
594 // pixels[0]: above and below_pred interleave vector
595 // pixels[1]: left vector
596 // pixels[2]: right_pred vector
load_pixel_w4(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)597 static inline void load_pixel_w4(const uint8_t *above, const uint8_t *left,
598                                  int height, __m128i *pixels) {
599   __m128i d = _mm_cvtsi32_si128(((const int *)above)[0]);
600   if (height == 4)
601     pixels[1] = _mm_cvtsi32_si128(((const int *)left)[0]);
602   else if (height == 8)
603     pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
604   else
605     pixels[1] = _mm_loadu_si128(((const __m128i *)left));
606 
607   pixels[2] = _mm_set1_epi16((int16_t)above[3]);
608 
609   const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
610   const __m128i zero = _mm_setzero_si128();
611   d = _mm_unpacklo_epi8(d, zero);
612   pixels[0] = _mm_unpacklo_epi16(d, bp);
613 }
614 
615 // weight_h[0]: weight_h vector
616 // weight_h[1]: scale - weight_h vector
617 // weight_h[2]: same as [0], second half for height = 16 only
618 // weight_h[3]: same as [1], second half for height = 16 only
619 // weight_w[0]: weights_w and scale - weights_w interleave vector
load_weight_w4(int height,__m128i * weight_h,__m128i * weight_w)620 static inline void load_weight_w4(int height, __m128i *weight_h,
621                                   __m128i *weight_w) {
622   const __m128i zero = _mm_setzero_si128();
623   const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
624   const __m128i t = _mm_cvtsi32_si128(((const int *)smooth_weights)[0]);
625   weight_h[0] = _mm_unpacklo_epi8(t, zero);
626   weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
627   weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
628 
629   if (height == 8) {
630     const __m128i weight = _mm_loadl_epi64((const __m128i *)&smooth_weights[4]);
631     weight_h[0] = _mm_unpacklo_epi8(weight, zero);
632     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
633   } else if (height == 16) {
634     const __m128i weight =
635         _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
636     weight_h[0] = _mm_unpacklo_epi8(weight, zero);
637     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
638     weight_h[2] = _mm_unpackhi_epi8(weight, zero);
639     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
640   }
641 }
642 
smooth_pred_4xh(const __m128i * pixel,const __m128i * wh,const __m128i * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)643 static inline void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
644                                    const __m128i *ww, int h, uint8_t *dst,
645                                    ptrdiff_t stride, int second_half) {
646   const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
647   const __m128i one = _mm_set1_epi16(1);
648   const __m128i inc = _mm_set1_epi16(0x202);
649   const __m128i gat = _mm_set1_epi32(0xc080400);
650   __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
651                             : _mm_set1_epi16((short)0x8000);
652   __m128i d = _mm_set1_epi16(0x100);
653 
654   for (int i = 0; i < h; ++i) {
655     const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
656     const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
657     const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
658     __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
659 
660     __m128i b = _mm_shuffle_epi8(pixel[1], rep);
661     b = _mm_unpacklo_epi16(b, pixel[2]);
662     __m128i sum = _mm_madd_epi16(b, ww[0]);
663 
664     sum = _mm_add_epi32(s, sum);
665     sum = _mm_add_epi32(sum, round);
666     sum = _mm_srai_epi32(sum, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
667 
668     sum = _mm_shuffle_epi8(sum, gat);
669     *(int *)dst = _mm_cvtsi128_si32(sum);
670     dst += stride;
671 
672     rep = _mm_add_epi16(rep, one);
673     d = _mm_add_epi16(d, inc);
674   }
675 }
676 
aom_smooth_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)677 void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
678                                     const uint8_t *above, const uint8_t *left) {
679   __m128i pixels[3];
680   load_pixel_w4(above, left, 4, pixels);
681 
682   __m128i wh[4], ww[2];
683   load_weight_w4(4, wh, ww);
684 
685   smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
686 }
687 
aom_smooth_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)688 void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
689                                     const uint8_t *above, const uint8_t *left) {
690   __m128i pixels[3];
691   load_pixel_w4(above, left, 8, pixels);
692 
693   __m128i wh[4], ww[2];
694   load_weight_w4(8, wh, ww);
695 
696   smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
697 }
698 
699 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)700 void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
701                                      const uint8_t *above,
702                                      const uint8_t *left) {
703   __m128i pixels[3];
704   load_pixel_w4(above, left, 16, pixels);
705 
706   __m128i wh[4], ww[2];
707   load_weight_w4(16, wh, ww);
708 
709   smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
710   dst += stride << 3;
711   smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
712 }
713 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
714 
715 // pixels[0]: above and below_pred interleave vector, first half
716 // pixels[1]: above and below_pred interleave vector, second half
717 // pixels[2]: left vector
718 // pixels[3]: right_pred vector
719 // pixels[4]: above and below_pred interleave vector, first half
720 // pixels[5]: above and below_pred interleave vector, second half
721 // pixels[6]: left vector + 16
722 // pixels[7]: right_pred vector
load_pixel_w8(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)723 static inline void load_pixel_w8(const uint8_t *above, const uint8_t *left,
724                                  int height, __m128i *pixels) {
725   const __m128i zero = _mm_setzero_si128();
726   const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
727   __m128i d = _mm_loadl_epi64((const __m128i *)above);
728   d = _mm_unpacklo_epi8(d, zero);
729   pixels[0] = _mm_unpacklo_epi16(d, bp);
730   pixels[1] = _mm_unpackhi_epi16(d, bp);
731 
732   pixels[3] = _mm_set1_epi16((int16_t)above[7]);
733 
734   if (height == 4) {
735     pixels[2] = _mm_cvtsi32_si128(((const int *)left)[0]);
736   } else if (height == 8) {
737     pixels[2] = _mm_loadl_epi64((const __m128i *)left);
738   } else if (height == 16) {
739     pixels[2] = _mm_load_si128((const __m128i *)left);
740   } else {
741     pixels[2] = _mm_load_si128((const __m128i *)left);
742     pixels[4] = pixels[0];
743     pixels[5] = pixels[1];
744     pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
745     pixels[7] = pixels[3];
746   }
747 }
748 
749 // weight_h[0]: weight_h vector
750 // weight_h[1]: scale - weight_h vector
751 // weight_h[2]: same as [0], offset 8
752 // weight_h[3]: same as [1], offset 8
753 // weight_h[4]: same as [0], offset 16
754 // weight_h[5]: same as [1], offset 16
755 // weight_h[6]: same as [0], offset 24
756 // weight_h[7]: same as [1], offset 24
757 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
758 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
load_weight_w8(int height,__m128i * weight_h,__m128i * weight_w)759 static inline void load_weight_w8(int height, __m128i *weight_h,
760                                   __m128i *weight_w) {
761   const __m128i zero = _mm_setzero_si128();
762   const int we_offset = height < 8 ? 0 : 4;
763   __m128i we = _mm_loadu_si128((const __m128i *)&smooth_weights[we_offset]);
764   weight_h[0] = _mm_unpacklo_epi8(we, zero);
765   const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
766   weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
767 
768   if (height == 4) {
769     we = _mm_srli_si128(we, 4);
770     __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
771     __m128i tmp2 = _mm_sub_epi16(d, tmp1);
772     weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
773     weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
774   } else {
775     weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
776     weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
777   }
778 
779   if (height == 16) {
780     we = _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
781     weight_h[0] = _mm_unpacklo_epi8(we, zero);
782     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
783     weight_h[2] = _mm_unpackhi_epi8(we, zero);
784     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
785   } else if (height == 32) {
786     const __m128i weight_lo =
787         _mm_loadu_si128((const __m128i *)&smooth_weights[28]);
788     weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
789     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
790     weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
791     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
792     const __m128i weight_hi =
793         _mm_loadu_si128((const __m128i *)&smooth_weights[28 + 16]);
794     weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
795     weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
796     weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
797     weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
798   }
799 }
800 
smooth_pred_8xh(const __m128i * pixels,const __m128i * wh,const __m128i * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)801 static inline void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
802                                    const __m128i *ww, int h, uint8_t *dst,
803                                    ptrdiff_t stride, int second_half) {
804   const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
805   const __m128i one = _mm_set1_epi16(1);
806   const __m128i inc = _mm_set1_epi16(0x202);
807   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
808 
809   __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
810                             : _mm_set1_epi16((short)0x8000);
811   __m128i d = _mm_set1_epi16(0x100);
812 
813   int i;
814   for (i = 0; i < h; ++i) {
815     const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
816     const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
817     const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
818     __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
819     __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
820 
821     __m128i b = _mm_shuffle_epi8(pixels[2], rep);
822     b = _mm_unpacklo_epi16(b, pixels[3]);
823     __m128i sum0 = _mm_madd_epi16(b, ww[0]);
824     __m128i sum1 = _mm_madd_epi16(b, ww[1]);
825 
826     s0 = _mm_add_epi32(s0, sum0);
827     s0 = _mm_add_epi32(s0, round);
828     s0 = _mm_srai_epi32(s0, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
829 
830     s1 = _mm_add_epi32(s1, sum1);
831     s1 = _mm_add_epi32(s1, round);
832     s1 = _mm_srai_epi32(s1, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
833 
834     sum0 = _mm_packus_epi16(s0, s1);
835     sum0 = _mm_shuffle_epi8(sum0, gat);
836     _mm_storel_epi64((__m128i *)dst, sum0);
837     dst += stride;
838 
839     rep = _mm_add_epi16(rep, one);
840     d = _mm_add_epi16(d, inc);
841   }
842 }
843 
aom_smooth_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)844 void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
845                                     const uint8_t *above, const uint8_t *left) {
846   __m128i pixels[4];
847   load_pixel_w8(above, left, 4, pixels);
848 
849   __m128i wh[4], ww[2];
850   load_weight_w8(4, wh, ww);
851 
852   smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
853 }
854 
aom_smooth_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)855 void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
856                                     const uint8_t *above, const uint8_t *left) {
857   __m128i pixels[4];
858   load_pixel_w8(above, left, 8, pixels);
859 
860   __m128i wh[4], ww[2];
861   load_weight_w8(8, wh, ww);
862 
863   smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
864 }
865 
aom_smooth_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)866 void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
867                                      const uint8_t *above,
868                                      const uint8_t *left) {
869   __m128i pixels[4];
870   load_pixel_w8(above, left, 16, pixels);
871 
872   __m128i wh[4], ww[2];
873   load_weight_w8(16, wh, ww);
874 
875   smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
876   dst += stride << 3;
877   smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
878 }
879 
880 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)881 void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
882                                      const uint8_t *above,
883                                      const uint8_t *left) {
884   __m128i pixels[8];
885   load_pixel_w8(above, left, 32, pixels);
886 
887   __m128i wh[8], ww[2];
888   load_weight_w8(32, wh, ww);
889 
890   smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
891   dst += stride << 3;
892   smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
893   dst += stride << 3;
894   smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
895   dst += stride << 3;
896   smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
897 }
898 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
899 
900 // TODO(slavarnway): Visual Studio only supports restrict when /std:c11
901 // (available in 2019+) or greater is specified; __restrict can be used in that
902 // case. This should be moved to rtcd and used consistently between the
903 // function declarations and definitions to avoid warnings in Visual Studio
904 // when defining LIBAOM_RESTRICT to restrict or __restrict.
905 #if defined(_MSC_VER)
906 #define LIBAOM_RESTRICT
907 #else
908 #define LIBAOM_RESTRICT restrict
909 #endif
910 
Load4(const void * src)911 static AOM_FORCE_INLINE __m128i Load4(const void *src) {
912   // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
913   // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
914   // movss instruction.
915   //
916   // Until compiler support of _mm_loadu_si32 is widespread, use of
917   // _mm_loadu_si32 is banned.
918   int val;
919   memcpy(&val, src, sizeof(val));
920   return _mm_cvtsi32_si128(val);
921 }
922 
LoadLo8(const void * a)923 static AOM_FORCE_INLINE __m128i LoadLo8(const void *a) {
924   return _mm_loadl_epi64((const __m128i *)(a));
925 }
926 
LoadUnaligned16(const void * a)927 static AOM_FORCE_INLINE __m128i LoadUnaligned16(const void *a) {
928   return _mm_loadu_si128((const __m128i *)(a));
929 }
930 
Store4(void * dst,const __m128i x)931 static AOM_FORCE_INLINE void Store4(void *dst, const __m128i x) {
932   const int val = _mm_cvtsi128_si32(x);
933   memcpy(dst, &val, sizeof(val));
934 }
935 
StoreLo8(void * a,const __m128i v)936 static AOM_FORCE_INLINE void StoreLo8(void *a, const __m128i v) {
937   _mm_storel_epi64((__m128i *)(a), v);
938 }
939 
StoreUnaligned16(void * a,const __m128i v)940 static AOM_FORCE_INLINE void StoreUnaligned16(void *a, const __m128i v) {
941   _mm_storeu_si128((__m128i *)(a), v);
942 }
943 
cvtepu8_epi16(__m128i x)944 static AOM_FORCE_INLINE __m128i cvtepu8_epi16(__m128i x) {
945   return _mm_unpacklo_epi8((x), _mm_setzero_si128());
946 }
947 
cvtepu8_epi32(__m128i x)948 static AOM_FORCE_INLINE __m128i cvtepu8_epi32(__m128i x) {
949   const __m128i tmp = _mm_unpacklo_epi8((x), _mm_setzero_si128());
950   return _mm_unpacklo_epi16(tmp, _mm_setzero_si128());
951 }
952 
cvtepu16_epi32(__m128i x)953 static AOM_FORCE_INLINE __m128i cvtepu16_epi32(__m128i x) {
954   return _mm_unpacklo_epi16((x), _mm_setzero_si128());
955 }
956 
smooth_predictor_wxh(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column,int width,int height)957 static void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
958                                  const uint8_t *LIBAOM_RESTRICT top_row,
959                                  const uint8_t *LIBAOM_RESTRICT left_column,
960                                  int width, int height) {
961   const uint8_t *const sm_weights_h = smooth_weights + height - 4;
962   const uint8_t *const sm_weights_w = smooth_weights + width - 4;
963   const __m128i zero = _mm_setzero_si128();
964   const __m128i scale_value = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
965   const __m128i bottom_left = _mm_cvtsi32_si128(left_column[height - 1]);
966   const __m128i top_right = _mm_set1_epi16(top_row[width - 1]);
967   const __m128i round = _mm_set1_epi32(1 << SMOOTH_WEIGHT_LOG2_SCALE);
968   for (int y = 0; y < height; ++y) {
969     const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
970     const __m128i left_y = _mm_cvtsi32_si128(left_column[y]);
971     const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
972     __m128i scaled_bottom_left =
973         _mm_mullo_epi16(scale_m_weights_y, bottom_left);
974     const __m128i weight_left_y =
975         _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
976     scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round);
977     scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0);
978     for (int x = 0; x < width; x += 8) {
979       const __m128i top_x = LoadLo8(top_row + x);
980       const __m128i weights_x = LoadLo8(sm_weights_w + x);
981       const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x);
982       const __m128i top_weights_x_lo = cvtepu8_epi16(top_weights_x);
983       const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero);
984 
985       // Here opposite weights and pixels are multiplied, where the order of
986       // interleaving is indicated in the names.
987       __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y);
988       __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y);
989 
990       // |scaled_bottom_left| is always scaled by the same weight each row, so
991       // we only derive |scaled_top_right| values here.
992       const __m128i inverted_weights_x =
993           _mm_sub_epi16(scale_value, cvtepu8_epi16(weights_x));
994       const __m128i scaled_top_right =
995           _mm_mullo_epi16(inverted_weights_x, top_right);
996       const __m128i scaled_top_right_lo = cvtepu16_epi32(scaled_top_right);
997       const __m128i scaled_top_right_hi =
998           _mm_unpackhi_epi16(scaled_top_right, zero);
999       pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left);
1000       pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left);
1001       pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo);
1002       pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi);
1003 
1004       // The round value for RightShiftWithRounding was added with
1005       // |scaled_bottom_left|.
1006       pred_lo = _mm_srli_epi32(pred_lo, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
1007       pred_hi = _mm_srli_epi32(pred_hi, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
1008       const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
1009       StoreLo8(dst + x, _mm_packus_epi16(pred, pred));
1010     }
1011     dst += stride;
1012   }
1013 }
1014 
1015 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1016 void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1017                                      const uint8_t *above,
1018                                      const uint8_t *left) {
1019   smooth_predictor_wxh(dst, stride, above, left, 16, 4);
1020 }
1021 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1022 
aom_smooth_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1023 void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1024                                      const uint8_t *above,
1025                                      const uint8_t *left) {
1026   smooth_predictor_wxh(dst, stride, above, left, 16, 8);
1027 }
1028 
aom_smooth_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1029 void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1030                                       const uint8_t *above,
1031                                       const uint8_t *left) {
1032   smooth_predictor_wxh(dst, stride, above, left, 16, 16);
1033 }
1034 
aom_smooth_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1035 void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1036                                       const uint8_t *above,
1037                                       const uint8_t *left) {
1038   smooth_predictor_wxh(dst, stride, above, left, 16, 32);
1039 }
1040 
1041 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1042 void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1043                                       const uint8_t *above,
1044                                       const uint8_t *left) {
1045   smooth_predictor_wxh(dst, stride, above, left, 16, 64);
1046 }
1047 
aom_smooth_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1048 void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1049                                      const uint8_t *above,
1050                                      const uint8_t *left) {
1051   smooth_predictor_wxh(dst, stride, above, left, 32, 8);
1052 }
1053 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1054 
aom_smooth_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1055 void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1056                                       const uint8_t *above,
1057                                       const uint8_t *left) {
1058   smooth_predictor_wxh(dst, stride, above, left, 32, 16);
1059 }
1060 
aom_smooth_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1061 void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1062                                       const uint8_t *above,
1063                                       const uint8_t *left) {
1064   smooth_predictor_wxh(dst, stride, above, left, 32, 32);
1065 }
1066 
aom_smooth_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1067 void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1068                                       const uint8_t *above,
1069                                       const uint8_t *left) {
1070   smooth_predictor_wxh(dst, stride, above, left, 32, 64);
1071 }
1072 
1073 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1074 void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1075                                       const uint8_t *above,
1076                                       const uint8_t *left) {
1077   smooth_predictor_wxh(dst, stride, above, left, 64, 16);
1078 }
1079 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1080 
aom_smooth_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1081 void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1082                                       const uint8_t *above,
1083                                       const uint8_t *left) {
1084   smooth_predictor_wxh(dst, stride, above, left, 64, 32);
1085 }
1086 
aom_smooth_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1087 void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1088                                       const uint8_t *above,
1089                                       const uint8_t *left) {
1090   smooth_predictor_wxh(dst, stride, above, left, 64, 64);
1091 }
1092 
1093 // -----------------------------------------------------------------------------
1094 // Smooth horizontal/vertical helper functions.
1095 
1096 // For Horizontal, pixels1 and pixels2 are the same repeated value. For
1097 // Vertical, weights1 and weights2 are the same, and scaled_corner1 and
1098 // scaled_corner2 are the same.
write_smooth_directional_sum16(uint8_t * LIBAOM_RESTRICT dst,const __m128i pixels1,const __m128i pixels2,const __m128i weights1,const __m128i weights2,const __m128i scaled_corner1,const __m128i scaled_corner2,const __m128i round)1099 static AOM_FORCE_INLINE void write_smooth_directional_sum16(
1100     uint8_t *LIBAOM_RESTRICT dst, const __m128i pixels1, const __m128i pixels2,
1101     const __m128i weights1, const __m128i weights2,
1102     const __m128i scaled_corner1, const __m128i scaled_corner2,
1103     const __m128i round) {
1104   const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1);
1105   const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2);
1106   const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1);
1107   const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2);
1108   // Equivalent to RightShiftWithRounding(pred[x][y], 8).
1109   const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8);
1110   const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8);
1111   StoreUnaligned16(dst, _mm_packus_epi16(pred1, pred2));
1112 }
1113 
smooth_directional_sum8(const __m128i pixels,const __m128i weights,const __m128i scaled_corner)1114 static AOM_FORCE_INLINE __m128i smooth_directional_sum8(
1115     const __m128i pixels, const __m128i weights, const __m128i scaled_corner) {
1116   const __m128i weighted_px = _mm_mullo_epi16(pixels, weights);
1117   return _mm_add_epi16(scaled_corner, weighted_px);
1118 }
1119 
write_smooth_directional_sum8(uint8_t * LIBAOM_RESTRICT dst,const __m128i * pixels,const __m128i * weights,const __m128i * scaled_corner,const __m128i * round)1120 static AOM_FORCE_INLINE void write_smooth_directional_sum8(
1121     uint8_t *LIBAOM_RESTRICT dst, const __m128i *pixels, const __m128i *weights,
1122     const __m128i *scaled_corner, const __m128i *round) {
1123   const __m128i pred_sum =
1124       smooth_directional_sum8(*pixels, *weights, *scaled_corner);
1125   // Equivalent to RightShiftWithRounding(pred[x][y], 8).
1126   const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, *round), 8);
1127   StoreLo8(dst, _mm_packus_epi16(pred, pred));
1128 }
1129 
1130 // -----------------------------------------------------------------------------
1131 // SMOOTH_V_PRED
1132 
load_smooth_vertical_pixels4(const uint8_t * LIBAOM_RESTRICT above,const uint8_t * LIBAOM_RESTRICT left,const int height,__m128i * pixels)1133 static AOM_FORCE_INLINE void load_smooth_vertical_pixels4(
1134     const uint8_t *LIBAOM_RESTRICT above, const uint8_t *LIBAOM_RESTRICT left,
1135     const int height, __m128i *pixels) {
1136   __m128i top = Load4(above);
1137   const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
1138   top = cvtepu8_epi16(top);
1139   pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
1140 }
1141 
1142 // |weight_array| alternates weight vectors from the table with their inverted
1143 // (256-w) counterparts. This is precomputed by the compiler when the weights
1144 // table is visible to this module. Removing this visibility can cut speed by up
1145 // to half in both 4xH and 8xH transforms.
load_smooth_vertical_weights4(const uint8_t * LIBAOM_RESTRICT weight_array,const int height,__m128i * weights)1146 static AOM_FORCE_INLINE void load_smooth_vertical_weights4(
1147     const uint8_t *LIBAOM_RESTRICT weight_array, const int height,
1148     __m128i *weights) {
1149   const __m128i inverter = _mm_set1_epi16(256);
1150 
1151   if (height == 4) {
1152     const __m128i weight = Load4(weight_array);
1153     weights[0] = cvtepu8_epi16(weight);
1154     weights[1] = _mm_sub_epi16(inverter, weights[0]);
1155   } else if (height == 8) {
1156     const __m128i weight = LoadLo8(weight_array + 4);
1157     weights[0] = cvtepu8_epi16(weight);
1158     weights[1] = _mm_sub_epi16(inverter, weights[0]);
1159   } else {
1160     const __m128i weight = LoadUnaligned16(weight_array + 12);
1161     const __m128i zero = _mm_setzero_si128();
1162     weights[0] = cvtepu8_epi16(weight);
1163     weights[1] = _mm_sub_epi16(inverter, weights[0]);
1164     weights[2] = _mm_unpackhi_epi8(weight, zero);
1165     weights[3] = _mm_sub_epi16(inverter, weights[2]);
1166   }
1167 }
1168 
write_smooth_vertical4xh(const __m128i * pixel,const __m128i * weight,const int height,uint8_t * LIBAOM_RESTRICT dst,const ptrdiff_t stride)1169 static AOM_FORCE_INLINE void write_smooth_vertical4xh(
1170     const __m128i *pixel, const __m128i *weight, const int height,
1171     uint8_t *LIBAOM_RESTRICT dst, const ptrdiff_t stride) {
1172   const __m128i pred_round = _mm_set1_epi32(128);
1173   const __m128i mask_increment = _mm_set1_epi16(0x0202);
1174   const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400);
1175   __m128i y_select = _mm_set1_epi16(0x0100);
1176 
1177   for (int y = 0; y < height; ++y) {
1178     const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select);
1179     const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select);
1180     const __m128i alternate_weights =
1181         _mm_unpacklo_epi16(weight_y, inverted_weight_y);
1182     // Here the pixel vector is top_row[0], corner, top_row[1], corner, ...
1183     // The madd instruction yields four results of the form:
1184     // (top_row[x] * weight[y] + corner * inverted_weight[y])
1185     __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights);
1186     sum = _mm_add_epi32(sum, pred_round);
1187     sum = _mm_srai_epi32(sum, 8);
1188     sum = _mm_shuffle_epi8(sum, cvtepu8_epi32);
1189     Store4(dst, sum);
1190     dst += stride;
1191     y_select = _mm_add_epi16(y_select, mask_increment);
1192   }
1193 }
1194 
aom_smooth_v_predictor_4x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1195 void aom_smooth_v_predictor_4x4_ssse3(
1196     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1197     const uint8_t *LIBAOM_RESTRICT top_row,
1198     const uint8_t *LIBAOM_RESTRICT left_column) {
1199   __m128i pixels;
1200   load_smooth_vertical_pixels4(top_row, left_column, 4, &pixels);
1201 
1202   __m128i weights[2];
1203   load_smooth_vertical_weights4(smooth_weights, 4, weights);
1204 
1205   write_smooth_vertical4xh(&pixels, weights, 4, dst, stride);
1206 }
1207 
aom_smooth_v_predictor_4x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1208 void aom_smooth_v_predictor_4x8_ssse3(
1209     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1210     const uint8_t *LIBAOM_RESTRICT top_row,
1211     const uint8_t *LIBAOM_RESTRICT left_column) {
1212   __m128i pixels;
1213   load_smooth_vertical_pixels4(top_row, left_column, 8, &pixels);
1214 
1215   __m128i weights[2];
1216   load_smooth_vertical_weights4(smooth_weights, 8, weights);
1217 
1218   write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
1219 }
1220 
1221 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_v_predictor_4x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1222 void aom_smooth_v_predictor_4x16_ssse3(
1223     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1224     const uint8_t *LIBAOM_RESTRICT top_row,
1225     const uint8_t *LIBAOM_RESTRICT left_column) {
1226   __m128i pixels;
1227   load_smooth_vertical_pixels4(top_row, left_column, 16, &pixels);
1228 
1229   __m128i weights[4];
1230   load_smooth_vertical_weights4(smooth_weights, 16, weights);
1231 
1232   write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
1233   dst += stride << 3;
1234   write_smooth_vertical4xh(&pixels, &weights[2], 8, dst, stride);
1235 }
1236 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1237 
aom_smooth_v_predictor_8x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1238 void aom_smooth_v_predictor_8x4_ssse3(
1239     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1240     const uint8_t *LIBAOM_RESTRICT top_row,
1241     const uint8_t *LIBAOM_RESTRICT left_column) {
1242   const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
1243   const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
1244   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1245   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1246   const __m128i scaled_bottom_left =
1247       _mm_mullo_epi16(inverted_weights, bottom_left);
1248   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1249   __m128i y_select = _mm_set1_epi32(0x01000100);
1250   const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1251   __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1252   __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1253   write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1254                                 &round);
1255   dst += stride;
1256   y_select = _mm_set1_epi32(0x03020302);
1257   weights_y = _mm_shuffle_epi8(weights, y_select);
1258   scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1259   write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1260                                 &round);
1261   dst += stride;
1262   y_select = _mm_set1_epi32(0x05040504);
1263   weights_y = _mm_shuffle_epi8(weights, y_select);
1264   scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1265   write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1266                                 &round);
1267   dst += stride;
1268   y_select = _mm_set1_epi32(0x07060706);
1269   weights_y = _mm_shuffle_epi8(weights, y_select);
1270   scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1271   write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1272                                 &round);
1273 }
1274 
aom_smooth_v_predictor_8x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1275 void aom_smooth_v_predictor_8x8_ssse3(
1276     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1277     const uint8_t *LIBAOM_RESTRICT top_row,
1278     const uint8_t *LIBAOM_RESTRICT left_column) {
1279   const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
1280   const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
1281   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1282   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1283   const __m128i scaled_bottom_left =
1284       _mm_mullo_epi16(inverted_weights, bottom_left);
1285   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1286   const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1287   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1288     const __m128i y_select = _mm_set1_epi32(y_mask);
1289     const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1290     const __m128i scaled_bottom_left_y =
1291         _mm_shuffle_epi8(scaled_bottom_left, y_select);
1292     write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1293                                   &round);
1294     dst += stride;
1295   }
1296 }
1297 
aom_smooth_v_predictor_8x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1298 void aom_smooth_v_predictor_8x16_ssse3(
1299     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1300     const uint8_t *LIBAOM_RESTRICT top_row,
1301     const uint8_t *LIBAOM_RESTRICT left_column) {
1302   const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1303   const __m128i weights = LoadUnaligned16(smooth_weights + 12);
1304 
1305   const __m128i weights1 = cvtepu8_epi16(weights);
1306   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
1307   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1308   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1309   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1310   const __m128i scaled_bottom_left1 =
1311       _mm_mullo_epi16(inverted_weights1, bottom_left);
1312   const __m128i scaled_bottom_left2 =
1313       _mm_mullo_epi16(inverted_weights2, bottom_left);
1314   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1315   const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1316   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1317     const __m128i y_select = _mm_set1_epi32(y_mask);
1318     const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1319     const __m128i scaled_bottom_left_y =
1320         _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1321     write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1322                                   &round);
1323     dst += stride;
1324   }
1325   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1326     const __m128i y_select = _mm_set1_epi32(y_mask);
1327     const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1328     const __m128i scaled_bottom_left_y =
1329         _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1330     write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1331                                   &round);
1332     dst += stride;
1333   }
1334 }
1335 
1336 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_v_predictor_8x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1337 void aom_smooth_v_predictor_8x32_ssse3(
1338     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1339     const uint8_t *LIBAOM_RESTRICT top_row,
1340     const uint8_t *LIBAOM_RESTRICT left_column) {
1341   const __m128i zero = _mm_setzero_si128();
1342   const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1343   const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1344   const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1345   const __m128i weights1 = cvtepu8_epi16(weights_lo);
1346   const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1347   const __m128i weights3 = cvtepu8_epi16(weights_hi);
1348   const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1349   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1350   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1351   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1352   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1353   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1354   const __m128i scaled_bottom_left1 =
1355       _mm_mullo_epi16(inverted_weights1, bottom_left);
1356   const __m128i scaled_bottom_left2 =
1357       _mm_mullo_epi16(inverted_weights2, bottom_left);
1358   const __m128i scaled_bottom_left3 =
1359       _mm_mullo_epi16(inverted_weights3, bottom_left);
1360   const __m128i scaled_bottom_left4 =
1361       _mm_mullo_epi16(inverted_weights4, bottom_left);
1362   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1363   const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1364   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1365     const __m128i y_select = _mm_set1_epi32(y_mask);
1366     const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1367     const __m128i scaled_bottom_left_y =
1368         _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1369     write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1370                                   &round);
1371     dst += stride;
1372   }
1373   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1374     const __m128i y_select = _mm_set1_epi32(y_mask);
1375     const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1376     const __m128i scaled_bottom_left_y =
1377         _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1378     write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1379                                   &round);
1380     dst += stride;
1381   }
1382   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1383     const __m128i y_select = _mm_set1_epi32(y_mask);
1384     const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1385     const __m128i scaled_bottom_left_y =
1386         _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1387     write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1388                                   &round);
1389     dst += stride;
1390   }
1391   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1392     const __m128i y_select = _mm_set1_epi32(y_mask);
1393     const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
1394     const __m128i scaled_bottom_left_y =
1395         _mm_shuffle_epi8(scaled_bottom_left4, y_select);
1396     write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1397                                   &round);
1398     dst += stride;
1399   }
1400 }
1401 
aom_smooth_v_predictor_16x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1402 void aom_smooth_v_predictor_16x4_ssse3(
1403     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1404     const uint8_t *LIBAOM_RESTRICT top_row,
1405     const uint8_t *LIBAOM_RESTRICT left_column) {
1406   const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
1407   const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
1408   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1409   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1410   const __m128i scaled_bottom_left =
1411       _mm_mullo_epi16(inverted_weights, bottom_left);
1412   const __m128i round = _mm_set1_epi16(128);
1413   const __m128i top = LoadUnaligned16(top_row);
1414   const __m128i top_lo = cvtepu8_epi16(top);
1415   const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
1416 
1417   __m128i y_select = _mm_set1_epi32(0x01000100);
1418   __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1419   __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1420   write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1421                                  scaled_bottom_left_y, scaled_bottom_left_y,
1422                                  round);
1423   dst += stride;
1424   y_select = _mm_set1_epi32(0x03020302);
1425   weights_y = _mm_shuffle_epi8(weights, y_select);
1426   scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1427   write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1428                                  scaled_bottom_left_y, scaled_bottom_left_y,
1429                                  round);
1430   dst += stride;
1431   y_select = _mm_set1_epi32(0x05040504);
1432   weights_y = _mm_shuffle_epi8(weights, y_select);
1433   scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1434   write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1435                                  scaled_bottom_left_y, scaled_bottom_left_y,
1436                                  round);
1437   dst += stride;
1438   y_select = _mm_set1_epi32(0x07060706);
1439   weights_y = _mm_shuffle_epi8(weights, y_select);
1440   scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1441   write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1442                                  scaled_bottom_left_y, scaled_bottom_left_y,
1443                                  round);
1444 }
1445 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1446 
aom_smooth_v_predictor_16x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1447 void aom_smooth_v_predictor_16x8_ssse3(
1448     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1449     const uint8_t *LIBAOM_RESTRICT top_row,
1450     const uint8_t *LIBAOM_RESTRICT left_column) {
1451   const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
1452   const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
1453   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1454   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1455   const __m128i scaled_bottom_left =
1456       _mm_mullo_epi16(inverted_weights, bottom_left);
1457   const __m128i round = _mm_set1_epi16(128);
1458   const __m128i top = LoadUnaligned16(top_row);
1459   const __m128i top_lo = cvtepu8_epi16(top);
1460   const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
1461   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1462     const __m128i y_select = _mm_set1_epi32(y_mask);
1463     const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1464     const __m128i scaled_bottom_left_y =
1465         _mm_shuffle_epi8(scaled_bottom_left, y_select);
1466     write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1467                                    scaled_bottom_left_y, scaled_bottom_left_y,
1468                                    round);
1469     dst += stride;
1470   }
1471 }
1472 
aom_smooth_v_predictor_16x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1473 void aom_smooth_v_predictor_16x16_ssse3(
1474     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1475     const uint8_t *LIBAOM_RESTRICT top_row,
1476     const uint8_t *LIBAOM_RESTRICT left_column) {
1477   const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1478   const __m128i zero = _mm_setzero_si128();
1479   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1480   const __m128i weights = LoadUnaligned16(smooth_weights + 12);
1481   const __m128i weights_lo = cvtepu8_epi16(weights);
1482   const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
1483   const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
1484   const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
1485   const __m128i scaled_bottom_left_lo =
1486       _mm_mullo_epi16(inverted_weights_lo, bottom_left);
1487   const __m128i scaled_bottom_left_hi =
1488       _mm_mullo_epi16(inverted_weights_hi, bottom_left);
1489   const __m128i round = _mm_set1_epi16(128);
1490 
1491   const __m128i top = LoadUnaligned16(top_row);
1492   const __m128i top_lo = cvtepu8_epi16(top);
1493   const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1494   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1495     const __m128i y_select = _mm_set1_epi32(y_mask);
1496     const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
1497     const __m128i scaled_bottom_left_y =
1498         _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
1499     write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1500                                    scaled_bottom_left_y, scaled_bottom_left_y,
1501                                    round);
1502     dst += stride;
1503   }
1504   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1505     const __m128i y_select = _mm_set1_epi32(y_mask);
1506     const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
1507     const __m128i scaled_bottom_left_y =
1508         _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
1509     write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1510                                    scaled_bottom_left_y, scaled_bottom_left_y,
1511                                    round);
1512     dst += stride;
1513   }
1514 }
1515 
aom_smooth_v_predictor_16x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1516 void aom_smooth_v_predictor_16x32_ssse3(
1517     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1518     const uint8_t *LIBAOM_RESTRICT top_row,
1519     const uint8_t *LIBAOM_RESTRICT left_column) {
1520   const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1521   const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1522   const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1523   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1524   const __m128i zero = _mm_setzero_si128();
1525   const __m128i weights1 = cvtepu8_epi16(weights_lo);
1526   const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1527   const __m128i weights3 = cvtepu8_epi16(weights_hi);
1528   const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1529   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1530   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1531   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1532   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1533   const __m128i scaled_bottom_left1 =
1534       _mm_mullo_epi16(inverted_weights1, bottom_left);
1535   const __m128i scaled_bottom_left2 =
1536       _mm_mullo_epi16(inverted_weights2, bottom_left);
1537   const __m128i scaled_bottom_left3 =
1538       _mm_mullo_epi16(inverted_weights3, bottom_left);
1539   const __m128i scaled_bottom_left4 =
1540       _mm_mullo_epi16(inverted_weights4, bottom_left);
1541   const __m128i round = _mm_set1_epi16(128);
1542 
1543   const __m128i top = LoadUnaligned16(top_row);
1544   const __m128i top_lo = cvtepu8_epi16(top);
1545   const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1546   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1547     const __m128i y_select = _mm_set1_epi32(y_mask);
1548     const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1549     const __m128i scaled_bottom_left_y =
1550         _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1551     write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1552                                    scaled_bottom_left_y, scaled_bottom_left_y,
1553                                    round);
1554     dst += stride;
1555   }
1556   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1557     const __m128i y_select = _mm_set1_epi32(y_mask);
1558     const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1559     const __m128i scaled_bottom_left_y =
1560         _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1561     write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1562                                    scaled_bottom_left_y, scaled_bottom_left_y,
1563                                    round);
1564     dst += stride;
1565   }
1566   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1567     const __m128i y_select = _mm_set1_epi32(y_mask);
1568     const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1569     const __m128i scaled_bottom_left_y =
1570         _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1571     write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1572                                    scaled_bottom_left_y, scaled_bottom_left_y,
1573                                    round);
1574     dst += stride;
1575   }
1576   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1577     const __m128i y_select = _mm_set1_epi32(y_mask);
1578     const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
1579     const __m128i scaled_bottom_left_y =
1580         _mm_shuffle_epi8(scaled_bottom_left4, y_select);
1581     write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1582                                    scaled_bottom_left_y, scaled_bottom_left_y,
1583                                    round);
1584     dst += stride;
1585   }
1586 }
1587 
1588 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_v_predictor_16x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1589 void aom_smooth_v_predictor_16x64_ssse3(
1590     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1591     const uint8_t *LIBAOM_RESTRICT top_row,
1592     const uint8_t *LIBAOM_RESTRICT left_column) {
1593   const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
1594   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1595   const __m128i round = _mm_set1_epi16(128);
1596   const __m128i zero = _mm_setzero_si128();
1597   const __m128i top = LoadUnaligned16(top_row);
1598   const __m128i top_lo = cvtepu8_epi16(top);
1599   const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1600   const uint8_t *weights_base_ptr = smooth_weights + 60;
1601   for (int left_offset = 0; left_offset < 64; left_offset += 16) {
1602     const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
1603     const __m128i weights_lo = cvtepu8_epi16(weights);
1604     const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
1605     const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
1606     const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
1607     const __m128i scaled_bottom_left_lo =
1608         _mm_mullo_epi16(inverted_weights_lo, bottom_left);
1609     const __m128i scaled_bottom_left_hi =
1610         _mm_mullo_epi16(inverted_weights_hi, bottom_left);
1611 
1612     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1613       const __m128i y_select = _mm_set1_epi32(y_mask);
1614       const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
1615       const __m128i scaled_bottom_left_y =
1616           _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
1617       write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1618                                      scaled_bottom_left_y, scaled_bottom_left_y,
1619                                      round);
1620       dst += stride;
1621     }
1622     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1623       const __m128i y_select = _mm_set1_epi32(y_mask);
1624       const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
1625       const __m128i scaled_bottom_left_y =
1626           _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
1627       write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1628                                      scaled_bottom_left_y, scaled_bottom_left_y,
1629                                      round);
1630       dst += stride;
1631     }
1632   }
1633 }
1634 
aom_smooth_v_predictor_32x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1635 void aom_smooth_v_predictor_32x8_ssse3(
1636     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1637     const uint8_t *LIBAOM_RESTRICT top_row,
1638     const uint8_t *LIBAOM_RESTRICT left_column) {
1639   const __m128i zero = _mm_setzero_si128();
1640   const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
1641   const __m128i top_lo = LoadUnaligned16(top_row);
1642   const __m128i top_hi = LoadUnaligned16(top_row + 16);
1643   const __m128i top1 = cvtepu8_epi16(top_lo);
1644   const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1645   const __m128i top3 = cvtepu8_epi16(top_hi);
1646   const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1647   __m128i scale = _mm_set1_epi16(256);
1648   const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
1649   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1650   const __m128i scaled_bottom_left =
1651       _mm_mullo_epi16(inverted_weights, bottom_left);
1652   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1653   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1654     __m128i y_select = _mm_set1_epi32(y_mask);
1655     const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1656     const __m128i scaled_bottom_left_y =
1657         _mm_shuffle_epi8(scaled_bottom_left, y_select);
1658     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1659                                    scaled_bottom_left_y, scaled_bottom_left_y,
1660                                    round);
1661     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1662                                    scaled_bottom_left_y, scaled_bottom_left_y,
1663                                    round);
1664     dst += stride;
1665   }
1666 }
1667 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1668 
aom_smooth_v_predictor_32x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1669 void aom_smooth_v_predictor_32x16_ssse3(
1670     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1671     const uint8_t *LIBAOM_RESTRICT top_row,
1672     const uint8_t *LIBAOM_RESTRICT left_column) {
1673   const __m128i zero = _mm_setzero_si128();
1674   const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1675   const __m128i top_lo = LoadUnaligned16(top_row);
1676   const __m128i top_hi = LoadUnaligned16(top_row + 16);
1677   const __m128i top1 = cvtepu8_epi16(top_lo);
1678   const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1679   const __m128i top3 = cvtepu8_epi16(top_hi);
1680   const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1681   const __m128i weights = LoadUnaligned16(smooth_weights + 12);
1682   const __m128i weights1 = cvtepu8_epi16(weights);
1683   const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
1684   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1685   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1686   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1687   const __m128i scaled_bottom_left1 =
1688       _mm_mullo_epi16(inverted_weights1, bottom_left);
1689   const __m128i scaled_bottom_left2 =
1690       _mm_mullo_epi16(inverted_weights2, bottom_left);
1691   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1692   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1693     __m128i y_select = _mm_set1_epi32(y_mask);
1694     const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1695     const __m128i scaled_bottom_left_y =
1696         _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1697     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1698                                    scaled_bottom_left_y, scaled_bottom_left_y,
1699                                    round);
1700     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1701                                    scaled_bottom_left_y, scaled_bottom_left_y,
1702                                    round);
1703     dst += stride;
1704   }
1705   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1706     __m128i y_select = _mm_set1_epi32(y_mask);
1707     const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1708     const __m128i scaled_bottom_left_y =
1709         _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1710     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1711                                    scaled_bottom_left_y, scaled_bottom_left_y,
1712                                    round);
1713     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1714                                    scaled_bottom_left_y, scaled_bottom_left_y,
1715                                    round);
1716     dst += stride;
1717   }
1718 }
1719 
aom_smooth_v_predictor_32x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1720 void aom_smooth_v_predictor_32x32_ssse3(
1721     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1722     const uint8_t *LIBAOM_RESTRICT top_row,
1723     const uint8_t *LIBAOM_RESTRICT left_column) {
1724   const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1725   const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1726   const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1727   const __m128i zero = _mm_setzero_si128();
1728   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1729   const __m128i top_lo = LoadUnaligned16(top_row);
1730   const __m128i top_hi = LoadUnaligned16(top_row + 16);
1731   const __m128i top1 = cvtepu8_epi16(top_lo);
1732   const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1733   const __m128i top3 = cvtepu8_epi16(top_hi);
1734   const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1735   const __m128i weights1 = cvtepu8_epi16(weights_lo);
1736   const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1737   const __m128i weights3 = cvtepu8_epi16(weights_hi);
1738   const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1739   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1740   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1741   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1742   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1743   const __m128i scaled_bottom_left1 =
1744       _mm_mullo_epi16(inverted_weights1, bottom_left);
1745   const __m128i scaled_bottom_left2 =
1746       _mm_mullo_epi16(inverted_weights2, bottom_left);
1747   const __m128i scaled_bottom_left3 =
1748       _mm_mullo_epi16(inverted_weights3, bottom_left);
1749   const __m128i scaled_bottom_left4 =
1750       _mm_mullo_epi16(inverted_weights4, bottom_left);
1751   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1752   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1753     const __m128i y_select = _mm_set1_epi32(y_mask);
1754     const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1755     const __m128i scaled_bottom_left_y =
1756         _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1757     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1758                                    scaled_bottom_left_y, scaled_bottom_left_y,
1759                                    round);
1760     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1761                                    scaled_bottom_left_y, scaled_bottom_left_y,
1762                                    round);
1763     dst += stride;
1764   }
1765   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1766     const __m128i y_select = _mm_set1_epi32(y_mask);
1767     const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1768     const __m128i scaled_bottom_left_y =
1769         _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1770     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1771                                    scaled_bottom_left_y, scaled_bottom_left_y,
1772                                    round);
1773     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1774                                    scaled_bottom_left_y, scaled_bottom_left_y,
1775                                    round);
1776     dst += stride;
1777   }
1778   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1779     const __m128i y_select = _mm_set1_epi32(y_mask);
1780     const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1781     const __m128i scaled_bottom_left_y =
1782         _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1783     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1784                                    scaled_bottom_left_y, scaled_bottom_left_y,
1785                                    round);
1786     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1787                                    scaled_bottom_left_y, scaled_bottom_left_y,
1788                                    round);
1789     dst += stride;
1790   }
1791   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1792     const __m128i y_select = _mm_set1_epi32(y_mask);
1793     const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
1794     const __m128i scaled_bottom_left_y =
1795         _mm_shuffle_epi8(scaled_bottom_left4, y_select);
1796     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1797                                    scaled_bottom_left_y, scaled_bottom_left_y,
1798                                    round);
1799     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1800                                    scaled_bottom_left_y, scaled_bottom_left_y,
1801                                    round);
1802     dst += stride;
1803   }
1804 }
1805 
aom_smooth_v_predictor_32x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1806 void aom_smooth_v_predictor_32x64_ssse3(
1807     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1808     const uint8_t *LIBAOM_RESTRICT top_row,
1809     const uint8_t *LIBAOM_RESTRICT left_column) {
1810   const __m128i zero = _mm_setzero_si128();
1811   const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
1812   const __m128i top_lo = LoadUnaligned16(top_row);
1813   const __m128i top_hi = LoadUnaligned16(top_row + 16);
1814   const __m128i top1 = cvtepu8_epi16(top_lo);
1815   const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1816   const __m128i top3 = cvtepu8_epi16(top_hi);
1817   const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1818   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1819   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1820   const uint8_t *weights_base_ptr = smooth_weights + 60;
1821   for (int left_offset = 0; left_offset < 64; left_offset += 16) {
1822     const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
1823     const __m128i weights_lo = cvtepu8_epi16(weights);
1824     const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
1825     const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
1826     const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
1827     const __m128i scaled_bottom_left_lo =
1828         _mm_mullo_epi16(inverted_weights_lo, bottom_left);
1829     const __m128i scaled_bottom_left_hi =
1830         _mm_mullo_epi16(inverted_weights_hi, bottom_left);
1831 
1832     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1833       const __m128i y_select = _mm_set1_epi32(y_mask);
1834       const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
1835       const __m128i scaled_bottom_left_y =
1836           _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
1837       write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1838                                      scaled_bottom_left_y, scaled_bottom_left_y,
1839                                      round);
1840       write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1841                                      scaled_bottom_left_y, scaled_bottom_left_y,
1842                                      round);
1843       dst += stride;
1844     }
1845     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1846       const __m128i y_select = _mm_set1_epi32(y_mask);
1847       const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
1848       const __m128i scaled_bottom_left_y =
1849           _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
1850       write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1851                                      scaled_bottom_left_y, scaled_bottom_left_y,
1852                                      round);
1853       write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1854                                      scaled_bottom_left_y, scaled_bottom_left_y,
1855                                      round);
1856       dst += stride;
1857     }
1858   }
1859 }
1860 
1861 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_v_predictor_64x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1862 void aom_smooth_v_predictor_64x16_ssse3(
1863     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1864     const uint8_t *LIBAOM_RESTRICT top_row,
1865     const uint8_t *LIBAOM_RESTRICT left_column) {
1866   const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1867   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1868   const __m128i zero = _mm_setzero_si128();
1869   const __m128i top_lolo = LoadUnaligned16(top_row);
1870   const __m128i top_lohi = LoadUnaligned16(top_row + 16);
1871   const __m128i top1 = cvtepu8_epi16(top_lolo);
1872   const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
1873   const __m128i top3 = cvtepu8_epi16(top_lohi);
1874   const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
1875 
1876   const __m128i weights = LoadUnaligned16(smooth_weights + 12);
1877   const __m128i weights1 = cvtepu8_epi16(weights);
1878   const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
1879   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1880   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1881   const __m128i top_hilo = LoadUnaligned16(top_row + 32);
1882   const __m128i top_hihi = LoadUnaligned16(top_row + 48);
1883   const __m128i top5 = cvtepu8_epi16(top_hilo);
1884   const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
1885   const __m128i top7 = cvtepu8_epi16(top_hihi);
1886   const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
1887   const __m128i scaled_bottom_left1 =
1888       _mm_mullo_epi16(inverted_weights1, bottom_left);
1889   const __m128i scaled_bottom_left2 =
1890       _mm_mullo_epi16(inverted_weights2, bottom_left);
1891   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1892   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1893     const __m128i y_select = _mm_set1_epi32(y_mask);
1894     const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1895     const __m128i scaled_bottom_left_y =
1896         _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1897     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1898                                    scaled_bottom_left_y, scaled_bottom_left_y,
1899                                    round);
1900     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1901                                    scaled_bottom_left_y, scaled_bottom_left_y,
1902                                    round);
1903     write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1904                                    scaled_bottom_left_y, scaled_bottom_left_y,
1905                                    round);
1906     write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1907                                    scaled_bottom_left_y, scaled_bottom_left_y,
1908                                    round);
1909     dst += stride;
1910   }
1911   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1912     const __m128i y_select = _mm_set1_epi32(y_mask);
1913     const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1914     const __m128i scaled_bottom_left_y =
1915         _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1916     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1917                                    scaled_bottom_left_y, scaled_bottom_left_y,
1918                                    round);
1919     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1920                                    scaled_bottom_left_y, scaled_bottom_left_y,
1921                                    round);
1922     write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1923                                    scaled_bottom_left_y, scaled_bottom_left_y,
1924                                    round);
1925     write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1926                                    scaled_bottom_left_y, scaled_bottom_left_y,
1927                                    round);
1928     dst += stride;
1929   }
1930 }
1931 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1932 
aom_smooth_v_predictor_64x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1933 void aom_smooth_v_predictor_64x32_ssse3(
1934     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1935     const uint8_t *LIBAOM_RESTRICT top_row,
1936     const uint8_t *LIBAOM_RESTRICT left_column) {
1937   const __m128i zero = _mm_setzero_si128();
1938   const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1939   const __m128i top_lolo = LoadUnaligned16(top_row);
1940   const __m128i top_lohi = LoadUnaligned16(top_row + 16);
1941   const __m128i top1 = cvtepu8_epi16(top_lolo);
1942   const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
1943   const __m128i top3 = cvtepu8_epi16(top_lohi);
1944   const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
1945   const __m128i top_hilo = LoadUnaligned16(top_row + 32);
1946   const __m128i top_hihi = LoadUnaligned16(top_row + 48);
1947   const __m128i top5 = cvtepu8_epi16(top_hilo);
1948   const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
1949   const __m128i top7 = cvtepu8_epi16(top_hihi);
1950   const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
1951   const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1952   const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1953   const __m128i weights1 = cvtepu8_epi16(weights_lo);
1954   const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1955   const __m128i weights3 = cvtepu8_epi16(weights_hi);
1956   const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1957   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1958   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1959   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1960   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1961   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1962   const __m128i scaled_bottom_left1 =
1963       _mm_mullo_epi16(inverted_weights1, bottom_left);
1964   const __m128i scaled_bottom_left2 =
1965       _mm_mullo_epi16(inverted_weights2, bottom_left);
1966   const __m128i scaled_bottom_left3 =
1967       _mm_mullo_epi16(inverted_weights3, bottom_left);
1968   const __m128i scaled_bottom_left4 =
1969       _mm_mullo_epi16(inverted_weights4, bottom_left);
1970   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1971 
1972   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1973     const __m128i y_select = _mm_set1_epi32(y_mask);
1974     const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1975     const __m128i scaled_bottom_left_y =
1976         _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1977     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1978                                    scaled_bottom_left_y, scaled_bottom_left_y,
1979                                    round);
1980     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1981                                    scaled_bottom_left_y, scaled_bottom_left_y,
1982                                    round);
1983     write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1984                                    scaled_bottom_left_y, scaled_bottom_left_y,
1985                                    round);
1986     write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1987                                    scaled_bottom_left_y, scaled_bottom_left_y,
1988                                    round);
1989     dst += stride;
1990   }
1991   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1992     const __m128i y_select = _mm_set1_epi32(y_mask);
1993     const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1994     const __m128i scaled_bottom_left_y =
1995         _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1996     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1997                                    scaled_bottom_left_y, scaled_bottom_left_y,
1998                                    round);
1999     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
2000                                    scaled_bottom_left_y, scaled_bottom_left_y,
2001                                    round);
2002     write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
2003                                    scaled_bottom_left_y, scaled_bottom_left_y,
2004                                    round);
2005     write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
2006                                    scaled_bottom_left_y, scaled_bottom_left_y,
2007                                    round);
2008     dst += stride;
2009   }
2010   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2011     const __m128i y_select = _mm_set1_epi32(y_mask);
2012     const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
2013     const __m128i scaled_bottom_left_y =
2014         _mm_shuffle_epi8(scaled_bottom_left3, y_select);
2015     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
2016                                    scaled_bottom_left_y, scaled_bottom_left_y,
2017                                    round);
2018     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
2019                                    scaled_bottom_left_y, scaled_bottom_left_y,
2020                                    round);
2021     write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
2022                                    scaled_bottom_left_y, scaled_bottom_left_y,
2023                                    round);
2024     write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
2025                                    scaled_bottom_left_y, scaled_bottom_left_y,
2026                                    round);
2027     dst += stride;
2028   }
2029   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2030     const __m128i y_select = _mm_set1_epi32(y_mask);
2031     const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
2032     const __m128i scaled_bottom_left_y =
2033         _mm_shuffle_epi8(scaled_bottom_left4, y_select);
2034     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
2035                                    scaled_bottom_left_y, scaled_bottom_left_y,
2036                                    round);
2037     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
2038                                    scaled_bottom_left_y, scaled_bottom_left_y,
2039                                    round);
2040     write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
2041                                    scaled_bottom_left_y, scaled_bottom_left_y,
2042                                    round);
2043     write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
2044                                    scaled_bottom_left_y, scaled_bottom_left_y,
2045                                    round);
2046     dst += stride;
2047   }
2048 }
2049 
aom_smooth_v_predictor_64x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2050 void aom_smooth_v_predictor_64x64_ssse3(
2051     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2052     const uint8_t *LIBAOM_RESTRICT top_row,
2053     const uint8_t *LIBAOM_RESTRICT left_column) {
2054   const __m128i zero = _mm_setzero_si128();
2055   const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
2056   const __m128i top_lolo = LoadUnaligned16(top_row);
2057   const __m128i top_lohi = LoadUnaligned16(top_row + 16);
2058   const __m128i top1 = cvtepu8_epi16(top_lolo);
2059   const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
2060   const __m128i top3 = cvtepu8_epi16(top_lohi);
2061   const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
2062   const __m128i top_hilo = LoadUnaligned16(top_row + 32);
2063   const __m128i top_hihi = LoadUnaligned16(top_row + 48);
2064   const __m128i top5 = cvtepu8_epi16(top_hilo);
2065   const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
2066   const __m128i top7 = cvtepu8_epi16(top_hihi);
2067   const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
2068   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2069   const __m128i round = _mm_set1_epi16(128);
2070   const uint8_t *weights_base_ptr = smooth_weights + 60;
2071   for (int left_offset = 0; left_offset < 64; left_offset += 16) {
2072     const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
2073     const __m128i weights_lo = cvtepu8_epi16(weights);
2074     const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
2075     const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
2076     const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
2077     const __m128i scaled_bottom_left_lo =
2078         _mm_mullo_epi16(inverted_weights_lo, bottom_left);
2079     const __m128i scaled_bottom_left_hi =
2080         _mm_mullo_epi16(inverted_weights_hi, bottom_left);
2081     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2082       const __m128i y_select = _mm_set1_epi32(y_mask);
2083       const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
2084       const __m128i scaled_bottom_left_y =
2085           _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
2086       write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
2087                                      scaled_bottom_left_y, scaled_bottom_left_y,
2088                                      round);
2089       write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
2090                                      scaled_bottom_left_y, scaled_bottom_left_y,
2091                                      round);
2092       write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
2093                                      scaled_bottom_left_y, scaled_bottom_left_y,
2094                                      round);
2095       write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
2096                                      scaled_bottom_left_y, scaled_bottom_left_y,
2097                                      round);
2098       dst += stride;
2099     }
2100     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2101       const __m128i y_select = _mm_set1_epi32(y_mask);
2102       const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
2103       const __m128i scaled_bottom_left_y =
2104           _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
2105       write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
2106                                      scaled_bottom_left_y, scaled_bottom_left_y,
2107                                      round);
2108       write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
2109                                      scaled_bottom_left_y, scaled_bottom_left_y,
2110                                      round);
2111       write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
2112                                      scaled_bottom_left_y, scaled_bottom_left_y,
2113                                      round);
2114       write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
2115                                      scaled_bottom_left_y, scaled_bottom_left_y,
2116                                      round);
2117       dst += stride;
2118     }
2119   }
2120 }
2121 
2122 // -----------------------------------------------------------------------------
2123 // SMOOTH_H_PRED
write_smooth_horizontal_sum4(uint8_t * LIBAOM_RESTRICT dst,const __m128i * left_y,const __m128i * weights,const __m128i * scaled_top_right,const __m128i * round)2124 static AOM_FORCE_INLINE void write_smooth_horizontal_sum4(
2125     uint8_t *LIBAOM_RESTRICT dst, const __m128i *left_y, const __m128i *weights,
2126     const __m128i *scaled_top_right, const __m128i *round) {
2127   const __m128i weighted_left_y = _mm_mullo_epi16(*left_y, *weights);
2128   const __m128i pred_sum = _mm_add_epi32(*scaled_top_right, weighted_left_y);
2129   // Equivalent to RightShiftWithRounding(pred[x][y], 8).
2130   const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, *round), 8);
2131   const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
2132   Store4(dst, _mm_shuffle_epi8(pred, cvtepi32_epi8));
2133 }
2134 
aom_smooth_h_predictor_4x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2135 void aom_smooth_h_predictor_4x4_ssse3(
2136     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2137     const uint8_t *LIBAOM_RESTRICT top_row,
2138     const uint8_t *LIBAOM_RESTRICT left_column) {
2139   const __m128i top_right = _mm_set1_epi32(top_row[3]);
2140   const __m128i left = cvtepu8_epi32(Load4(left_column));
2141   const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
2142   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2143   const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
2144   const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2145   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2146   __m128i left_y = _mm_shuffle_epi32(left, 0);
2147   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2148                                &round);
2149   dst += stride;
2150   left_y = _mm_shuffle_epi32(left, 0x55);
2151   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2152                                &round);
2153   dst += stride;
2154   left_y = _mm_shuffle_epi32(left, 0xaa);
2155   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2156                                &round);
2157   dst += stride;
2158   left_y = _mm_shuffle_epi32(left, 0xff);
2159   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2160                                &round);
2161 }
2162 
aom_smooth_h_predictor_4x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2163 void aom_smooth_h_predictor_4x8_ssse3(
2164     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2165     const uint8_t *LIBAOM_RESTRICT top_row,
2166     const uint8_t *LIBAOM_RESTRICT left_column) {
2167   const __m128i top_right = _mm_set1_epi32(top_row[3]);
2168   const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
2169   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2170   const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
2171   const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2172   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2173   __m128i left = cvtepu8_epi32(Load4(left_column));
2174   __m128i left_y = _mm_shuffle_epi32(left, 0);
2175   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2176                                &round);
2177   dst += stride;
2178   left_y = _mm_shuffle_epi32(left, 0x55);
2179   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2180                                &round);
2181   dst += stride;
2182   left_y = _mm_shuffle_epi32(left, 0xaa);
2183   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2184                                &round);
2185   dst += stride;
2186   left_y = _mm_shuffle_epi32(left, 0xff);
2187   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2188                                &round);
2189   dst += stride;
2190 
2191   left = cvtepu8_epi32(Load4(left_column + 4));
2192   left_y = _mm_shuffle_epi32(left, 0);
2193   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2194                                &round);
2195   dst += stride;
2196   left_y = _mm_shuffle_epi32(left, 0x55);
2197   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2198                                &round);
2199   dst += stride;
2200   left_y = _mm_shuffle_epi32(left, 0xaa);
2201   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2202                                &round);
2203   dst += stride;
2204   left_y = _mm_shuffle_epi32(left, 0xff);
2205   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2206                                &round);
2207 }
2208 
2209 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_h_predictor_4x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2210 void aom_smooth_h_predictor_4x16_ssse3(
2211     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2212     const uint8_t *LIBAOM_RESTRICT top_row,
2213     const uint8_t *LIBAOM_RESTRICT left_column) {
2214   const __m128i top_right = _mm_set1_epi32(top_row[3]);
2215   const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
2216   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2217   const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
2218   const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2219   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2220   __m128i left = cvtepu8_epi32(Load4(left_column));
2221   __m128i left_y = _mm_shuffle_epi32(left, 0);
2222   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2223                                &round);
2224   dst += stride;
2225   left_y = _mm_shuffle_epi32(left, 0x55);
2226   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2227                                &round);
2228   dst += stride;
2229   left_y = _mm_shuffle_epi32(left, 0xaa);
2230   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2231                                &round);
2232   dst += stride;
2233   left_y = _mm_shuffle_epi32(left, 0xff);
2234   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2235                                &round);
2236   dst += stride;
2237 
2238   left = cvtepu8_epi32(Load4(left_column + 4));
2239   left_y = _mm_shuffle_epi32(left, 0);
2240   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2241                                &round);
2242   dst += stride;
2243   left_y = _mm_shuffle_epi32(left, 0x55);
2244   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2245                                &round);
2246   dst += stride;
2247   left_y = _mm_shuffle_epi32(left, 0xaa);
2248   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2249                                &round);
2250   dst += stride;
2251   left_y = _mm_shuffle_epi32(left, 0xff);
2252   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2253                                &round);
2254   dst += stride;
2255 
2256   left = cvtepu8_epi32(Load4(left_column + 8));
2257   left_y = _mm_shuffle_epi32(left, 0);
2258   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2259                                &round);
2260   dst += stride;
2261   left_y = _mm_shuffle_epi32(left, 0x55);
2262   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2263                                &round);
2264   dst += stride;
2265   left_y = _mm_shuffle_epi32(left, 0xaa);
2266   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2267                                &round);
2268   dst += stride;
2269   left_y = _mm_shuffle_epi32(left, 0xff);
2270   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2271                                &round);
2272   dst += stride;
2273 
2274   left = cvtepu8_epi32(Load4(left_column + 12));
2275   left_y = _mm_shuffle_epi32(left, 0);
2276   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2277                                &round);
2278   dst += stride;
2279   left_y = _mm_shuffle_epi32(left, 0x55);
2280   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2281                                &round);
2282   dst += stride;
2283   left_y = _mm_shuffle_epi32(left, 0xaa);
2284   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2285                                &round);
2286   dst += stride;
2287   left_y = _mm_shuffle_epi32(left, 0xff);
2288   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2289                                &round);
2290 }
2291 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2292 
2293 // For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,
2294 // |pixels| is a segment of the top row or the whole top row, and |weights| is
2295 // repeated.
aom_smooth_h_predictor_8x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2296 void aom_smooth_h_predictor_8x4_ssse3(
2297     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2298     const uint8_t *LIBAOM_RESTRICT top_row,
2299     const uint8_t *LIBAOM_RESTRICT left_column) {
2300   const __m128i top_right = _mm_set1_epi16(top_row[7]);
2301   const __m128i left = cvtepu8_epi16(Load4(left_column));
2302   const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2303   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2304   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2305   const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2306   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2307   __m128i y_select = _mm_set1_epi32(0x01000100);
2308   __m128i left_y = _mm_shuffle_epi8(left, y_select);
2309   write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2310                                 &round);
2311   dst += stride;
2312   y_select = _mm_set1_epi32(0x03020302);
2313   left_y = _mm_shuffle_epi8(left, y_select);
2314   write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2315                                 &round);
2316   dst += stride;
2317   y_select = _mm_set1_epi32(0x05040504);
2318   left_y = _mm_shuffle_epi8(left, y_select);
2319   write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2320                                 &round);
2321   dst += stride;
2322   y_select = _mm_set1_epi32(0x07060706);
2323   left_y = _mm_shuffle_epi8(left, y_select);
2324   write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2325                                 &round);
2326 }
2327 
aom_smooth_h_predictor_8x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2328 void aom_smooth_h_predictor_8x8_ssse3(
2329     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2330     const uint8_t *LIBAOM_RESTRICT top_row,
2331     const uint8_t *LIBAOM_RESTRICT left_column) {
2332   const __m128i top_right = _mm_set1_epi16(top_row[7]);
2333   const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2334   const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2335   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2336   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2337   const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2338   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2339   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2340     const __m128i y_select = _mm_set1_epi32(y_mask);
2341     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2342     write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2343                                   &round);
2344     dst += stride;
2345   }
2346 }
2347 
aom_smooth_h_predictor_8x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2348 void aom_smooth_h_predictor_8x16_ssse3(
2349     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2350     const uint8_t *LIBAOM_RESTRICT top_row,
2351     const uint8_t *LIBAOM_RESTRICT left_column) {
2352   const __m128i top_right = _mm_set1_epi16(top_row[7]);
2353   const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2354   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2355   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2356   const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2357   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2358   __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2359   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2360     const __m128i y_select = _mm_set1_epi32(y_mask);
2361     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2362     write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2363                                   &round);
2364     dst += stride;
2365   }
2366   left = cvtepu8_epi16(LoadLo8(left_column + 8));
2367   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2368     const __m128i y_select = _mm_set1_epi32(y_mask);
2369     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2370     write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2371                                   &round);
2372     dst += stride;
2373   }
2374 }
2375 
2376 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_h_predictor_8x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2377 void aom_smooth_h_predictor_8x32_ssse3(
2378     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2379     const uint8_t *LIBAOM_RESTRICT top_row,
2380     const uint8_t *LIBAOM_RESTRICT left_column) {
2381   const __m128i top_right = _mm_set1_epi16(top_row[7]);
2382   const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2383   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2384   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2385   const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2386   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2387   __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2388   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2389     const __m128i y_select = _mm_set1_epi32(y_mask);
2390     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2391     write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2392                                   &round);
2393     dst += stride;
2394   }
2395   left = cvtepu8_epi16(LoadLo8(left_column + 8));
2396   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2397     const __m128i y_select = _mm_set1_epi32(y_mask);
2398     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2399     write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2400                                   &round);
2401     dst += stride;
2402   }
2403   left = cvtepu8_epi16(LoadLo8(left_column + 16));
2404   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2405     const __m128i y_select = _mm_set1_epi32(y_mask);
2406     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2407     write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2408                                   &round);
2409     dst += stride;
2410   }
2411   left = cvtepu8_epi16(LoadLo8(left_column + 24));
2412   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2413     const __m128i y_select = _mm_set1_epi32(y_mask);
2414     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2415     write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2416                                   &round);
2417     dst += stride;
2418   }
2419 }
2420 
aom_smooth_h_predictor_16x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2421 void aom_smooth_h_predictor_16x4_ssse3(
2422     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2423     const uint8_t *LIBAOM_RESTRICT top_row,
2424     const uint8_t *LIBAOM_RESTRICT left_column) {
2425   const __m128i top_right = _mm_set1_epi16(top_row[15]);
2426   const __m128i left = cvtepu8_epi16(Load4(left_column));
2427   const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2428   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2429   const __m128i weights1 = cvtepu8_epi16(weights);
2430   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2431   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2432   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2433   const __m128i scaled_top_right1 =
2434       _mm_mullo_epi16(inverted_weights1, top_right);
2435   const __m128i scaled_top_right2 =
2436       _mm_mullo_epi16(inverted_weights2, top_right);
2437   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2438   __m128i y_mask = _mm_set1_epi32(0x01000100);
2439   __m128i left_y = _mm_shuffle_epi8(left, y_mask);
2440   write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2441                                  scaled_top_right1, scaled_top_right2, round);
2442   dst += stride;
2443   y_mask = _mm_set1_epi32(0x03020302);
2444   left_y = _mm_shuffle_epi8(left, y_mask);
2445   write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2446                                  scaled_top_right1, scaled_top_right2, round);
2447   dst += stride;
2448   y_mask = _mm_set1_epi32(0x05040504);
2449   left_y = _mm_shuffle_epi8(left, y_mask);
2450   write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2451                                  scaled_top_right1, scaled_top_right2, round);
2452   dst += stride;
2453   y_mask = _mm_set1_epi32(0x07060706);
2454   left_y = _mm_shuffle_epi8(left, y_mask);
2455   write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2456                                  scaled_top_right1, scaled_top_right2, round);
2457 }
2458 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2459 
aom_smooth_h_predictor_16x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2460 void aom_smooth_h_predictor_16x8_ssse3(
2461     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2462     const uint8_t *LIBAOM_RESTRICT top_row,
2463     const uint8_t *LIBAOM_RESTRICT left_column) {
2464   const __m128i top_right = _mm_set1_epi16(top_row[15]);
2465   const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2466   const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2467   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2468   const __m128i weights1 = cvtepu8_epi16(weights);
2469   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2470   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2471   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2472   const __m128i scaled_top_right1 =
2473       _mm_mullo_epi16(inverted_weights1, top_right);
2474   const __m128i scaled_top_right2 =
2475       _mm_mullo_epi16(inverted_weights2, top_right);
2476   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2477   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2478     const __m128i y_select = _mm_set1_epi32(y_mask);
2479     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2480     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2481                                    scaled_top_right1, scaled_top_right2, round);
2482     dst += stride;
2483   }
2484 }
2485 
aom_smooth_h_predictor_16x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2486 void aom_smooth_h_predictor_16x16_ssse3(
2487     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2488     const uint8_t *LIBAOM_RESTRICT top_row,
2489     const uint8_t *LIBAOM_RESTRICT left_column) {
2490   const __m128i top_right = _mm_set1_epi16(top_row[15]);
2491   const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2492   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2493   const __m128i weights1 = cvtepu8_epi16(weights);
2494   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2495   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2496   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2497   const __m128i scaled_top_right1 =
2498       _mm_mullo_epi16(inverted_weights1, top_right);
2499   const __m128i scaled_top_right2 =
2500       _mm_mullo_epi16(inverted_weights2, top_right);
2501   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2502   __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2503   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2504     const __m128i y_select = _mm_set1_epi32(y_mask);
2505     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2506     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2507                                    scaled_top_right1, scaled_top_right2, round);
2508     dst += stride;
2509   }
2510   left = cvtepu8_epi16(LoadLo8(left_column + 8));
2511   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2512     const __m128i y_select = _mm_set1_epi32(y_mask);
2513     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2514     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2515                                    scaled_top_right1, scaled_top_right2, round);
2516     dst += stride;
2517   }
2518 }
2519 
aom_smooth_h_predictor_16x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2520 void aom_smooth_h_predictor_16x32_ssse3(
2521     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2522     const uint8_t *LIBAOM_RESTRICT top_row,
2523     const uint8_t *LIBAOM_RESTRICT left_column) {
2524   const __m128i top_right = _mm_set1_epi16(top_row[15]);
2525   const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2526   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2527   const __m128i weights1 = cvtepu8_epi16(weights);
2528   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2529   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2530   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2531   const __m128i scaled_top_right1 =
2532       _mm_mullo_epi16(inverted_weights1, top_right);
2533   const __m128i scaled_top_right2 =
2534       _mm_mullo_epi16(inverted_weights2, top_right);
2535   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2536   __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2537   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2538     const __m128i y_select = _mm_set1_epi32(y_mask);
2539     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2540     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2541                                    scaled_top_right1, scaled_top_right2, round);
2542     dst += stride;
2543   }
2544   left = cvtepu8_epi16(LoadLo8(left_column + 8));
2545   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2546     const __m128i y_select = _mm_set1_epi32(y_mask);
2547     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2548     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2549                                    scaled_top_right1, scaled_top_right2, round);
2550     dst += stride;
2551   }
2552   left = cvtepu8_epi16(LoadLo8(left_column + 16));
2553   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2554     const __m128i y_select = _mm_set1_epi32(y_mask);
2555     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2556     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2557                                    scaled_top_right1, scaled_top_right2, round);
2558     dst += stride;
2559   }
2560   left = cvtepu8_epi16(LoadLo8(left_column + 24));
2561   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2562     const __m128i y_select = _mm_set1_epi32(y_mask);
2563     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2564     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2565                                    scaled_top_right1, scaled_top_right2, round);
2566     dst += stride;
2567   }
2568 }
2569 
2570 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_h_predictor_16x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2571 void aom_smooth_h_predictor_16x64_ssse3(
2572     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2573     const uint8_t *LIBAOM_RESTRICT top_row,
2574     const uint8_t *LIBAOM_RESTRICT left_column) {
2575   const __m128i top_right = _mm_set1_epi16(top_row[15]);
2576   const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2577   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2578   const __m128i weights1 = cvtepu8_epi16(weights);
2579   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2580   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2581   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2582   const __m128i scaled_top_right1 =
2583       _mm_mullo_epi16(inverted_weights1, top_right);
2584   const __m128i scaled_top_right2 =
2585       _mm_mullo_epi16(inverted_weights2, top_right);
2586   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2587   for (int left_offset = 0; left_offset < 64; left_offset += 8) {
2588     const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
2589     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2590       const __m128i y_select = _mm_set1_epi32(y_mask);
2591       const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2592       write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2593                                      scaled_top_right1, scaled_top_right2,
2594                                      round);
2595       dst += stride;
2596     }
2597   }
2598 }
2599 
aom_smooth_h_predictor_32x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2600 void aom_smooth_h_predictor_32x8_ssse3(
2601     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2602     const uint8_t *LIBAOM_RESTRICT top_row,
2603     const uint8_t *LIBAOM_RESTRICT left_column) {
2604   const __m128i top_right = _mm_set1_epi16(top_row[31]);
2605   const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2606   const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2607   const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
2608   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2609   const __m128i weights1 = cvtepu8_epi16(weights_lo);
2610   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2611   const __m128i weights3 = cvtepu8_epi16(weights_hi);
2612   const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2613   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2614   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2615   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2616   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2617   const __m128i scaled_top_right1 =
2618       _mm_mullo_epi16(inverted_weights1, top_right);
2619   const __m128i scaled_top_right2 =
2620       _mm_mullo_epi16(inverted_weights2, top_right);
2621   const __m128i scaled_top_right3 =
2622       _mm_mullo_epi16(inverted_weights3, top_right);
2623   const __m128i scaled_top_right4 =
2624       _mm_mullo_epi16(inverted_weights4, top_right);
2625   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2626   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2627     __m128i y_select = _mm_set1_epi32(y_mask);
2628     __m128i left_y = _mm_shuffle_epi8(left, y_select);
2629     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2630                                    scaled_top_right1, scaled_top_right2, round);
2631     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2632                                    scaled_top_right3, scaled_top_right4, round);
2633     dst += stride;
2634   }
2635 }
2636 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2637 
aom_smooth_h_predictor_32x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2638 void aom_smooth_h_predictor_32x16_ssse3(
2639     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2640     const uint8_t *LIBAOM_RESTRICT top_row,
2641     const uint8_t *LIBAOM_RESTRICT left_column) {
2642   const __m128i top_right = _mm_set1_epi16(top_row[31]);
2643   const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
2644   const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2645   const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
2646   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2647   const __m128i weights1 = cvtepu8_epi16(weights_lo);
2648   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2649   const __m128i weights3 = cvtepu8_epi16(weights_hi);
2650   const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2651   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2652   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2653   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2654   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2655   const __m128i scaled_top_right1 =
2656       _mm_mullo_epi16(inverted_weights1, top_right);
2657   const __m128i scaled_top_right2 =
2658       _mm_mullo_epi16(inverted_weights2, top_right);
2659   const __m128i scaled_top_right3 =
2660       _mm_mullo_epi16(inverted_weights3, top_right);
2661   const __m128i scaled_top_right4 =
2662       _mm_mullo_epi16(inverted_weights4, top_right);
2663   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2664   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2665     __m128i y_select = _mm_set1_epi32(y_mask);
2666     __m128i left_y = _mm_shuffle_epi8(left1, y_select);
2667     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2668                                    scaled_top_right1, scaled_top_right2, round);
2669     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2670                                    scaled_top_right3, scaled_top_right4, round);
2671     dst += stride;
2672   }
2673   const __m128i left2 =
2674       cvtepu8_epi16(LoadLo8((const uint8_t *)left_column + 8));
2675   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2676     __m128i y_select = _mm_set1_epi32(y_mask);
2677     __m128i left_y = _mm_shuffle_epi8(left2, y_select);
2678     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2679                                    scaled_top_right1, scaled_top_right2, round);
2680     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2681                                    scaled_top_right3, scaled_top_right4, round);
2682     dst += stride;
2683   }
2684 }
2685 
aom_smooth_h_predictor_32x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2686 void aom_smooth_h_predictor_32x32_ssse3(
2687     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2688     const uint8_t *LIBAOM_RESTRICT top_row,
2689     const uint8_t *LIBAOM_RESTRICT left_column) {
2690   const __m128i top_right = _mm_set1_epi16(top_row[31]);
2691   const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2692   const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
2693   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2694   const __m128i weights1 = cvtepu8_epi16(weights_lo);
2695   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2696   const __m128i weights3 = cvtepu8_epi16(weights_hi);
2697   const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2698   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2699   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2700   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2701   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2702   const __m128i scaled_top_right1 =
2703       _mm_mullo_epi16(inverted_weights1, top_right);
2704   const __m128i scaled_top_right2 =
2705       _mm_mullo_epi16(inverted_weights2, top_right);
2706   const __m128i scaled_top_right3 =
2707       _mm_mullo_epi16(inverted_weights3, top_right);
2708   const __m128i scaled_top_right4 =
2709       _mm_mullo_epi16(inverted_weights4, top_right);
2710   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2711   __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2712   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2713     __m128i y_select = _mm_set1_epi32(y_mask);
2714     __m128i left_y = _mm_shuffle_epi8(left, y_select);
2715     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2716                                    scaled_top_right1, scaled_top_right2, round);
2717     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2718                                    scaled_top_right3, scaled_top_right4, round);
2719     dst += stride;
2720   }
2721   left = cvtepu8_epi16(LoadLo8(left_column + 8));
2722   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2723     __m128i y_select = _mm_set1_epi32(y_mask);
2724     __m128i left_y = _mm_shuffle_epi8(left, y_select);
2725     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2726                                    scaled_top_right1, scaled_top_right2, round);
2727     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2728                                    scaled_top_right3, scaled_top_right4, round);
2729     dst += stride;
2730   }
2731   left = cvtepu8_epi16(LoadLo8(left_column + 16));
2732   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2733     __m128i y_select = _mm_set1_epi32(y_mask);
2734     __m128i left_y = _mm_shuffle_epi8(left, y_select);
2735     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2736                                    scaled_top_right1, scaled_top_right2, round);
2737     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2738                                    scaled_top_right3, scaled_top_right4, round);
2739     dst += stride;
2740   }
2741   left = cvtepu8_epi16(LoadLo8(left_column + 24));
2742   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2743     __m128i y_select = _mm_set1_epi32(y_mask);
2744     __m128i left_y = _mm_shuffle_epi8(left, y_select);
2745     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2746                                    scaled_top_right1, scaled_top_right2, round);
2747     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2748                                    scaled_top_right3, scaled_top_right4, round);
2749     dst += stride;
2750   }
2751 }
2752 
aom_smooth_h_predictor_32x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2753 void aom_smooth_h_predictor_32x64_ssse3(
2754     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2755     const uint8_t *LIBAOM_RESTRICT top_row,
2756     const uint8_t *LIBAOM_RESTRICT left_column) {
2757   const __m128i top_right = _mm_set1_epi16(top_row[31]);
2758   const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2759   const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
2760   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2761   const __m128i weights1 = cvtepu8_epi16(weights_lo);
2762   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2763   const __m128i weights3 = cvtepu8_epi16(weights_hi);
2764   const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2765   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2766   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2767   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2768   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2769   const __m128i scaled_top_right1 =
2770       _mm_mullo_epi16(inverted_weights1, top_right);
2771   const __m128i scaled_top_right2 =
2772       _mm_mullo_epi16(inverted_weights2, top_right);
2773   const __m128i scaled_top_right3 =
2774       _mm_mullo_epi16(inverted_weights3, top_right);
2775   const __m128i scaled_top_right4 =
2776       _mm_mullo_epi16(inverted_weights4, top_right);
2777   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2778   for (int left_offset = 0; left_offset < 64; left_offset += 8) {
2779     const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
2780     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2781       const __m128i y_select = _mm_set1_epi32(y_mask);
2782       const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2783       write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2784                                      scaled_top_right1, scaled_top_right2,
2785                                      round);
2786       write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
2787                                      weights4, scaled_top_right3,
2788                                      scaled_top_right4, round);
2789       dst += stride;
2790     }
2791   }
2792 }
2793 
2794 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_h_predictor_64x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2795 void aom_smooth_h_predictor_64x16_ssse3(
2796     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2797     const uint8_t *LIBAOM_RESTRICT top_row,
2798     const uint8_t *LIBAOM_RESTRICT left_column) {
2799   const __m128i top_right = _mm_set1_epi16(top_row[63]);
2800   const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
2801   const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
2802   const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
2803   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2804   const __m128i weights1 = cvtepu8_epi16(weights_lolo);
2805   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
2806   const __m128i weights3 = cvtepu8_epi16(weights_lohi);
2807   const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
2808   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2809   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2810   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2811   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2812   const __m128i scaled_top_right1 =
2813       _mm_mullo_epi16(inverted_weights1, top_right);
2814   const __m128i scaled_top_right2 =
2815       _mm_mullo_epi16(inverted_weights2, top_right);
2816   const __m128i scaled_top_right3 =
2817       _mm_mullo_epi16(inverted_weights3, top_right);
2818   const __m128i scaled_top_right4 =
2819       _mm_mullo_epi16(inverted_weights4, top_right);
2820   const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
2821   const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
2822   const __m128i weights5 = cvtepu8_epi16(weights_hilo);
2823   const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
2824   const __m128i weights7 = cvtepu8_epi16(weights_hihi);
2825   const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
2826   const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
2827   const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
2828   const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
2829   const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
2830   const __m128i scaled_top_right5 =
2831       _mm_mullo_epi16(inverted_weights5, top_right);
2832   const __m128i scaled_top_right6 =
2833       _mm_mullo_epi16(inverted_weights6, top_right);
2834   const __m128i scaled_top_right7 =
2835       _mm_mullo_epi16(inverted_weights7, top_right);
2836   const __m128i scaled_top_right8 =
2837       _mm_mullo_epi16(inverted_weights8, top_right);
2838   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2839   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2840     __m128i y_select = _mm_set1_epi32(y_mask);
2841     __m128i left_y = _mm_shuffle_epi8(left1, y_select);
2842     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2843                                    scaled_top_right1, scaled_top_right2, round);
2844     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2845                                    scaled_top_right3, scaled_top_right4, round);
2846     write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2847                                    scaled_top_right5, scaled_top_right6, round);
2848     write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2849                                    scaled_top_right7, scaled_top_right8, round);
2850     dst += stride;
2851   }
2852   const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
2853   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2854     __m128i y_select = _mm_set1_epi32(y_mask);
2855     __m128i left_y = _mm_shuffle_epi8(left2, y_select);
2856     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2857                                    scaled_top_right1, scaled_top_right2, round);
2858     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2859                                    scaled_top_right3, scaled_top_right4, round);
2860     write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2861                                    scaled_top_right5, scaled_top_right6, round);
2862     write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2863                                    scaled_top_right7, scaled_top_right8, round);
2864     dst += stride;
2865   }
2866 }
2867 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2868 
aom_smooth_h_predictor_64x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2869 void aom_smooth_h_predictor_64x32_ssse3(
2870     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2871     const uint8_t *LIBAOM_RESTRICT top_row,
2872     const uint8_t *LIBAOM_RESTRICT left_column) {
2873   const __m128i top_right = _mm_set1_epi16(top_row[63]);
2874   const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
2875   const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
2876   const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
2877   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2878   const __m128i weights1 = cvtepu8_epi16(weights_lolo);
2879   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
2880   const __m128i weights3 = cvtepu8_epi16(weights_lohi);
2881   const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
2882   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2883   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2884   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2885   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2886   const __m128i scaled_top_right1 =
2887       _mm_mullo_epi16(inverted_weights1, top_right);
2888   const __m128i scaled_top_right2 =
2889       _mm_mullo_epi16(inverted_weights2, top_right);
2890   const __m128i scaled_top_right3 =
2891       _mm_mullo_epi16(inverted_weights3, top_right);
2892   const __m128i scaled_top_right4 =
2893       _mm_mullo_epi16(inverted_weights4, top_right);
2894   const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
2895   const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
2896   const __m128i weights5 = cvtepu8_epi16(weights_hilo);
2897   const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
2898   const __m128i weights7 = cvtepu8_epi16(weights_hihi);
2899   const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
2900   const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
2901   const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
2902   const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
2903   const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
2904   const __m128i scaled_top_right5 =
2905       _mm_mullo_epi16(inverted_weights5, top_right);
2906   const __m128i scaled_top_right6 =
2907       _mm_mullo_epi16(inverted_weights6, top_right);
2908   const __m128i scaled_top_right7 =
2909       _mm_mullo_epi16(inverted_weights7, top_right);
2910   const __m128i scaled_top_right8 =
2911       _mm_mullo_epi16(inverted_weights8, top_right);
2912   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2913   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2914     const __m128i y_select = _mm_set1_epi32(y_mask);
2915     const __m128i left_y = _mm_shuffle_epi8(left1, y_select);
2916     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2917                                    scaled_top_right1, scaled_top_right2, round);
2918     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2919                                    scaled_top_right3, scaled_top_right4, round);
2920     write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2921                                    scaled_top_right5, scaled_top_right6, round);
2922     write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2923                                    scaled_top_right7, scaled_top_right8, round);
2924     dst += stride;
2925   }
2926   const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
2927   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2928     const __m128i y_select = _mm_set1_epi32(y_mask);
2929     const __m128i left_y = _mm_shuffle_epi8(left2, y_select);
2930     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2931                                    scaled_top_right1, scaled_top_right2, round);
2932     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2933                                    scaled_top_right3, scaled_top_right4, round);
2934     write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2935                                    scaled_top_right5, scaled_top_right6, round);
2936     write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2937                                    scaled_top_right7, scaled_top_right8, round);
2938     dst += stride;
2939   }
2940   const __m128i left3 = cvtepu8_epi16(LoadLo8(left_column + 16));
2941   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2942     const __m128i y_select = _mm_set1_epi32(y_mask);
2943     const __m128i left_y = _mm_shuffle_epi8(left3, y_select);
2944     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2945                                    scaled_top_right1, scaled_top_right2, round);
2946     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2947                                    scaled_top_right3, scaled_top_right4, round);
2948     write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2949                                    scaled_top_right5, scaled_top_right6, round);
2950     write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2951                                    scaled_top_right7, scaled_top_right8, round);
2952     dst += stride;
2953   }
2954   const __m128i left4 = cvtepu8_epi16(LoadLo8(left_column + 24));
2955   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2956     const __m128i y_select = _mm_set1_epi32(y_mask);
2957     const __m128i left_y = _mm_shuffle_epi8(left4, y_select);
2958     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2959                                    scaled_top_right1, scaled_top_right2, round);
2960     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2961                                    scaled_top_right3, scaled_top_right4, round);
2962     write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2963                                    scaled_top_right5, scaled_top_right6, round);
2964     write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2965                                    scaled_top_right7, scaled_top_right8, round);
2966     dst += stride;
2967   }
2968 }
2969 
aom_smooth_h_predictor_64x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2970 void aom_smooth_h_predictor_64x64_ssse3(
2971     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2972     const uint8_t *LIBAOM_RESTRICT top_row,
2973     const uint8_t *LIBAOM_RESTRICT left_column) {
2974   const __m128i top_right = _mm_set1_epi16(top_row[63]);
2975   const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
2976   const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
2977   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2978   const __m128i weights1 = cvtepu8_epi16(weights_lolo);
2979   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
2980   const __m128i weights3 = cvtepu8_epi16(weights_lohi);
2981   const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
2982   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2983   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2984   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2985   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2986   const __m128i scaled_top_right1 =
2987       _mm_mullo_epi16(inverted_weights1, top_right);
2988   const __m128i scaled_top_right2 =
2989       _mm_mullo_epi16(inverted_weights2, top_right);
2990   const __m128i scaled_top_right3 =
2991       _mm_mullo_epi16(inverted_weights3, top_right);
2992   const __m128i scaled_top_right4 =
2993       _mm_mullo_epi16(inverted_weights4, top_right);
2994   const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
2995   const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
2996   const __m128i weights5 = cvtepu8_epi16(weights_hilo);
2997   const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
2998   const __m128i weights7 = cvtepu8_epi16(weights_hihi);
2999   const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
3000   const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
3001   const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
3002   const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
3003   const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
3004   const __m128i scaled_top_right5 =
3005       _mm_mullo_epi16(inverted_weights5, top_right);
3006   const __m128i scaled_top_right6 =
3007       _mm_mullo_epi16(inverted_weights6, top_right);
3008   const __m128i scaled_top_right7 =
3009       _mm_mullo_epi16(inverted_weights7, top_right);
3010   const __m128i scaled_top_right8 =
3011       _mm_mullo_epi16(inverted_weights8, top_right);
3012   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
3013   for (int left_offset = 0; left_offset < 64; left_offset += 8) {
3014     const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
3015     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
3016       const __m128i y_select = _mm_set1_epi32(y_mask);
3017       const __m128i left_y = _mm_shuffle_epi8(left, y_select);
3018       write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
3019                                      scaled_top_right1, scaled_top_right2,
3020                                      round);
3021       write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
3022                                      weights4, scaled_top_right3,
3023                                      scaled_top_right4, round);
3024       write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5,
3025                                      weights6, scaled_top_right5,
3026                                      scaled_top_right6, round);
3027       write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7,
3028                                      weights8, scaled_top_right7,
3029                                      scaled_top_right8, round);
3030       dst += stride;
3031     }
3032   }
3033 }
3034