1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <tmmintrin.h>
13
14 #include "config/aom_dsp_rtcd.h"
15
16 #include "aom_dsp/intrapred_common.h"
17
18 // -----------------------------------------------------------------------------
19 // PAETH_PRED
20
21 // Return 8 16-bit pixels in one row
paeth_8x1_pred(const __m128i * left,const __m128i * top,const __m128i * topleft)22 static inline __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
23 const __m128i *topleft) {
24 const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
25
26 __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
27 __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
28 __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
29
30 __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
31 mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
32 __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
33
34 pl = _mm_andnot_si128(mask1, *left);
35
36 ptl = _mm_and_si128(mask2, *topleft);
37 pt = _mm_andnot_si128(mask2, *top);
38 pt = _mm_or_si128(pt, ptl);
39 pt = _mm_and_si128(mask1, pt);
40
41 return _mm_or_si128(pl, pt);
42 }
43
aom_paeth_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)44 void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
45 const uint8_t *above, const uint8_t *left) {
46 __m128i l = _mm_loadl_epi64((const __m128i *)left);
47 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
48 const __m128i zero = _mm_setzero_si128();
49 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
50 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
51 __m128i rep = _mm_set1_epi16((short)0x8000);
52 const __m128i one = _mm_set1_epi16(1);
53
54 int i;
55 for (i = 0; i < 4; ++i) {
56 const __m128i l16 = _mm_shuffle_epi8(l, rep);
57 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
58
59 *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
60 dst += stride;
61 rep = _mm_add_epi16(rep, one);
62 }
63 }
64
aom_paeth_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)65 void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
66 const uint8_t *above, const uint8_t *left) {
67 __m128i l = _mm_loadl_epi64((const __m128i *)left);
68 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
69 const __m128i zero = _mm_setzero_si128();
70 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
71 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
72 __m128i rep = _mm_set1_epi16((short)0x8000);
73 const __m128i one = _mm_set1_epi16(1);
74
75 int i;
76 for (i = 0; i < 8; ++i) {
77 const __m128i l16 = _mm_shuffle_epi8(l, rep);
78 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
79
80 *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
81 dst += stride;
82 rep = _mm_add_epi16(rep, one);
83 }
84 }
85
86 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_paeth_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)87 void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
88 const uint8_t *above, const uint8_t *left) {
89 __m128i l = _mm_load_si128((const __m128i *)left);
90 const __m128i t = _mm_cvtsi32_si128(((const int *)above)[0]);
91 const __m128i zero = _mm_setzero_si128();
92 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
93 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
94 __m128i rep = _mm_set1_epi16((short)0x8000);
95 const __m128i one = _mm_set1_epi16(1);
96
97 for (int i = 0; i < 16; ++i) {
98 const __m128i l16 = _mm_shuffle_epi8(l, rep);
99 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
100
101 *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
102 dst += stride;
103 rep = _mm_add_epi16(rep, one);
104 }
105 }
106 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
107
aom_paeth_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)108 void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
109 const uint8_t *above, const uint8_t *left) {
110 __m128i l = _mm_loadl_epi64((const __m128i *)left);
111 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
112 const __m128i zero = _mm_setzero_si128();
113 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
114 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
115 __m128i rep = _mm_set1_epi16((short)0x8000);
116 const __m128i one = _mm_set1_epi16(1);
117
118 int i;
119 for (i = 0; i < 4; ++i) {
120 const __m128i l16 = _mm_shuffle_epi8(l, rep);
121 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
122
123 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
124 dst += stride;
125 rep = _mm_add_epi16(rep, one);
126 }
127 }
128
aom_paeth_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)129 void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
130 const uint8_t *above, const uint8_t *left) {
131 __m128i l = _mm_loadl_epi64((const __m128i *)left);
132 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
133 const __m128i zero = _mm_setzero_si128();
134 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
135 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
136 __m128i rep = _mm_set1_epi16((short)0x8000);
137 const __m128i one = _mm_set1_epi16(1);
138
139 int i;
140 for (i = 0; i < 8; ++i) {
141 const __m128i l16 = _mm_shuffle_epi8(l, rep);
142 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
143
144 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
145 dst += stride;
146 rep = _mm_add_epi16(rep, one);
147 }
148 }
149
aom_paeth_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)150 void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
151 const uint8_t *above, const uint8_t *left) {
152 __m128i l = _mm_load_si128((const __m128i *)left);
153 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
154 const __m128i zero = _mm_setzero_si128();
155 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
156 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
157 __m128i rep = _mm_set1_epi16((short)0x8000);
158 const __m128i one = _mm_set1_epi16(1);
159
160 int i;
161 for (i = 0; i < 16; ++i) {
162 const __m128i l16 = _mm_shuffle_epi8(l, rep);
163 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
164
165 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
166 dst += stride;
167 rep = _mm_add_epi16(rep, one);
168 }
169 }
170
171 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_paeth_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)172 void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
173 const uint8_t *above, const uint8_t *left) {
174 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
175 const __m128i zero = _mm_setzero_si128();
176 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
177 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
178 const __m128i one = _mm_set1_epi16(1);
179
180 for (int j = 0; j < 2; ++j) {
181 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
182 __m128i rep = _mm_set1_epi16((short)0x8000);
183 for (int i = 0; i < 16; ++i) {
184 const __m128i l16 = _mm_shuffle_epi8(l, rep);
185 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
186
187 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
188 dst += stride;
189 rep = _mm_add_epi16(rep, one);
190 }
191 }
192 }
193 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
194
195 // Return 16 8-bit pixels in one row
paeth_16x1_pred(const __m128i * left,const __m128i * top0,const __m128i * top1,const __m128i * topleft)196 static inline __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
197 const __m128i *top1,
198 const __m128i *topleft) {
199 const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
200 const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
201 return _mm_packus_epi16(p0, p1);
202 }
203
204 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_paeth_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)205 void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
206 const uint8_t *above, const uint8_t *left) {
207 __m128i l = _mm_cvtsi32_si128(((const int *)left)[0]);
208 const __m128i t = _mm_load_si128((const __m128i *)above);
209 const __m128i zero = _mm_setzero_si128();
210 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
211 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
212 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
213 __m128i rep = _mm_set1_epi16((short)0x8000);
214 const __m128i one = _mm_set1_epi16(1);
215
216 for (int i = 0; i < 4; ++i) {
217 const __m128i l16 = _mm_shuffle_epi8(l, rep);
218 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
219
220 _mm_store_si128((__m128i *)dst, row);
221 dst += stride;
222 rep = _mm_add_epi16(rep, one);
223 }
224 }
225 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
226
aom_paeth_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)227 void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
228 const uint8_t *above, const uint8_t *left) {
229 __m128i l = _mm_loadl_epi64((const __m128i *)left);
230 const __m128i t = _mm_load_si128((const __m128i *)above);
231 const __m128i zero = _mm_setzero_si128();
232 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
233 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
234 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
235 __m128i rep = _mm_set1_epi16((short)0x8000);
236 const __m128i one = _mm_set1_epi16(1);
237
238 int i;
239 for (i = 0; i < 8; ++i) {
240 const __m128i l16 = _mm_shuffle_epi8(l, rep);
241 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
242
243 _mm_store_si128((__m128i *)dst, row);
244 dst += stride;
245 rep = _mm_add_epi16(rep, one);
246 }
247 }
248
aom_paeth_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)249 void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
250 const uint8_t *above,
251 const uint8_t *left) {
252 __m128i l = _mm_load_si128((const __m128i *)left);
253 const __m128i t = _mm_load_si128((const __m128i *)above);
254 const __m128i zero = _mm_setzero_si128();
255 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
256 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
257 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
258 __m128i rep = _mm_set1_epi16((short)0x8000);
259 const __m128i one = _mm_set1_epi16(1);
260
261 int i;
262 for (i = 0; i < 16; ++i) {
263 const __m128i l16 = _mm_shuffle_epi8(l, rep);
264 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
265
266 _mm_store_si128((__m128i *)dst, row);
267 dst += stride;
268 rep = _mm_add_epi16(rep, one);
269 }
270 }
271
aom_paeth_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)272 void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
273 const uint8_t *above,
274 const uint8_t *left) {
275 __m128i l = _mm_load_si128((const __m128i *)left);
276 const __m128i t = _mm_load_si128((const __m128i *)above);
277 const __m128i zero = _mm_setzero_si128();
278 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
279 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
280 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
281 __m128i rep = _mm_set1_epi16((short)0x8000);
282 const __m128i one = _mm_set1_epi16(1);
283 __m128i l16;
284
285 int i;
286 for (i = 0; i < 16; ++i) {
287 l16 = _mm_shuffle_epi8(l, rep);
288 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
289
290 _mm_store_si128((__m128i *)dst, row);
291 dst += stride;
292 rep = _mm_add_epi16(rep, one);
293 }
294
295 l = _mm_load_si128((const __m128i *)(left + 16));
296 rep = _mm_set1_epi16((short)0x8000);
297 for (i = 0; i < 16; ++i) {
298 l16 = _mm_shuffle_epi8(l, rep);
299 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
300
301 _mm_store_si128((__m128i *)dst, row);
302 dst += stride;
303 rep = _mm_add_epi16(rep, one);
304 }
305 }
306
307 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_paeth_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)308 void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
309 const uint8_t *above,
310 const uint8_t *left) {
311 const __m128i t = _mm_load_si128((const __m128i *)above);
312 const __m128i zero = _mm_setzero_si128();
313 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
314 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
315 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
316 const __m128i one = _mm_set1_epi16(1);
317
318 for (int j = 0; j < 4; ++j) {
319 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
320 __m128i rep = _mm_set1_epi16((short)0x8000);
321 for (int i = 0; i < 16; ++i) {
322 const __m128i l16 = _mm_shuffle_epi8(l, rep);
323 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
324 _mm_store_si128((__m128i *)dst, row);
325 dst += stride;
326 rep = _mm_add_epi16(rep, one);
327 }
328 }
329 }
330
aom_paeth_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)331 void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
332 const uint8_t *above, const uint8_t *left) {
333 const __m128i a = _mm_load_si128((const __m128i *)above);
334 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
335 const __m128i zero = _mm_setzero_si128();
336 const __m128i al = _mm_unpacklo_epi8(a, zero);
337 const __m128i ah = _mm_unpackhi_epi8(a, zero);
338 const __m128i bl = _mm_unpacklo_epi8(b, zero);
339 const __m128i bh = _mm_unpackhi_epi8(b, zero);
340
341 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
342 __m128i rep = _mm_set1_epi16((short)0x8000);
343 const __m128i one = _mm_set1_epi16(1);
344 const __m128i l = _mm_loadl_epi64((const __m128i *)left);
345 __m128i l16;
346
347 for (int i = 0; i < 8; ++i) {
348 l16 = _mm_shuffle_epi8(l, rep);
349 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
350 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
351
352 _mm_store_si128((__m128i *)dst, r32l);
353 _mm_store_si128((__m128i *)(dst + 16), r32h);
354 dst += stride;
355 rep = _mm_add_epi16(rep, one);
356 }
357 }
358 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
359
aom_paeth_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)360 void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
361 const uint8_t *above,
362 const uint8_t *left) {
363 const __m128i a = _mm_load_si128((const __m128i *)above);
364 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
365 const __m128i zero = _mm_setzero_si128();
366 const __m128i al = _mm_unpacklo_epi8(a, zero);
367 const __m128i ah = _mm_unpackhi_epi8(a, zero);
368 const __m128i bl = _mm_unpacklo_epi8(b, zero);
369 const __m128i bh = _mm_unpackhi_epi8(b, zero);
370
371 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
372 __m128i rep = _mm_set1_epi16((short)0x8000);
373 const __m128i one = _mm_set1_epi16(1);
374 __m128i l = _mm_load_si128((const __m128i *)left);
375 __m128i l16;
376
377 int i;
378 for (i = 0; i < 16; ++i) {
379 l16 = _mm_shuffle_epi8(l, rep);
380 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
381 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
382
383 _mm_store_si128((__m128i *)dst, r32l);
384 _mm_store_si128((__m128i *)(dst + 16), r32h);
385 dst += stride;
386 rep = _mm_add_epi16(rep, one);
387 }
388 }
389
aom_paeth_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)390 void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
391 const uint8_t *above,
392 const uint8_t *left) {
393 const __m128i a = _mm_load_si128((const __m128i *)above);
394 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
395 const __m128i zero = _mm_setzero_si128();
396 const __m128i al = _mm_unpacklo_epi8(a, zero);
397 const __m128i ah = _mm_unpackhi_epi8(a, zero);
398 const __m128i bl = _mm_unpacklo_epi8(b, zero);
399 const __m128i bh = _mm_unpackhi_epi8(b, zero);
400
401 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
402 __m128i rep = _mm_set1_epi16((short)0x8000);
403 const __m128i one = _mm_set1_epi16(1);
404 __m128i l = _mm_load_si128((const __m128i *)left);
405 __m128i l16;
406
407 int i;
408 for (i = 0; i < 16; ++i) {
409 l16 = _mm_shuffle_epi8(l, rep);
410 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
411 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
412
413 _mm_store_si128((__m128i *)dst, r32l);
414 _mm_store_si128((__m128i *)(dst + 16), r32h);
415 dst += stride;
416 rep = _mm_add_epi16(rep, one);
417 }
418
419 rep = _mm_set1_epi16((short)0x8000);
420 l = _mm_load_si128((const __m128i *)(left + 16));
421 for (i = 0; i < 16; ++i) {
422 l16 = _mm_shuffle_epi8(l, rep);
423 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
424 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
425
426 _mm_store_si128((__m128i *)dst, r32l);
427 _mm_store_si128((__m128i *)(dst + 16), r32h);
428 dst += stride;
429 rep = _mm_add_epi16(rep, one);
430 }
431 }
432
aom_paeth_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)433 void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
434 const uint8_t *above,
435 const uint8_t *left) {
436 const __m128i a = _mm_load_si128((const __m128i *)above);
437 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
438 const __m128i zero = _mm_setzero_si128();
439 const __m128i al = _mm_unpacklo_epi8(a, zero);
440 const __m128i ah = _mm_unpackhi_epi8(a, zero);
441 const __m128i bl = _mm_unpacklo_epi8(b, zero);
442 const __m128i bh = _mm_unpackhi_epi8(b, zero);
443
444 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
445 const __m128i one = _mm_set1_epi16(1);
446 __m128i l16;
447
448 int i, j;
449 for (j = 0; j < 4; ++j) {
450 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
451 __m128i rep = _mm_set1_epi16((short)0x8000);
452 for (i = 0; i < 16; ++i) {
453 l16 = _mm_shuffle_epi8(l, rep);
454 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
455 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
456
457 _mm_store_si128((__m128i *)dst, r32l);
458 _mm_store_si128((__m128i *)(dst + 16), r32h);
459 dst += stride;
460 rep = _mm_add_epi16(rep, one);
461 }
462 }
463 }
464
aom_paeth_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)465 void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
466 const uint8_t *above,
467 const uint8_t *left) {
468 const __m128i a = _mm_load_si128((const __m128i *)above);
469 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
470 const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
471 const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
472 const __m128i zero = _mm_setzero_si128();
473 const __m128i al = _mm_unpacklo_epi8(a, zero);
474 const __m128i ah = _mm_unpackhi_epi8(a, zero);
475 const __m128i bl = _mm_unpacklo_epi8(b, zero);
476 const __m128i bh = _mm_unpackhi_epi8(b, zero);
477 const __m128i cl = _mm_unpacklo_epi8(c, zero);
478 const __m128i ch = _mm_unpackhi_epi8(c, zero);
479 const __m128i dl = _mm_unpacklo_epi8(d, zero);
480 const __m128i dh = _mm_unpackhi_epi8(d, zero);
481
482 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
483 const __m128i one = _mm_set1_epi16(1);
484 __m128i l16;
485
486 int i, j;
487 for (j = 0; j < 2; ++j) {
488 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
489 __m128i rep = _mm_set1_epi16((short)0x8000);
490 for (i = 0; i < 16; ++i) {
491 l16 = _mm_shuffle_epi8(l, rep);
492 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
493 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
494 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
495 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
496
497 _mm_store_si128((__m128i *)dst, r0);
498 _mm_store_si128((__m128i *)(dst + 16), r1);
499 _mm_store_si128((__m128i *)(dst + 32), r2);
500 _mm_store_si128((__m128i *)(dst + 48), r3);
501 dst += stride;
502 rep = _mm_add_epi16(rep, one);
503 }
504 }
505 }
506
aom_paeth_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)507 void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
508 const uint8_t *above,
509 const uint8_t *left) {
510 const __m128i a = _mm_load_si128((const __m128i *)above);
511 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
512 const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
513 const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
514 const __m128i zero = _mm_setzero_si128();
515 const __m128i al = _mm_unpacklo_epi8(a, zero);
516 const __m128i ah = _mm_unpackhi_epi8(a, zero);
517 const __m128i bl = _mm_unpacklo_epi8(b, zero);
518 const __m128i bh = _mm_unpackhi_epi8(b, zero);
519 const __m128i cl = _mm_unpacklo_epi8(c, zero);
520 const __m128i ch = _mm_unpackhi_epi8(c, zero);
521 const __m128i dl = _mm_unpacklo_epi8(d, zero);
522 const __m128i dh = _mm_unpackhi_epi8(d, zero);
523
524 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
525 const __m128i one = _mm_set1_epi16(1);
526 __m128i l16;
527
528 int i, j;
529 for (j = 0; j < 4; ++j) {
530 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
531 __m128i rep = _mm_set1_epi16((short)0x8000);
532 for (i = 0; i < 16; ++i) {
533 l16 = _mm_shuffle_epi8(l, rep);
534 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
535 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
536 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
537 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
538
539 _mm_store_si128((__m128i *)dst, r0);
540 _mm_store_si128((__m128i *)(dst + 16), r1);
541 _mm_store_si128((__m128i *)(dst + 32), r2);
542 _mm_store_si128((__m128i *)(dst + 48), r3);
543 dst += stride;
544 rep = _mm_add_epi16(rep, one);
545 }
546 }
547 }
548
549 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_paeth_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)550 void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
551 const uint8_t *above,
552 const uint8_t *left) {
553 const __m128i a = _mm_load_si128((const __m128i *)above);
554 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
555 const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
556 const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
557 const __m128i zero = _mm_setzero_si128();
558 const __m128i al = _mm_unpacklo_epi8(a, zero);
559 const __m128i ah = _mm_unpackhi_epi8(a, zero);
560 const __m128i bl = _mm_unpacklo_epi8(b, zero);
561 const __m128i bh = _mm_unpackhi_epi8(b, zero);
562 const __m128i cl = _mm_unpacklo_epi8(c, zero);
563 const __m128i ch = _mm_unpackhi_epi8(c, zero);
564 const __m128i dl = _mm_unpacklo_epi8(d, zero);
565 const __m128i dh = _mm_unpackhi_epi8(d, zero);
566
567 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
568 const __m128i one = _mm_set1_epi16(1);
569 __m128i l16;
570
571 int i;
572 const __m128i l = _mm_load_si128((const __m128i *)left);
573 __m128i rep = _mm_set1_epi16((short)0x8000);
574 for (i = 0; i < 16; ++i) {
575 l16 = _mm_shuffle_epi8(l, rep);
576 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
577 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
578 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
579 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
580
581 _mm_store_si128((__m128i *)dst, r0);
582 _mm_store_si128((__m128i *)(dst + 16), r1);
583 _mm_store_si128((__m128i *)(dst + 32), r2);
584 _mm_store_si128((__m128i *)(dst + 48), r3);
585 dst += stride;
586 rep = _mm_add_epi16(rep, one);
587 }
588 }
589 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
590
591 // -----------------------------------------------------------------------------
592 // SMOOTH_PRED
593
594 // pixels[0]: above and below_pred interleave vector
595 // pixels[1]: left vector
596 // pixels[2]: right_pred vector
load_pixel_w4(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)597 static inline void load_pixel_w4(const uint8_t *above, const uint8_t *left,
598 int height, __m128i *pixels) {
599 __m128i d = _mm_cvtsi32_si128(((const int *)above)[0]);
600 if (height == 4)
601 pixels[1] = _mm_cvtsi32_si128(((const int *)left)[0]);
602 else if (height == 8)
603 pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
604 else
605 pixels[1] = _mm_loadu_si128(((const __m128i *)left));
606
607 pixels[2] = _mm_set1_epi16((int16_t)above[3]);
608
609 const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
610 const __m128i zero = _mm_setzero_si128();
611 d = _mm_unpacklo_epi8(d, zero);
612 pixels[0] = _mm_unpacklo_epi16(d, bp);
613 }
614
615 // weight_h[0]: weight_h vector
616 // weight_h[1]: scale - weight_h vector
617 // weight_h[2]: same as [0], second half for height = 16 only
618 // weight_h[3]: same as [1], second half for height = 16 only
619 // weight_w[0]: weights_w and scale - weights_w interleave vector
load_weight_w4(int height,__m128i * weight_h,__m128i * weight_w)620 static inline void load_weight_w4(int height, __m128i *weight_h,
621 __m128i *weight_w) {
622 const __m128i zero = _mm_setzero_si128();
623 const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
624 const __m128i t = _mm_cvtsi32_si128(((const int *)smooth_weights)[0]);
625 weight_h[0] = _mm_unpacklo_epi8(t, zero);
626 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
627 weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
628
629 if (height == 8) {
630 const __m128i weight = _mm_loadl_epi64((const __m128i *)&smooth_weights[4]);
631 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
632 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
633 } else if (height == 16) {
634 const __m128i weight =
635 _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
636 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
637 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
638 weight_h[2] = _mm_unpackhi_epi8(weight, zero);
639 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
640 }
641 }
642
smooth_pred_4xh(const __m128i * pixel,const __m128i * wh,const __m128i * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)643 static inline void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
644 const __m128i *ww, int h, uint8_t *dst,
645 ptrdiff_t stride, int second_half) {
646 const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
647 const __m128i one = _mm_set1_epi16(1);
648 const __m128i inc = _mm_set1_epi16(0x202);
649 const __m128i gat = _mm_set1_epi32(0xc080400);
650 __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
651 : _mm_set1_epi16((short)0x8000);
652 __m128i d = _mm_set1_epi16(0x100);
653
654 for (int i = 0; i < h; ++i) {
655 const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
656 const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
657 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
658 __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
659
660 __m128i b = _mm_shuffle_epi8(pixel[1], rep);
661 b = _mm_unpacklo_epi16(b, pixel[2]);
662 __m128i sum = _mm_madd_epi16(b, ww[0]);
663
664 sum = _mm_add_epi32(s, sum);
665 sum = _mm_add_epi32(sum, round);
666 sum = _mm_srai_epi32(sum, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
667
668 sum = _mm_shuffle_epi8(sum, gat);
669 *(int *)dst = _mm_cvtsi128_si32(sum);
670 dst += stride;
671
672 rep = _mm_add_epi16(rep, one);
673 d = _mm_add_epi16(d, inc);
674 }
675 }
676
aom_smooth_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)677 void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
678 const uint8_t *above, const uint8_t *left) {
679 __m128i pixels[3];
680 load_pixel_w4(above, left, 4, pixels);
681
682 __m128i wh[4], ww[2];
683 load_weight_w4(4, wh, ww);
684
685 smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
686 }
687
aom_smooth_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)688 void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
689 const uint8_t *above, const uint8_t *left) {
690 __m128i pixels[3];
691 load_pixel_w4(above, left, 8, pixels);
692
693 __m128i wh[4], ww[2];
694 load_weight_w4(8, wh, ww);
695
696 smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
697 }
698
699 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)700 void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
701 const uint8_t *above,
702 const uint8_t *left) {
703 __m128i pixels[3];
704 load_pixel_w4(above, left, 16, pixels);
705
706 __m128i wh[4], ww[2];
707 load_weight_w4(16, wh, ww);
708
709 smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
710 dst += stride << 3;
711 smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
712 }
713 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
714
715 // pixels[0]: above and below_pred interleave vector, first half
716 // pixels[1]: above and below_pred interleave vector, second half
717 // pixels[2]: left vector
718 // pixels[3]: right_pred vector
719 // pixels[4]: above and below_pred interleave vector, first half
720 // pixels[5]: above and below_pred interleave vector, second half
721 // pixels[6]: left vector + 16
722 // pixels[7]: right_pred vector
load_pixel_w8(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)723 static inline void load_pixel_w8(const uint8_t *above, const uint8_t *left,
724 int height, __m128i *pixels) {
725 const __m128i zero = _mm_setzero_si128();
726 const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
727 __m128i d = _mm_loadl_epi64((const __m128i *)above);
728 d = _mm_unpacklo_epi8(d, zero);
729 pixels[0] = _mm_unpacklo_epi16(d, bp);
730 pixels[1] = _mm_unpackhi_epi16(d, bp);
731
732 pixels[3] = _mm_set1_epi16((int16_t)above[7]);
733
734 if (height == 4) {
735 pixels[2] = _mm_cvtsi32_si128(((const int *)left)[0]);
736 } else if (height == 8) {
737 pixels[2] = _mm_loadl_epi64((const __m128i *)left);
738 } else if (height == 16) {
739 pixels[2] = _mm_load_si128((const __m128i *)left);
740 } else {
741 pixels[2] = _mm_load_si128((const __m128i *)left);
742 pixels[4] = pixels[0];
743 pixels[5] = pixels[1];
744 pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
745 pixels[7] = pixels[3];
746 }
747 }
748
749 // weight_h[0]: weight_h vector
750 // weight_h[1]: scale - weight_h vector
751 // weight_h[2]: same as [0], offset 8
752 // weight_h[3]: same as [1], offset 8
753 // weight_h[4]: same as [0], offset 16
754 // weight_h[5]: same as [1], offset 16
755 // weight_h[6]: same as [0], offset 24
756 // weight_h[7]: same as [1], offset 24
757 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
758 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
load_weight_w8(int height,__m128i * weight_h,__m128i * weight_w)759 static inline void load_weight_w8(int height, __m128i *weight_h,
760 __m128i *weight_w) {
761 const __m128i zero = _mm_setzero_si128();
762 const int we_offset = height < 8 ? 0 : 4;
763 __m128i we = _mm_loadu_si128((const __m128i *)&smooth_weights[we_offset]);
764 weight_h[0] = _mm_unpacklo_epi8(we, zero);
765 const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
766 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
767
768 if (height == 4) {
769 we = _mm_srli_si128(we, 4);
770 __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
771 __m128i tmp2 = _mm_sub_epi16(d, tmp1);
772 weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
773 weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
774 } else {
775 weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
776 weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
777 }
778
779 if (height == 16) {
780 we = _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
781 weight_h[0] = _mm_unpacklo_epi8(we, zero);
782 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
783 weight_h[2] = _mm_unpackhi_epi8(we, zero);
784 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
785 } else if (height == 32) {
786 const __m128i weight_lo =
787 _mm_loadu_si128((const __m128i *)&smooth_weights[28]);
788 weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
789 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
790 weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
791 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
792 const __m128i weight_hi =
793 _mm_loadu_si128((const __m128i *)&smooth_weights[28 + 16]);
794 weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
795 weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
796 weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
797 weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
798 }
799 }
800
smooth_pred_8xh(const __m128i * pixels,const __m128i * wh,const __m128i * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)801 static inline void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
802 const __m128i *ww, int h, uint8_t *dst,
803 ptrdiff_t stride, int second_half) {
804 const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
805 const __m128i one = _mm_set1_epi16(1);
806 const __m128i inc = _mm_set1_epi16(0x202);
807 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
808
809 __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
810 : _mm_set1_epi16((short)0x8000);
811 __m128i d = _mm_set1_epi16(0x100);
812
813 int i;
814 for (i = 0; i < h; ++i) {
815 const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
816 const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
817 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
818 __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
819 __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
820
821 __m128i b = _mm_shuffle_epi8(pixels[2], rep);
822 b = _mm_unpacklo_epi16(b, pixels[3]);
823 __m128i sum0 = _mm_madd_epi16(b, ww[0]);
824 __m128i sum1 = _mm_madd_epi16(b, ww[1]);
825
826 s0 = _mm_add_epi32(s0, sum0);
827 s0 = _mm_add_epi32(s0, round);
828 s0 = _mm_srai_epi32(s0, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
829
830 s1 = _mm_add_epi32(s1, sum1);
831 s1 = _mm_add_epi32(s1, round);
832 s1 = _mm_srai_epi32(s1, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
833
834 sum0 = _mm_packus_epi16(s0, s1);
835 sum0 = _mm_shuffle_epi8(sum0, gat);
836 _mm_storel_epi64((__m128i *)dst, sum0);
837 dst += stride;
838
839 rep = _mm_add_epi16(rep, one);
840 d = _mm_add_epi16(d, inc);
841 }
842 }
843
aom_smooth_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)844 void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
845 const uint8_t *above, const uint8_t *left) {
846 __m128i pixels[4];
847 load_pixel_w8(above, left, 4, pixels);
848
849 __m128i wh[4], ww[2];
850 load_weight_w8(4, wh, ww);
851
852 smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
853 }
854
aom_smooth_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)855 void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
856 const uint8_t *above, const uint8_t *left) {
857 __m128i pixels[4];
858 load_pixel_w8(above, left, 8, pixels);
859
860 __m128i wh[4], ww[2];
861 load_weight_w8(8, wh, ww);
862
863 smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
864 }
865
aom_smooth_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)866 void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
867 const uint8_t *above,
868 const uint8_t *left) {
869 __m128i pixels[4];
870 load_pixel_w8(above, left, 16, pixels);
871
872 __m128i wh[4], ww[2];
873 load_weight_w8(16, wh, ww);
874
875 smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
876 dst += stride << 3;
877 smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
878 }
879
880 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)881 void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
882 const uint8_t *above,
883 const uint8_t *left) {
884 __m128i pixels[8];
885 load_pixel_w8(above, left, 32, pixels);
886
887 __m128i wh[8], ww[2];
888 load_weight_w8(32, wh, ww);
889
890 smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
891 dst += stride << 3;
892 smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
893 dst += stride << 3;
894 smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
895 dst += stride << 3;
896 smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
897 }
898 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
899
900 // TODO(slavarnway): Visual Studio only supports restrict when /std:c11
901 // (available in 2019+) or greater is specified; __restrict can be used in that
902 // case. This should be moved to rtcd and used consistently between the
903 // function declarations and definitions to avoid warnings in Visual Studio
904 // when defining LIBAOM_RESTRICT to restrict or __restrict.
905 #if defined(_MSC_VER)
906 #define LIBAOM_RESTRICT
907 #else
908 #define LIBAOM_RESTRICT restrict
909 #endif
910
Load4(const void * src)911 static AOM_FORCE_INLINE __m128i Load4(const void *src) {
912 // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
913 // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
914 // movss instruction.
915 //
916 // Until compiler support of _mm_loadu_si32 is widespread, use of
917 // _mm_loadu_si32 is banned.
918 int val;
919 memcpy(&val, src, sizeof(val));
920 return _mm_cvtsi32_si128(val);
921 }
922
LoadLo8(const void * a)923 static AOM_FORCE_INLINE __m128i LoadLo8(const void *a) {
924 return _mm_loadl_epi64((const __m128i *)(a));
925 }
926
LoadUnaligned16(const void * a)927 static AOM_FORCE_INLINE __m128i LoadUnaligned16(const void *a) {
928 return _mm_loadu_si128((const __m128i *)(a));
929 }
930
Store4(void * dst,const __m128i x)931 static AOM_FORCE_INLINE void Store4(void *dst, const __m128i x) {
932 const int val = _mm_cvtsi128_si32(x);
933 memcpy(dst, &val, sizeof(val));
934 }
935
StoreLo8(void * a,const __m128i v)936 static AOM_FORCE_INLINE void StoreLo8(void *a, const __m128i v) {
937 _mm_storel_epi64((__m128i *)(a), v);
938 }
939
StoreUnaligned16(void * a,const __m128i v)940 static AOM_FORCE_INLINE void StoreUnaligned16(void *a, const __m128i v) {
941 _mm_storeu_si128((__m128i *)(a), v);
942 }
943
cvtepu8_epi16(__m128i x)944 static AOM_FORCE_INLINE __m128i cvtepu8_epi16(__m128i x) {
945 return _mm_unpacklo_epi8((x), _mm_setzero_si128());
946 }
947
cvtepu8_epi32(__m128i x)948 static AOM_FORCE_INLINE __m128i cvtepu8_epi32(__m128i x) {
949 const __m128i tmp = _mm_unpacklo_epi8((x), _mm_setzero_si128());
950 return _mm_unpacklo_epi16(tmp, _mm_setzero_si128());
951 }
952
cvtepu16_epi32(__m128i x)953 static AOM_FORCE_INLINE __m128i cvtepu16_epi32(__m128i x) {
954 return _mm_unpacklo_epi16((x), _mm_setzero_si128());
955 }
956
smooth_predictor_wxh(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column,int width,int height)957 static void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
958 const uint8_t *LIBAOM_RESTRICT top_row,
959 const uint8_t *LIBAOM_RESTRICT left_column,
960 int width, int height) {
961 const uint8_t *const sm_weights_h = smooth_weights + height - 4;
962 const uint8_t *const sm_weights_w = smooth_weights + width - 4;
963 const __m128i zero = _mm_setzero_si128();
964 const __m128i scale_value = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
965 const __m128i bottom_left = _mm_cvtsi32_si128(left_column[height - 1]);
966 const __m128i top_right = _mm_set1_epi16(top_row[width - 1]);
967 const __m128i round = _mm_set1_epi32(1 << SMOOTH_WEIGHT_LOG2_SCALE);
968 for (int y = 0; y < height; ++y) {
969 const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
970 const __m128i left_y = _mm_cvtsi32_si128(left_column[y]);
971 const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
972 __m128i scaled_bottom_left =
973 _mm_mullo_epi16(scale_m_weights_y, bottom_left);
974 const __m128i weight_left_y =
975 _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
976 scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round);
977 scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0);
978 for (int x = 0; x < width; x += 8) {
979 const __m128i top_x = LoadLo8(top_row + x);
980 const __m128i weights_x = LoadLo8(sm_weights_w + x);
981 const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x);
982 const __m128i top_weights_x_lo = cvtepu8_epi16(top_weights_x);
983 const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero);
984
985 // Here opposite weights and pixels are multiplied, where the order of
986 // interleaving is indicated in the names.
987 __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y);
988 __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y);
989
990 // |scaled_bottom_left| is always scaled by the same weight each row, so
991 // we only derive |scaled_top_right| values here.
992 const __m128i inverted_weights_x =
993 _mm_sub_epi16(scale_value, cvtepu8_epi16(weights_x));
994 const __m128i scaled_top_right =
995 _mm_mullo_epi16(inverted_weights_x, top_right);
996 const __m128i scaled_top_right_lo = cvtepu16_epi32(scaled_top_right);
997 const __m128i scaled_top_right_hi =
998 _mm_unpackhi_epi16(scaled_top_right, zero);
999 pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left);
1000 pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left);
1001 pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo);
1002 pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi);
1003
1004 // The round value for RightShiftWithRounding was added with
1005 // |scaled_bottom_left|.
1006 pred_lo = _mm_srli_epi32(pred_lo, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
1007 pred_hi = _mm_srli_epi32(pred_hi, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
1008 const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
1009 StoreLo8(dst + x, _mm_packus_epi16(pred, pred));
1010 }
1011 dst += stride;
1012 }
1013 }
1014
1015 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1016 void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1017 const uint8_t *above,
1018 const uint8_t *left) {
1019 smooth_predictor_wxh(dst, stride, above, left, 16, 4);
1020 }
1021 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1022
aom_smooth_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1023 void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1024 const uint8_t *above,
1025 const uint8_t *left) {
1026 smooth_predictor_wxh(dst, stride, above, left, 16, 8);
1027 }
1028
aom_smooth_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1029 void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1030 const uint8_t *above,
1031 const uint8_t *left) {
1032 smooth_predictor_wxh(dst, stride, above, left, 16, 16);
1033 }
1034
aom_smooth_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1035 void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1036 const uint8_t *above,
1037 const uint8_t *left) {
1038 smooth_predictor_wxh(dst, stride, above, left, 16, 32);
1039 }
1040
1041 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1042 void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1043 const uint8_t *above,
1044 const uint8_t *left) {
1045 smooth_predictor_wxh(dst, stride, above, left, 16, 64);
1046 }
1047
aom_smooth_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1048 void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1049 const uint8_t *above,
1050 const uint8_t *left) {
1051 smooth_predictor_wxh(dst, stride, above, left, 32, 8);
1052 }
1053 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1054
aom_smooth_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1055 void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1056 const uint8_t *above,
1057 const uint8_t *left) {
1058 smooth_predictor_wxh(dst, stride, above, left, 32, 16);
1059 }
1060
aom_smooth_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1061 void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1062 const uint8_t *above,
1063 const uint8_t *left) {
1064 smooth_predictor_wxh(dst, stride, above, left, 32, 32);
1065 }
1066
aom_smooth_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1067 void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1068 const uint8_t *above,
1069 const uint8_t *left) {
1070 smooth_predictor_wxh(dst, stride, above, left, 32, 64);
1071 }
1072
1073 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1074 void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1075 const uint8_t *above,
1076 const uint8_t *left) {
1077 smooth_predictor_wxh(dst, stride, above, left, 64, 16);
1078 }
1079 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1080
aom_smooth_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1081 void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1082 const uint8_t *above,
1083 const uint8_t *left) {
1084 smooth_predictor_wxh(dst, stride, above, left, 64, 32);
1085 }
1086
aom_smooth_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1087 void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1088 const uint8_t *above,
1089 const uint8_t *left) {
1090 smooth_predictor_wxh(dst, stride, above, left, 64, 64);
1091 }
1092
1093 // -----------------------------------------------------------------------------
1094 // Smooth horizontal/vertical helper functions.
1095
1096 // For Horizontal, pixels1 and pixels2 are the same repeated value. For
1097 // Vertical, weights1 and weights2 are the same, and scaled_corner1 and
1098 // scaled_corner2 are the same.
write_smooth_directional_sum16(uint8_t * LIBAOM_RESTRICT dst,const __m128i pixels1,const __m128i pixels2,const __m128i weights1,const __m128i weights2,const __m128i scaled_corner1,const __m128i scaled_corner2,const __m128i round)1099 static AOM_FORCE_INLINE void write_smooth_directional_sum16(
1100 uint8_t *LIBAOM_RESTRICT dst, const __m128i pixels1, const __m128i pixels2,
1101 const __m128i weights1, const __m128i weights2,
1102 const __m128i scaled_corner1, const __m128i scaled_corner2,
1103 const __m128i round) {
1104 const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1);
1105 const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2);
1106 const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1);
1107 const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2);
1108 // Equivalent to RightShiftWithRounding(pred[x][y], 8).
1109 const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8);
1110 const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8);
1111 StoreUnaligned16(dst, _mm_packus_epi16(pred1, pred2));
1112 }
1113
smooth_directional_sum8(const __m128i pixels,const __m128i weights,const __m128i scaled_corner)1114 static AOM_FORCE_INLINE __m128i smooth_directional_sum8(
1115 const __m128i pixels, const __m128i weights, const __m128i scaled_corner) {
1116 const __m128i weighted_px = _mm_mullo_epi16(pixels, weights);
1117 return _mm_add_epi16(scaled_corner, weighted_px);
1118 }
1119
write_smooth_directional_sum8(uint8_t * LIBAOM_RESTRICT dst,const __m128i * pixels,const __m128i * weights,const __m128i * scaled_corner,const __m128i * round)1120 static AOM_FORCE_INLINE void write_smooth_directional_sum8(
1121 uint8_t *LIBAOM_RESTRICT dst, const __m128i *pixels, const __m128i *weights,
1122 const __m128i *scaled_corner, const __m128i *round) {
1123 const __m128i pred_sum =
1124 smooth_directional_sum8(*pixels, *weights, *scaled_corner);
1125 // Equivalent to RightShiftWithRounding(pred[x][y], 8).
1126 const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, *round), 8);
1127 StoreLo8(dst, _mm_packus_epi16(pred, pred));
1128 }
1129
1130 // -----------------------------------------------------------------------------
1131 // SMOOTH_V_PRED
1132
load_smooth_vertical_pixels4(const uint8_t * LIBAOM_RESTRICT above,const uint8_t * LIBAOM_RESTRICT left,const int height,__m128i * pixels)1133 static AOM_FORCE_INLINE void load_smooth_vertical_pixels4(
1134 const uint8_t *LIBAOM_RESTRICT above, const uint8_t *LIBAOM_RESTRICT left,
1135 const int height, __m128i *pixels) {
1136 __m128i top = Load4(above);
1137 const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
1138 top = cvtepu8_epi16(top);
1139 pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
1140 }
1141
1142 // |weight_array| alternates weight vectors from the table with their inverted
1143 // (256-w) counterparts. This is precomputed by the compiler when the weights
1144 // table is visible to this module. Removing this visibility can cut speed by up
1145 // to half in both 4xH and 8xH transforms.
load_smooth_vertical_weights4(const uint8_t * LIBAOM_RESTRICT weight_array,const int height,__m128i * weights)1146 static AOM_FORCE_INLINE void load_smooth_vertical_weights4(
1147 const uint8_t *LIBAOM_RESTRICT weight_array, const int height,
1148 __m128i *weights) {
1149 const __m128i inverter = _mm_set1_epi16(256);
1150
1151 if (height == 4) {
1152 const __m128i weight = Load4(weight_array);
1153 weights[0] = cvtepu8_epi16(weight);
1154 weights[1] = _mm_sub_epi16(inverter, weights[0]);
1155 } else if (height == 8) {
1156 const __m128i weight = LoadLo8(weight_array + 4);
1157 weights[0] = cvtepu8_epi16(weight);
1158 weights[1] = _mm_sub_epi16(inverter, weights[0]);
1159 } else {
1160 const __m128i weight = LoadUnaligned16(weight_array + 12);
1161 const __m128i zero = _mm_setzero_si128();
1162 weights[0] = cvtepu8_epi16(weight);
1163 weights[1] = _mm_sub_epi16(inverter, weights[0]);
1164 weights[2] = _mm_unpackhi_epi8(weight, zero);
1165 weights[3] = _mm_sub_epi16(inverter, weights[2]);
1166 }
1167 }
1168
write_smooth_vertical4xh(const __m128i * pixel,const __m128i * weight,const int height,uint8_t * LIBAOM_RESTRICT dst,const ptrdiff_t stride)1169 static AOM_FORCE_INLINE void write_smooth_vertical4xh(
1170 const __m128i *pixel, const __m128i *weight, const int height,
1171 uint8_t *LIBAOM_RESTRICT dst, const ptrdiff_t stride) {
1172 const __m128i pred_round = _mm_set1_epi32(128);
1173 const __m128i mask_increment = _mm_set1_epi16(0x0202);
1174 const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400);
1175 __m128i y_select = _mm_set1_epi16(0x0100);
1176
1177 for (int y = 0; y < height; ++y) {
1178 const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select);
1179 const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select);
1180 const __m128i alternate_weights =
1181 _mm_unpacklo_epi16(weight_y, inverted_weight_y);
1182 // Here the pixel vector is top_row[0], corner, top_row[1], corner, ...
1183 // The madd instruction yields four results of the form:
1184 // (top_row[x] * weight[y] + corner * inverted_weight[y])
1185 __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights);
1186 sum = _mm_add_epi32(sum, pred_round);
1187 sum = _mm_srai_epi32(sum, 8);
1188 sum = _mm_shuffle_epi8(sum, cvtepu8_epi32);
1189 Store4(dst, sum);
1190 dst += stride;
1191 y_select = _mm_add_epi16(y_select, mask_increment);
1192 }
1193 }
1194
aom_smooth_v_predictor_4x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1195 void aom_smooth_v_predictor_4x4_ssse3(
1196 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1197 const uint8_t *LIBAOM_RESTRICT top_row,
1198 const uint8_t *LIBAOM_RESTRICT left_column) {
1199 __m128i pixels;
1200 load_smooth_vertical_pixels4(top_row, left_column, 4, &pixels);
1201
1202 __m128i weights[2];
1203 load_smooth_vertical_weights4(smooth_weights, 4, weights);
1204
1205 write_smooth_vertical4xh(&pixels, weights, 4, dst, stride);
1206 }
1207
aom_smooth_v_predictor_4x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1208 void aom_smooth_v_predictor_4x8_ssse3(
1209 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1210 const uint8_t *LIBAOM_RESTRICT top_row,
1211 const uint8_t *LIBAOM_RESTRICT left_column) {
1212 __m128i pixels;
1213 load_smooth_vertical_pixels4(top_row, left_column, 8, &pixels);
1214
1215 __m128i weights[2];
1216 load_smooth_vertical_weights4(smooth_weights, 8, weights);
1217
1218 write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
1219 }
1220
1221 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_v_predictor_4x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1222 void aom_smooth_v_predictor_4x16_ssse3(
1223 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1224 const uint8_t *LIBAOM_RESTRICT top_row,
1225 const uint8_t *LIBAOM_RESTRICT left_column) {
1226 __m128i pixels;
1227 load_smooth_vertical_pixels4(top_row, left_column, 16, &pixels);
1228
1229 __m128i weights[4];
1230 load_smooth_vertical_weights4(smooth_weights, 16, weights);
1231
1232 write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
1233 dst += stride << 3;
1234 write_smooth_vertical4xh(&pixels, &weights[2], 8, dst, stride);
1235 }
1236 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1237
aom_smooth_v_predictor_8x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1238 void aom_smooth_v_predictor_8x4_ssse3(
1239 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1240 const uint8_t *LIBAOM_RESTRICT top_row,
1241 const uint8_t *LIBAOM_RESTRICT left_column) {
1242 const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
1243 const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
1244 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1245 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1246 const __m128i scaled_bottom_left =
1247 _mm_mullo_epi16(inverted_weights, bottom_left);
1248 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1249 __m128i y_select = _mm_set1_epi32(0x01000100);
1250 const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1251 __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1252 __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1253 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1254 &round);
1255 dst += stride;
1256 y_select = _mm_set1_epi32(0x03020302);
1257 weights_y = _mm_shuffle_epi8(weights, y_select);
1258 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1259 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1260 &round);
1261 dst += stride;
1262 y_select = _mm_set1_epi32(0x05040504);
1263 weights_y = _mm_shuffle_epi8(weights, y_select);
1264 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1265 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1266 &round);
1267 dst += stride;
1268 y_select = _mm_set1_epi32(0x07060706);
1269 weights_y = _mm_shuffle_epi8(weights, y_select);
1270 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1271 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1272 &round);
1273 }
1274
aom_smooth_v_predictor_8x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1275 void aom_smooth_v_predictor_8x8_ssse3(
1276 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1277 const uint8_t *LIBAOM_RESTRICT top_row,
1278 const uint8_t *LIBAOM_RESTRICT left_column) {
1279 const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
1280 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
1281 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1282 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1283 const __m128i scaled_bottom_left =
1284 _mm_mullo_epi16(inverted_weights, bottom_left);
1285 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1286 const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1287 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1288 const __m128i y_select = _mm_set1_epi32(y_mask);
1289 const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1290 const __m128i scaled_bottom_left_y =
1291 _mm_shuffle_epi8(scaled_bottom_left, y_select);
1292 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1293 &round);
1294 dst += stride;
1295 }
1296 }
1297
aom_smooth_v_predictor_8x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1298 void aom_smooth_v_predictor_8x16_ssse3(
1299 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1300 const uint8_t *LIBAOM_RESTRICT top_row,
1301 const uint8_t *LIBAOM_RESTRICT left_column) {
1302 const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1303 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
1304
1305 const __m128i weights1 = cvtepu8_epi16(weights);
1306 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
1307 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1308 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1309 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1310 const __m128i scaled_bottom_left1 =
1311 _mm_mullo_epi16(inverted_weights1, bottom_left);
1312 const __m128i scaled_bottom_left2 =
1313 _mm_mullo_epi16(inverted_weights2, bottom_left);
1314 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1315 const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1316 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1317 const __m128i y_select = _mm_set1_epi32(y_mask);
1318 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1319 const __m128i scaled_bottom_left_y =
1320 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1321 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1322 &round);
1323 dst += stride;
1324 }
1325 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1326 const __m128i y_select = _mm_set1_epi32(y_mask);
1327 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1328 const __m128i scaled_bottom_left_y =
1329 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1330 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1331 &round);
1332 dst += stride;
1333 }
1334 }
1335
1336 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_v_predictor_8x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1337 void aom_smooth_v_predictor_8x32_ssse3(
1338 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1339 const uint8_t *LIBAOM_RESTRICT top_row,
1340 const uint8_t *LIBAOM_RESTRICT left_column) {
1341 const __m128i zero = _mm_setzero_si128();
1342 const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1343 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1344 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1345 const __m128i weights1 = cvtepu8_epi16(weights_lo);
1346 const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1347 const __m128i weights3 = cvtepu8_epi16(weights_hi);
1348 const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1349 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1350 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1351 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1352 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1353 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1354 const __m128i scaled_bottom_left1 =
1355 _mm_mullo_epi16(inverted_weights1, bottom_left);
1356 const __m128i scaled_bottom_left2 =
1357 _mm_mullo_epi16(inverted_weights2, bottom_left);
1358 const __m128i scaled_bottom_left3 =
1359 _mm_mullo_epi16(inverted_weights3, bottom_left);
1360 const __m128i scaled_bottom_left4 =
1361 _mm_mullo_epi16(inverted_weights4, bottom_left);
1362 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1363 const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1364 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1365 const __m128i y_select = _mm_set1_epi32(y_mask);
1366 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1367 const __m128i scaled_bottom_left_y =
1368 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1369 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1370 &round);
1371 dst += stride;
1372 }
1373 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1374 const __m128i y_select = _mm_set1_epi32(y_mask);
1375 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1376 const __m128i scaled_bottom_left_y =
1377 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1378 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1379 &round);
1380 dst += stride;
1381 }
1382 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1383 const __m128i y_select = _mm_set1_epi32(y_mask);
1384 const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1385 const __m128i scaled_bottom_left_y =
1386 _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1387 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1388 &round);
1389 dst += stride;
1390 }
1391 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1392 const __m128i y_select = _mm_set1_epi32(y_mask);
1393 const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
1394 const __m128i scaled_bottom_left_y =
1395 _mm_shuffle_epi8(scaled_bottom_left4, y_select);
1396 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1397 &round);
1398 dst += stride;
1399 }
1400 }
1401
aom_smooth_v_predictor_16x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1402 void aom_smooth_v_predictor_16x4_ssse3(
1403 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1404 const uint8_t *LIBAOM_RESTRICT top_row,
1405 const uint8_t *LIBAOM_RESTRICT left_column) {
1406 const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
1407 const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
1408 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1409 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1410 const __m128i scaled_bottom_left =
1411 _mm_mullo_epi16(inverted_weights, bottom_left);
1412 const __m128i round = _mm_set1_epi16(128);
1413 const __m128i top = LoadUnaligned16(top_row);
1414 const __m128i top_lo = cvtepu8_epi16(top);
1415 const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
1416
1417 __m128i y_select = _mm_set1_epi32(0x01000100);
1418 __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1419 __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1420 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1421 scaled_bottom_left_y, scaled_bottom_left_y,
1422 round);
1423 dst += stride;
1424 y_select = _mm_set1_epi32(0x03020302);
1425 weights_y = _mm_shuffle_epi8(weights, y_select);
1426 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1427 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1428 scaled_bottom_left_y, scaled_bottom_left_y,
1429 round);
1430 dst += stride;
1431 y_select = _mm_set1_epi32(0x05040504);
1432 weights_y = _mm_shuffle_epi8(weights, y_select);
1433 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1434 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1435 scaled_bottom_left_y, scaled_bottom_left_y,
1436 round);
1437 dst += stride;
1438 y_select = _mm_set1_epi32(0x07060706);
1439 weights_y = _mm_shuffle_epi8(weights, y_select);
1440 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1441 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1442 scaled_bottom_left_y, scaled_bottom_left_y,
1443 round);
1444 }
1445 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1446
aom_smooth_v_predictor_16x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1447 void aom_smooth_v_predictor_16x8_ssse3(
1448 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1449 const uint8_t *LIBAOM_RESTRICT top_row,
1450 const uint8_t *LIBAOM_RESTRICT left_column) {
1451 const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
1452 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
1453 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1454 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1455 const __m128i scaled_bottom_left =
1456 _mm_mullo_epi16(inverted_weights, bottom_left);
1457 const __m128i round = _mm_set1_epi16(128);
1458 const __m128i top = LoadUnaligned16(top_row);
1459 const __m128i top_lo = cvtepu8_epi16(top);
1460 const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
1461 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1462 const __m128i y_select = _mm_set1_epi32(y_mask);
1463 const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1464 const __m128i scaled_bottom_left_y =
1465 _mm_shuffle_epi8(scaled_bottom_left, y_select);
1466 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1467 scaled_bottom_left_y, scaled_bottom_left_y,
1468 round);
1469 dst += stride;
1470 }
1471 }
1472
aom_smooth_v_predictor_16x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1473 void aom_smooth_v_predictor_16x16_ssse3(
1474 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1475 const uint8_t *LIBAOM_RESTRICT top_row,
1476 const uint8_t *LIBAOM_RESTRICT left_column) {
1477 const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1478 const __m128i zero = _mm_setzero_si128();
1479 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1480 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
1481 const __m128i weights_lo = cvtepu8_epi16(weights);
1482 const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
1483 const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
1484 const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
1485 const __m128i scaled_bottom_left_lo =
1486 _mm_mullo_epi16(inverted_weights_lo, bottom_left);
1487 const __m128i scaled_bottom_left_hi =
1488 _mm_mullo_epi16(inverted_weights_hi, bottom_left);
1489 const __m128i round = _mm_set1_epi16(128);
1490
1491 const __m128i top = LoadUnaligned16(top_row);
1492 const __m128i top_lo = cvtepu8_epi16(top);
1493 const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1494 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1495 const __m128i y_select = _mm_set1_epi32(y_mask);
1496 const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
1497 const __m128i scaled_bottom_left_y =
1498 _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
1499 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1500 scaled_bottom_left_y, scaled_bottom_left_y,
1501 round);
1502 dst += stride;
1503 }
1504 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1505 const __m128i y_select = _mm_set1_epi32(y_mask);
1506 const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
1507 const __m128i scaled_bottom_left_y =
1508 _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
1509 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1510 scaled_bottom_left_y, scaled_bottom_left_y,
1511 round);
1512 dst += stride;
1513 }
1514 }
1515
aom_smooth_v_predictor_16x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1516 void aom_smooth_v_predictor_16x32_ssse3(
1517 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1518 const uint8_t *LIBAOM_RESTRICT top_row,
1519 const uint8_t *LIBAOM_RESTRICT left_column) {
1520 const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1521 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1522 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1523 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1524 const __m128i zero = _mm_setzero_si128();
1525 const __m128i weights1 = cvtepu8_epi16(weights_lo);
1526 const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1527 const __m128i weights3 = cvtepu8_epi16(weights_hi);
1528 const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1529 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1530 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1531 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1532 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1533 const __m128i scaled_bottom_left1 =
1534 _mm_mullo_epi16(inverted_weights1, bottom_left);
1535 const __m128i scaled_bottom_left2 =
1536 _mm_mullo_epi16(inverted_weights2, bottom_left);
1537 const __m128i scaled_bottom_left3 =
1538 _mm_mullo_epi16(inverted_weights3, bottom_left);
1539 const __m128i scaled_bottom_left4 =
1540 _mm_mullo_epi16(inverted_weights4, bottom_left);
1541 const __m128i round = _mm_set1_epi16(128);
1542
1543 const __m128i top = LoadUnaligned16(top_row);
1544 const __m128i top_lo = cvtepu8_epi16(top);
1545 const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1546 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1547 const __m128i y_select = _mm_set1_epi32(y_mask);
1548 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1549 const __m128i scaled_bottom_left_y =
1550 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1551 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1552 scaled_bottom_left_y, scaled_bottom_left_y,
1553 round);
1554 dst += stride;
1555 }
1556 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1557 const __m128i y_select = _mm_set1_epi32(y_mask);
1558 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1559 const __m128i scaled_bottom_left_y =
1560 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1561 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1562 scaled_bottom_left_y, scaled_bottom_left_y,
1563 round);
1564 dst += stride;
1565 }
1566 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1567 const __m128i y_select = _mm_set1_epi32(y_mask);
1568 const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1569 const __m128i scaled_bottom_left_y =
1570 _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1571 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1572 scaled_bottom_left_y, scaled_bottom_left_y,
1573 round);
1574 dst += stride;
1575 }
1576 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1577 const __m128i y_select = _mm_set1_epi32(y_mask);
1578 const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
1579 const __m128i scaled_bottom_left_y =
1580 _mm_shuffle_epi8(scaled_bottom_left4, y_select);
1581 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1582 scaled_bottom_left_y, scaled_bottom_left_y,
1583 round);
1584 dst += stride;
1585 }
1586 }
1587
1588 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_v_predictor_16x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1589 void aom_smooth_v_predictor_16x64_ssse3(
1590 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1591 const uint8_t *LIBAOM_RESTRICT top_row,
1592 const uint8_t *LIBAOM_RESTRICT left_column) {
1593 const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
1594 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1595 const __m128i round = _mm_set1_epi16(128);
1596 const __m128i zero = _mm_setzero_si128();
1597 const __m128i top = LoadUnaligned16(top_row);
1598 const __m128i top_lo = cvtepu8_epi16(top);
1599 const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1600 const uint8_t *weights_base_ptr = smooth_weights + 60;
1601 for (int left_offset = 0; left_offset < 64; left_offset += 16) {
1602 const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
1603 const __m128i weights_lo = cvtepu8_epi16(weights);
1604 const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
1605 const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
1606 const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
1607 const __m128i scaled_bottom_left_lo =
1608 _mm_mullo_epi16(inverted_weights_lo, bottom_left);
1609 const __m128i scaled_bottom_left_hi =
1610 _mm_mullo_epi16(inverted_weights_hi, bottom_left);
1611
1612 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1613 const __m128i y_select = _mm_set1_epi32(y_mask);
1614 const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
1615 const __m128i scaled_bottom_left_y =
1616 _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
1617 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1618 scaled_bottom_left_y, scaled_bottom_left_y,
1619 round);
1620 dst += stride;
1621 }
1622 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1623 const __m128i y_select = _mm_set1_epi32(y_mask);
1624 const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
1625 const __m128i scaled_bottom_left_y =
1626 _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
1627 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1628 scaled_bottom_left_y, scaled_bottom_left_y,
1629 round);
1630 dst += stride;
1631 }
1632 }
1633 }
1634
aom_smooth_v_predictor_32x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1635 void aom_smooth_v_predictor_32x8_ssse3(
1636 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1637 const uint8_t *LIBAOM_RESTRICT top_row,
1638 const uint8_t *LIBAOM_RESTRICT left_column) {
1639 const __m128i zero = _mm_setzero_si128();
1640 const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
1641 const __m128i top_lo = LoadUnaligned16(top_row);
1642 const __m128i top_hi = LoadUnaligned16(top_row + 16);
1643 const __m128i top1 = cvtepu8_epi16(top_lo);
1644 const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1645 const __m128i top3 = cvtepu8_epi16(top_hi);
1646 const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1647 __m128i scale = _mm_set1_epi16(256);
1648 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
1649 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1650 const __m128i scaled_bottom_left =
1651 _mm_mullo_epi16(inverted_weights, bottom_left);
1652 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1653 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1654 __m128i y_select = _mm_set1_epi32(y_mask);
1655 const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1656 const __m128i scaled_bottom_left_y =
1657 _mm_shuffle_epi8(scaled_bottom_left, y_select);
1658 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1659 scaled_bottom_left_y, scaled_bottom_left_y,
1660 round);
1661 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1662 scaled_bottom_left_y, scaled_bottom_left_y,
1663 round);
1664 dst += stride;
1665 }
1666 }
1667 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1668
aom_smooth_v_predictor_32x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1669 void aom_smooth_v_predictor_32x16_ssse3(
1670 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1671 const uint8_t *LIBAOM_RESTRICT top_row,
1672 const uint8_t *LIBAOM_RESTRICT left_column) {
1673 const __m128i zero = _mm_setzero_si128();
1674 const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1675 const __m128i top_lo = LoadUnaligned16(top_row);
1676 const __m128i top_hi = LoadUnaligned16(top_row + 16);
1677 const __m128i top1 = cvtepu8_epi16(top_lo);
1678 const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1679 const __m128i top3 = cvtepu8_epi16(top_hi);
1680 const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1681 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
1682 const __m128i weights1 = cvtepu8_epi16(weights);
1683 const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
1684 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1685 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1686 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1687 const __m128i scaled_bottom_left1 =
1688 _mm_mullo_epi16(inverted_weights1, bottom_left);
1689 const __m128i scaled_bottom_left2 =
1690 _mm_mullo_epi16(inverted_weights2, bottom_left);
1691 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1692 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1693 __m128i y_select = _mm_set1_epi32(y_mask);
1694 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1695 const __m128i scaled_bottom_left_y =
1696 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1697 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1698 scaled_bottom_left_y, scaled_bottom_left_y,
1699 round);
1700 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1701 scaled_bottom_left_y, scaled_bottom_left_y,
1702 round);
1703 dst += stride;
1704 }
1705 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1706 __m128i y_select = _mm_set1_epi32(y_mask);
1707 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1708 const __m128i scaled_bottom_left_y =
1709 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1710 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1711 scaled_bottom_left_y, scaled_bottom_left_y,
1712 round);
1713 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1714 scaled_bottom_left_y, scaled_bottom_left_y,
1715 round);
1716 dst += stride;
1717 }
1718 }
1719
aom_smooth_v_predictor_32x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1720 void aom_smooth_v_predictor_32x32_ssse3(
1721 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1722 const uint8_t *LIBAOM_RESTRICT top_row,
1723 const uint8_t *LIBAOM_RESTRICT left_column) {
1724 const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1725 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1726 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1727 const __m128i zero = _mm_setzero_si128();
1728 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1729 const __m128i top_lo = LoadUnaligned16(top_row);
1730 const __m128i top_hi = LoadUnaligned16(top_row + 16);
1731 const __m128i top1 = cvtepu8_epi16(top_lo);
1732 const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1733 const __m128i top3 = cvtepu8_epi16(top_hi);
1734 const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1735 const __m128i weights1 = cvtepu8_epi16(weights_lo);
1736 const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1737 const __m128i weights3 = cvtepu8_epi16(weights_hi);
1738 const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1739 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1740 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1741 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1742 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1743 const __m128i scaled_bottom_left1 =
1744 _mm_mullo_epi16(inverted_weights1, bottom_left);
1745 const __m128i scaled_bottom_left2 =
1746 _mm_mullo_epi16(inverted_weights2, bottom_left);
1747 const __m128i scaled_bottom_left3 =
1748 _mm_mullo_epi16(inverted_weights3, bottom_left);
1749 const __m128i scaled_bottom_left4 =
1750 _mm_mullo_epi16(inverted_weights4, bottom_left);
1751 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1752 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1753 const __m128i y_select = _mm_set1_epi32(y_mask);
1754 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1755 const __m128i scaled_bottom_left_y =
1756 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1757 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1758 scaled_bottom_left_y, scaled_bottom_left_y,
1759 round);
1760 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1761 scaled_bottom_left_y, scaled_bottom_left_y,
1762 round);
1763 dst += stride;
1764 }
1765 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1766 const __m128i y_select = _mm_set1_epi32(y_mask);
1767 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1768 const __m128i scaled_bottom_left_y =
1769 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1770 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1771 scaled_bottom_left_y, scaled_bottom_left_y,
1772 round);
1773 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1774 scaled_bottom_left_y, scaled_bottom_left_y,
1775 round);
1776 dst += stride;
1777 }
1778 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1779 const __m128i y_select = _mm_set1_epi32(y_mask);
1780 const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1781 const __m128i scaled_bottom_left_y =
1782 _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1783 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1784 scaled_bottom_left_y, scaled_bottom_left_y,
1785 round);
1786 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1787 scaled_bottom_left_y, scaled_bottom_left_y,
1788 round);
1789 dst += stride;
1790 }
1791 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1792 const __m128i y_select = _mm_set1_epi32(y_mask);
1793 const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
1794 const __m128i scaled_bottom_left_y =
1795 _mm_shuffle_epi8(scaled_bottom_left4, y_select);
1796 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1797 scaled_bottom_left_y, scaled_bottom_left_y,
1798 round);
1799 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1800 scaled_bottom_left_y, scaled_bottom_left_y,
1801 round);
1802 dst += stride;
1803 }
1804 }
1805
aom_smooth_v_predictor_32x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1806 void aom_smooth_v_predictor_32x64_ssse3(
1807 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1808 const uint8_t *LIBAOM_RESTRICT top_row,
1809 const uint8_t *LIBAOM_RESTRICT left_column) {
1810 const __m128i zero = _mm_setzero_si128();
1811 const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
1812 const __m128i top_lo = LoadUnaligned16(top_row);
1813 const __m128i top_hi = LoadUnaligned16(top_row + 16);
1814 const __m128i top1 = cvtepu8_epi16(top_lo);
1815 const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1816 const __m128i top3 = cvtepu8_epi16(top_hi);
1817 const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1818 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1819 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1820 const uint8_t *weights_base_ptr = smooth_weights + 60;
1821 for (int left_offset = 0; left_offset < 64; left_offset += 16) {
1822 const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
1823 const __m128i weights_lo = cvtepu8_epi16(weights);
1824 const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
1825 const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
1826 const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
1827 const __m128i scaled_bottom_left_lo =
1828 _mm_mullo_epi16(inverted_weights_lo, bottom_left);
1829 const __m128i scaled_bottom_left_hi =
1830 _mm_mullo_epi16(inverted_weights_hi, bottom_left);
1831
1832 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1833 const __m128i y_select = _mm_set1_epi32(y_mask);
1834 const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
1835 const __m128i scaled_bottom_left_y =
1836 _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
1837 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1838 scaled_bottom_left_y, scaled_bottom_left_y,
1839 round);
1840 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1841 scaled_bottom_left_y, scaled_bottom_left_y,
1842 round);
1843 dst += stride;
1844 }
1845 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1846 const __m128i y_select = _mm_set1_epi32(y_mask);
1847 const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
1848 const __m128i scaled_bottom_left_y =
1849 _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
1850 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1851 scaled_bottom_left_y, scaled_bottom_left_y,
1852 round);
1853 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1854 scaled_bottom_left_y, scaled_bottom_left_y,
1855 round);
1856 dst += stride;
1857 }
1858 }
1859 }
1860
1861 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_v_predictor_64x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1862 void aom_smooth_v_predictor_64x16_ssse3(
1863 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1864 const uint8_t *LIBAOM_RESTRICT top_row,
1865 const uint8_t *LIBAOM_RESTRICT left_column) {
1866 const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1867 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1868 const __m128i zero = _mm_setzero_si128();
1869 const __m128i top_lolo = LoadUnaligned16(top_row);
1870 const __m128i top_lohi = LoadUnaligned16(top_row + 16);
1871 const __m128i top1 = cvtepu8_epi16(top_lolo);
1872 const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
1873 const __m128i top3 = cvtepu8_epi16(top_lohi);
1874 const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
1875
1876 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
1877 const __m128i weights1 = cvtepu8_epi16(weights);
1878 const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
1879 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1880 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1881 const __m128i top_hilo = LoadUnaligned16(top_row + 32);
1882 const __m128i top_hihi = LoadUnaligned16(top_row + 48);
1883 const __m128i top5 = cvtepu8_epi16(top_hilo);
1884 const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
1885 const __m128i top7 = cvtepu8_epi16(top_hihi);
1886 const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
1887 const __m128i scaled_bottom_left1 =
1888 _mm_mullo_epi16(inverted_weights1, bottom_left);
1889 const __m128i scaled_bottom_left2 =
1890 _mm_mullo_epi16(inverted_weights2, bottom_left);
1891 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1892 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1893 const __m128i y_select = _mm_set1_epi32(y_mask);
1894 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1895 const __m128i scaled_bottom_left_y =
1896 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1897 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1898 scaled_bottom_left_y, scaled_bottom_left_y,
1899 round);
1900 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1901 scaled_bottom_left_y, scaled_bottom_left_y,
1902 round);
1903 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1904 scaled_bottom_left_y, scaled_bottom_left_y,
1905 round);
1906 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1907 scaled_bottom_left_y, scaled_bottom_left_y,
1908 round);
1909 dst += stride;
1910 }
1911 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1912 const __m128i y_select = _mm_set1_epi32(y_mask);
1913 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1914 const __m128i scaled_bottom_left_y =
1915 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1916 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1917 scaled_bottom_left_y, scaled_bottom_left_y,
1918 round);
1919 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1920 scaled_bottom_left_y, scaled_bottom_left_y,
1921 round);
1922 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1923 scaled_bottom_left_y, scaled_bottom_left_y,
1924 round);
1925 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1926 scaled_bottom_left_y, scaled_bottom_left_y,
1927 round);
1928 dst += stride;
1929 }
1930 }
1931 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1932
aom_smooth_v_predictor_64x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1933 void aom_smooth_v_predictor_64x32_ssse3(
1934 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1935 const uint8_t *LIBAOM_RESTRICT top_row,
1936 const uint8_t *LIBAOM_RESTRICT left_column) {
1937 const __m128i zero = _mm_setzero_si128();
1938 const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1939 const __m128i top_lolo = LoadUnaligned16(top_row);
1940 const __m128i top_lohi = LoadUnaligned16(top_row + 16);
1941 const __m128i top1 = cvtepu8_epi16(top_lolo);
1942 const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
1943 const __m128i top3 = cvtepu8_epi16(top_lohi);
1944 const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
1945 const __m128i top_hilo = LoadUnaligned16(top_row + 32);
1946 const __m128i top_hihi = LoadUnaligned16(top_row + 48);
1947 const __m128i top5 = cvtepu8_epi16(top_hilo);
1948 const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
1949 const __m128i top7 = cvtepu8_epi16(top_hihi);
1950 const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
1951 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1952 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1953 const __m128i weights1 = cvtepu8_epi16(weights_lo);
1954 const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1955 const __m128i weights3 = cvtepu8_epi16(weights_hi);
1956 const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1957 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1958 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1959 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1960 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1961 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1962 const __m128i scaled_bottom_left1 =
1963 _mm_mullo_epi16(inverted_weights1, bottom_left);
1964 const __m128i scaled_bottom_left2 =
1965 _mm_mullo_epi16(inverted_weights2, bottom_left);
1966 const __m128i scaled_bottom_left3 =
1967 _mm_mullo_epi16(inverted_weights3, bottom_left);
1968 const __m128i scaled_bottom_left4 =
1969 _mm_mullo_epi16(inverted_weights4, bottom_left);
1970 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1971
1972 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1973 const __m128i y_select = _mm_set1_epi32(y_mask);
1974 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1975 const __m128i scaled_bottom_left_y =
1976 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1977 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1978 scaled_bottom_left_y, scaled_bottom_left_y,
1979 round);
1980 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1981 scaled_bottom_left_y, scaled_bottom_left_y,
1982 round);
1983 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1984 scaled_bottom_left_y, scaled_bottom_left_y,
1985 round);
1986 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1987 scaled_bottom_left_y, scaled_bottom_left_y,
1988 round);
1989 dst += stride;
1990 }
1991 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1992 const __m128i y_select = _mm_set1_epi32(y_mask);
1993 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1994 const __m128i scaled_bottom_left_y =
1995 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1996 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1997 scaled_bottom_left_y, scaled_bottom_left_y,
1998 round);
1999 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
2000 scaled_bottom_left_y, scaled_bottom_left_y,
2001 round);
2002 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
2003 scaled_bottom_left_y, scaled_bottom_left_y,
2004 round);
2005 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
2006 scaled_bottom_left_y, scaled_bottom_left_y,
2007 round);
2008 dst += stride;
2009 }
2010 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2011 const __m128i y_select = _mm_set1_epi32(y_mask);
2012 const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
2013 const __m128i scaled_bottom_left_y =
2014 _mm_shuffle_epi8(scaled_bottom_left3, y_select);
2015 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
2016 scaled_bottom_left_y, scaled_bottom_left_y,
2017 round);
2018 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
2019 scaled_bottom_left_y, scaled_bottom_left_y,
2020 round);
2021 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
2022 scaled_bottom_left_y, scaled_bottom_left_y,
2023 round);
2024 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
2025 scaled_bottom_left_y, scaled_bottom_left_y,
2026 round);
2027 dst += stride;
2028 }
2029 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2030 const __m128i y_select = _mm_set1_epi32(y_mask);
2031 const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
2032 const __m128i scaled_bottom_left_y =
2033 _mm_shuffle_epi8(scaled_bottom_left4, y_select);
2034 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
2035 scaled_bottom_left_y, scaled_bottom_left_y,
2036 round);
2037 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
2038 scaled_bottom_left_y, scaled_bottom_left_y,
2039 round);
2040 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
2041 scaled_bottom_left_y, scaled_bottom_left_y,
2042 round);
2043 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
2044 scaled_bottom_left_y, scaled_bottom_left_y,
2045 round);
2046 dst += stride;
2047 }
2048 }
2049
aom_smooth_v_predictor_64x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2050 void aom_smooth_v_predictor_64x64_ssse3(
2051 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2052 const uint8_t *LIBAOM_RESTRICT top_row,
2053 const uint8_t *LIBAOM_RESTRICT left_column) {
2054 const __m128i zero = _mm_setzero_si128();
2055 const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
2056 const __m128i top_lolo = LoadUnaligned16(top_row);
2057 const __m128i top_lohi = LoadUnaligned16(top_row + 16);
2058 const __m128i top1 = cvtepu8_epi16(top_lolo);
2059 const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
2060 const __m128i top3 = cvtepu8_epi16(top_lohi);
2061 const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
2062 const __m128i top_hilo = LoadUnaligned16(top_row + 32);
2063 const __m128i top_hihi = LoadUnaligned16(top_row + 48);
2064 const __m128i top5 = cvtepu8_epi16(top_hilo);
2065 const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
2066 const __m128i top7 = cvtepu8_epi16(top_hihi);
2067 const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
2068 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2069 const __m128i round = _mm_set1_epi16(128);
2070 const uint8_t *weights_base_ptr = smooth_weights + 60;
2071 for (int left_offset = 0; left_offset < 64; left_offset += 16) {
2072 const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
2073 const __m128i weights_lo = cvtepu8_epi16(weights);
2074 const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
2075 const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
2076 const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
2077 const __m128i scaled_bottom_left_lo =
2078 _mm_mullo_epi16(inverted_weights_lo, bottom_left);
2079 const __m128i scaled_bottom_left_hi =
2080 _mm_mullo_epi16(inverted_weights_hi, bottom_left);
2081 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2082 const __m128i y_select = _mm_set1_epi32(y_mask);
2083 const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
2084 const __m128i scaled_bottom_left_y =
2085 _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
2086 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
2087 scaled_bottom_left_y, scaled_bottom_left_y,
2088 round);
2089 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
2090 scaled_bottom_left_y, scaled_bottom_left_y,
2091 round);
2092 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
2093 scaled_bottom_left_y, scaled_bottom_left_y,
2094 round);
2095 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
2096 scaled_bottom_left_y, scaled_bottom_left_y,
2097 round);
2098 dst += stride;
2099 }
2100 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2101 const __m128i y_select = _mm_set1_epi32(y_mask);
2102 const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
2103 const __m128i scaled_bottom_left_y =
2104 _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
2105 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
2106 scaled_bottom_left_y, scaled_bottom_left_y,
2107 round);
2108 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
2109 scaled_bottom_left_y, scaled_bottom_left_y,
2110 round);
2111 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
2112 scaled_bottom_left_y, scaled_bottom_left_y,
2113 round);
2114 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
2115 scaled_bottom_left_y, scaled_bottom_left_y,
2116 round);
2117 dst += stride;
2118 }
2119 }
2120 }
2121
2122 // -----------------------------------------------------------------------------
2123 // SMOOTH_H_PRED
write_smooth_horizontal_sum4(uint8_t * LIBAOM_RESTRICT dst,const __m128i * left_y,const __m128i * weights,const __m128i * scaled_top_right,const __m128i * round)2124 static AOM_FORCE_INLINE void write_smooth_horizontal_sum4(
2125 uint8_t *LIBAOM_RESTRICT dst, const __m128i *left_y, const __m128i *weights,
2126 const __m128i *scaled_top_right, const __m128i *round) {
2127 const __m128i weighted_left_y = _mm_mullo_epi16(*left_y, *weights);
2128 const __m128i pred_sum = _mm_add_epi32(*scaled_top_right, weighted_left_y);
2129 // Equivalent to RightShiftWithRounding(pred[x][y], 8).
2130 const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, *round), 8);
2131 const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
2132 Store4(dst, _mm_shuffle_epi8(pred, cvtepi32_epi8));
2133 }
2134
aom_smooth_h_predictor_4x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2135 void aom_smooth_h_predictor_4x4_ssse3(
2136 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2137 const uint8_t *LIBAOM_RESTRICT top_row,
2138 const uint8_t *LIBAOM_RESTRICT left_column) {
2139 const __m128i top_right = _mm_set1_epi32(top_row[3]);
2140 const __m128i left = cvtepu8_epi32(Load4(left_column));
2141 const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
2142 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2143 const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
2144 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2145 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2146 __m128i left_y = _mm_shuffle_epi32(left, 0);
2147 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2148 &round);
2149 dst += stride;
2150 left_y = _mm_shuffle_epi32(left, 0x55);
2151 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2152 &round);
2153 dst += stride;
2154 left_y = _mm_shuffle_epi32(left, 0xaa);
2155 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2156 &round);
2157 dst += stride;
2158 left_y = _mm_shuffle_epi32(left, 0xff);
2159 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2160 &round);
2161 }
2162
aom_smooth_h_predictor_4x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2163 void aom_smooth_h_predictor_4x8_ssse3(
2164 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2165 const uint8_t *LIBAOM_RESTRICT top_row,
2166 const uint8_t *LIBAOM_RESTRICT left_column) {
2167 const __m128i top_right = _mm_set1_epi32(top_row[3]);
2168 const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
2169 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2170 const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
2171 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2172 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2173 __m128i left = cvtepu8_epi32(Load4(left_column));
2174 __m128i left_y = _mm_shuffle_epi32(left, 0);
2175 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2176 &round);
2177 dst += stride;
2178 left_y = _mm_shuffle_epi32(left, 0x55);
2179 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2180 &round);
2181 dst += stride;
2182 left_y = _mm_shuffle_epi32(left, 0xaa);
2183 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2184 &round);
2185 dst += stride;
2186 left_y = _mm_shuffle_epi32(left, 0xff);
2187 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2188 &round);
2189 dst += stride;
2190
2191 left = cvtepu8_epi32(Load4(left_column + 4));
2192 left_y = _mm_shuffle_epi32(left, 0);
2193 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2194 &round);
2195 dst += stride;
2196 left_y = _mm_shuffle_epi32(left, 0x55);
2197 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2198 &round);
2199 dst += stride;
2200 left_y = _mm_shuffle_epi32(left, 0xaa);
2201 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2202 &round);
2203 dst += stride;
2204 left_y = _mm_shuffle_epi32(left, 0xff);
2205 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2206 &round);
2207 }
2208
2209 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_h_predictor_4x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2210 void aom_smooth_h_predictor_4x16_ssse3(
2211 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2212 const uint8_t *LIBAOM_RESTRICT top_row,
2213 const uint8_t *LIBAOM_RESTRICT left_column) {
2214 const __m128i top_right = _mm_set1_epi32(top_row[3]);
2215 const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
2216 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2217 const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
2218 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2219 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2220 __m128i left = cvtepu8_epi32(Load4(left_column));
2221 __m128i left_y = _mm_shuffle_epi32(left, 0);
2222 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2223 &round);
2224 dst += stride;
2225 left_y = _mm_shuffle_epi32(left, 0x55);
2226 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2227 &round);
2228 dst += stride;
2229 left_y = _mm_shuffle_epi32(left, 0xaa);
2230 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2231 &round);
2232 dst += stride;
2233 left_y = _mm_shuffle_epi32(left, 0xff);
2234 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2235 &round);
2236 dst += stride;
2237
2238 left = cvtepu8_epi32(Load4(left_column + 4));
2239 left_y = _mm_shuffle_epi32(left, 0);
2240 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2241 &round);
2242 dst += stride;
2243 left_y = _mm_shuffle_epi32(left, 0x55);
2244 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2245 &round);
2246 dst += stride;
2247 left_y = _mm_shuffle_epi32(left, 0xaa);
2248 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2249 &round);
2250 dst += stride;
2251 left_y = _mm_shuffle_epi32(left, 0xff);
2252 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2253 &round);
2254 dst += stride;
2255
2256 left = cvtepu8_epi32(Load4(left_column + 8));
2257 left_y = _mm_shuffle_epi32(left, 0);
2258 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2259 &round);
2260 dst += stride;
2261 left_y = _mm_shuffle_epi32(left, 0x55);
2262 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2263 &round);
2264 dst += stride;
2265 left_y = _mm_shuffle_epi32(left, 0xaa);
2266 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2267 &round);
2268 dst += stride;
2269 left_y = _mm_shuffle_epi32(left, 0xff);
2270 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2271 &round);
2272 dst += stride;
2273
2274 left = cvtepu8_epi32(Load4(left_column + 12));
2275 left_y = _mm_shuffle_epi32(left, 0);
2276 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2277 &round);
2278 dst += stride;
2279 left_y = _mm_shuffle_epi32(left, 0x55);
2280 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2281 &round);
2282 dst += stride;
2283 left_y = _mm_shuffle_epi32(left, 0xaa);
2284 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2285 &round);
2286 dst += stride;
2287 left_y = _mm_shuffle_epi32(left, 0xff);
2288 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2289 &round);
2290 }
2291 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2292
2293 // For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,
2294 // |pixels| is a segment of the top row or the whole top row, and |weights| is
2295 // repeated.
aom_smooth_h_predictor_8x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2296 void aom_smooth_h_predictor_8x4_ssse3(
2297 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2298 const uint8_t *LIBAOM_RESTRICT top_row,
2299 const uint8_t *LIBAOM_RESTRICT left_column) {
2300 const __m128i top_right = _mm_set1_epi16(top_row[7]);
2301 const __m128i left = cvtepu8_epi16(Load4(left_column));
2302 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2303 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2304 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2305 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2306 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2307 __m128i y_select = _mm_set1_epi32(0x01000100);
2308 __m128i left_y = _mm_shuffle_epi8(left, y_select);
2309 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2310 &round);
2311 dst += stride;
2312 y_select = _mm_set1_epi32(0x03020302);
2313 left_y = _mm_shuffle_epi8(left, y_select);
2314 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2315 &round);
2316 dst += stride;
2317 y_select = _mm_set1_epi32(0x05040504);
2318 left_y = _mm_shuffle_epi8(left, y_select);
2319 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2320 &round);
2321 dst += stride;
2322 y_select = _mm_set1_epi32(0x07060706);
2323 left_y = _mm_shuffle_epi8(left, y_select);
2324 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2325 &round);
2326 }
2327
aom_smooth_h_predictor_8x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2328 void aom_smooth_h_predictor_8x8_ssse3(
2329 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2330 const uint8_t *LIBAOM_RESTRICT top_row,
2331 const uint8_t *LIBAOM_RESTRICT left_column) {
2332 const __m128i top_right = _mm_set1_epi16(top_row[7]);
2333 const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2334 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2335 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2336 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2337 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2338 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2339 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2340 const __m128i y_select = _mm_set1_epi32(y_mask);
2341 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2342 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2343 &round);
2344 dst += stride;
2345 }
2346 }
2347
aom_smooth_h_predictor_8x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2348 void aom_smooth_h_predictor_8x16_ssse3(
2349 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2350 const uint8_t *LIBAOM_RESTRICT top_row,
2351 const uint8_t *LIBAOM_RESTRICT left_column) {
2352 const __m128i top_right = _mm_set1_epi16(top_row[7]);
2353 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2354 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2355 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2356 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2357 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2358 __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2359 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2360 const __m128i y_select = _mm_set1_epi32(y_mask);
2361 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2362 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2363 &round);
2364 dst += stride;
2365 }
2366 left = cvtepu8_epi16(LoadLo8(left_column + 8));
2367 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2368 const __m128i y_select = _mm_set1_epi32(y_mask);
2369 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2370 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2371 &round);
2372 dst += stride;
2373 }
2374 }
2375
2376 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_h_predictor_8x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2377 void aom_smooth_h_predictor_8x32_ssse3(
2378 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2379 const uint8_t *LIBAOM_RESTRICT top_row,
2380 const uint8_t *LIBAOM_RESTRICT left_column) {
2381 const __m128i top_right = _mm_set1_epi16(top_row[7]);
2382 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2383 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2384 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2385 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2386 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2387 __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2388 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2389 const __m128i y_select = _mm_set1_epi32(y_mask);
2390 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2391 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2392 &round);
2393 dst += stride;
2394 }
2395 left = cvtepu8_epi16(LoadLo8(left_column + 8));
2396 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2397 const __m128i y_select = _mm_set1_epi32(y_mask);
2398 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2399 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2400 &round);
2401 dst += stride;
2402 }
2403 left = cvtepu8_epi16(LoadLo8(left_column + 16));
2404 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2405 const __m128i y_select = _mm_set1_epi32(y_mask);
2406 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2407 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2408 &round);
2409 dst += stride;
2410 }
2411 left = cvtepu8_epi16(LoadLo8(left_column + 24));
2412 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2413 const __m128i y_select = _mm_set1_epi32(y_mask);
2414 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2415 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2416 &round);
2417 dst += stride;
2418 }
2419 }
2420
aom_smooth_h_predictor_16x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2421 void aom_smooth_h_predictor_16x4_ssse3(
2422 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2423 const uint8_t *LIBAOM_RESTRICT top_row,
2424 const uint8_t *LIBAOM_RESTRICT left_column) {
2425 const __m128i top_right = _mm_set1_epi16(top_row[15]);
2426 const __m128i left = cvtepu8_epi16(Load4(left_column));
2427 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2428 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2429 const __m128i weights1 = cvtepu8_epi16(weights);
2430 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2431 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2432 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2433 const __m128i scaled_top_right1 =
2434 _mm_mullo_epi16(inverted_weights1, top_right);
2435 const __m128i scaled_top_right2 =
2436 _mm_mullo_epi16(inverted_weights2, top_right);
2437 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2438 __m128i y_mask = _mm_set1_epi32(0x01000100);
2439 __m128i left_y = _mm_shuffle_epi8(left, y_mask);
2440 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2441 scaled_top_right1, scaled_top_right2, round);
2442 dst += stride;
2443 y_mask = _mm_set1_epi32(0x03020302);
2444 left_y = _mm_shuffle_epi8(left, y_mask);
2445 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2446 scaled_top_right1, scaled_top_right2, round);
2447 dst += stride;
2448 y_mask = _mm_set1_epi32(0x05040504);
2449 left_y = _mm_shuffle_epi8(left, y_mask);
2450 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2451 scaled_top_right1, scaled_top_right2, round);
2452 dst += stride;
2453 y_mask = _mm_set1_epi32(0x07060706);
2454 left_y = _mm_shuffle_epi8(left, y_mask);
2455 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2456 scaled_top_right1, scaled_top_right2, round);
2457 }
2458 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2459
aom_smooth_h_predictor_16x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2460 void aom_smooth_h_predictor_16x8_ssse3(
2461 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2462 const uint8_t *LIBAOM_RESTRICT top_row,
2463 const uint8_t *LIBAOM_RESTRICT left_column) {
2464 const __m128i top_right = _mm_set1_epi16(top_row[15]);
2465 const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2466 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2467 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2468 const __m128i weights1 = cvtepu8_epi16(weights);
2469 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2470 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2471 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2472 const __m128i scaled_top_right1 =
2473 _mm_mullo_epi16(inverted_weights1, top_right);
2474 const __m128i scaled_top_right2 =
2475 _mm_mullo_epi16(inverted_weights2, top_right);
2476 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2477 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2478 const __m128i y_select = _mm_set1_epi32(y_mask);
2479 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2480 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2481 scaled_top_right1, scaled_top_right2, round);
2482 dst += stride;
2483 }
2484 }
2485
aom_smooth_h_predictor_16x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2486 void aom_smooth_h_predictor_16x16_ssse3(
2487 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2488 const uint8_t *LIBAOM_RESTRICT top_row,
2489 const uint8_t *LIBAOM_RESTRICT left_column) {
2490 const __m128i top_right = _mm_set1_epi16(top_row[15]);
2491 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2492 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2493 const __m128i weights1 = cvtepu8_epi16(weights);
2494 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2495 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2496 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2497 const __m128i scaled_top_right1 =
2498 _mm_mullo_epi16(inverted_weights1, top_right);
2499 const __m128i scaled_top_right2 =
2500 _mm_mullo_epi16(inverted_weights2, top_right);
2501 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2502 __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2503 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2504 const __m128i y_select = _mm_set1_epi32(y_mask);
2505 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2506 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2507 scaled_top_right1, scaled_top_right2, round);
2508 dst += stride;
2509 }
2510 left = cvtepu8_epi16(LoadLo8(left_column + 8));
2511 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2512 const __m128i y_select = _mm_set1_epi32(y_mask);
2513 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2514 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2515 scaled_top_right1, scaled_top_right2, round);
2516 dst += stride;
2517 }
2518 }
2519
aom_smooth_h_predictor_16x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2520 void aom_smooth_h_predictor_16x32_ssse3(
2521 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2522 const uint8_t *LIBAOM_RESTRICT top_row,
2523 const uint8_t *LIBAOM_RESTRICT left_column) {
2524 const __m128i top_right = _mm_set1_epi16(top_row[15]);
2525 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2526 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2527 const __m128i weights1 = cvtepu8_epi16(weights);
2528 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2529 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2530 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2531 const __m128i scaled_top_right1 =
2532 _mm_mullo_epi16(inverted_weights1, top_right);
2533 const __m128i scaled_top_right2 =
2534 _mm_mullo_epi16(inverted_weights2, top_right);
2535 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2536 __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2537 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2538 const __m128i y_select = _mm_set1_epi32(y_mask);
2539 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2540 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2541 scaled_top_right1, scaled_top_right2, round);
2542 dst += stride;
2543 }
2544 left = cvtepu8_epi16(LoadLo8(left_column + 8));
2545 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2546 const __m128i y_select = _mm_set1_epi32(y_mask);
2547 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2548 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2549 scaled_top_right1, scaled_top_right2, round);
2550 dst += stride;
2551 }
2552 left = cvtepu8_epi16(LoadLo8(left_column + 16));
2553 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2554 const __m128i y_select = _mm_set1_epi32(y_mask);
2555 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2556 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2557 scaled_top_right1, scaled_top_right2, round);
2558 dst += stride;
2559 }
2560 left = cvtepu8_epi16(LoadLo8(left_column + 24));
2561 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2562 const __m128i y_select = _mm_set1_epi32(y_mask);
2563 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2564 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2565 scaled_top_right1, scaled_top_right2, round);
2566 dst += stride;
2567 }
2568 }
2569
2570 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_h_predictor_16x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2571 void aom_smooth_h_predictor_16x64_ssse3(
2572 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2573 const uint8_t *LIBAOM_RESTRICT top_row,
2574 const uint8_t *LIBAOM_RESTRICT left_column) {
2575 const __m128i top_right = _mm_set1_epi16(top_row[15]);
2576 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2577 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2578 const __m128i weights1 = cvtepu8_epi16(weights);
2579 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2580 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2581 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2582 const __m128i scaled_top_right1 =
2583 _mm_mullo_epi16(inverted_weights1, top_right);
2584 const __m128i scaled_top_right2 =
2585 _mm_mullo_epi16(inverted_weights2, top_right);
2586 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2587 for (int left_offset = 0; left_offset < 64; left_offset += 8) {
2588 const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
2589 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2590 const __m128i y_select = _mm_set1_epi32(y_mask);
2591 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2592 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2593 scaled_top_right1, scaled_top_right2,
2594 round);
2595 dst += stride;
2596 }
2597 }
2598 }
2599
aom_smooth_h_predictor_32x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2600 void aom_smooth_h_predictor_32x8_ssse3(
2601 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2602 const uint8_t *LIBAOM_RESTRICT top_row,
2603 const uint8_t *LIBAOM_RESTRICT left_column) {
2604 const __m128i top_right = _mm_set1_epi16(top_row[31]);
2605 const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2606 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2607 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
2608 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2609 const __m128i weights1 = cvtepu8_epi16(weights_lo);
2610 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2611 const __m128i weights3 = cvtepu8_epi16(weights_hi);
2612 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2613 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2614 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2615 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2616 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2617 const __m128i scaled_top_right1 =
2618 _mm_mullo_epi16(inverted_weights1, top_right);
2619 const __m128i scaled_top_right2 =
2620 _mm_mullo_epi16(inverted_weights2, top_right);
2621 const __m128i scaled_top_right3 =
2622 _mm_mullo_epi16(inverted_weights3, top_right);
2623 const __m128i scaled_top_right4 =
2624 _mm_mullo_epi16(inverted_weights4, top_right);
2625 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2626 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2627 __m128i y_select = _mm_set1_epi32(y_mask);
2628 __m128i left_y = _mm_shuffle_epi8(left, y_select);
2629 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2630 scaled_top_right1, scaled_top_right2, round);
2631 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2632 scaled_top_right3, scaled_top_right4, round);
2633 dst += stride;
2634 }
2635 }
2636 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2637
aom_smooth_h_predictor_32x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2638 void aom_smooth_h_predictor_32x16_ssse3(
2639 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2640 const uint8_t *LIBAOM_RESTRICT top_row,
2641 const uint8_t *LIBAOM_RESTRICT left_column) {
2642 const __m128i top_right = _mm_set1_epi16(top_row[31]);
2643 const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
2644 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2645 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
2646 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2647 const __m128i weights1 = cvtepu8_epi16(weights_lo);
2648 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2649 const __m128i weights3 = cvtepu8_epi16(weights_hi);
2650 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2651 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2652 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2653 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2654 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2655 const __m128i scaled_top_right1 =
2656 _mm_mullo_epi16(inverted_weights1, top_right);
2657 const __m128i scaled_top_right2 =
2658 _mm_mullo_epi16(inverted_weights2, top_right);
2659 const __m128i scaled_top_right3 =
2660 _mm_mullo_epi16(inverted_weights3, top_right);
2661 const __m128i scaled_top_right4 =
2662 _mm_mullo_epi16(inverted_weights4, top_right);
2663 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2664 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2665 __m128i y_select = _mm_set1_epi32(y_mask);
2666 __m128i left_y = _mm_shuffle_epi8(left1, y_select);
2667 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2668 scaled_top_right1, scaled_top_right2, round);
2669 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2670 scaled_top_right3, scaled_top_right4, round);
2671 dst += stride;
2672 }
2673 const __m128i left2 =
2674 cvtepu8_epi16(LoadLo8((const uint8_t *)left_column + 8));
2675 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2676 __m128i y_select = _mm_set1_epi32(y_mask);
2677 __m128i left_y = _mm_shuffle_epi8(left2, y_select);
2678 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2679 scaled_top_right1, scaled_top_right2, round);
2680 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2681 scaled_top_right3, scaled_top_right4, round);
2682 dst += stride;
2683 }
2684 }
2685
aom_smooth_h_predictor_32x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2686 void aom_smooth_h_predictor_32x32_ssse3(
2687 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2688 const uint8_t *LIBAOM_RESTRICT top_row,
2689 const uint8_t *LIBAOM_RESTRICT left_column) {
2690 const __m128i top_right = _mm_set1_epi16(top_row[31]);
2691 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2692 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
2693 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2694 const __m128i weights1 = cvtepu8_epi16(weights_lo);
2695 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2696 const __m128i weights3 = cvtepu8_epi16(weights_hi);
2697 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2698 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2699 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2700 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2701 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2702 const __m128i scaled_top_right1 =
2703 _mm_mullo_epi16(inverted_weights1, top_right);
2704 const __m128i scaled_top_right2 =
2705 _mm_mullo_epi16(inverted_weights2, top_right);
2706 const __m128i scaled_top_right3 =
2707 _mm_mullo_epi16(inverted_weights3, top_right);
2708 const __m128i scaled_top_right4 =
2709 _mm_mullo_epi16(inverted_weights4, top_right);
2710 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2711 __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2712 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2713 __m128i y_select = _mm_set1_epi32(y_mask);
2714 __m128i left_y = _mm_shuffle_epi8(left, y_select);
2715 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2716 scaled_top_right1, scaled_top_right2, round);
2717 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2718 scaled_top_right3, scaled_top_right4, round);
2719 dst += stride;
2720 }
2721 left = cvtepu8_epi16(LoadLo8(left_column + 8));
2722 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2723 __m128i y_select = _mm_set1_epi32(y_mask);
2724 __m128i left_y = _mm_shuffle_epi8(left, y_select);
2725 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2726 scaled_top_right1, scaled_top_right2, round);
2727 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2728 scaled_top_right3, scaled_top_right4, round);
2729 dst += stride;
2730 }
2731 left = cvtepu8_epi16(LoadLo8(left_column + 16));
2732 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2733 __m128i y_select = _mm_set1_epi32(y_mask);
2734 __m128i left_y = _mm_shuffle_epi8(left, y_select);
2735 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2736 scaled_top_right1, scaled_top_right2, round);
2737 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2738 scaled_top_right3, scaled_top_right4, round);
2739 dst += stride;
2740 }
2741 left = cvtepu8_epi16(LoadLo8(left_column + 24));
2742 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2743 __m128i y_select = _mm_set1_epi32(y_mask);
2744 __m128i left_y = _mm_shuffle_epi8(left, y_select);
2745 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2746 scaled_top_right1, scaled_top_right2, round);
2747 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2748 scaled_top_right3, scaled_top_right4, round);
2749 dst += stride;
2750 }
2751 }
2752
aom_smooth_h_predictor_32x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2753 void aom_smooth_h_predictor_32x64_ssse3(
2754 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2755 const uint8_t *LIBAOM_RESTRICT top_row,
2756 const uint8_t *LIBAOM_RESTRICT left_column) {
2757 const __m128i top_right = _mm_set1_epi16(top_row[31]);
2758 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2759 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
2760 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2761 const __m128i weights1 = cvtepu8_epi16(weights_lo);
2762 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2763 const __m128i weights3 = cvtepu8_epi16(weights_hi);
2764 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2765 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2766 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2767 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2768 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2769 const __m128i scaled_top_right1 =
2770 _mm_mullo_epi16(inverted_weights1, top_right);
2771 const __m128i scaled_top_right2 =
2772 _mm_mullo_epi16(inverted_weights2, top_right);
2773 const __m128i scaled_top_right3 =
2774 _mm_mullo_epi16(inverted_weights3, top_right);
2775 const __m128i scaled_top_right4 =
2776 _mm_mullo_epi16(inverted_weights4, top_right);
2777 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2778 for (int left_offset = 0; left_offset < 64; left_offset += 8) {
2779 const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
2780 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2781 const __m128i y_select = _mm_set1_epi32(y_mask);
2782 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2783 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2784 scaled_top_right1, scaled_top_right2,
2785 round);
2786 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
2787 weights4, scaled_top_right3,
2788 scaled_top_right4, round);
2789 dst += stride;
2790 }
2791 }
2792 }
2793
2794 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_smooth_h_predictor_64x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2795 void aom_smooth_h_predictor_64x16_ssse3(
2796 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2797 const uint8_t *LIBAOM_RESTRICT top_row,
2798 const uint8_t *LIBAOM_RESTRICT left_column) {
2799 const __m128i top_right = _mm_set1_epi16(top_row[63]);
2800 const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
2801 const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
2802 const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
2803 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2804 const __m128i weights1 = cvtepu8_epi16(weights_lolo);
2805 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
2806 const __m128i weights3 = cvtepu8_epi16(weights_lohi);
2807 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
2808 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2809 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2810 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2811 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2812 const __m128i scaled_top_right1 =
2813 _mm_mullo_epi16(inverted_weights1, top_right);
2814 const __m128i scaled_top_right2 =
2815 _mm_mullo_epi16(inverted_weights2, top_right);
2816 const __m128i scaled_top_right3 =
2817 _mm_mullo_epi16(inverted_weights3, top_right);
2818 const __m128i scaled_top_right4 =
2819 _mm_mullo_epi16(inverted_weights4, top_right);
2820 const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
2821 const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
2822 const __m128i weights5 = cvtepu8_epi16(weights_hilo);
2823 const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
2824 const __m128i weights7 = cvtepu8_epi16(weights_hihi);
2825 const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
2826 const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
2827 const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
2828 const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
2829 const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
2830 const __m128i scaled_top_right5 =
2831 _mm_mullo_epi16(inverted_weights5, top_right);
2832 const __m128i scaled_top_right6 =
2833 _mm_mullo_epi16(inverted_weights6, top_right);
2834 const __m128i scaled_top_right7 =
2835 _mm_mullo_epi16(inverted_weights7, top_right);
2836 const __m128i scaled_top_right8 =
2837 _mm_mullo_epi16(inverted_weights8, top_right);
2838 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2839 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2840 __m128i y_select = _mm_set1_epi32(y_mask);
2841 __m128i left_y = _mm_shuffle_epi8(left1, y_select);
2842 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2843 scaled_top_right1, scaled_top_right2, round);
2844 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2845 scaled_top_right3, scaled_top_right4, round);
2846 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2847 scaled_top_right5, scaled_top_right6, round);
2848 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2849 scaled_top_right7, scaled_top_right8, round);
2850 dst += stride;
2851 }
2852 const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
2853 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2854 __m128i y_select = _mm_set1_epi32(y_mask);
2855 __m128i left_y = _mm_shuffle_epi8(left2, y_select);
2856 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2857 scaled_top_right1, scaled_top_right2, round);
2858 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2859 scaled_top_right3, scaled_top_right4, round);
2860 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2861 scaled_top_right5, scaled_top_right6, round);
2862 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2863 scaled_top_right7, scaled_top_right8, round);
2864 dst += stride;
2865 }
2866 }
2867 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2868
aom_smooth_h_predictor_64x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2869 void aom_smooth_h_predictor_64x32_ssse3(
2870 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2871 const uint8_t *LIBAOM_RESTRICT top_row,
2872 const uint8_t *LIBAOM_RESTRICT left_column) {
2873 const __m128i top_right = _mm_set1_epi16(top_row[63]);
2874 const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
2875 const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
2876 const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
2877 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2878 const __m128i weights1 = cvtepu8_epi16(weights_lolo);
2879 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
2880 const __m128i weights3 = cvtepu8_epi16(weights_lohi);
2881 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
2882 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2883 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2884 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2885 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2886 const __m128i scaled_top_right1 =
2887 _mm_mullo_epi16(inverted_weights1, top_right);
2888 const __m128i scaled_top_right2 =
2889 _mm_mullo_epi16(inverted_weights2, top_right);
2890 const __m128i scaled_top_right3 =
2891 _mm_mullo_epi16(inverted_weights3, top_right);
2892 const __m128i scaled_top_right4 =
2893 _mm_mullo_epi16(inverted_weights4, top_right);
2894 const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
2895 const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
2896 const __m128i weights5 = cvtepu8_epi16(weights_hilo);
2897 const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
2898 const __m128i weights7 = cvtepu8_epi16(weights_hihi);
2899 const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
2900 const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
2901 const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
2902 const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
2903 const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
2904 const __m128i scaled_top_right5 =
2905 _mm_mullo_epi16(inverted_weights5, top_right);
2906 const __m128i scaled_top_right6 =
2907 _mm_mullo_epi16(inverted_weights6, top_right);
2908 const __m128i scaled_top_right7 =
2909 _mm_mullo_epi16(inverted_weights7, top_right);
2910 const __m128i scaled_top_right8 =
2911 _mm_mullo_epi16(inverted_weights8, top_right);
2912 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2913 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2914 const __m128i y_select = _mm_set1_epi32(y_mask);
2915 const __m128i left_y = _mm_shuffle_epi8(left1, y_select);
2916 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2917 scaled_top_right1, scaled_top_right2, round);
2918 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2919 scaled_top_right3, scaled_top_right4, round);
2920 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2921 scaled_top_right5, scaled_top_right6, round);
2922 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2923 scaled_top_right7, scaled_top_right8, round);
2924 dst += stride;
2925 }
2926 const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
2927 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2928 const __m128i y_select = _mm_set1_epi32(y_mask);
2929 const __m128i left_y = _mm_shuffle_epi8(left2, y_select);
2930 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2931 scaled_top_right1, scaled_top_right2, round);
2932 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2933 scaled_top_right3, scaled_top_right4, round);
2934 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2935 scaled_top_right5, scaled_top_right6, round);
2936 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2937 scaled_top_right7, scaled_top_right8, round);
2938 dst += stride;
2939 }
2940 const __m128i left3 = cvtepu8_epi16(LoadLo8(left_column + 16));
2941 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2942 const __m128i y_select = _mm_set1_epi32(y_mask);
2943 const __m128i left_y = _mm_shuffle_epi8(left3, y_select);
2944 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2945 scaled_top_right1, scaled_top_right2, round);
2946 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2947 scaled_top_right3, scaled_top_right4, round);
2948 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2949 scaled_top_right5, scaled_top_right6, round);
2950 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2951 scaled_top_right7, scaled_top_right8, round);
2952 dst += stride;
2953 }
2954 const __m128i left4 = cvtepu8_epi16(LoadLo8(left_column + 24));
2955 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2956 const __m128i y_select = _mm_set1_epi32(y_mask);
2957 const __m128i left_y = _mm_shuffle_epi8(left4, y_select);
2958 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2959 scaled_top_right1, scaled_top_right2, round);
2960 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2961 scaled_top_right3, scaled_top_right4, round);
2962 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2963 scaled_top_right5, scaled_top_right6, round);
2964 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2965 scaled_top_right7, scaled_top_right8, round);
2966 dst += stride;
2967 }
2968 }
2969
aom_smooth_h_predictor_64x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2970 void aom_smooth_h_predictor_64x64_ssse3(
2971 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2972 const uint8_t *LIBAOM_RESTRICT top_row,
2973 const uint8_t *LIBAOM_RESTRICT left_column) {
2974 const __m128i top_right = _mm_set1_epi16(top_row[63]);
2975 const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
2976 const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
2977 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2978 const __m128i weights1 = cvtepu8_epi16(weights_lolo);
2979 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
2980 const __m128i weights3 = cvtepu8_epi16(weights_lohi);
2981 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
2982 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2983 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2984 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2985 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2986 const __m128i scaled_top_right1 =
2987 _mm_mullo_epi16(inverted_weights1, top_right);
2988 const __m128i scaled_top_right2 =
2989 _mm_mullo_epi16(inverted_weights2, top_right);
2990 const __m128i scaled_top_right3 =
2991 _mm_mullo_epi16(inverted_weights3, top_right);
2992 const __m128i scaled_top_right4 =
2993 _mm_mullo_epi16(inverted_weights4, top_right);
2994 const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
2995 const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
2996 const __m128i weights5 = cvtepu8_epi16(weights_hilo);
2997 const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
2998 const __m128i weights7 = cvtepu8_epi16(weights_hihi);
2999 const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
3000 const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
3001 const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
3002 const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
3003 const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
3004 const __m128i scaled_top_right5 =
3005 _mm_mullo_epi16(inverted_weights5, top_right);
3006 const __m128i scaled_top_right6 =
3007 _mm_mullo_epi16(inverted_weights6, top_right);
3008 const __m128i scaled_top_right7 =
3009 _mm_mullo_epi16(inverted_weights7, top_right);
3010 const __m128i scaled_top_right8 =
3011 _mm_mullo_epi16(inverted_weights8, top_right);
3012 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
3013 for (int left_offset = 0; left_offset < 64; left_offset += 8) {
3014 const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
3015 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
3016 const __m128i y_select = _mm_set1_epi32(y_mask);
3017 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
3018 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
3019 scaled_top_right1, scaled_top_right2,
3020 round);
3021 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
3022 weights4, scaled_top_right3,
3023 scaled_top_right4, round);
3024 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5,
3025 weights6, scaled_top_right5,
3026 scaled_top_right6, round);
3027 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7,
3028 weights8, scaled_top_right7,
3029 scaled_top_right8, round);
3030 dst += stride;
3031 }
3032 }
3033 }
3034