1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <emmintrin.h>
13 #include "aom_dsp/x86/intrapred_x86.h"
14 #include "config/aom_dsp_rtcd.h"
15
dc_store_4xh(uint32_t dc,int height,uint8_t * dst,ptrdiff_t stride)16 static inline void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
17 ptrdiff_t stride) {
18 for (int i = 0; i < height; i += 2) {
19 *(uint32_t *)dst = dc;
20 dst += stride;
21 *(uint32_t *)dst = dc;
22 dst += stride;
23 }
24 }
25
dc_store_8xh(const __m128i * row,int height,uint8_t * dst,ptrdiff_t stride)26 static inline void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
27 ptrdiff_t stride) {
28 int i;
29 for (i = 0; i < height; ++i) {
30 _mm_storel_epi64((__m128i *)dst, *row);
31 dst += stride;
32 }
33 }
34
dc_store_16xh(const __m128i * row,int height,uint8_t * dst,ptrdiff_t stride)35 static inline void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
36 ptrdiff_t stride) {
37 int i;
38 for (i = 0; i < height; ++i) {
39 _mm_store_si128((__m128i *)dst, *row);
40 dst += stride;
41 }
42 }
43
dc_store_32xh(const __m128i * row,int height,uint8_t * dst,ptrdiff_t stride)44 static inline void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
45 ptrdiff_t stride) {
46 int i;
47 for (i = 0; i < height; ++i) {
48 _mm_store_si128((__m128i *)dst, *row);
49 _mm_store_si128((__m128i *)(dst + 16), *row);
50 dst += stride;
51 }
52 }
53
dc_store_64xh(const __m128i * row,int height,uint8_t * dst,ptrdiff_t stride)54 static inline void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
55 ptrdiff_t stride) {
56 for (int i = 0; i < height; ++i) {
57 _mm_store_si128((__m128i *)dst, *row);
58 _mm_store_si128((__m128i *)(dst + 16), *row);
59 _mm_store_si128((__m128i *)(dst + 32), *row);
60 _mm_store_si128((__m128i *)(dst + 48), *row);
61 dst += stride;
62 }
63 }
64
dc_sum_4(const uint8_t * ref)65 static inline __m128i dc_sum_4(const uint8_t *ref) {
66 __m128i x = _mm_loadl_epi64((__m128i const *)ref);
67 const __m128i zero = _mm_setzero_si128();
68 x = _mm_unpacklo_epi8(x, zero);
69 return _mm_sad_epu8(x, zero);
70 }
71
dc_sum_8(const uint8_t * ref)72 static inline __m128i dc_sum_8(const uint8_t *ref) {
73 __m128i x = _mm_loadl_epi64((__m128i const *)ref);
74 const __m128i zero = _mm_setzero_si128();
75 return _mm_sad_epu8(x, zero);
76 }
77
dc_sum_64(const uint8_t * ref)78 static inline __m128i dc_sum_64(const uint8_t *ref) {
79 __m128i x0 = _mm_load_si128((__m128i const *)ref);
80 __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
81 __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
82 __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
83 const __m128i zero = _mm_setzero_si128();
84 x0 = _mm_sad_epu8(x0, zero);
85 x1 = _mm_sad_epu8(x1, zero);
86 x2 = _mm_sad_epu8(x2, zero);
87 x3 = _mm_sad_epu8(x3, zero);
88 x0 = _mm_add_epi16(x0, x1);
89 x2 = _mm_add_epi16(x2, x3);
90 x0 = _mm_add_epi16(x0, x2);
91 const __m128i high = _mm_unpackhi_epi64(x0, x0);
92 return _mm_add_epi16(x0, high);
93 }
94
95 #define DC_MULTIPLIER_1X2 0x5556
96 #define DC_MULTIPLIER_1X4 0x3334
97
98 #define DC_SHIFT2 16
99
divide_using_multiply_shift(int num,int shift1,int multiplier)100 static inline int divide_using_multiply_shift(int num, int shift1,
101 int multiplier) {
102 const int interm = num >> shift1;
103 return interm * multiplier >> DC_SHIFT2;
104 }
105
106 // -----------------------------------------------------------------------------
107 // DC_PRED
108
aom_dc_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)109 void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
110 const uint8_t *above, const uint8_t *left) {
111 const __m128i sum_left = dc_sum_8(left);
112 __m128i sum_above = dc_sum_4(above);
113 sum_above = _mm_add_epi16(sum_left, sum_above);
114
115 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
116 sum += 6;
117 sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
118
119 const __m128i row = _mm_set1_epi8((int8_t)sum);
120 const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
121 dc_store_4xh(pred, 8, dst, stride);
122 }
123
124 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)125 void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
126 const uint8_t *above, const uint8_t *left) {
127 const __m128i sum_left = dc_sum_16_sse2(left);
128 __m128i sum_above = dc_sum_4(above);
129 sum_above = _mm_add_epi16(sum_left, sum_above);
130
131 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
132 sum += 10;
133 sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
134
135 const __m128i row = _mm_set1_epi8((int8_t)sum);
136 const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
137 dc_store_4xh(pred, 16, dst, stride);
138 }
139 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
140
aom_dc_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)141 void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
142 const uint8_t *above, const uint8_t *left) {
143 const __m128i sum_left = dc_sum_4(left);
144 __m128i sum_above = dc_sum_8(above);
145 sum_above = _mm_add_epi16(sum_above, sum_left);
146
147 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
148 sum += 6;
149 sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
150
151 const __m128i row = _mm_set1_epi8((int8_t)sum);
152 dc_store_8xh(&row, 4, dst, stride);
153 }
154
aom_dc_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)155 void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
156 const uint8_t *above, const uint8_t *left) {
157 const __m128i sum_left = dc_sum_16_sse2(left);
158 __m128i sum_above = dc_sum_8(above);
159 sum_above = _mm_add_epi16(sum_above, sum_left);
160
161 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
162 sum += 12;
163 sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
164 const __m128i row = _mm_set1_epi8((int8_t)sum);
165 dc_store_8xh(&row, 16, dst, stride);
166 }
167
168 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)169 void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
170 const uint8_t *above, const uint8_t *left) {
171 const __m128i sum_left = dc_sum_32_sse2(left);
172 __m128i sum_above = dc_sum_8(above);
173 sum_above = _mm_add_epi16(sum_above, sum_left);
174
175 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
176 sum += 20;
177 sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
178 const __m128i row = _mm_set1_epi8((int8_t)sum);
179 dc_store_8xh(&row, 32, dst, stride);
180 }
181
aom_dc_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)182 void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
183 const uint8_t *above, const uint8_t *left) {
184 const __m128i sum_left = dc_sum_4(left);
185 __m128i sum_above = dc_sum_16_sse2(above);
186 sum_above = _mm_add_epi16(sum_above, sum_left);
187
188 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
189 sum += 10;
190 sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
191 const __m128i row = _mm_set1_epi8((int8_t)sum);
192 dc_store_16xh(&row, 4, dst, stride);
193 }
194 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
195
aom_dc_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)196 void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
197 const uint8_t *above, const uint8_t *left) {
198 const __m128i sum_left = dc_sum_8(left);
199 __m128i sum_above = dc_sum_16_sse2(above);
200 sum_above = _mm_add_epi16(sum_above, sum_left);
201
202 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
203 sum += 12;
204 sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
205 const __m128i row = _mm_set1_epi8((int8_t)sum);
206 dc_store_16xh(&row, 8, dst, stride);
207 }
208
aom_dc_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)209 void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
210 const uint8_t *above, const uint8_t *left) {
211 const __m128i sum_left = dc_sum_32_sse2(left);
212 __m128i sum_above = dc_sum_16_sse2(above);
213 sum_above = _mm_add_epi16(sum_left, sum_above);
214
215 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
216 sum += 24;
217 sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
218 const __m128i row = _mm_set1_epi8((int8_t)sum);
219 dc_store_16xh(&row, 32, dst, stride);
220 }
221
222 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)223 void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
224 const uint8_t *above, const uint8_t *left) {
225 const __m128i sum_left = dc_sum_64(left);
226 __m128i sum_above = dc_sum_16_sse2(above);
227 sum_above = _mm_add_epi16(sum_left, sum_above);
228
229 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
230 sum += 40;
231 sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
232 const __m128i row = _mm_set1_epi8((int8_t)sum);
233 dc_store_16xh(&row, 64, dst, stride);
234 }
235
aom_dc_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)236 void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
237 const uint8_t *above, const uint8_t *left) {
238 __m128i sum_above = dc_sum_32_sse2(above);
239 const __m128i sum_left = dc_sum_8(left);
240 sum_above = _mm_add_epi16(sum_above, sum_left);
241
242 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
243 sum += 20;
244 sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
245 const __m128i row = _mm_set1_epi8((int8_t)sum);
246 dc_store_32xh(&row, 8, dst, stride);
247 }
248 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
249
aom_dc_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)250 void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
251 const uint8_t *above, const uint8_t *left) {
252 __m128i sum_above = dc_sum_32_sse2(above);
253 const __m128i sum_left = dc_sum_16_sse2(left);
254 sum_above = _mm_add_epi16(sum_above, sum_left);
255
256 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
257 sum += 24;
258 sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
259 const __m128i row = _mm_set1_epi8((int8_t)sum);
260 dc_store_32xh(&row, 16, dst, stride);
261 }
262
aom_dc_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)263 void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
264 const uint8_t *above, const uint8_t *left) {
265 __m128i sum_above = dc_sum_32_sse2(above);
266 const __m128i sum_left = dc_sum_64(left);
267 sum_above = _mm_add_epi16(sum_above, sum_left);
268
269 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
270 sum += 48;
271 sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
272 const __m128i row = _mm_set1_epi8((int8_t)sum);
273 dc_store_32xh(&row, 64, dst, stride);
274 }
275
aom_dc_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)276 void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
277 const uint8_t *above, const uint8_t *left) {
278 __m128i sum_above = dc_sum_64(above);
279 const __m128i sum_left = dc_sum_64(left);
280 sum_above = _mm_add_epi16(sum_above, sum_left);
281
282 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
283 sum += 64;
284 sum /= 128;
285 const __m128i row = _mm_set1_epi8((int8_t)sum);
286 dc_store_64xh(&row, 64, dst, stride);
287 }
288
aom_dc_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)289 void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
290 const uint8_t *above, const uint8_t *left) {
291 __m128i sum_above = dc_sum_64(above);
292 const __m128i sum_left = dc_sum_32_sse2(left);
293 sum_above = _mm_add_epi16(sum_above, sum_left);
294
295 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
296 sum += 48;
297 sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
298 const __m128i row = _mm_set1_epi8((int8_t)sum);
299 dc_store_64xh(&row, 32, dst, stride);
300 }
301
302 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)303 void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
304 const uint8_t *above, const uint8_t *left) {
305 __m128i sum_above = dc_sum_64(above);
306 const __m128i sum_left = dc_sum_16_sse2(left);
307 sum_above = _mm_add_epi16(sum_above, sum_left);
308
309 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
310 sum += 40;
311 sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
312 const __m128i row = _mm_set1_epi8((int8_t)sum);
313 dc_store_64xh(&row, 16, dst, stride);
314 }
315 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
316
317 // -----------------------------------------------------------------------------
318 // DC_TOP
319
aom_dc_top_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)320 void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
321 const uint8_t *above, const uint8_t *left) {
322 (void)left;
323 __m128i sum_above = dc_sum_4(above);
324 const __m128i two = _mm_set1_epi16(2);
325 sum_above = _mm_add_epi16(sum_above, two);
326 sum_above = _mm_srai_epi16(sum_above, 2);
327 sum_above = _mm_shufflelo_epi16(sum_above, 0);
328 sum_above = _mm_packus_epi16(sum_above, sum_above);
329
330 const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
331 dc_store_4xh(pred, 8, dst, stride);
332 }
333
334 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_top_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)335 void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
336 const uint8_t *above, const uint8_t *left) {
337 (void)left;
338 __m128i sum_above = dc_sum_4(above);
339 const __m128i two = _mm_set1_epi16(2);
340 sum_above = _mm_add_epi16(sum_above, two);
341 sum_above = _mm_srai_epi16(sum_above, 2);
342 sum_above = _mm_shufflelo_epi16(sum_above, 0);
343 sum_above = _mm_packus_epi16(sum_above, sum_above);
344
345 const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
346 dc_store_4xh(pred, 16, dst, stride);
347 }
348 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
349
aom_dc_top_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)350 void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
351 const uint8_t *above, const uint8_t *left) {
352 (void)left;
353 __m128i sum_above = dc_sum_8(above);
354 const __m128i four = _mm_set1_epi16(4);
355 sum_above = _mm_add_epi16(sum_above, four);
356 sum_above = _mm_srai_epi16(sum_above, 3);
357 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
358 const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
359 dc_store_8xh(&row, 4, dst, stride);
360 }
361
aom_dc_top_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)362 void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
363 const uint8_t *above, const uint8_t *left) {
364 (void)left;
365 __m128i sum_above = dc_sum_8(above);
366 const __m128i four = _mm_set1_epi16(4);
367 sum_above = _mm_add_epi16(sum_above, four);
368 sum_above = _mm_srai_epi16(sum_above, 3);
369 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
370 const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
371 dc_store_8xh(&row, 16, dst, stride);
372 }
373
374 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_top_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)375 void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
376 const uint8_t *above, const uint8_t *left) {
377 (void)left;
378 __m128i sum_above = dc_sum_8(above);
379 const __m128i four = _mm_set1_epi16(4);
380 sum_above = _mm_add_epi16(sum_above, four);
381 sum_above = _mm_srai_epi16(sum_above, 3);
382 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
383 const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
384 dc_store_8xh(&row, 32, dst, stride);
385 }
386
aom_dc_top_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)387 void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
388 const uint8_t *above, const uint8_t *left) {
389 (void)left;
390 __m128i sum_above = dc_sum_16_sse2(above);
391 const __m128i eight = _mm_set1_epi16(8);
392 sum_above = _mm_add_epi16(sum_above, eight);
393 sum_above = _mm_srai_epi16(sum_above, 4);
394 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
395 sum_above = _mm_shufflelo_epi16(sum_above, 0);
396 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
397 dc_store_16xh(&row, 4, dst, stride);
398 }
399 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
400
aom_dc_top_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)401 void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
402 const uint8_t *above, const uint8_t *left) {
403 (void)left;
404 __m128i sum_above = dc_sum_16_sse2(above);
405 const __m128i eight = _mm_set1_epi16(8);
406 sum_above = _mm_add_epi16(sum_above, eight);
407 sum_above = _mm_srai_epi16(sum_above, 4);
408 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
409 sum_above = _mm_shufflelo_epi16(sum_above, 0);
410 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
411 dc_store_16xh(&row, 8, dst, stride);
412 }
413
aom_dc_top_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)414 void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
415 const uint8_t *above,
416 const uint8_t *left) {
417 (void)left;
418 __m128i sum_above = dc_sum_16_sse2(above);
419 const __m128i eight = _mm_set1_epi16(8);
420 sum_above = _mm_add_epi16(sum_above, eight);
421 sum_above = _mm_srai_epi16(sum_above, 4);
422 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
423 sum_above = _mm_shufflelo_epi16(sum_above, 0);
424 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
425 dc_store_16xh(&row, 32, dst, stride);
426 }
427
428 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_top_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)429 void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
430 const uint8_t *above,
431 const uint8_t *left) {
432 (void)left;
433 __m128i sum_above = dc_sum_16_sse2(above);
434 const __m128i eight = _mm_set1_epi16(8);
435 sum_above = _mm_add_epi16(sum_above, eight);
436 sum_above = _mm_srai_epi16(sum_above, 4);
437 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
438 sum_above = _mm_shufflelo_epi16(sum_above, 0);
439 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
440 dc_store_16xh(&row, 64, dst, stride);
441 }
442
aom_dc_top_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)443 void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
444 const uint8_t *above, const uint8_t *left) {
445 (void)left;
446 __m128i sum_above = dc_sum_32_sse2(above);
447 const __m128i sixteen = _mm_set1_epi16(16);
448 sum_above = _mm_add_epi16(sum_above, sixteen);
449 sum_above = _mm_srai_epi16(sum_above, 5);
450 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
451 sum_above = _mm_shufflelo_epi16(sum_above, 0);
452 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
453 dc_store_32xh(&row, 8, dst, stride);
454 }
455 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
456
aom_dc_top_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)457 void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
458 const uint8_t *above,
459 const uint8_t *left) {
460 (void)left;
461 __m128i sum_above = dc_sum_32_sse2(above);
462 const __m128i sixteen = _mm_set1_epi16(16);
463 sum_above = _mm_add_epi16(sum_above, sixteen);
464 sum_above = _mm_srai_epi16(sum_above, 5);
465 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
466 sum_above = _mm_shufflelo_epi16(sum_above, 0);
467 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
468 dc_store_32xh(&row, 16, dst, stride);
469 }
470
aom_dc_top_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)471 void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
472 const uint8_t *above,
473 const uint8_t *left) {
474 (void)left;
475 __m128i sum_above = dc_sum_32_sse2(above);
476 const __m128i sixteen = _mm_set1_epi16(16);
477 sum_above = _mm_add_epi16(sum_above, sixteen);
478 sum_above = _mm_srai_epi16(sum_above, 5);
479 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
480 sum_above = _mm_shufflelo_epi16(sum_above, 0);
481 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
482 dc_store_32xh(&row, 64, dst, stride);
483 }
484
aom_dc_top_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)485 void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
486 const uint8_t *above,
487 const uint8_t *left) {
488 (void)left;
489 __m128i sum_above = dc_sum_64(above);
490 const __m128i thirtytwo = _mm_set1_epi16(32);
491 sum_above = _mm_add_epi16(sum_above, thirtytwo);
492 sum_above = _mm_srai_epi16(sum_above, 6);
493 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
494 sum_above = _mm_shufflelo_epi16(sum_above, 0);
495 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
496 dc_store_64xh(&row, 64, dst, stride);
497 }
498
aom_dc_top_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)499 void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
500 const uint8_t *above,
501 const uint8_t *left) {
502 (void)left;
503 __m128i sum_above = dc_sum_64(above);
504 const __m128i thirtytwo = _mm_set1_epi16(32);
505 sum_above = _mm_add_epi16(sum_above, thirtytwo);
506 sum_above = _mm_srai_epi16(sum_above, 6);
507 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
508 sum_above = _mm_shufflelo_epi16(sum_above, 0);
509 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
510 dc_store_64xh(&row, 32, dst, stride);
511 }
512
513 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_top_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)514 void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
515 const uint8_t *above,
516 const uint8_t *left) {
517 (void)left;
518 __m128i sum_above = dc_sum_64(above);
519 const __m128i thirtytwo = _mm_set1_epi16(32);
520 sum_above = _mm_add_epi16(sum_above, thirtytwo);
521 sum_above = _mm_srai_epi16(sum_above, 6);
522 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
523 sum_above = _mm_shufflelo_epi16(sum_above, 0);
524 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
525 dc_store_64xh(&row, 16, dst, stride);
526 }
527 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
528
529 // -----------------------------------------------------------------------------
530 // DC_LEFT
531
aom_dc_left_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)532 void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
533 const uint8_t *above, const uint8_t *left) {
534 (void)above;
535 __m128i sum_left = dc_sum_8(left);
536 const __m128i four = _mm_set1_epi16(4);
537 sum_left = _mm_add_epi16(sum_left, four);
538 sum_left = _mm_srai_epi16(sum_left, 3);
539 sum_left = _mm_shufflelo_epi16(sum_left, 0);
540 sum_left = _mm_packus_epi16(sum_left, sum_left);
541
542 const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
543 dc_store_4xh(pred, 8, dst, stride);
544 }
545
546 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_left_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)547 void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
548 const uint8_t *above,
549 const uint8_t *left) {
550 (void)above;
551 __m128i sum_left = dc_sum_16_sse2(left);
552 const __m128i eight = _mm_set1_epi16(8);
553 sum_left = _mm_add_epi16(sum_left, eight);
554 sum_left = _mm_srai_epi16(sum_left, 4);
555 sum_left = _mm_shufflelo_epi16(sum_left, 0);
556 sum_left = _mm_packus_epi16(sum_left, sum_left);
557
558 const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
559 dc_store_4xh(pred, 16, dst, stride);
560 }
561 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
562
aom_dc_left_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)563 void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
564 const uint8_t *above, const uint8_t *left) {
565 (void)above;
566 __m128i sum_left = dc_sum_4(left);
567 const __m128i two = _mm_set1_epi16(2);
568 sum_left = _mm_add_epi16(sum_left, two);
569 sum_left = _mm_srai_epi16(sum_left, 2);
570 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
571 const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
572 dc_store_8xh(&row, 4, dst, stride);
573 }
574
aom_dc_left_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)575 void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
576 const uint8_t *above,
577 const uint8_t *left) {
578 (void)above;
579 __m128i sum_left = dc_sum_16_sse2(left);
580 const __m128i eight = _mm_set1_epi16(8);
581 sum_left = _mm_add_epi16(sum_left, eight);
582 sum_left = _mm_srai_epi16(sum_left, 4);
583 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
584 const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
585 dc_store_8xh(&row, 16, dst, stride);
586 }
587
588 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_left_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)589 void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
590 const uint8_t *above,
591 const uint8_t *left) {
592 (void)above;
593 __m128i sum_left = dc_sum_32_sse2(left);
594 const __m128i sixteen = _mm_set1_epi16(16);
595 sum_left = _mm_add_epi16(sum_left, sixteen);
596 sum_left = _mm_srai_epi16(sum_left, 5);
597 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
598 const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
599 dc_store_8xh(&row, 32, dst, stride);
600 }
601
aom_dc_left_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)602 void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
603 const uint8_t *above,
604 const uint8_t *left) {
605 (void)above;
606 __m128i sum_left = dc_sum_4(left);
607 const __m128i two = _mm_set1_epi16(2);
608 sum_left = _mm_add_epi16(sum_left, two);
609 sum_left = _mm_srai_epi16(sum_left, 2);
610 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
611 sum_left = _mm_shufflelo_epi16(sum_left, 0);
612 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
613 dc_store_16xh(&row, 4, dst, stride);
614 }
615 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
616
aom_dc_left_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)617 void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
618 const uint8_t *above,
619 const uint8_t *left) {
620 (void)above;
621 __m128i sum_left = dc_sum_8(left);
622 const __m128i four = _mm_set1_epi16(4);
623 sum_left = _mm_add_epi16(sum_left, four);
624 sum_left = _mm_srai_epi16(sum_left, 3);
625 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
626 sum_left = _mm_shufflelo_epi16(sum_left, 0);
627 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
628 dc_store_16xh(&row, 8, dst, stride);
629 }
630
aom_dc_left_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)631 void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
632 const uint8_t *above,
633 const uint8_t *left) {
634 (void)above;
635 __m128i sum_left = dc_sum_32_sse2(left);
636 const __m128i sixteen = _mm_set1_epi16(16);
637 sum_left = _mm_add_epi16(sum_left, sixteen);
638 sum_left = _mm_srai_epi16(sum_left, 5);
639 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
640 sum_left = _mm_shufflelo_epi16(sum_left, 0);
641 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
642 dc_store_16xh(&row, 32, dst, stride);
643 }
644
645 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_left_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)646 void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
647 const uint8_t *above,
648 const uint8_t *left) {
649 (void)above;
650 __m128i sum_left = dc_sum_64(left);
651 const __m128i thirtytwo = _mm_set1_epi16(32);
652 sum_left = _mm_add_epi16(sum_left, thirtytwo);
653 sum_left = _mm_srai_epi16(sum_left, 6);
654 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
655 sum_left = _mm_shufflelo_epi16(sum_left, 0);
656 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
657 dc_store_16xh(&row, 64, dst, stride);
658 }
659
aom_dc_left_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)660 void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
661 const uint8_t *above,
662 const uint8_t *left) {
663 (void)above;
664 __m128i sum_left = dc_sum_8(left);
665 const __m128i four = _mm_set1_epi16(4);
666 sum_left = _mm_add_epi16(sum_left, four);
667 sum_left = _mm_srai_epi16(sum_left, 3);
668 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
669 sum_left = _mm_shufflelo_epi16(sum_left, 0);
670 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
671 dc_store_32xh(&row, 8, dst, stride);
672 }
673 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
674
aom_dc_left_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)675 void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
676 const uint8_t *above,
677 const uint8_t *left) {
678 (void)above;
679 __m128i sum_left = dc_sum_16_sse2(left);
680 const __m128i eight = _mm_set1_epi16(8);
681 sum_left = _mm_add_epi16(sum_left, eight);
682 sum_left = _mm_srai_epi16(sum_left, 4);
683 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
684 sum_left = _mm_shufflelo_epi16(sum_left, 0);
685 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
686 dc_store_32xh(&row, 16, dst, stride);
687 }
688
aom_dc_left_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)689 void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
690 const uint8_t *above,
691 const uint8_t *left) {
692 (void)above;
693 __m128i sum_left = dc_sum_64(left);
694 const __m128i thirtytwo = _mm_set1_epi16(32);
695 sum_left = _mm_add_epi16(sum_left, thirtytwo);
696 sum_left = _mm_srai_epi16(sum_left, 6);
697 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
698 sum_left = _mm_shufflelo_epi16(sum_left, 0);
699 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
700 dc_store_32xh(&row, 64, dst, stride);
701 }
702
aom_dc_left_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)703 void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
704 const uint8_t *above,
705 const uint8_t *left) {
706 (void)above;
707 __m128i sum_left = dc_sum_64(left);
708 const __m128i thirtytwo = _mm_set1_epi16(32);
709 sum_left = _mm_add_epi16(sum_left, thirtytwo);
710 sum_left = _mm_srai_epi16(sum_left, 6);
711 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
712 sum_left = _mm_shufflelo_epi16(sum_left, 0);
713 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
714 dc_store_64xh(&row, 64, dst, stride);
715 }
716
aom_dc_left_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)717 void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
718 const uint8_t *above,
719 const uint8_t *left) {
720 (void)above;
721 __m128i sum_left = dc_sum_32_sse2(left);
722 const __m128i sixteen = _mm_set1_epi16(16);
723 sum_left = _mm_add_epi16(sum_left, sixteen);
724 sum_left = _mm_srai_epi16(sum_left, 5);
725 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
726 sum_left = _mm_shufflelo_epi16(sum_left, 0);
727 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
728 dc_store_64xh(&row, 32, dst, stride);
729 }
730
731 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_left_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)732 void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
733 const uint8_t *above,
734 const uint8_t *left) {
735 (void)above;
736 __m128i sum_left = dc_sum_16_sse2(left);
737 const __m128i eight = _mm_set1_epi16(8);
738 sum_left = _mm_add_epi16(sum_left, eight);
739 sum_left = _mm_srai_epi16(sum_left, 4);
740 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
741 sum_left = _mm_shufflelo_epi16(sum_left, 0);
742 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
743 dc_store_64xh(&row, 16, dst, stride);
744 }
745 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
746
747 // -----------------------------------------------------------------------------
748 // DC_128
749
aom_dc_128_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)750 void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
751 const uint8_t *above, const uint8_t *left) {
752 (void)above;
753 (void)left;
754 const uint32_t pred = 0x80808080;
755 dc_store_4xh(pred, 8, dst, stride);
756 }
757
758 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_128_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)759 void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
760 const uint8_t *above, const uint8_t *left) {
761 (void)above;
762 (void)left;
763 const uint32_t pred = 0x80808080;
764 dc_store_4xh(pred, 16, dst, stride);
765 }
766 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
767
aom_dc_128_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)768 void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
769 const uint8_t *above, const uint8_t *left) {
770 (void)above;
771 (void)left;
772 const __m128i row = _mm_set1_epi8((int8_t)128);
773 dc_store_8xh(&row, 4, dst, stride);
774 }
775
aom_dc_128_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)776 void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
777 const uint8_t *above, const uint8_t *left) {
778 (void)above;
779 (void)left;
780 const __m128i row = _mm_set1_epi8((int8_t)128);
781 dc_store_8xh(&row, 16, dst, stride);
782 }
783
784 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_128_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)785 void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
786 const uint8_t *above, const uint8_t *left) {
787 (void)above;
788 (void)left;
789 const __m128i row = _mm_set1_epi8((int8_t)128);
790 dc_store_8xh(&row, 32, dst, stride);
791 }
792
aom_dc_128_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)793 void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
794 const uint8_t *above, const uint8_t *left) {
795 (void)above;
796 (void)left;
797 const __m128i row = _mm_set1_epi8((int8_t)128);
798 dc_store_16xh(&row, 4, dst, stride);
799 }
800 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
801
aom_dc_128_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)802 void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
803 const uint8_t *above, const uint8_t *left) {
804 (void)above;
805 (void)left;
806 const __m128i row = _mm_set1_epi8((int8_t)128);
807 dc_store_16xh(&row, 8, dst, stride);
808 }
809
aom_dc_128_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)810 void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
811 const uint8_t *above,
812 const uint8_t *left) {
813 (void)above;
814 (void)left;
815 const __m128i row = _mm_set1_epi8((int8_t)128);
816 dc_store_16xh(&row, 32, dst, stride);
817 }
818
819 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_128_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)820 void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
821 const uint8_t *above,
822 const uint8_t *left) {
823 (void)above;
824 (void)left;
825 const __m128i row = _mm_set1_epi8((int8_t)128);
826 dc_store_16xh(&row, 64, dst, stride);
827 }
828
aom_dc_128_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)829 void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
830 const uint8_t *above, const uint8_t *left) {
831 (void)above;
832 (void)left;
833 const __m128i row = _mm_set1_epi8((int8_t)128);
834 dc_store_32xh(&row, 8, dst, stride);
835 }
836 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
837
aom_dc_128_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)838 void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
839 const uint8_t *above,
840 const uint8_t *left) {
841 (void)above;
842 (void)left;
843 const __m128i row = _mm_set1_epi8((int8_t)128);
844 dc_store_32xh(&row, 16, dst, stride);
845 }
846
aom_dc_128_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)847 void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
848 const uint8_t *above,
849 const uint8_t *left) {
850 (void)above;
851 (void)left;
852 const __m128i row = _mm_set1_epi8((int8_t)128);
853 dc_store_32xh(&row, 64, dst, stride);
854 }
855
aom_dc_128_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)856 void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
857 const uint8_t *above,
858 const uint8_t *left) {
859 (void)above;
860 (void)left;
861 const __m128i row = _mm_set1_epi8((int8_t)128);
862 dc_store_64xh(&row, 64, dst, stride);
863 }
864
aom_dc_128_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)865 void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
866 const uint8_t *above,
867 const uint8_t *left) {
868 (void)above;
869 (void)left;
870 const __m128i row = _mm_set1_epi8((int8_t)128);
871 dc_store_64xh(&row, 32, dst, stride);
872 }
873
874 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_128_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)875 void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
876 const uint8_t *above,
877 const uint8_t *left) {
878 (void)above;
879 (void)left;
880 const __m128i row = _mm_set1_epi8((int8_t)128);
881 dc_store_64xh(&row, 16, dst, stride);
882 }
883 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
884
885 // -----------------------------------------------------------------------------
886 // V_PRED
887
aom_v_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)888 void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
889 const uint8_t *above, const uint8_t *left) {
890 const uint32_t pred = *(uint32_t *)above;
891 (void)left;
892 dc_store_4xh(pred, 8, dst, stride);
893 }
894
895 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)896 void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
897 const uint8_t *above, const uint8_t *left) {
898 const uint32_t pred = *(uint32_t *)above;
899 (void)left;
900 dc_store_4xh(pred, 16, dst, stride);
901 }
902 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
903
aom_v_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)904 void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
905 const uint8_t *above, const uint8_t *left) {
906 const __m128i row = _mm_loadl_epi64((__m128i const *)above);
907 (void)left;
908 dc_store_8xh(&row, 4, dst, stride);
909 }
910
aom_v_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)911 void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
912 const uint8_t *above, const uint8_t *left) {
913 const __m128i row = _mm_loadl_epi64((__m128i const *)above);
914 (void)left;
915 dc_store_8xh(&row, 16, dst, stride);
916 }
917
918 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)919 void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
920 const uint8_t *above, const uint8_t *left) {
921 const __m128i row = _mm_loadl_epi64((__m128i const *)above);
922 (void)left;
923 dc_store_8xh(&row, 32, dst, stride);
924 }
925
aom_v_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)926 void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
927 const uint8_t *above, const uint8_t *left) {
928 const __m128i row = _mm_load_si128((__m128i const *)above);
929 (void)left;
930 dc_store_16xh(&row, 4, dst, stride);
931 }
932 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
933
aom_v_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)934 void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
935 const uint8_t *above, const uint8_t *left) {
936 const __m128i row = _mm_load_si128((__m128i const *)above);
937 (void)left;
938 dc_store_16xh(&row, 8, dst, stride);
939 }
940
aom_v_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)941 void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
942 const uint8_t *above, const uint8_t *left) {
943 const __m128i row = _mm_load_si128((__m128i const *)above);
944 (void)left;
945 dc_store_16xh(&row, 32, dst, stride);
946 }
947
948 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)949 void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
950 const uint8_t *above, const uint8_t *left) {
951 const __m128i row = _mm_load_si128((__m128i const *)above);
952 (void)left;
953 dc_store_16xh(&row, 64, dst, stride);
954 }
955 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
956
v_predictor_32xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int height)957 static inline void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
958 const uint8_t *above, int height) {
959 const __m128i row0 = _mm_load_si128((__m128i const *)above);
960 const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
961 for (int i = 0; i < height; ++i) {
962 _mm_store_si128((__m128i *)dst, row0);
963 _mm_store_si128((__m128i *)(dst + 16), row1);
964 dst += stride;
965 }
966 }
967
968 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)969 void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
970 const uint8_t *above, const uint8_t *left) {
971 (void)left;
972 v_predictor_32xh(dst, stride, above, 8);
973 }
974 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
975
aom_v_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)976 void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
977 const uint8_t *above, const uint8_t *left) {
978 (void)left;
979 v_predictor_32xh(dst, stride, above, 16);
980 }
981
aom_v_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)982 void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
983 const uint8_t *above, const uint8_t *left) {
984 (void)left;
985 v_predictor_32xh(dst, stride, above, 64);
986 }
987
v_predictor_64xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int height)988 static inline void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
989 const uint8_t *above, int height) {
990 const __m128i row0 = _mm_load_si128((__m128i const *)above);
991 const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
992 const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
993 const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
994 for (int i = 0; i < height; ++i) {
995 _mm_store_si128((__m128i *)dst, row0);
996 _mm_store_si128((__m128i *)(dst + 16), row1);
997 _mm_store_si128((__m128i *)(dst + 32), row2);
998 _mm_store_si128((__m128i *)(dst + 48), row3);
999 dst += stride;
1000 }
1001 }
1002
aom_v_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1003 void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
1004 const uint8_t *above, const uint8_t *left) {
1005 (void)left;
1006 v_predictor_64xh(dst, stride, above, 64);
1007 }
1008
aom_v_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1009 void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
1010 const uint8_t *above, const uint8_t *left) {
1011 (void)left;
1012 v_predictor_64xh(dst, stride, above, 32);
1013 }
1014
1015 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1016 void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
1017 const uint8_t *above, const uint8_t *left) {
1018 (void)left;
1019 v_predictor_64xh(dst, stride, above, 16);
1020 }
1021 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1022
1023 // -----------------------------------------------------------------------------
1024 // H_PRED
1025
aom_h_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1026 void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
1027 const uint8_t *above, const uint8_t *left) {
1028 (void)above;
1029 __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
1030 left_col = _mm_unpacklo_epi8(left_col, left_col);
1031 __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
1032 __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
1033 __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
1034 __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
1035 *(int *)dst = _mm_cvtsi128_si32(row0);
1036 dst += stride;
1037 *(int *)dst = _mm_cvtsi128_si32(row1);
1038 dst += stride;
1039 *(int *)dst = _mm_cvtsi128_si32(row2);
1040 dst += stride;
1041 *(int *)dst = _mm_cvtsi128_si32(row3);
1042 dst += stride;
1043 left_col = _mm_unpackhi_epi64(left_col, left_col);
1044 row0 = _mm_shufflelo_epi16(left_col, 0);
1045 row1 = _mm_shufflelo_epi16(left_col, 0x55);
1046 row2 = _mm_shufflelo_epi16(left_col, 0xaa);
1047 row3 = _mm_shufflelo_epi16(left_col, 0xff);
1048 *(int *)dst = _mm_cvtsi128_si32(row0);
1049 dst += stride;
1050 *(int *)dst = _mm_cvtsi128_si32(row1);
1051 dst += stride;
1052 *(int *)dst = _mm_cvtsi128_si32(row2);
1053 dst += stride;
1054 *(int *)dst = _mm_cvtsi128_si32(row3);
1055 }
1056
1057 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1058 void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
1059 const uint8_t *above, const uint8_t *left) {
1060 (void)above;
1061 const __m128i left_col = _mm_load_si128((__m128i const *)left);
1062 __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
1063 __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
1064
1065 __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
1066 __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1067 __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1068 __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1069 *(int *)dst = _mm_cvtsi128_si32(row0);
1070 dst += stride;
1071 *(int *)dst = _mm_cvtsi128_si32(row1);
1072 dst += stride;
1073 *(int *)dst = _mm_cvtsi128_si32(row2);
1074 dst += stride;
1075 *(int *)dst = _mm_cvtsi128_si32(row3);
1076 dst += stride;
1077
1078 left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
1079 row0 = _mm_shufflelo_epi16(left_col_low, 0);
1080 row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1081 row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1082 row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1083 *(int *)dst = _mm_cvtsi128_si32(row0);
1084 dst += stride;
1085 *(int *)dst = _mm_cvtsi128_si32(row1);
1086 dst += stride;
1087 *(int *)dst = _mm_cvtsi128_si32(row2);
1088 dst += stride;
1089 *(int *)dst = _mm_cvtsi128_si32(row3);
1090 dst += stride;
1091
1092 row0 = _mm_shufflelo_epi16(left_col_high, 0);
1093 row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1094 row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1095 row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1096 *(int *)dst = _mm_cvtsi128_si32(row0);
1097 dst += stride;
1098 *(int *)dst = _mm_cvtsi128_si32(row1);
1099 dst += stride;
1100 *(int *)dst = _mm_cvtsi128_si32(row2);
1101 dst += stride;
1102 *(int *)dst = _mm_cvtsi128_si32(row3);
1103 dst += stride;
1104
1105 left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
1106 row0 = _mm_shufflelo_epi16(left_col_high, 0);
1107 row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1108 row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1109 row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1110 *(int *)dst = _mm_cvtsi128_si32(row0);
1111 dst += stride;
1112 *(int *)dst = _mm_cvtsi128_si32(row1);
1113 dst += stride;
1114 *(int *)dst = _mm_cvtsi128_si32(row2);
1115 dst += stride;
1116 *(int *)dst = _mm_cvtsi128_si32(row3);
1117 }
1118 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1119
aom_h_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1120 void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
1121 const uint8_t *above, const uint8_t *left) {
1122 (void)above;
1123 __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
1124 left_col = _mm_unpacklo_epi8(left_col, left_col);
1125 __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
1126 __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
1127 __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
1128 __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
1129 _mm_storel_epi64((__m128i *)dst, row0);
1130 dst += stride;
1131 _mm_storel_epi64((__m128i *)dst, row1);
1132 dst += stride;
1133 _mm_storel_epi64((__m128i *)dst, row2);
1134 dst += stride;
1135 _mm_storel_epi64((__m128i *)dst, row3);
1136 }
1137
h_predictor_8x16xc(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int count)1138 static inline void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
1139 const uint8_t *above, const uint8_t *left,
1140 int count) {
1141 (void)above;
1142 for (int i = 0; i < count; ++i) {
1143 const __m128i left_col = _mm_load_si128((__m128i const *)left);
1144 __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
1145 __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
1146
1147 __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
1148 __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1149 __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1150 __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1151 _mm_storel_epi64((__m128i *)dst, row0);
1152 dst += stride;
1153 _mm_storel_epi64((__m128i *)dst, row1);
1154 dst += stride;
1155 _mm_storel_epi64((__m128i *)dst, row2);
1156 dst += stride;
1157 _mm_storel_epi64((__m128i *)dst, row3);
1158 dst += stride;
1159
1160 left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
1161 row0 = _mm_shufflelo_epi16(left_col_low, 0);
1162 row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1163 row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1164 row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1165 _mm_storel_epi64((__m128i *)dst, row0);
1166 dst += stride;
1167 _mm_storel_epi64((__m128i *)dst, row1);
1168 dst += stride;
1169 _mm_storel_epi64((__m128i *)dst, row2);
1170 dst += stride;
1171 _mm_storel_epi64((__m128i *)dst, row3);
1172 dst += stride;
1173
1174 row0 = _mm_shufflelo_epi16(left_col_high, 0);
1175 row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1176 row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1177 row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1178 _mm_storel_epi64((__m128i *)dst, row0);
1179 dst += stride;
1180 _mm_storel_epi64((__m128i *)dst, row1);
1181 dst += stride;
1182 _mm_storel_epi64((__m128i *)dst, row2);
1183 dst += stride;
1184 _mm_storel_epi64((__m128i *)dst, row3);
1185 dst += stride;
1186
1187 left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
1188 row0 = _mm_shufflelo_epi16(left_col_high, 0);
1189 row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1190 row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1191 row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1192 _mm_storel_epi64((__m128i *)dst, row0);
1193 dst += stride;
1194 _mm_storel_epi64((__m128i *)dst, row1);
1195 dst += stride;
1196 _mm_storel_epi64((__m128i *)dst, row2);
1197 dst += stride;
1198 _mm_storel_epi64((__m128i *)dst, row3);
1199 dst += stride;
1200 left += 16;
1201 }
1202 }
1203
aom_h_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1204 void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
1205 const uint8_t *above, const uint8_t *left) {
1206 h_predictor_8x16xc(dst, stride, above, left, 1);
1207 }
1208
1209 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1210 void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
1211 const uint8_t *above, const uint8_t *left) {
1212 h_predictor_8x16xc(dst, stride, above, left, 2);
1213 }
1214 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1215
h_pred_store_16xh(const __m128i * row,int h,uint8_t * dst,ptrdiff_t stride)1216 static inline void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
1217 ptrdiff_t stride) {
1218 int i;
1219 for (i = 0; i < h; ++i) {
1220 _mm_store_si128((__m128i *)dst, row[i]);
1221 dst += stride;
1222 }
1223 }
1224
repeat_low_4pixels(const __m128i * x,__m128i * row)1225 static inline void repeat_low_4pixels(const __m128i *x, __m128i *row) {
1226 const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
1227 const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
1228 const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
1229 const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
1230
1231 row[0] = _mm_unpacklo_epi64(u0, u0);
1232 row[1] = _mm_unpacklo_epi64(u1, u1);
1233 row[2] = _mm_unpacklo_epi64(u2, u2);
1234 row[3] = _mm_unpacklo_epi64(u3, u3);
1235 }
1236
repeat_high_4pixels(const __m128i * x,__m128i * row)1237 static inline void repeat_high_4pixels(const __m128i *x, __m128i *row) {
1238 const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
1239 const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
1240 const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
1241 const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
1242
1243 row[0] = _mm_unpackhi_epi64(u0, u0);
1244 row[1] = _mm_unpackhi_epi64(u1, u1);
1245 row[2] = _mm_unpackhi_epi64(u2, u2);
1246 row[3] = _mm_unpackhi_epi64(u3, u3);
1247 }
1248
1249 // Process 16x8, first 4 rows
1250 // Use first 8 bytes of left register: xxxxxxxx33221100
h_prediction_16x8_1(const __m128i * left,uint8_t * dst,ptrdiff_t stride)1251 static inline void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
1252 ptrdiff_t stride) {
1253 __m128i row[4];
1254 repeat_low_4pixels(left, row);
1255 h_pred_store_16xh(row, 4, dst, stride);
1256 }
1257
1258 // Process 16x8, second 4 rows
1259 // Use second 8 bytes of left register: 77665544xxxxxxxx
h_prediction_16x8_2(const __m128i * left,uint8_t * dst,ptrdiff_t stride)1260 static inline void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
1261 ptrdiff_t stride) {
1262 __m128i row[4];
1263 repeat_high_4pixels(left, row);
1264 h_pred_store_16xh(row, 4, dst, stride);
1265 }
1266
1267 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1268 void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
1269 const uint8_t *above, const uint8_t *left) {
1270 (void)above;
1271 const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
1272 const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1273 h_prediction_16x8_1(&left_col_8p, dst, stride);
1274 }
1275 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1276
aom_h_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1277 void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
1278 const uint8_t *above, const uint8_t *left) {
1279 (void)above;
1280 const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
1281 const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1282 h_prediction_16x8_1(&left_col_8p, dst, stride);
1283 dst += stride << 2;
1284 h_prediction_16x8_2(&left_col_8p, dst, stride);
1285 }
1286
h_predictor_16xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int count)1287 static inline void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
1288 const uint8_t *left, int count) {
1289 int i = 0;
1290 do {
1291 const __m128i left_col = _mm_load_si128((const __m128i *)left);
1292 const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
1293 h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
1294 dst += stride << 2;
1295 h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
1296 dst += stride << 2;
1297
1298 const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
1299 h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
1300 dst += stride << 2;
1301 h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
1302 dst += stride << 2;
1303
1304 left += 16;
1305 i++;
1306 } while (i < count);
1307 }
1308
aom_h_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1309 void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
1310 const uint8_t *above, const uint8_t *left) {
1311 (void)above;
1312 h_predictor_16xh(dst, stride, left, 2);
1313 }
1314
1315 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1316 void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
1317 const uint8_t *above, const uint8_t *left) {
1318 (void)above;
1319 h_predictor_16xh(dst, stride, left, 4);
1320 }
1321 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1322
h_pred_store_32xh(const __m128i * row,int h,uint8_t * dst,ptrdiff_t stride)1323 static inline void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
1324 ptrdiff_t stride) {
1325 int i;
1326 for (i = 0; i < h; ++i) {
1327 _mm_store_si128((__m128i *)dst, row[i]);
1328 _mm_store_si128((__m128i *)(dst + 16), row[i]);
1329 dst += stride;
1330 }
1331 }
1332
1333 // Process 32x8, first 4 rows
1334 // Use first 8 bytes of left register: xxxxxxxx33221100
h_prediction_32x8_1(const __m128i * left,uint8_t * dst,ptrdiff_t stride)1335 static inline void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
1336 ptrdiff_t stride) {
1337 __m128i row[4];
1338 repeat_low_4pixels(left, row);
1339 h_pred_store_32xh(row, 4, dst, stride);
1340 }
1341
1342 // Process 32x8, second 4 rows
1343 // Use second 8 bytes of left register: 77665544xxxxxxxx
h_prediction_32x8_2(const __m128i * left,uint8_t * dst,ptrdiff_t stride)1344 static inline void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
1345 ptrdiff_t stride) {
1346 __m128i row[4];
1347 repeat_high_4pixels(left, row);
1348 h_pred_store_32xh(row, 4, dst, stride);
1349 }
1350
1351 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1352 void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
1353 const uint8_t *above, const uint8_t *left) {
1354 __m128i left_col, left_col_8p;
1355 (void)above;
1356
1357 left_col = _mm_load_si128((const __m128i *)left);
1358
1359 left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1360 h_prediction_32x8_1(&left_col_8p, dst, stride);
1361 dst += stride << 2;
1362 h_prediction_32x8_2(&left_col_8p, dst, stride);
1363 }
1364 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1365
aom_h_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1366 void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
1367 const uint8_t *above, const uint8_t *left) {
1368 __m128i left_col, left_col_8p;
1369 (void)above;
1370
1371 left_col = _mm_load_si128((const __m128i *)left);
1372
1373 left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1374 h_prediction_32x8_1(&left_col_8p, dst, stride);
1375 dst += stride << 2;
1376 h_prediction_32x8_2(&left_col_8p, dst, stride);
1377 dst += stride << 2;
1378
1379 left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
1380 h_prediction_32x8_1(&left_col_8p, dst, stride);
1381 dst += stride << 2;
1382 h_prediction_32x8_2(&left_col_8p, dst, stride);
1383 }
1384
h_predictor_32xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int height)1385 static inline void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
1386 const uint8_t *left, int height) {
1387 int i = height >> 2;
1388 do {
1389 __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
1390 left4 = _mm_unpacklo_epi8(left4, left4);
1391 left4 = _mm_unpacklo_epi8(left4, left4);
1392 const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
1393 const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
1394 _mm_store_si128((__m128i *)dst, r0);
1395 _mm_store_si128((__m128i *)(dst + 16), r0);
1396 _mm_store_si128((__m128i *)(dst + stride), r1);
1397 _mm_store_si128((__m128i *)(dst + stride + 16), r1);
1398 const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
1399 const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
1400 _mm_store_si128((__m128i *)(dst + stride * 2), r2);
1401 _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
1402 _mm_store_si128((__m128i *)(dst + stride * 3), r3);
1403 _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
1404 left += 4;
1405 dst += stride * 4;
1406 } while (--i);
1407 }
1408
aom_h_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1409 void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
1410 const uint8_t *above, const uint8_t *left) {
1411 (void)above;
1412 h_predictor_32xh(dst, stride, left, 64);
1413 }
1414
h_predictor_64xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int height)1415 static inline void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
1416 const uint8_t *left, int height) {
1417 int i = height >> 2;
1418 do {
1419 __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
1420 left4 = _mm_unpacklo_epi8(left4, left4);
1421 left4 = _mm_unpacklo_epi8(left4, left4);
1422 const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
1423 const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
1424 _mm_store_si128((__m128i *)dst, r0);
1425 _mm_store_si128((__m128i *)(dst + 16), r0);
1426 _mm_store_si128((__m128i *)(dst + 32), r0);
1427 _mm_store_si128((__m128i *)(dst + 48), r0);
1428 _mm_store_si128((__m128i *)(dst + stride), r1);
1429 _mm_store_si128((__m128i *)(dst + stride + 16), r1);
1430 _mm_store_si128((__m128i *)(dst + stride + 32), r1);
1431 _mm_store_si128((__m128i *)(dst + stride + 48), r1);
1432 const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
1433 const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
1434 _mm_store_si128((__m128i *)(dst + stride * 2), r2);
1435 _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
1436 _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
1437 _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
1438 _mm_store_si128((__m128i *)(dst + stride * 3), r3);
1439 _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
1440 _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
1441 _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
1442 left += 4;
1443 dst += stride * 4;
1444 } while (--i);
1445 }
1446
aom_h_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1447 void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
1448 const uint8_t *above, const uint8_t *left) {
1449 (void)above;
1450 h_predictor_64xh(dst, stride, left, 64);
1451 }
1452
aom_h_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1453 void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
1454 const uint8_t *above, const uint8_t *left) {
1455 (void)above;
1456 h_predictor_64xh(dst, stride, left, 32);
1457 }
1458
1459 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1460 void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
1461 const uint8_t *above, const uint8_t *left) {
1462 (void)above;
1463 h_predictor_64xh(dst, stride, left, 16);
1464 }
1465 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1466