xref: /aosp_15_r20/external/libaom/aom_dsp/x86/intrapred_sse2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <emmintrin.h>
13 #include "aom_dsp/x86/intrapred_x86.h"
14 #include "config/aom_dsp_rtcd.h"
15 
dc_store_4xh(uint32_t dc,int height,uint8_t * dst,ptrdiff_t stride)16 static inline void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
17                                 ptrdiff_t stride) {
18   for (int i = 0; i < height; i += 2) {
19     *(uint32_t *)dst = dc;
20     dst += stride;
21     *(uint32_t *)dst = dc;
22     dst += stride;
23   }
24 }
25 
dc_store_8xh(const __m128i * row,int height,uint8_t * dst,ptrdiff_t stride)26 static inline void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
27                                 ptrdiff_t stride) {
28   int i;
29   for (i = 0; i < height; ++i) {
30     _mm_storel_epi64((__m128i *)dst, *row);
31     dst += stride;
32   }
33 }
34 
dc_store_16xh(const __m128i * row,int height,uint8_t * dst,ptrdiff_t stride)35 static inline void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
36                                  ptrdiff_t stride) {
37   int i;
38   for (i = 0; i < height; ++i) {
39     _mm_store_si128((__m128i *)dst, *row);
40     dst += stride;
41   }
42 }
43 
dc_store_32xh(const __m128i * row,int height,uint8_t * dst,ptrdiff_t stride)44 static inline void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
45                                  ptrdiff_t stride) {
46   int i;
47   for (i = 0; i < height; ++i) {
48     _mm_store_si128((__m128i *)dst, *row);
49     _mm_store_si128((__m128i *)(dst + 16), *row);
50     dst += stride;
51   }
52 }
53 
dc_store_64xh(const __m128i * row,int height,uint8_t * dst,ptrdiff_t stride)54 static inline void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
55                                  ptrdiff_t stride) {
56   for (int i = 0; i < height; ++i) {
57     _mm_store_si128((__m128i *)dst, *row);
58     _mm_store_si128((__m128i *)(dst + 16), *row);
59     _mm_store_si128((__m128i *)(dst + 32), *row);
60     _mm_store_si128((__m128i *)(dst + 48), *row);
61     dst += stride;
62   }
63 }
64 
dc_sum_4(const uint8_t * ref)65 static inline __m128i dc_sum_4(const uint8_t *ref) {
66   __m128i x = _mm_loadl_epi64((__m128i const *)ref);
67   const __m128i zero = _mm_setzero_si128();
68   x = _mm_unpacklo_epi8(x, zero);
69   return _mm_sad_epu8(x, zero);
70 }
71 
dc_sum_8(const uint8_t * ref)72 static inline __m128i dc_sum_8(const uint8_t *ref) {
73   __m128i x = _mm_loadl_epi64((__m128i const *)ref);
74   const __m128i zero = _mm_setzero_si128();
75   return _mm_sad_epu8(x, zero);
76 }
77 
dc_sum_64(const uint8_t * ref)78 static inline __m128i dc_sum_64(const uint8_t *ref) {
79   __m128i x0 = _mm_load_si128((__m128i const *)ref);
80   __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
81   __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
82   __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
83   const __m128i zero = _mm_setzero_si128();
84   x0 = _mm_sad_epu8(x0, zero);
85   x1 = _mm_sad_epu8(x1, zero);
86   x2 = _mm_sad_epu8(x2, zero);
87   x3 = _mm_sad_epu8(x3, zero);
88   x0 = _mm_add_epi16(x0, x1);
89   x2 = _mm_add_epi16(x2, x3);
90   x0 = _mm_add_epi16(x0, x2);
91   const __m128i high = _mm_unpackhi_epi64(x0, x0);
92   return _mm_add_epi16(x0, high);
93 }
94 
95 #define DC_MULTIPLIER_1X2 0x5556
96 #define DC_MULTIPLIER_1X4 0x3334
97 
98 #define DC_SHIFT2 16
99 
divide_using_multiply_shift(int num,int shift1,int multiplier)100 static inline int divide_using_multiply_shift(int num, int shift1,
101                                               int multiplier) {
102   const int interm = num >> shift1;
103   return interm * multiplier >> DC_SHIFT2;
104 }
105 
106 // -----------------------------------------------------------------------------
107 // DC_PRED
108 
aom_dc_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)109 void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
110                                const uint8_t *above, const uint8_t *left) {
111   const __m128i sum_left = dc_sum_8(left);
112   __m128i sum_above = dc_sum_4(above);
113   sum_above = _mm_add_epi16(sum_left, sum_above);
114 
115   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
116   sum += 6;
117   sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
118 
119   const __m128i row = _mm_set1_epi8((int8_t)sum);
120   const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
121   dc_store_4xh(pred, 8, dst, stride);
122 }
123 
124 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)125 void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
126                                 const uint8_t *above, const uint8_t *left) {
127   const __m128i sum_left = dc_sum_16_sse2(left);
128   __m128i sum_above = dc_sum_4(above);
129   sum_above = _mm_add_epi16(sum_left, sum_above);
130 
131   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
132   sum += 10;
133   sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
134 
135   const __m128i row = _mm_set1_epi8((int8_t)sum);
136   const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
137   dc_store_4xh(pred, 16, dst, stride);
138 }
139 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
140 
aom_dc_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)141 void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
142                                const uint8_t *above, const uint8_t *left) {
143   const __m128i sum_left = dc_sum_4(left);
144   __m128i sum_above = dc_sum_8(above);
145   sum_above = _mm_add_epi16(sum_above, sum_left);
146 
147   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
148   sum += 6;
149   sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
150 
151   const __m128i row = _mm_set1_epi8((int8_t)sum);
152   dc_store_8xh(&row, 4, dst, stride);
153 }
154 
aom_dc_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)155 void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
156                                 const uint8_t *above, const uint8_t *left) {
157   const __m128i sum_left = dc_sum_16_sse2(left);
158   __m128i sum_above = dc_sum_8(above);
159   sum_above = _mm_add_epi16(sum_above, sum_left);
160 
161   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
162   sum += 12;
163   sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
164   const __m128i row = _mm_set1_epi8((int8_t)sum);
165   dc_store_8xh(&row, 16, dst, stride);
166 }
167 
168 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)169 void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
170                                 const uint8_t *above, const uint8_t *left) {
171   const __m128i sum_left = dc_sum_32_sse2(left);
172   __m128i sum_above = dc_sum_8(above);
173   sum_above = _mm_add_epi16(sum_above, sum_left);
174 
175   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
176   sum += 20;
177   sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
178   const __m128i row = _mm_set1_epi8((int8_t)sum);
179   dc_store_8xh(&row, 32, dst, stride);
180 }
181 
aom_dc_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)182 void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
183                                 const uint8_t *above, const uint8_t *left) {
184   const __m128i sum_left = dc_sum_4(left);
185   __m128i sum_above = dc_sum_16_sse2(above);
186   sum_above = _mm_add_epi16(sum_above, sum_left);
187 
188   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
189   sum += 10;
190   sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
191   const __m128i row = _mm_set1_epi8((int8_t)sum);
192   dc_store_16xh(&row, 4, dst, stride);
193 }
194 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
195 
aom_dc_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)196 void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
197                                 const uint8_t *above, const uint8_t *left) {
198   const __m128i sum_left = dc_sum_8(left);
199   __m128i sum_above = dc_sum_16_sse2(above);
200   sum_above = _mm_add_epi16(sum_above, sum_left);
201 
202   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
203   sum += 12;
204   sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
205   const __m128i row = _mm_set1_epi8((int8_t)sum);
206   dc_store_16xh(&row, 8, dst, stride);
207 }
208 
aom_dc_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)209 void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
210                                  const uint8_t *above, const uint8_t *left) {
211   const __m128i sum_left = dc_sum_32_sse2(left);
212   __m128i sum_above = dc_sum_16_sse2(above);
213   sum_above = _mm_add_epi16(sum_left, sum_above);
214 
215   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
216   sum += 24;
217   sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
218   const __m128i row = _mm_set1_epi8((int8_t)sum);
219   dc_store_16xh(&row, 32, dst, stride);
220 }
221 
222 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)223 void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
224                                  const uint8_t *above, const uint8_t *left) {
225   const __m128i sum_left = dc_sum_64(left);
226   __m128i sum_above = dc_sum_16_sse2(above);
227   sum_above = _mm_add_epi16(sum_left, sum_above);
228 
229   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
230   sum += 40;
231   sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
232   const __m128i row = _mm_set1_epi8((int8_t)sum);
233   dc_store_16xh(&row, 64, dst, stride);
234 }
235 
aom_dc_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)236 void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
237                                 const uint8_t *above, const uint8_t *left) {
238   __m128i sum_above = dc_sum_32_sse2(above);
239   const __m128i sum_left = dc_sum_8(left);
240   sum_above = _mm_add_epi16(sum_above, sum_left);
241 
242   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
243   sum += 20;
244   sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
245   const __m128i row = _mm_set1_epi8((int8_t)sum);
246   dc_store_32xh(&row, 8, dst, stride);
247 }
248 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
249 
aom_dc_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)250 void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
251                                  const uint8_t *above, const uint8_t *left) {
252   __m128i sum_above = dc_sum_32_sse2(above);
253   const __m128i sum_left = dc_sum_16_sse2(left);
254   sum_above = _mm_add_epi16(sum_above, sum_left);
255 
256   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
257   sum += 24;
258   sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
259   const __m128i row = _mm_set1_epi8((int8_t)sum);
260   dc_store_32xh(&row, 16, dst, stride);
261 }
262 
aom_dc_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)263 void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
264                                  const uint8_t *above, const uint8_t *left) {
265   __m128i sum_above = dc_sum_32_sse2(above);
266   const __m128i sum_left = dc_sum_64(left);
267   sum_above = _mm_add_epi16(sum_above, sum_left);
268 
269   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
270   sum += 48;
271   sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
272   const __m128i row = _mm_set1_epi8((int8_t)sum);
273   dc_store_32xh(&row, 64, dst, stride);
274 }
275 
aom_dc_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)276 void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
277                                  const uint8_t *above, const uint8_t *left) {
278   __m128i sum_above = dc_sum_64(above);
279   const __m128i sum_left = dc_sum_64(left);
280   sum_above = _mm_add_epi16(sum_above, sum_left);
281 
282   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
283   sum += 64;
284   sum /= 128;
285   const __m128i row = _mm_set1_epi8((int8_t)sum);
286   dc_store_64xh(&row, 64, dst, stride);
287 }
288 
aom_dc_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)289 void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
290                                  const uint8_t *above, const uint8_t *left) {
291   __m128i sum_above = dc_sum_64(above);
292   const __m128i sum_left = dc_sum_32_sse2(left);
293   sum_above = _mm_add_epi16(sum_above, sum_left);
294 
295   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
296   sum += 48;
297   sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
298   const __m128i row = _mm_set1_epi8((int8_t)sum);
299   dc_store_64xh(&row, 32, dst, stride);
300 }
301 
302 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)303 void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
304                                  const uint8_t *above, const uint8_t *left) {
305   __m128i sum_above = dc_sum_64(above);
306   const __m128i sum_left = dc_sum_16_sse2(left);
307   sum_above = _mm_add_epi16(sum_above, sum_left);
308 
309   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
310   sum += 40;
311   sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
312   const __m128i row = _mm_set1_epi8((int8_t)sum);
313   dc_store_64xh(&row, 16, dst, stride);
314 }
315 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
316 
317 // -----------------------------------------------------------------------------
318 // DC_TOP
319 
aom_dc_top_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)320 void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
321                                    const uint8_t *above, const uint8_t *left) {
322   (void)left;
323   __m128i sum_above = dc_sum_4(above);
324   const __m128i two = _mm_set1_epi16(2);
325   sum_above = _mm_add_epi16(sum_above, two);
326   sum_above = _mm_srai_epi16(sum_above, 2);
327   sum_above = _mm_shufflelo_epi16(sum_above, 0);
328   sum_above = _mm_packus_epi16(sum_above, sum_above);
329 
330   const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
331   dc_store_4xh(pred, 8, dst, stride);
332 }
333 
334 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_top_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)335 void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
336                                     const uint8_t *above, const uint8_t *left) {
337   (void)left;
338   __m128i sum_above = dc_sum_4(above);
339   const __m128i two = _mm_set1_epi16(2);
340   sum_above = _mm_add_epi16(sum_above, two);
341   sum_above = _mm_srai_epi16(sum_above, 2);
342   sum_above = _mm_shufflelo_epi16(sum_above, 0);
343   sum_above = _mm_packus_epi16(sum_above, sum_above);
344 
345   const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
346   dc_store_4xh(pred, 16, dst, stride);
347 }
348 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
349 
aom_dc_top_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)350 void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
351                                    const uint8_t *above, const uint8_t *left) {
352   (void)left;
353   __m128i sum_above = dc_sum_8(above);
354   const __m128i four = _mm_set1_epi16(4);
355   sum_above = _mm_add_epi16(sum_above, four);
356   sum_above = _mm_srai_epi16(sum_above, 3);
357   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
358   const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
359   dc_store_8xh(&row, 4, dst, stride);
360 }
361 
aom_dc_top_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)362 void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
363                                     const uint8_t *above, const uint8_t *left) {
364   (void)left;
365   __m128i sum_above = dc_sum_8(above);
366   const __m128i four = _mm_set1_epi16(4);
367   sum_above = _mm_add_epi16(sum_above, four);
368   sum_above = _mm_srai_epi16(sum_above, 3);
369   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
370   const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
371   dc_store_8xh(&row, 16, dst, stride);
372 }
373 
374 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_top_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)375 void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
376                                     const uint8_t *above, const uint8_t *left) {
377   (void)left;
378   __m128i sum_above = dc_sum_8(above);
379   const __m128i four = _mm_set1_epi16(4);
380   sum_above = _mm_add_epi16(sum_above, four);
381   sum_above = _mm_srai_epi16(sum_above, 3);
382   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
383   const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
384   dc_store_8xh(&row, 32, dst, stride);
385 }
386 
aom_dc_top_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)387 void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
388                                     const uint8_t *above, const uint8_t *left) {
389   (void)left;
390   __m128i sum_above = dc_sum_16_sse2(above);
391   const __m128i eight = _mm_set1_epi16(8);
392   sum_above = _mm_add_epi16(sum_above, eight);
393   sum_above = _mm_srai_epi16(sum_above, 4);
394   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
395   sum_above = _mm_shufflelo_epi16(sum_above, 0);
396   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
397   dc_store_16xh(&row, 4, dst, stride);
398 }
399 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
400 
aom_dc_top_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)401 void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
402                                     const uint8_t *above, const uint8_t *left) {
403   (void)left;
404   __m128i sum_above = dc_sum_16_sse2(above);
405   const __m128i eight = _mm_set1_epi16(8);
406   sum_above = _mm_add_epi16(sum_above, eight);
407   sum_above = _mm_srai_epi16(sum_above, 4);
408   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
409   sum_above = _mm_shufflelo_epi16(sum_above, 0);
410   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
411   dc_store_16xh(&row, 8, dst, stride);
412 }
413 
aom_dc_top_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)414 void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
415                                      const uint8_t *above,
416                                      const uint8_t *left) {
417   (void)left;
418   __m128i sum_above = dc_sum_16_sse2(above);
419   const __m128i eight = _mm_set1_epi16(8);
420   sum_above = _mm_add_epi16(sum_above, eight);
421   sum_above = _mm_srai_epi16(sum_above, 4);
422   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
423   sum_above = _mm_shufflelo_epi16(sum_above, 0);
424   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
425   dc_store_16xh(&row, 32, dst, stride);
426 }
427 
428 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_top_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)429 void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
430                                      const uint8_t *above,
431                                      const uint8_t *left) {
432   (void)left;
433   __m128i sum_above = dc_sum_16_sse2(above);
434   const __m128i eight = _mm_set1_epi16(8);
435   sum_above = _mm_add_epi16(sum_above, eight);
436   sum_above = _mm_srai_epi16(sum_above, 4);
437   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
438   sum_above = _mm_shufflelo_epi16(sum_above, 0);
439   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
440   dc_store_16xh(&row, 64, dst, stride);
441 }
442 
aom_dc_top_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)443 void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
444                                     const uint8_t *above, const uint8_t *left) {
445   (void)left;
446   __m128i sum_above = dc_sum_32_sse2(above);
447   const __m128i sixteen = _mm_set1_epi16(16);
448   sum_above = _mm_add_epi16(sum_above, sixteen);
449   sum_above = _mm_srai_epi16(sum_above, 5);
450   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
451   sum_above = _mm_shufflelo_epi16(sum_above, 0);
452   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
453   dc_store_32xh(&row, 8, dst, stride);
454 }
455 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
456 
aom_dc_top_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)457 void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
458                                      const uint8_t *above,
459                                      const uint8_t *left) {
460   (void)left;
461   __m128i sum_above = dc_sum_32_sse2(above);
462   const __m128i sixteen = _mm_set1_epi16(16);
463   sum_above = _mm_add_epi16(sum_above, sixteen);
464   sum_above = _mm_srai_epi16(sum_above, 5);
465   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
466   sum_above = _mm_shufflelo_epi16(sum_above, 0);
467   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
468   dc_store_32xh(&row, 16, dst, stride);
469 }
470 
aom_dc_top_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)471 void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
472                                      const uint8_t *above,
473                                      const uint8_t *left) {
474   (void)left;
475   __m128i sum_above = dc_sum_32_sse2(above);
476   const __m128i sixteen = _mm_set1_epi16(16);
477   sum_above = _mm_add_epi16(sum_above, sixteen);
478   sum_above = _mm_srai_epi16(sum_above, 5);
479   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
480   sum_above = _mm_shufflelo_epi16(sum_above, 0);
481   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
482   dc_store_32xh(&row, 64, dst, stride);
483 }
484 
aom_dc_top_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)485 void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
486                                      const uint8_t *above,
487                                      const uint8_t *left) {
488   (void)left;
489   __m128i sum_above = dc_sum_64(above);
490   const __m128i thirtytwo = _mm_set1_epi16(32);
491   sum_above = _mm_add_epi16(sum_above, thirtytwo);
492   sum_above = _mm_srai_epi16(sum_above, 6);
493   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
494   sum_above = _mm_shufflelo_epi16(sum_above, 0);
495   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
496   dc_store_64xh(&row, 64, dst, stride);
497 }
498 
aom_dc_top_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)499 void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
500                                      const uint8_t *above,
501                                      const uint8_t *left) {
502   (void)left;
503   __m128i sum_above = dc_sum_64(above);
504   const __m128i thirtytwo = _mm_set1_epi16(32);
505   sum_above = _mm_add_epi16(sum_above, thirtytwo);
506   sum_above = _mm_srai_epi16(sum_above, 6);
507   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
508   sum_above = _mm_shufflelo_epi16(sum_above, 0);
509   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
510   dc_store_64xh(&row, 32, dst, stride);
511 }
512 
513 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_top_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)514 void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
515                                      const uint8_t *above,
516                                      const uint8_t *left) {
517   (void)left;
518   __m128i sum_above = dc_sum_64(above);
519   const __m128i thirtytwo = _mm_set1_epi16(32);
520   sum_above = _mm_add_epi16(sum_above, thirtytwo);
521   sum_above = _mm_srai_epi16(sum_above, 6);
522   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
523   sum_above = _mm_shufflelo_epi16(sum_above, 0);
524   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
525   dc_store_64xh(&row, 16, dst, stride);
526 }
527 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
528 
529 // -----------------------------------------------------------------------------
530 // DC_LEFT
531 
aom_dc_left_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)532 void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
533                                     const uint8_t *above, const uint8_t *left) {
534   (void)above;
535   __m128i sum_left = dc_sum_8(left);
536   const __m128i four = _mm_set1_epi16(4);
537   sum_left = _mm_add_epi16(sum_left, four);
538   sum_left = _mm_srai_epi16(sum_left, 3);
539   sum_left = _mm_shufflelo_epi16(sum_left, 0);
540   sum_left = _mm_packus_epi16(sum_left, sum_left);
541 
542   const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
543   dc_store_4xh(pred, 8, dst, stride);
544 }
545 
546 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_left_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)547 void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
548                                      const uint8_t *above,
549                                      const uint8_t *left) {
550   (void)above;
551   __m128i sum_left = dc_sum_16_sse2(left);
552   const __m128i eight = _mm_set1_epi16(8);
553   sum_left = _mm_add_epi16(sum_left, eight);
554   sum_left = _mm_srai_epi16(sum_left, 4);
555   sum_left = _mm_shufflelo_epi16(sum_left, 0);
556   sum_left = _mm_packus_epi16(sum_left, sum_left);
557 
558   const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
559   dc_store_4xh(pred, 16, dst, stride);
560 }
561 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
562 
aom_dc_left_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)563 void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
564                                     const uint8_t *above, const uint8_t *left) {
565   (void)above;
566   __m128i sum_left = dc_sum_4(left);
567   const __m128i two = _mm_set1_epi16(2);
568   sum_left = _mm_add_epi16(sum_left, two);
569   sum_left = _mm_srai_epi16(sum_left, 2);
570   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
571   const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
572   dc_store_8xh(&row, 4, dst, stride);
573 }
574 
aom_dc_left_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)575 void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
576                                      const uint8_t *above,
577                                      const uint8_t *left) {
578   (void)above;
579   __m128i sum_left = dc_sum_16_sse2(left);
580   const __m128i eight = _mm_set1_epi16(8);
581   sum_left = _mm_add_epi16(sum_left, eight);
582   sum_left = _mm_srai_epi16(sum_left, 4);
583   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
584   const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
585   dc_store_8xh(&row, 16, dst, stride);
586 }
587 
588 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_left_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)589 void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
590                                      const uint8_t *above,
591                                      const uint8_t *left) {
592   (void)above;
593   __m128i sum_left = dc_sum_32_sse2(left);
594   const __m128i sixteen = _mm_set1_epi16(16);
595   sum_left = _mm_add_epi16(sum_left, sixteen);
596   sum_left = _mm_srai_epi16(sum_left, 5);
597   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
598   const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
599   dc_store_8xh(&row, 32, dst, stride);
600 }
601 
aom_dc_left_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)602 void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
603                                      const uint8_t *above,
604                                      const uint8_t *left) {
605   (void)above;
606   __m128i sum_left = dc_sum_4(left);
607   const __m128i two = _mm_set1_epi16(2);
608   sum_left = _mm_add_epi16(sum_left, two);
609   sum_left = _mm_srai_epi16(sum_left, 2);
610   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
611   sum_left = _mm_shufflelo_epi16(sum_left, 0);
612   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
613   dc_store_16xh(&row, 4, dst, stride);
614 }
615 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
616 
aom_dc_left_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)617 void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
618                                      const uint8_t *above,
619                                      const uint8_t *left) {
620   (void)above;
621   __m128i sum_left = dc_sum_8(left);
622   const __m128i four = _mm_set1_epi16(4);
623   sum_left = _mm_add_epi16(sum_left, four);
624   sum_left = _mm_srai_epi16(sum_left, 3);
625   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
626   sum_left = _mm_shufflelo_epi16(sum_left, 0);
627   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
628   dc_store_16xh(&row, 8, dst, stride);
629 }
630 
aom_dc_left_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)631 void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
632                                       const uint8_t *above,
633                                       const uint8_t *left) {
634   (void)above;
635   __m128i sum_left = dc_sum_32_sse2(left);
636   const __m128i sixteen = _mm_set1_epi16(16);
637   sum_left = _mm_add_epi16(sum_left, sixteen);
638   sum_left = _mm_srai_epi16(sum_left, 5);
639   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
640   sum_left = _mm_shufflelo_epi16(sum_left, 0);
641   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
642   dc_store_16xh(&row, 32, dst, stride);
643 }
644 
645 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_left_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)646 void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
647                                       const uint8_t *above,
648                                       const uint8_t *left) {
649   (void)above;
650   __m128i sum_left = dc_sum_64(left);
651   const __m128i thirtytwo = _mm_set1_epi16(32);
652   sum_left = _mm_add_epi16(sum_left, thirtytwo);
653   sum_left = _mm_srai_epi16(sum_left, 6);
654   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
655   sum_left = _mm_shufflelo_epi16(sum_left, 0);
656   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
657   dc_store_16xh(&row, 64, dst, stride);
658 }
659 
aom_dc_left_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)660 void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
661                                      const uint8_t *above,
662                                      const uint8_t *left) {
663   (void)above;
664   __m128i sum_left = dc_sum_8(left);
665   const __m128i four = _mm_set1_epi16(4);
666   sum_left = _mm_add_epi16(sum_left, four);
667   sum_left = _mm_srai_epi16(sum_left, 3);
668   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
669   sum_left = _mm_shufflelo_epi16(sum_left, 0);
670   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
671   dc_store_32xh(&row, 8, dst, stride);
672 }
673 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
674 
aom_dc_left_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)675 void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
676                                       const uint8_t *above,
677                                       const uint8_t *left) {
678   (void)above;
679   __m128i sum_left = dc_sum_16_sse2(left);
680   const __m128i eight = _mm_set1_epi16(8);
681   sum_left = _mm_add_epi16(sum_left, eight);
682   sum_left = _mm_srai_epi16(sum_left, 4);
683   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
684   sum_left = _mm_shufflelo_epi16(sum_left, 0);
685   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
686   dc_store_32xh(&row, 16, dst, stride);
687 }
688 
aom_dc_left_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)689 void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
690                                       const uint8_t *above,
691                                       const uint8_t *left) {
692   (void)above;
693   __m128i sum_left = dc_sum_64(left);
694   const __m128i thirtytwo = _mm_set1_epi16(32);
695   sum_left = _mm_add_epi16(sum_left, thirtytwo);
696   sum_left = _mm_srai_epi16(sum_left, 6);
697   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
698   sum_left = _mm_shufflelo_epi16(sum_left, 0);
699   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
700   dc_store_32xh(&row, 64, dst, stride);
701 }
702 
aom_dc_left_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)703 void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
704                                       const uint8_t *above,
705                                       const uint8_t *left) {
706   (void)above;
707   __m128i sum_left = dc_sum_64(left);
708   const __m128i thirtytwo = _mm_set1_epi16(32);
709   sum_left = _mm_add_epi16(sum_left, thirtytwo);
710   sum_left = _mm_srai_epi16(sum_left, 6);
711   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
712   sum_left = _mm_shufflelo_epi16(sum_left, 0);
713   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
714   dc_store_64xh(&row, 64, dst, stride);
715 }
716 
aom_dc_left_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)717 void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
718                                       const uint8_t *above,
719                                       const uint8_t *left) {
720   (void)above;
721   __m128i sum_left = dc_sum_32_sse2(left);
722   const __m128i sixteen = _mm_set1_epi16(16);
723   sum_left = _mm_add_epi16(sum_left, sixteen);
724   sum_left = _mm_srai_epi16(sum_left, 5);
725   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
726   sum_left = _mm_shufflelo_epi16(sum_left, 0);
727   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
728   dc_store_64xh(&row, 32, dst, stride);
729 }
730 
731 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_left_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)732 void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
733                                       const uint8_t *above,
734                                       const uint8_t *left) {
735   (void)above;
736   __m128i sum_left = dc_sum_16_sse2(left);
737   const __m128i eight = _mm_set1_epi16(8);
738   sum_left = _mm_add_epi16(sum_left, eight);
739   sum_left = _mm_srai_epi16(sum_left, 4);
740   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
741   sum_left = _mm_shufflelo_epi16(sum_left, 0);
742   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
743   dc_store_64xh(&row, 16, dst, stride);
744 }
745 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
746 
747 // -----------------------------------------------------------------------------
748 // DC_128
749 
aom_dc_128_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)750 void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
751                                    const uint8_t *above, const uint8_t *left) {
752   (void)above;
753   (void)left;
754   const uint32_t pred = 0x80808080;
755   dc_store_4xh(pred, 8, dst, stride);
756 }
757 
758 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_128_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)759 void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
760                                     const uint8_t *above, const uint8_t *left) {
761   (void)above;
762   (void)left;
763   const uint32_t pred = 0x80808080;
764   dc_store_4xh(pred, 16, dst, stride);
765 }
766 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
767 
aom_dc_128_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)768 void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
769                                    const uint8_t *above, const uint8_t *left) {
770   (void)above;
771   (void)left;
772   const __m128i row = _mm_set1_epi8((int8_t)128);
773   dc_store_8xh(&row, 4, dst, stride);
774 }
775 
aom_dc_128_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)776 void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
777                                     const uint8_t *above, const uint8_t *left) {
778   (void)above;
779   (void)left;
780   const __m128i row = _mm_set1_epi8((int8_t)128);
781   dc_store_8xh(&row, 16, dst, stride);
782 }
783 
784 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_128_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)785 void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
786                                     const uint8_t *above, const uint8_t *left) {
787   (void)above;
788   (void)left;
789   const __m128i row = _mm_set1_epi8((int8_t)128);
790   dc_store_8xh(&row, 32, dst, stride);
791 }
792 
aom_dc_128_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)793 void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
794                                     const uint8_t *above, const uint8_t *left) {
795   (void)above;
796   (void)left;
797   const __m128i row = _mm_set1_epi8((int8_t)128);
798   dc_store_16xh(&row, 4, dst, stride);
799 }
800 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
801 
aom_dc_128_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)802 void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
803                                     const uint8_t *above, const uint8_t *left) {
804   (void)above;
805   (void)left;
806   const __m128i row = _mm_set1_epi8((int8_t)128);
807   dc_store_16xh(&row, 8, dst, stride);
808 }
809 
aom_dc_128_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)810 void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
811                                      const uint8_t *above,
812                                      const uint8_t *left) {
813   (void)above;
814   (void)left;
815   const __m128i row = _mm_set1_epi8((int8_t)128);
816   dc_store_16xh(&row, 32, dst, stride);
817 }
818 
819 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_128_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)820 void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
821                                      const uint8_t *above,
822                                      const uint8_t *left) {
823   (void)above;
824   (void)left;
825   const __m128i row = _mm_set1_epi8((int8_t)128);
826   dc_store_16xh(&row, 64, dst, stride);
827 }
828 
aom_dc_128_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)829 void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
830                                     const uint8_t *above, const uint8_t *left) {
831   (void)above;
832   (void)left;
833   const __m128i row = _mm_set1_epi8((int8_t)128);
834   dc_store_32xh(&row, 8, dst, stride);
835 }
836 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
837 
aom_dc_128_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)838 void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
839                                      const uint8_t *above,
840                                      const uint8_t *left) {
841   (void)above;
842   (void)left;
843   const __m128i row = _mm_set1_epi8((int8_t)128);
844   dc_store_32xh(&row, 16, dst, stride);
845 }
846 
aom_dc_128_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)847 void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
848                                      const uint8_t *above,
849                                      const uint8_t *left) {
850   (void)above;
851   (void)left;
852   const __m128i row = _mm_set1_epi8((int8_t)128);
853   dc_store_32xh(&row, 64, dst, stride);
854 }
855 
aom_dc_128_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)856 void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
857                                      const uint8_t *above,
858                                      const uint8_t *left) {
859   (void)above;
860   (void)left;
861   const __m128i row = _mm_set1_epi8((int8_t)128);
862   dc_store_64xh(&row, 64, dst, stride);
863 }
864 
aom_dc_128_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)865 void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
866                                      const uint8_t *above,
867                                      const uint8_t *left) {
868   (void)above;
869   (void)left;
870   const __m128i row = _mm_set1_epi8((int8_t)128);
871   dc_store_64xh(&row, 32, dst, stride);
872 }
873 
874 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_128_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)875 void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
876                                      const uint8_t *above,
877                                      const uint8_t *left) {
878   (void)above;
879   (void)left;
880   const __m128i row = _mm_set1_epi8((int8_t)128);
881   dc_store_64xh(&row, 16, dst, stride);
882 }
883 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
884 
885 // -----------------------------------------------------------------------------
886 // V_PRED
887 
aom_v_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)888 void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
889                               const uint8_t *above, const uint8_t *left) {
890   const uint32_t pred = *(uint32_t *)above;
891   (void)left;
892   dc_store_4xh(pred, 8, dst, stride);
893 }
894 
895 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)896 void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
897                                const uint8_t *above, const uint8_t *left) {
898   const uint32_t pred = *(uint32_t *)above;
899   (void)left;
900   dc_store_4xh(pred, 16, dst, stride);
901 }
902 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
903 
aom_v_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)904 void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
905                               const uint8_t *above, const uint8_t *left) {
906   const __m128i row = _mm_loadl_epi64((__m128i const *)above);
907   (void)left;
908   dc_store_8xh(&row, 4, dst, stride);
909 }
910 
aom_v_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)911 void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
912                                const uint8_t *above, const uint8_t *left) {
913   const __m128i row = _mm_loadl_epi64((__m128i const *)above);
914   (void)left;
915   dc_store_8xh(&row, 16, dst, stride);
916 }
917 
918 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)919 void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
920                                const uint8_t *above, const uint8_t *left) {
921   const __m128i row = _mm_loadl_epi64((__m128i const *)above);
922   (void)left;
923   dc_store_8xh(&row, 32, dst, stride);
924 }
925 
aom_v_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)926 void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
927                                const uint8_t *above, const uint8_t *left) {
928   const __m128i row = _mm_load_si128((__m128i const *)above);
929   (void)left;
930   dc_store_16xh(&row, 4, dst, stride);
931 }
932 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
933 
aom_v_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)934 void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
935                                const uint8_t *above, const uint8_t *left) {
936   const __m128i row = _mm_load_si128((__m128i const *)above);
937   (void)left;
938   dc_store_16xh(&row, 8, dst, stride);
939 }
940 
aom_v_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)941 void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
942                                 const uint8_t *above, const uint8_t *left) {
943   const __m128i row = _mm_load_si128((__m128i const *)above);
944   (void)left;
945   dc_store_16xh(&row, 32, dst, stride);
946 }
947 
948 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)949 void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
950                                 const uint8_t *above, const uint8_t *left) {
951   const __m128i row = _mm_load_si128((__m128i const *)above);
952   (void)left;
953   dc_store_16xh(&row, 64, dst, stride);
954 }
955 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
956 
v_predictor_32xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int height)957 static inline void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
958                                     const uint8_t *above, int height) {
959   const __m128i row0 = _mm_load_si128((__m128i const *)above);
960   const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
961   for (int i = 0; i < height; ++i) {
962     _mm_store_si128((__m128i *)dst, row0);
963     _mm_store_si128((__m128i *)(dst + 16), row1);
964     dst += stride;
965   }
966 }
967 
968 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)969 void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
970                                const uint8_t *above, const uint8_t *left) {
971   (void)left;
972   v_predictor_32xh(dst, stride, above, 8);
973 }
974 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
975 
aom_v_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)976 void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
977                                 const uint8_t *above, const uint8_t *left) {
978   (void)left;
979   v_predictor_32xh(dst, stride, above, 16);
980 }
981 
aom_v_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)982 void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
983                                 const uint8_t *above, const uint8_t *left) {
984   (void)left;
985   v_predictor_32xh(dst, stride, above, 64);
986 }
987 
v_predictor_64xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int height)988 static inline void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
989                                     const uint8_t *above, int height) {
990   const __m128i row0 = _mm_load_si128((__m128i const *)above);
991   const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
992   const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
993   const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
994   for (int i = 0; i < height; ++i) {
995     _mm_store_si128((__m128i *)dst, row0);
996     _mm_store_si128((__m128i *)(dst + 16), row1);
997     _mm_store_si128((__m128i *)(dst + 32), row2);
998     _mm_store_si128((__m128i *)(dst + 48), row3);
999     dst += stride;
1000   }
1001 }
1002 
aom_v_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1003 void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
1004                                 const uint8_t *above, const uint8_t *left) {
1005   (void)left;
1006   v_predictor_64xh(dst, stride, above, 64);
1007 }
1008 
aom_v_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1009 void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
1010                                 const uint8_t *above, const uint8_t *left) {
1011   (void)left;
1012   v_predictor_64xh(dst, stride, above, 32);
1013 }
1014 
1015 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1016 void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
1017                                 const uint8_t *above, const uint8_t *left) {
1018   (void)left;
1019   v_predictor_64xh(dst, stride, above, 16);
1020 }
1021 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1022 
1023 // -----------------------------------------------------------------------------
1024 // H_PRED
1025 
aom_h_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1026 void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
1027                               const uint8_t *above, const uint8_t *left) {
1028   (void)above;
1029   __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
1030   left_col = _mm_unpacklo_epi8(left_col, left_col);
1031   __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
1032   __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
1033   __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
1034   __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
1035   *(int *)dst = _mm_cvtsi128_si32(row0);
1036   dst += stride;
1037   *(int *)dst = _mm_cvtsi128_si32(row1);
1038   dst += stride;
1039   *(int *)dst = _mm_cvtsi128_si32(row2);
1040   dst += stride;
1041   *(int *)dst = _mm_cvtsi128_si32(row3);
1042   dst += stride;
1043   left_col = _mm_unpackhi_epi64(left_col, left_col);
1044   row0 = _mm_shufflelo_epi16(left_col, 0);
1045   row1 = _mm_shufflelo_epi16(left_col, 0x55);
1046   row2 = _mm_shufflelo_epi16(left_col, 0xaa);
1047   row3 = _mm_shufflelo_epi16(left_col, 0xff);
1048   *(int *)dst = _mm_cvtsi128_si32(row0);
1049   dst += stride;
1050   *(int *)dst = _mm_cvtsi128_si32(row1);
1051   dst += stride;
1052   *(int *)dst = _mm_cvtsi128_si32(row2);
1053   dst += stride;
1054   *(int *)dst = _mm_cvtsi128_si32(row3);
1055 }
1056 
1057 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1058 void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
1059                                const uint8_t *above, const uint8_t *left) {
1060   (void)above;
1061   const __m128i left_col = _mm_load_si128((__m128i const *)left);
1062   __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
1063   __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
1064 
1065   __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
1066   __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1067   __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1068   __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1069   *(int *)dst = _mm_cvtsi128_si32(row0);
1070   dst += stride;
1071   *(int *)dst = _mm_cvtsi128_si32(row1);
1072   dst += stride;
1073   *(int *)dst = _mm_cvtsi128_si32(row2);
1074   dst += stride;
1075   *(int *)dst = _mm_cvtsi128_si32(row3);
1076   dst += stride;
1077 
1078   left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
1079   row0 = _mm_shufflelo_epi16(left_col_low, 0);
1080   row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1081   row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1082   row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1083   *(int *)dst = _mm_cvtsi128_si32(row0);
1084   dst += stride;
1085   *(int *)dst = _mm_cvtsi128_si32(row1);
1086   dst += stride;
1087   *(int *)dst = _mm_cvtsi128_si32(row2);
1088   dst += stride;
1089   *(int *)dst = _mm_cvtsi128_si32(row3);
1090   dst += stride;
1091 
1092   row0 = _mm_shufflelo_epi16(left_col_high, 0);
1093   row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1094   row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1095   row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1096   *(int *)dst = _mm_cvtsi128_si32(row0);
1097   dst += stride;
1098   *(int *)dst = _mm_cvtsi128_si32(row1);
1099   dst += stride;
1100   *(int *)dst = _mm_cvtsi128_si32(row2);
1101   dst += stride;
1102   *(int *)dst = _mm_cvtsi128_si32(row3);
1103   dst += stride;
1104 
1105   left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
1106   row0 = _mm_shufflelo_epi16(left_col_high, 0);
1107   row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1108   row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1109   row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1110   *(int *)dst = _mm_cvtsi128_si32(row0);
1111   dst += stride;
1112   *(int *)dst = _mm_cvtsi128_si32(row1);
1113   dst += stride;
1114   *(int *)dst = _mm_cvtsi128_si32(row2);
1115   dst += stride;
1116   *(int *)dst = _mm_cvtsi128_si32(row3);
1117 }
1118 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1119 
aom_h_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1120 void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
1121                               const uint8_t *above, const uint8_t *left) {
1122   (void)above;
1123   __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
1124   left_col = _mm_unpacklo_epi8(left_col, left_col);
1125   __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
1126   __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
1127   __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
1128   __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
1129   _mm_storel_epi64((__m128i *)dst, row0);
1130   dst += stride;
1131   _mm_storel_epi64((__m128i *)dst, row1);
1132   dst += stride;
1133   _mm_storel_epi64((__m128i *)dst, row2);
1134   dst += stride;
1135   _mm_storel_epi64((__m128i *)dst, row3);
1136 }
1137 
h_predictor_8x16xc(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int count)1138 static inline void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
1139                                       const uint8_t *above, const uint8_t *left,
1140                                       int count) {
1141   (void)above;
1142   for (int i = 0; i < count; ++i) {
1143     const __m128i left_col = _mm_load_si128((__m128i const *)left);
1144     __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
1145     __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
1146 
1147     __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
1148     __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1149     __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1150     __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1151     _mm_storel_epi64((__m128i *)dst, row0);
1152     dst += stride;
1153     _mm_storel_epi64((__m128i *)dst, row1);
1154     dst += stride;
1155     _mm_storel_epi64((__m128i *)dst, row2);
1156     dst += stride;
1157     _mm_storel_epi64((__m128i *)dst, row3);
1158     dst += stride;
1159 
1160     left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
1161     row0 = _mm_shufflelo_epi16(left_col_low, 0);
1162     row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1163     row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1164     row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1165     _mm_storel_epi64((__m128i *)dst, row0);
1166     dst += stride;
1167     _mm_storel_epi64((__m128i *)dst, row1);
1168     dst += stride;
1169     _mm_storel_epi64((__m128i *)dst, row2);
1170     dst += stride;
1171     _mm_storel_epi64((__m128i *)dst, row3);
1172     dst += stride;
1173 
1174     row0 = _mm_shufflelo_epi16(left_col_high, 0);
1175     row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1176     row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1177     row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1178     _mm_storel_epi64((__m128i *)dst, row0);
1179     dst += stride;
1180     _mm_storel_epi64((__m128i *)dst, row1);
1181     dst += stride;
1182     _mm_storel_epi64((__m128i *)dst, row2);
1183     dst += stride;
1184     _mm_storel_epi64((__m128i *)dst, row3);
1185     dst += stride;
1186 
1187     left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
1188     row0 = _mm_shufflelo_epi16(left_col_high, 0);
1189     row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1190     row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1191     row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1192     _mm_storel_epi64((__m128i *)dst, row0);
1193     dst += stride;
1194     _mm_storel_epi64((__m128i *)dst, row1);
1195     dst += stride;
1196     _mm_storel_epi64((__m128i *)dst, row2);
1197     dst += stride;
1198     _mm_storel_epi64((__m128i *)dst, row3);
1199     dst += stride;
1200     left += 16;
1201   }
1202 }
1203 
aom_h_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1204 void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
1205                                const uint8_t *above, const uint8_t *left) {
1206   h_predictor_8x16xc(dst, stride, above, left, 1);
1207 }
1208 
1209 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1210 void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
1211                                const uint8_t *above, const uint8_t *left) {
1212   h_predictor_8x16xc(dst, stride, above, left, 2);
1213 }
1214 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1215 
h_pred_store_16xh(const __m128i * row,int h,uint8_t * dst,ptrdiff_t stride)1216 static inline void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
1217                                      ptrdiff_t stride) {
1218   int i;
1219   for (i = 0; i < h; ++i) {
1220     _mm_store_si128((__m128i *)dst, row[i]);
1221     dst += stride;
1222   }
1223 }
1224 
repeat_low_4pixels(const __m128i * x,__m128i * row)1225 static inline void repeat_low_4pixels(const __m128i *x, __m128i *row) {
1226   const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
1227   const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
1228   const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
1229   const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
1230 
1231   row[0] = _mm_unpacklo_epi64(u0, u0);
1232   row[1] = _mm_unpacklo_epi64(u1, u1);
1233   row[2] = _mm_unpacklo_epi64(u2, u2);
1234   row[3] = _mm_unpacklo_epi64(u3, u3);
1235 }
1236 
repeat_high_4pixels(const __m128i * x,__m128i * row)1237 static inline void repeat_high_4pixels(const __m128i *x, __m128i *row) {
1238   const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
1239   const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
1240   const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
1241   const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
1242 
1243   row[0] = _mm_unpackhi_epi64(u0, u0);
1244   row[1] = _mm_unpackhi_epi64(u1, u1);
1245   row[2] = _mm_unpackhi_epi64(u2, u2);
1246   row[3] = _mm_unpackhi_epi64(u3, u3);
1247 }
1248 
1249 // Process 16x8, first 4 rows
1250 // Use first 8 bytes of left register: xxxxxxxx33221100
h_prediction_16x8_1(const __m128i * left,uint8_t * dst,ptrdiff_t stride)1251 static inline void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
1252                                        ptrdiff_t stride) {
1253   __m128i row[4];
1254   repeat_low_4pixels(left, row);
1255   h_pred_store_16xh(row, 4, dst, stride);
1256 }
1257 
1258 // Process 16x8, second 4 rows
1259 // Use second 8 bytes of left register: 77665544xxxxxxxx
h_prediction_16x8_2(const __m128i * left,uint8_t * dst,ptrdiff_t stride)1260 static inline void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
1261                                        ptrdiff_t stride) {
1262   __m128i row[4];
1263   repeat_high_4pixels(left, row);
1264   h_pred_store_16xh(row, 4, dst, stride);
1265 }
1266 
1267 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1268 void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
1269                                const uint8_t *above, const uint8_t *left) {
1270   (void)above;
1271   const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
1272   const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1273   h_prediction_16x8_1(&left_col_8p, dst, stride);
1274 }
1275 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1276 
aom_h_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1277 void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
1278                                const uint8_t *above, const uint8_t *left) {
1279   (void)above;
1280   const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
1281   const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1282   h_prediction_16x8_1(&left_col_8p, dst, stride);
1283   dst += stride << 2;
1284   h_prediction_16x8_2(&left_col_8p, dst, stride);
1285 }
1286 
h_predictor_16xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int count)1287 static inline void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
1288                                     const uint8_t *left, int count) {
1289   int i = 0;
1290   do {
1291     const __m128i left_col = _mm_load_si128((const __m128i *)left);
1292     const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
1293     h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
1294     dst += stride << 2;
1295     h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
1296     dst += stride << 2;
1297 
1298     const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
1299     h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
1300     dst += stride << 2;
1301     h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
1302     dst += stride << 2;
1303 
1304     left += 16;
1305     i++;
1306   } while (i < count);
1307 }
1308 
aom_h_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1309 void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
1310                                 const uint8_t *above, const uint8_t *left) {
1311   (void)above;
1312   h_predictor_16xh(dst, stride, left, 2);
1313 }
1314 
1315 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1316 void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
1317                                 const uint8_t *above, const uint8_t *left) {
1318   (void)above;
1319   h_predictor_16xh(dst, stride, left, 4);
1320 }
1321 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1322 
h_pred_store_32xh(const __m128i * row,int h,uint8_t * dst,ptrdiff_t stride)1323 static inline void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
1324                                      ptrdiff_t stride) {
1325   int i;
1326   for (i = 0; i < h; ++i) {
1327     _mm_store_si128((__m128i *)dst, row[i]);
1328     _mm_store_si128((__m128i *)(dst + 16), row[i]);
1329     dst += stride;
1330   }
1331 }
1332 
1333 // Process 32x8, first 4 rows
1334 // Use first 8 bytes of left register: xxxxxxxx33221100
h_prediction_32x8_1(const __m128i * left,uint8_t * dst,ptrdiff_t stride)1335 static inline void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
1336                                        ptrdiff_t stride) {
1337   __m128i row[4];
1338   repeat_low_4pixels(left, row);
1339   h_pred_store_32xh(row, 4, dst, stride);
1340 }
1341 
1342 // Process 32x8, second 4 rows
1343 // Use second 8 bytes of left register: 77665544xxxxxxxx
h_prediction_32x8_2(const __m128i * left,uint8_t * dst,ptrdiff_t stride)1344 static inline void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
1345                                        ptrdiff_t stride) {
1346   __m128i row[4];
1347   repeat_high_4pixels(left, row);
1348   h_pred_store_32xh(row, 4, dst, stride);
1349 }
1350 
1351 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1352 void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
1353                                const uint8_t *above, const uint8_t *left) {
1354   __m128i left_col, left_col_8p;
1355   (void)above;
1356 
1357   left_col = _mm_load_si128((const __m128i *)left);
1358 
1359   left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1360   h_prediction_32x8_1(&left_col_8p, dst, stride);
1361   dst += stride << 2;
1362   h_prediction_32x8_2(&left_col_8p, dst, stride);
1363 }
1364 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1365 
aom_h_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1366 void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
1367                                 const uint8_t *above, const uint8_t *left) {
1368   __m128i left_col, left_col_8p;
1369   (void)above;
1370 
1371   left_col = _mm_load_si128((const __m128i *)left);
1372 
1373   left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1374   h_prediction_32x8_1(&left_col_8p, dst, stride);
1375   dst += stride << 2;
1376   h_prediction_32x8_2(&left_col_8p, dst, stride);
1377   dst += stride << 2;
1378 
1379   left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
1380   h_prediction_32x8_1(&left_col_8p, dst, stride);
1381   dst += stride << 2;
1382   h_prediction_32x8_2(&left_col_8p, dst, stride);
1383 }
1384 
h_predictor_32xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int height)1385 static inline void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
1386                                     const uint8_t *left, int height) {
1387   int i = height >> 2;
1388   do {
1389     __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
1390     left4 = _mm_unpacklo_epi8(left4, left4);
1391     left4 = _mm_unpacklo_epi8(left4, left4);
1392     const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
1393     const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
1394     _mm_store_si128((__m128i *)dst, r0);
1395     _mm_store_si128((__m128i *)(dst + 16), r0);
1396     _mm_store_si128((__m128i *)(dst + stride), r1);
1397     _mm_store_si128((__m128i *)(dst + stride + 16), r1);
1398     const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
1399     const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
1400     _mm_store_si128((__m128i *)(dst + stride * 2), r2);
1401     _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
1402     _mm_store_si128((__m128i *)(dst + stride * 3), r3);
1403     _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
1404     left += 4;
1405     dst += stride * 4;
1406   } while (--i);
1407 }
1408 
aom_h_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1409 void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
1410                                 const uint8_t *above, const uint8_t *left) {
1411   (void)above;
1412   h_predictor_32xh(dst, stride, left, 64);
1413 }
1414 
h_predictor_64xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int height)1415 static inline void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
1416                                     const uint8_t *left, int height) {
1417   int i = height >> 2;
1418   do {
1419     __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
1420     left4 = _mm_unpacklo_epi8(left4, left4);
1421     left4 = _mm_unpacklo_epi8(left4, left4);
1422     const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
1423     const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
1424     _mm_store_si128((__m128i *)dst, r0);
1425     _mm_store_si128((__m128i *)(dst + 16), r0);
1426     _mm_store_si128((__m128i *)(dst + 32), r0);
1427     _mm_store_si128((__m128i *)(dst + 48), r0);
1428     _mm_store_si128((__m128i *)(dst + stride), r1);
1429     _mm_store_si128((__m128i *)(dst + stride + 16), r1);
1430     _mm_store_si128((__m128i *)(dst + stride + 32), r1);
1431     _mm_store_si128((__m128i *)(dst + stride + 48), r1);
1432     const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
1433     const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
1434     _mm_store_si128((__m128i *)(dst + stride * 2), r2);
1435     _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
1436     _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
1437     _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
1438     _mm_store_si128((__m128i *)(dst + stride * 3), r3);
1439     _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
1440     _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
1441     _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
1442     left += 4;
1443     dst += stride * 4;
1444   } while (--i);
1445 }
1446 
aom_h_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1447 void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
1448                                 const uint8_t *above, const uint8_t *left) {
1449   (void)above;
1450   h_predictor_64xh(dst, stride, left, 64);
1451 }
1452 
aom_h_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1453 void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
1454                                 const uint8_t *above, const uint8_t *left) {
1455   (void)above;
1456   h_predictor_64xh(dst, stride, left, 32);
1457 }
1458 
1459 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1460 void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
1461                                 const uint8_t *above, const uint8_t *left) {
1462   (void)above;
1463   h_predictor_64xh(dst, stride, left, 16);
1464 }
1465 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1466