xref: /aosp_15_r20/external/libaom/aom_dsp/x86/intrapred_sse2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker  * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker  *
4*77c1e3ccSAndroid Build Coastguard Worker  * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker  * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker  * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker  */
11*77c1e3ccSAndroid Build Coastguard Worker 
12*77c1e3ccSAndroid Build Coastguard Worker #include <emmintrin.h>
13*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/x86/intrapred_x86.h"
14*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_dsp_rtcd.h"
15*77c1e3ccSAndroid Build Coastguard Worker 
dc_store_4xh(uint32_t dc,int height,uint8_t * dst,ptrdiff_t stride)16*77c1e3ccSAndroid Build Coastguard Worker static inline void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
17*77c1e3ccSAndroid Build Coastguard Worker                                 ptrdiff_t stride) {
18*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < height; i += 2) {
19*77c1e3ccSAndroid Build Coastguard Worker     *(uint32_t *)dst = dc;
20*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
21*77c1e3ccSAndroid Build Coastguard Worker     *(uint32_t *)dst = dc;
22*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
23*77c1e3ccSAndroid Build Coastguard Worker   }
24*77c1e3ccSAndroid Build Coastguard Worker }
25*77c1e3ccSAndroid Build Coastguard Worker 
dc_store_8xh(const __m128i * row,int height,uint8_t * dst,ptrdiff_t stride)26*77c1e3ccSAndroid Build Coastguard Worker static inline void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
27*77c1e3ccSAndroid Build Coastguard Worker                                 ptrdiff_t stride) {
28*77c1e3ccSAndroid Build Coastguard Worker   int i;
29*77c1e3ccSAndroid Build Coastguard Worker   for (i = 0; i < height; ++i) {
30*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)dst, *row);
31*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
32*77c1e3ccSAndroid Build Coastguard Worker   }
33*77c1e3ccSAndroid Build Coastguard Worker }
34*77c1e3ccSAndroid Build Coastguard Worker 
dc_store_16xh(const __m128i * row,int height,uint8_t * dst,ptrdiff_t stride)35*77c1e3ccSAndroid Build Coastguard Worker static inline void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
36*77c1e3ccSAndroid Build Coastguard Worker                                  ptrdiff_t stride) {
37*77c1e3ccSAndroid Build Coastguard Worker   int i;
38*77c1e3ccSAndroid Build Coastguard Worker   for (i = 0; i < height; ++i) {
39*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)dst, *row);
40*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
41*77c1e3ccSAndroid Build Coastguard Worker   }
42*77c1e3ccSAndroid Build Coastguard Worker }
43*77c1e3ccSAndroid Build Coastguard Worker 
dc_store_32xh(const __m128i * row,int height,uint8_t * dst,ptrdiff_t stride)44*77c1e3ccSAndroid Build Coastguard Worker static inline void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
45*77c1e3ccSAndroid Build Coastguard Worker                                  ptrdiff_t stride) {
46*77c1e3ccSAndroid Build Coastguard Worker   int i;
47*77c1e3ccSAndroid Build Coastguard Worker   for (i = 0; i < height; ++i) {
48*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)dst, *row);
49*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + 16), *row);
50*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
51*77c1e3ccSAndroid Build Coastguard Worker   }
52*77c1e3ccSAndroid Build Coastguard Worker }
53*77c1e3ccSAndroid Build Coastguard Worker 
dc_store_64xh(const __m128i * row,int height,uint8_t * dst,ptrdiff_t stride)54*77c1e3ccSAndroid Build Coastguard Worker static inline void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
55*77c1e3ccSAndroid Build Coastguard Worker                                  ptrdiff_t stride) {
56*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < height; ++i) {
57*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)dst, *row);
58*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + 16), *row);
59*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + 32), *row);
60*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + 48), *row);
61*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
62*77c1e3ccSAndroid Build Coastguard Worker   }
63*77c1e3ccSAndroid Build Coastguard Worker }
64*77c1e3ccSAndroid Build Coastguard Worker 
dc_sum_4(const uint8_t * ref)65*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i dc_sum_4(const uint8_t *ref) {
66*77c1e3ccSAndroid Build Coastguard Worker   __m128i x = _mm_loadl_epi64((__m128i const *)ref);
67*77c1e3ccSAndroid Build Coastguard Worker   const __m128i zero = _mm_setzero_si128();
68*77c1e3ccSAndroid Build Coastguard Worker   x = _mm_unpacklo_epi8(x, zero);
69*77c1e3ccSAndroid Build Coastguard Worker   return _mm_sad_epu8(x, zero);
70*77c1e3ccSAndroid Build Coastguard Worker }
71*77c1e3ccSAndroid Build Coastguard Worker 
dc_sum_8(const uint8_t * ref)72*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i dc_sum_8(const uint8_t *ref) {
73*77c1e3ccSAndroid Build Coastguard Worker   __m128i x = _mm_loadl_epi64((__m128i const *)ref);
74*77c1e3ccSAndroid Build Coastguard Worker   const __m128i zero = _mm_setzero_si128();
75*77c1e3ccSAndroid Build Coastguard Worker   return _mm_sad_epu8(x, zero);
76*77c1e3ccSAndroid Build Coastguard Worker }
77*77c1e3ccSAndroid Build Coastguard Worker 
dc_sum_64(const uint8_t * ref)78*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i dc_sum_64(const uint8_t *ref) {
79*77c1e3ccSAndroid Build Coastguard Worker   __m128i x0 = _mm_load_si128((__m128i const *)ref);
80*77c1e3ccSAndroid Build Coastguard Worker   __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
81*77c1e3ccSAndroid Build Coastguard Worker   __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
82*77c1e3ccSAndroid Build Coastguard Worker   __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
83*77c1e3ccSAndroid Build Coastguard Worker   const __m128i zero = _mm_setzero_si128();
84*77c1e3ccSAndroid Build Coastguard Worker   x0 = _mm_sad_epu8(x0, zero);
85*77c1e3ccSAndroid Build Coastguard Worker   x1 = _mm_sad_epu8(x1, zero);
86*77c1e3ccSAndroid Build Coastguard Worker   x2 = _mm_sad_epu8(x2, zero);
87*77c1e3ccSAndroid Build Coastguard Worker   x3 = _mm_sad_epu8(x3, zero);
88*77c1e3ccSAndroid Build Coastguard Worker   x0 = _mm_add_epi16(x0, x1);
89*77c1e3ccSAndroid Build Coastguard Worker   x2 = _mm_add_epi16(x2, x3);
90*77c1e3ccSAndroid Build Coastguard Worker   x0 = _mm_add_epi16(x0, x2);
91*77c1e3ccSAndroid Build Coastguard Worker   const __m128i high = _mm_unpackhi_epi64(x0, x0);
92*77c1e3ccSAndroid Build Coastguard Worker   return _mm_add_epi16(x0, high);
93*77c1e3ccSAndroid Build Coastguard Worker }
94*77c1e3ccSAndroid Build Coastguard Worker 
95*77c1e3ccSAndroid Build Coastguard Worker #define DC_MULTIPLIER_1X2 0x5556
96*77c1e3ccSAndroid Build Coastguard Worker #define DC_MULTIPLIER_1X4 0x3334
97*77c1e3ccSAndroid Build Coastguard Worker 
98*77c1e3ccSAndroid Build Coastguard Worker #define DC_SHIFT2 16
99*77c1e3ccSAndroid Build Coastguard Worker 
divide_using_multiply_shift(int num,int shift1,int multiplier)100*77c1e3ccSAndroid Build Coastguard Worker static inline int divide_using_multiply_shift(int num, int shift1,
101*77c1e3ccSAndroid Build Coastguard Worker                                               int multiplier) {
102*77c1e3ccSAndroid Build Coastguard Worker   const int interm = num >> shift1;
103*77c1e3ccSAndroid Build Coastguard Worker   return interm * multiplier >> DC_SHIFT2;
104*77c1e3ccSAndroid Build Coastguard Worker }
105*77c1e3ccSAndroid Build Coastguard Worker 
106*77c1e3ccSAndroid Build Coastguard Worker // -----------------------------------------------------------------------------
107*77c1e3ccSAndroid Build Coastguard Worker // DC_PRED
108*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)109*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
110*77c1e3ccSAndroid Build Coastguard Worker                                const uint8_t *above, const uint8_t *left) {
111*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum_left = dc_sum_8(left);
112*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_4(above);
113*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_left, sum_above);
114*77c1e3ccSAndroid Build Coastguard Worker 
115*77c1e3ccSAndroid Build Coastguard Worker   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
116*77c1e3ccSAndroid Build Coastguard Worker   sum += 6;
117*77c1e3ccSAndroid Build Coastguard Worker   sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
118*77c1e3ccSAndroid Build Coastguard Worker 
119*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)sum);
120*77c1e3ccSAndroid Build Coastguard Worker   const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
121*77c1e3ccSAndroid Build Coastguard Worker   dc_store_4xh(pred, 8, dst, stride);
122*77c1e3ccSAndroid Build Coastguard Worker }
123*77c1e3ccSAndroid Build Coastguard Worker 
124*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)125*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
126*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
127*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum_left = dc_sum_16_sse2(left);
128*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_4(above);
129*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_left, sum_above);
130*77c1e3ccSAndroid Build Coastguard Worker 
131*77c1e3ccSAndroid Build Coastguard Worker   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
132*77c1e3ccSAndroid Build Coastguard Worker   sum += 10;
133*77c1e3ccSAndroid Build Coastguard Worker   sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
134*77c1e3ccSAndroid Build Coastguard Worker 
135*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)sum);
136*77c1e3ccSAndroid Build Coastguard Worker   const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
137*77c1e3ccSAndroid Build Coastguard Worker   dc_store_4xh(pred, 16, dst, stride);
138*77c1e3ccSAndroid Build Coastguard Worker }
139*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
140*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)141*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
142*77c1e3ccSAndroid Build Coastguard Worker                                const uint8_t *above, const uint8_t *left) {
143*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum_left = dc_sum_4(left);
144*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_8(above);
145*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, sum_left);
146*77c1e3ccSAndroid Build Coastguard Worker 
147*77c1e3ccSAndroid Build Coastguard Worker   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
148*77c1e3ccSAndroid Build Coastguard Worker   sum += 6;
149*77c1e3ccSAndroid Build Coastguard Worker   sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
150*77c1e3ccSAndroid Build Coastguard Worker 
151*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)sum);
152*77c1e3ccSAndroid Build Coastguard Worker   dc_store_8xh(&row, 4, dst, stride);
153*77c1e3ccSAndroid Build Coastguard Worker }
154*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)155*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
156*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
157*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum_left = dc_sum_16_sse2(left);
158*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_8(above);
159*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, sum_left);
160*77c1e3ccSAndroid Build Coastguard Worker 
161*77c1e3ccSAndroid Build Coastguard Worker   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
162*77c1e3ccSAndroid Build Coastguard Worker   sum += 12;
163*77c1e3ccSAndroid Build Coastguard Worker   sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
164*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)sum);
165*77c1e3ccSAndroid Build Coastguard Worker   dc_store_8xh(&row, 16, dst, stride);
166*77c1e3ccSAndroid Build Coastguard Worker }
167*77c1e3ccSAndroid Build Coastguard Worker 
168*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)169*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
170*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
171*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum_left = dc_sum_32_sse2(left);
172*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_8(above);
173*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, sum_left);
174*77c1e3ccSAndroid Build Coastguard Worker 
175*77c1e3ccSAndroid Build Coastguard Worker   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
176*77c1e3ccSAndroid Build Coastguard Worker   sum += 20;
177*77c1e3ccSAndroid Build Coastguard Worker   sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
178*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)sum);
179*77c1e3ccSAndroid Build Coastguard Worker   dc_store_8xh(&row, 32, dst, stride);
180*77c1e3ccSAndroid Build Coastguard Worker }
181*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)182*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
183*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
184*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum_left = dc_sum_4(left);
185*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_16_sse2(above);
186*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, sum_left);
187*77c1e3ccSAndroid Build Coastguard Worker 
188*77c1e3ccSAndroid Build Coastguard Worker   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
189*77c1e3ccSAndroid Build Coastguard Worker   sum += 10;
190*77c1e3ccSAndroid Build Coastguard Worker   sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
191*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)sum);
192*77c1e3ccSAndroid Build Coastguard Worker   dc_store_16xh(&row, 4, dst, stride);
193*77c1e3ccSAndroid Build Coastguard Worker }
194*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
195*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)196*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
197*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
198*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum_left = dc_sum_8(left);
199*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_16_sse2(above);
200*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, sum_left);
201*77c1e3ccSAndroid Build Coastguard Worker 
202*77c1e3ccSAndroid Build Coastguard Worker   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
203*77c1e3ccSAndroid Build Coastguard Worker   sum += 12;
204*77c1e3ccSAndroid Build Coastguard Worker   sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
205*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)sum);
206*77c1e3ccSAndroid Build Coastguard Worker   dc_store_16xh(&row, 8, dst, stride);
207*77c1e3ccSAndroid Build Coastguard Worker }
208*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)209*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
210*77c1e3ccSAndroid Build Coastguard Worker                                  const uint8_t *above, const uint8_t *left) {
211*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum_left = dc_sum_32_sse2(left);
212*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_16_sse2(above);
213*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_left, sum_above);
214*77c1e3ccSAndroid Build Coastguard Worker 
215*77c1e3ccSAndroid Build Coastguard Worker   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
216*77c1e3ccSAndroid Build Coastguard Worker   sum += 24;
217*77c1e3ccSAndroid Build Coastguard Worker   sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
218*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)sum);
219*77c1e3ccSAndroid Build Coastguard Worker   dc_store_16xh(&row, 32, dst, stride);
220*77c1e3ccSAndroid Build Coastguard Worker }
221*77c1e3ccSAndroid Build Coastguard Worker 
222*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)223*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
224*77c1e3ccSAndroid Build Coastguard Worker                                  const uint8_t *above, const uint8_t *left) {
225*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum_left = dc_sum_64(left);
226*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_16_sse2(above);
227*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_left, sum_above);
228*77c1e3ccSAndroid Build Coastguard Worker 
229*77c1e3ccSAndroid Build Coastguard Worker   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
230*77c1e3ccSAndroid Build Coastguard Worker   sum += 40;
231*77c1e3ccSAndroid Build Coastguard Worker   sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
232*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)sum);
233*77c1e3ccSAndroid Build Coastguard Worker   dc_store_16xh(&row, 64, dst, stride);
234*77c1e3ccSAndroid Build Coastguard Worker }
235*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)236*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
237*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
238*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_32_sse2(above);
239*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum_left = dc_sum_8(left);
240*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, sum_left);
241*77c1e3ccSAndroid Build Coastguard Worker 
242*77c1e3ccSAndroid Build Coastguard Worker   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
243*77c1e3ccSAndroid Build Coastguard Worker   sum += 20;
244*77c1e3ccSAndroid Build Coastguard Worker   sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
245*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)sum);
246*77c1e3ccSAndroid Build Coastguard Worker   dc_store_32xh(&row, 8, dst, stride);
247*77c1e3ccSAndroid Build Coastguard Worker }
248*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
249*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)250*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
251*77c1e3ccSAndroid Build Coastguard Worker                                  const uint8_t *above, const uint8_t *left) {
252*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_32_sse2(above);
253*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum_left = dc_sum_16_sse2(left);
254*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, sum_left);
255*77c1e3ccSAndroid Build Coastguard Worker 
256*77c1e3ccSAndroid Build Coastguard Worker   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
257*77c1e3ccSAndroid Build Coastguard Worker   sum += 24;
258*77c1e3ccSAndroid Build Coastguard Worker   sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
259*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)sum);
260*77c1e3ccSAndroid Build Coastguard Worker   dc_store_32xh(&row, 16, dst, stride);
261*77c1e3ccSAndroid Build Coastguard Worker }
262*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)263*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
264*77c1e3ccSAndroid Build Coastguard Worker                                  const uint8_t *above, const uint8_t *left) {
265*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_32_sse2(above);
266*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum_left = dc_sum_64(left);
267*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, sum_left);
268*77c1e3ccSAndroid Build Coastguard Worker 
269*77c1e3ccSAndroid Build Coastguard Worker   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
270*77c1e3ccSAndroid Build Coastguard Worker   sum += 48;
271*77c1e3ccSAndroid Build Coastguard Worker   sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
272*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)sum);
273*77c1e3ccSAndroid Build Coastguard Worker   dc_store_32xh(&row, 64, dst, stride);
274*77c1e3ccSAndroid Build Coastguard Worker }
275*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)276*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
277*77c1e3ccSAndroid Build Coastguard Worker                                  const uint8_t *above, const uint8_t *left) {
278*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_64(above);
279*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum_left = dc_sum_64(left);
280*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, sum_left);
281*77c1e3ccSAndroid Build Coastguard Worker 
282*77c1e3ccSAndroid Build Coastguard Worker   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
283*77c1e3ccSAndroid Build Coastguard Worker   sum += 64;
284*77c1e3ccSAndroid Build Coastguard Worker   sum /= 128;
285*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)sum);
286*77c1e3ccSAndroid Build Coastguard Worker   dc_store_64xh(&row, 64, dst, stride);
287*77c1e3ccSAndroid Build Coastguard Worker }
288*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)289*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
290*77c1e3ccSAndroid Build Coastguard Worker                                  const uint8_t *above, const uint8_t *left) {
291*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_64(above);
292*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum_left = dc_sum_32_sse2(left);
293*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, sum_left);
294*77c1e3ccSAndroid Build Coastguard Worker 
295*77c1e3ccSAndroid Build Coastguard Worker   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
296*77c1e3ccSAndroid Build Coastguard Worker   sum += 48;
297*77c1e3ccSAndroid Build Coastguard Worker   sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
298*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)sum);
299*77c1e3ccSAndroid Build Coastguard Worker   dc_store_64xh(&row, 32, dst, stride);
300*77c1e3ccSAndroid Build Coastguard Worker }
301*77c1e3ccSAndroid Build Coastguard Worker 
302*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)303*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
304*77c1e3ccSAndroid Build Coastguard Worker                                  const uint8_t *above, const uint8_t *left) {
305*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_64(above);
306*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum_left = dc_sum_16_sse2(left);
307*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, sum_left);
308*77c1e3ccSAndroid Build Coastguard Worker 
309*77c1e3ccSAndroid Build Coastguard Worker   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
310*77c1e3ccSAndroid Build Coastguard Worker   sum += 40;
311*77c1e3ccSAndroid Build Coastguard Worker   sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
312*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)sum);
313*77c1e3ccSAndroid Build Coastguard Worker   dc_store_64xh(&row, 16, dst, stride);
314*77c1e3ccSAndroid Build Coastguard Worker }
315*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
316*77c1e3ccSAndroid Build Coastguard Worker 
317*77c1e3ccSAndroid Build Coastguard Worker // -----------------------------------------------------------------------------
318*77c1e3ccSAndroid Build Coastguard Worker // DC_TOP
319*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_top_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)320*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
321*77c1e3ccSAndroid Build Coastguard Worker                                    const uint8_t *above, const uint8_t *left) {
322*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
323*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_4(above);
324*77c1e3ccSAndroid Build Coastguard Worker   const __m128i two = _mm_set1_epi16(2);
325*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, two);
326*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_srai_epi16(sum_above, 2);
327*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_shufflelo_epi16(sum_above, 0);
328*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_packus_epi16(sum_above, sum_above);
329*77c1e3ccSAndroid Build Coastguard Worker 
330*77c1e3ccSAndroid Build Coastguard Worker   const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
331*77c1e3ccSAndroid Build Coastguard Worker   dc_store_4xh(pred, 8, dst, stride);
332*77c1e3ccSAndroid Build Coastguard Worker }
333*77c1e3ccSAndroid Build Coastguard Worker 
334*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_top_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)335*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
336*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, const uint8_t *left) {
337*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
338*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_4(above);
339*77c1e3ccSAndroid Build Coastguard Worker   const __m128i two = _mm_set1_epi16(2);
340*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, two);
341*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_srai_epi16(sum_above, 2);
342*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_shufflelo_epi16(sum_above, 0);
343*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_packus_epi16(sum_above, sum_above);
344*77c1e3ccSAndroid Build Coastguard Worker 
345*77c1e3ccSAndroid Build Coastguard Worker   const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
346*77c1e3ccSAndroid Build Coastguard Worker   dc_store_4xh(pred, 16, dst, stride);
347*77c1e3ccSAndroid Build Coastguard Worker }
348*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
349*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_top_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)350*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
351*77c1e3ccSAndroid Build Coastguard Worker                                    const uint8_t *above, const uint8_t *left) {
352*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
353*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_8(above);
354*77c1e3ccSAndroid Build Coastguard Worker   const __m128i four = _mm_set1_epi16(4);
355*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, four);
356*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_srai_epi16(sum_above, 3);
357*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
358*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
359*77c1e3ccSAndroid Build Coastguard Worker   dc_store_8xh(&row, 4, dst, stride);
360*77c1e3ccSAndroid Build Coastguard Worker }
361*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_top_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)362*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
363*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, const uint8_t *left) {
364*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
365*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_8(above);
366*77c1e3ccSAndroid Build Coastguard Worker   const __m128i four = _mm_set1_epi16(4);
367*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, four);
368*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_srai_epi16(sum_above, 3);
369*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
370*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
371*77c1e3ccSAndroid Build Coastguard Worker   dc_store_8xh(&row, 16, dst, stride);
372*77c1e3ccSAndroid Build Coastguard Worker }
373*77c1e3ccSAndroid Build Coastguard Worker 
374*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_top_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)375*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
376*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, const uint8_t *left) {
377*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
378*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_8(above);
379*77c1e3ccSAndroid Build Coastguard Worker   const __m128i four = _mm_set1_epi16(4);
380*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, four);
381*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_srai_epi16(sum_above, 3);
382*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
383*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
384*77c1e3ccSAndroid Build Coastguard Worker   dc_store_8xh(&row, 32, dst, stride);
385*77c1e3ccSAndroid Build Coastguard Worker }
386*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_top_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)387*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
388*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, const uint8_t *left) {
389*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
390*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_16_sse2(above);
391*77c1e3ccSAndroid Build Coastguard Worker   const __m128i eight = _mm_set1_epi16(8);
392*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, eight);
393*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_srai_epi16(sum_above, 4);
394*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
395*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_shufflelo_epi16(sum_above, 0);
396*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
397*77c1e3ccSAndroid Build Coastguard Worker   dc_store_16xh(&row, 4, dst, stride);
398*77c1e3ccSAndroid Build Coastguard Worker }
399*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
400*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_top_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)401*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
402*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, const uint8_t *left) {
403*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
404*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_16_sse2(above);
405*77c1e3ccSAndroid Build Coastguard Worker   const __m128i eight = _mm_set1_epi16(8);
406*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, eight);
407*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_srai_epi16(sum_above, 4);
408*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
409*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_shufflelo_epi16(sum_above, 0);
410*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
411*77c1e3ccSAndroid Build Coastguard Worker   dc_store_16xh(&row, 8, dst, stride);
412*77c1e3ccSAndroid Build Coastguard Worker }
413*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_top_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)414*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
415*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
416*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
417*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
418*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_16_sse2(above);
419*77c1e3ccSAndroid Build Coastguard Worker   const __m128i eight = _mm_set1_epi16(8);
420*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, eight);
421*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_srai_epi16(sum_above, 4);
422*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
423*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_shufflelo_epi16(sum_above, 0);
424*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
425*77c1e3ccSAndroid Build Coastguard Worker   dc_store_16xh(&row, 32, dst, stride);
426*77c1e3ccSAndroid Build Coastguard Worker }
427*77c1e3ccSAndroid Build Coastguard Worker 
428*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_top_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)429*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
430*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
431*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
432*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
433*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_16_sse2(above);
434*77c1e3ccSAndroid Build Coastguard Worker   const __m128i eight = _mm_set1_epi16(8);
435*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, eight);
436*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_srai_epi16(sum_above, 4);
437*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
438*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_shufflelo_epi16(sum_above, 0);
439*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
440*77c1e3ccSAndroid Build Coastguard Worker   dc_store_16xh(&row, 64, dst, stride);
441*77c1e3ccSAndroid Build Coastguard Worker }
442*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_top_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)443*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
444*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, const uint8_t *left) {
445*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
446*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_32_sse2(above);
447*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sixteen = _mm_set1_epi16(16);
448*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, sixteen);
449*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_srai_epi16(sum_above, 5);
450*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
451*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_shufflelo_epi16(sum_above, 0);
452*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
453*77c1e3ccSAndroid Build Coastguard Worker   dc_store_32xh(&row, 8, dst, stride);
454*77c1e3ccSAndroid Build Coastguard Worker }
455*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
456*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_top_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)457*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
458*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
459*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
460*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
461*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_32_sse2(above);
462*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sixteen = _mm_set1_epi16(16);
463*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, sixteen);
464*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_srai_epi16(sum_above, 5);
465*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
466*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_shufflelo_epi16(sum_above, 0);
467*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
468*77c1e3ccSAndroid Build Coastguard Worker   dc_store_32xh(&row, 16, dst, stride);
469*77c1e3ccSAndroid Build Coastguard Worker }
470*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_top_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)471*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
472*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
473*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
474*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
475*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_32_sse2(above);
476*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sixteen = _mm_set1_epi16(16);
477*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, sixteen);
478*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_srai_epi16(sum_above, 5);
479*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
480*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_shufflelo_epi16(sum_above, 0);
481*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
482*77c1e3ccSAndroid Build Coastguard Worker   dc_store_32xh(&row, 64, dst, stride);
483*77c1e3ccSAndroid Build Coastguard Worker }
484*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_top_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)485*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
486*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
487*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
488*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
489*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_64(above);
490*77c1e3ccSAndroid Build Coastguard Worker   const __m128i thirtytwo = _mm_set1_epi16(32);
491*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, thirtytwo);
492*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_srai_epi16(sum_above, 6);
493*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
494*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_shufflelo_epi16(sum_above, 0);
495*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
496*77c1e3ccSAndroid Build Coastguard Worker   dc_store_64xh(&row, 64, dst, stride);
497*77c1e3ccSAndroid Build Coastguard Worker }
498*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_top_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)499*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
500*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
501*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
502*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
503*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_64(above);
504*77c1e3ccSAndroid Build Coastguard Worker   const __m128i thirtytwo = _mm_set1_epi16(32);
505*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, thirtytwo);
506*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_srai_epi16(sum_above, 6);
507*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
508*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_shufflelo_epi16(sum_above, 0);
509*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
510*77c1e3ccSAndroid Build Coastguard Worker   dc_store_64xh(&row, 32, dst, stride);
511*77c1e3ccSAndroid Build Coastguard Worker }
512*77c1e3ccSAndroid Build Coastguard Worker 
513*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_top_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)514*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
515*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
516*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
517*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
518*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_above = dc_sum_64(above);
519*77c1e3ccSAndroid Build Coastguard Worker   const __m128i thirtytwo = _mm_set1_epi16(32);
520*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_add_epi16(sum_above, thirtytwo);
521*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_srai_epi16(sum_above, 6);
522*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
523*77c1e3ccSAndroid Build Coastguard Worker   sum_above = _mm_shufflelo_epi16(sum_above, 0);
524*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
525*77c1e3ccSAndroid Build Coastguard Worker   dc_store_64xh(&row, 16, dst, stride);
526*77c1e3ccSAndroid Build Coastguard Worker }
527*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
528*77c1e3ccSAndroid Build Coastguard Worker 
529*77c1e3ccSAndroid Build Coastguard Worker // -----------------------------------------------------------------------------
530*77c1e3ccSAndroid Build Coastguard Worker // DC_LEFT
531*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_left_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)532*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
533*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, const uint8_t *left) {
534*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
535*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_left = dc_sum_8(left);
536*77c1e3ccSAndroid Build Coastguard Worker   const __m128i four = _mm_set1_epi16(4);
537*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_add_epi16(sum_left, four);
538*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_srai_epi16(sum_left, 3);
539*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_shufflelo_epi16(sum_left, 0);
540*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_packus_epi16(sum_left, sum_left);
541*77c1e3ccSAndroid Build Coastguard Worker 
542*77c1e3ccSAndroid Build Coastguard Worker   const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
543*77c1e3ccSAndroid Build Coastguard Worker   dc_store_4xh(pred, 8, dst, stride);
544*77c1e3ccSAndroid Build Coastguard Worker }
545*77c1e3ccSAndroid Build Coastguard Worker 
546*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_left_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)547*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
548*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
549*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
550*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
551*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_left = dc_sum_16_sse2(left);
552*77c1e3ccSAndroid Build Coastguard Worker   const __m128i eight = _mm_set1_epi16(8);
553*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_add_epi16(sum_left, eight);
554*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_srai_epi16(sum_left, 4);
555*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_shufflelo_epi16(sum_left, 0);
556*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_packus_epi16(sum_left, sum_left);
557*77c1e3ccSAndroid Build Coastguard Worker 
558*77c1e3ccSAndroid Build Coastguard Worker   const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
559*77c1e3ccSAndroid Build Coastguard Worker   dc_store_4xh(pred, 16, dst, stride);
560*77c1e3ccSAndroid Build Coastguard Worker }
561*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
562*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_left_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)563*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
564*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, const uint8_t *left) {
565*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
566*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_left = dc_sum_4(left);
567*77c1e3ccSAndroid Build Coastguard Worker   const __m128i two = _mm_set1_epi16(2);
568*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_add_epi16(sum_left, two);
569*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_srai_epi16(sum_left, 2);
570*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
571*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
572*77c1e3ccSAndroid Build Coastguard Worker   dc_store_8xh(&row, 4, dst, stride);
573*77c1e3ccSAndroid Build Coastguard Worker }
574*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_left_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)575*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
576*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
577*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
578*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
579*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_left = dc_sum_16_sse2(left);
580*77c1e3ccSAndroid Build Coastguard Worker   const __m128i eight = _mm_set1_epi16(8);
581*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_add_epi16(sum_left, eight);
582*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_srai_epi16(sum_left, 4);
583*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
584*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
585*77c1e3ccSAndroid Build Coastguard Worker   dc_store_8xh(&row, 16, dst, stride);
586*77c1e3ccSAndroid Build Coastguard Worker }
587*77c1e3ccSAndroid Build Coastguard Worker 
588*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_left_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)589*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
590*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
591*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
592*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
593*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_left = dc_sum_32_sse2(left);
594*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sixteen = _mm_set1_epi16(16);
595*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_add_epi16(sum_left, sixteen);
596*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_srai_epi16(sum_left, 5);
597*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
598*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
599*77c1e3ccSAndroid Build Coastguard Worker   dc_store_8xh(&row, 32, dst, stride);
600*77c1e3ccSAndroid Build Coastguard Worker }
601*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_left_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)602*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
603*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
604*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
605*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
606*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_left = dc_sum_4(left);
607*77c1e3ccSAndroid Build Coastguard Worker   const __m128i two = _mm_set1_epi16(2);
608*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_add_epi16(sum_left, two);
609*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_srai_epi16(sum_left, 2);
610*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
611*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_shufflelo_epi16(sum_left, 0);
612*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
613*77c1e3ccSAndroid Build Coastguard Worker   dc_store_16xh(&row, 4, dst, stride);
614*77c1e3ccSAndroid Build Coastguard Worker }
615*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
616*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_left_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)617*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
618*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
619*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
620*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
621*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_left = dc_sum_8(left);
622*77c1e3ccSAndroid Build Coastguard Worker   const __m128i four = _mm_set1_epi16(4);
623*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_add_epi16(sum_left, four);
624*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_srai_epi16(sum_left, 3);
625*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
626*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_shufflelo_epi16(sum_left, 0);
627*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
628*77c1e3ccSAndroid Build Coastguard Worker   dc_store_16xh(&row, 8, dst, stride);
629*77c1e3ccSAndroid Build Coastguard Worker }
630*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_left_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)631*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
632*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *above,
633*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *left) {
634*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
635*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_left = dc_sum_32_sse2(left);
636*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sixteen = _mm_set1_epi16(16);
637*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_add_epi16(sum_left, sixteen);
638*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_srai_epi16(sum_left, 5);
639*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
640*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_shufflelo_epi16(sum_left, 0);
641*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
642*77c1e3ccSAndroid Build Coastguard Worker   dc_store_16xh(&row, 32, dst, stride);
643*77c1e3ccSAndroid Build Coastguard Worker }
644*77c1e3ccSAndroid Build Coastguard Worker 
645*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_left_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)646*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
647*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *above,
648*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *left) {
649*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
650*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_left = dc_sum_64(left);
651*77c1e3ccSAndroid Build Coastguard Worker   const __m128i thirtytwo = _mm_set1_epi16(32);
652*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_add_epi16(sum_left, thirtytwo);
653*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_srai_epi16(sum_left, 6);
654*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
655*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_shufflelo_epi16(sum_left, 0);
656*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
657*77c1e3ccSAndroid Build Coastguard Worker   dc_store_16xh(&row, 64, dst, stride);
658*77c1e3ccSAndroid Build Coastguard Worker }
659*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_left_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)660*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
661*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
662*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
663*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
664*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_left = dc_sum_8(left);
665*77c1e3ccSAndroid Build Coastguard Worker   const __m128i four = _mm_set1_epi16(4);
666*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_add_epi16(sum_left, four);
667*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_srai_epi16(sum_left, 3);
668*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
669*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_shufflelo_epi16(sum_left, 0);
670*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
671*77c1e3ccSAndroid Build Coastguard Worker   dc_store_32xh(&row, 8, dst, stride);
672*77c1e3ccSAndroid Build Coastguard Worker }
673*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
674*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_left_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)675*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
676*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *above,
677*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *left) {
678*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
679*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_left = dc_sum_16_sse2(left);
680*77c1e3ccSAndroid Build Coastguard Worker   const __m128i eight = _mm_set1_epi16(8);
681*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_add_epi16(sum_left, eight);
682*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_srai_epi16(sum_left, 4);
683*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
684*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_shufflelo_epi16(sum_left, 0);
685*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
686*77c1e3ccSAndroid Build Coastguard Worker   dc_store_32xh(&row, 16, dst, stride);
687*77c1e3ccSAndroid Build Coastguard Worker }
688*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_left_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)689*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
690*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *above,
691*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *left) {
692*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
693*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_left = dc_sum_64(left);
694*77c1e3ccSAndroid Build Coastguard Worker   const __m128i thirtytwo = _mm_set1_epi16(32);
695*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_add_epi16(sum_left, thirtytwo);
696*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_srai_epi16(sum_left, 6);
697*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
698*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_shufflelo_epi16(sum_left, 0);
699*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
700*77c1e3ccSAndroid Build Coastguard Worker   dc_store_32xh(&row, 64, dst, stride);
701*77c1e3ccSAndroid Build Coastguard Worker }
702*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_left_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)703*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
704*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *above,
705*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *left) {
706*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
707*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_left = dc_sum_64(left);
708*77c1e3ccSAndroid Build Coastguard Worker   const __m128i thirtytwo = _mm_set1_epi16(32);
709*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_add_epi16(sum_left, thirtytwo);
710*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_srai_epi16(sum_left, 6);
711*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
712*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_shufflelo_epi16(sum_left, 0);
713*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
714*77c1e3ccSAndroid Build Coastguard Worker   dc_store_64xh(&row, 64, dst, stride);
715*77c1e3ccSAndroid Build Coastguard Worker }
716*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_left_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)717*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
718*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *above,
719*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *left) {
720*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
721*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_left = dc_sum_32_sse2(left);
722*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sixteen = _mm_set1_epi16(16);
723*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_add_epi16(sum_left, sixteen);
724*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_srai_epi16(sum_left, 5);
725*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
726*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_shufflelo_epi16(sum_left, 0);
727*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
728*77c1e3ccSAndroid Build Coastguard Worker   dc_store_64xh(&row, 32, dst, stride);
729*77c1e3ccSAndroid Build Coastguard Worker }
730*77c1e3ccSAndroid Build Coastguard Worker 
731*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_left_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)732*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
733*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *above,
734*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *left) {
735*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
736*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_left = dc_sum_16_sse2(left);
737*77c1e3ccSAndroid Build Coastguard Worker   const __m128i eight = _mm_set1_epi16(8);
738*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_add_epi16(sum_left, eight);
739*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_srai_epi16(sum_left, 4);
740*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
741*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm_shufflelo_epi16(sum_left, 0);
742*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
743*77c1e3ccSAndroid Build Coastguard Worker   dc_store_64xh(&row, 16, dst, stride);
744*77c1e3ccSAndroid Build Coastguard Worker }
745*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
746*77c1e3ccSAndroid Build Coastguard Worker 
747*77c1e3ccSAndroid Build Coastguard Worker // -----------------------------------------------------------------------------
748*77c1e3ccSAndroid Build Coastguard Worker // DC_128
749*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_128_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)750*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
751*77c1e3ccSAndroid Build Coastguard Worker                                    const uint8_t *above, const uint8_t *left) {
752*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
753*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
754*77c1e3ccSAndroid Build Coastguard Worker   const uint32_t pred = 0x80808080;
755*77c1e3ccSAndroid Build Coastguard Worker   dc_store_4xh(pred, 8, dst, stride);
756*77c1e3ccSAndroid Build Coastguard Worker }
757*77c1e3ccSAndroid Build Coastguard Worker 
758*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_128_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)759*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
760*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, const uint8_t *left) {
761*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
762*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
763*77c1e3ccSAndroid Build Coastguard Worker   const uint32_t pred = 0x80808080;
764*77c1e3ccSAndroid Build Coastguard Worker   dc_store_4xh(pred, 16, dst, stride);
765*77c1e3ccSAndroid Build Coastguard Worker }
766*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
767*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_128_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)768*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
769*77c1e3ccSAndroid Build Coastguard Worker                                    const uint8_t *above, const uint8_t *left) {
770*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
771*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
772*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)128);
773*77c1e3ccSAndroid Build Coastguard Worker   dc_store_8xh(&row, 4, dst, stride);
774*77c1e3ccSAndroid Build Coastguard Worker }
775*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_128_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)776*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
777*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, const uint8_t *left) {
778*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
779*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
780*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)128);
781*77c1e3ccSAndroid Build Coastguard Worker   dc_store_8xh(&row, 16, dst, stride);
782*77c1e3ccSAndroid Build Coastguard Worker }
783*77c1e3ccSAndroid Build Coastguard Worker 
784*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_128_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)785*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
786*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, const uint8_t *left) {
787*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
788*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
789*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)128);
790*77c1e3ccSAndroid Build Coastguard Worker   dc_store_8xh(&row, 32, dst, stride);
791*77c1e3ccSAndroid Build Coastguard Worker }
792*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_128_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)793*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
794*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, const uint8_t *left) {
795*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
796*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
797*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)128);
798*77c1e3ccSAndroid Build Coastguard Worker   dc_store_16xh(&row, 4, dst, stride);
799*77c1e3ccSAndroid Build Coastguard Worker }
800*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
801*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_128_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)802*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
803*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, const uint8_t *left) {
804*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
805*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
806*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)128);
807*77c1e3ccSAndroid Build Coastguard Worker   dc_store_16xh(&row, 8, dst, stride);
808*77c1e3ccSAndroid Build Coastguard Worker }
809*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_128_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)810*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
811*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
812*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
813*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
814*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
815*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)128);
816*77c1e3ccSAndroid Build Coastguard Worker   dc_store_16xh(&row, 32, dst, stride);
817*77c1e3ccSAndroid Build Coastguard Worker }
818*77c1e3ccSAndroid Build Coastguard Worker 
819*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_128_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)820*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
821*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
822*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
823*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
824*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
825*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)128);
826*77c1e3ccSAndroid Build Coastguard Worker   dc_store_16xh(&row, 64, dst, stride);
827*77c1e3ccSAndroid Build Coastguard Worker }
828*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_128_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)829*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
830*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, const uint8_t *left) {
831*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
832*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
833*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)128);
834*77c1e3ccSAndroid Build Coastguard Worker   dc_store_32xh(&row, 8, dst, stride);
835*77c1e3ccSAndroid Build Coastguard Worker }
836*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
837*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_128_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)838*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
839*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
840*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
841*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
842*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
843*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)128);
844*77c1e3ccSAndroid Build Coastguard Worker   dc_store_32xh(&row, 16, dst, stride);
845*77c1e3ccSAndroid Build Coastguard Worker }
846*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_128_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)847*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
848*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
849*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
850*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
851*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
852*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)128);
853*77c1e3ccSAndroid Build Coastguard Worker   dc_store_32xh(&row, 64, dst, stride);
854*77c1e3ccSAndroid Build Coastguard Worker }
855*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_128_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)856*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
857*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
858*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
859*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
860*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
861*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)128);
862*77c1e3ccSAndroid Build Coastguard Worker   dc_store_64xh(&row, 64, dst, stride);
863*77c1e3ccSAndroid Build Coastguard Worker }
864*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_128_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)865*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
866*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
867*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
868*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
869*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
870*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)128);
871*77c1e3ccSAndroid Build Coastguard Worker   dc_store_64xh(&row, 32, dst, stride);
872*77c1e3ccSAndroid Build Coastguard Worker }
873*77c1e3ccSAndroid Build Coastguard Worker 
874*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_128_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)875*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
876*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
877*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
878*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
879*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
880*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_set1_epi8((int8_t)128);
881*77c1e3ccSAndroid Build Coastguard Worker   dc_store_64xh(&row, 16, dst, stride);
882*77c1e3ccSAndroid Build Coastguard Worker }
883*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
884*77c1e3ccSAndroid Build Coastguard Worker 
885*77c1e3ccSAndroid Build Coastguard Worker // -----------------------------------------------------------------------------
886*77c1e3ccSAndroid Build Coastguard Worker // V_PRED
887*77c1e3ccSAndroid Build Coastguard Worker 
aom_v_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)888*77c1e3ccSAndroid Build Coastguard Worker void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
889*77c1e3ccSAndroid Build Coastguard Worker                               const uint8_t *above, const uint8_t *left) {
890*77c1e3ccSAndroid Build Coastguard Worker   const uint32_t pred = *(uint32_t *)above;
891*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
892*77c1e3ccSAndroid Build Coastguard Worker   dc_store_4xh(pred, 8, dst, stride);
893*77c1e3ccSAndroid Build Coastguard Worker }
894*77c1e3ccSAndroid Build Coastguard Worker 
895*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)896*77c1e3ccSAndroid Build Coastguard Worker void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
897*77c1e3ccSAndroid Build Coastguard Worker                                const uint8_t *above, const uint8_t *left) {
898*77c1e3ccSAndroid Build Coastguard Worker   const uint32_t pred = *(uint32_t *)above;
899*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
900*77c1e3ccSAndroid Build Coastguard Worker   dc_store_4xh(pred, 16, dst, stride);
901*77c1e3ccSAndroid Build Coastguard Worker }
902*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
903*77c1e3ccSAndroid Build Coastguard Worker 
aom_v_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)904*77c1e3ccSAndroid Build Coastguard Worker void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
905*77c1e3ccSAndroid Build Coastguard Worker                               const uint8_t *above, const uint8_t *left) {
906*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_loadl_epi64((__m128i const *)above);
907*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
908*77c1e3ccSAndroid Build Coastguard Worker   dc_store_8xh(&row, 4, dst, stride);
909*77c1e3ccSAndroid Build Coastguard Worker }
910*77c1e3ccSAndroid Build Coastguard Worker 
aom_v_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)911*77c1e3ccSAndroid Build Coastguard Worker void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
912*77c1e3ccSAndroid Build Coastguard Worker                                const uint8_t *above, const uint8_t *left) {
913*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_loadl_epi64((__m128i const *)above);
914*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
915*77c1e3ccSAndroid Build Coastguard Worker   dc_store_8xh(&row, 16, dst, stride);
916*77c1e3ccSAndroid Build Coastguard Worker }
917*77c1e3ccSAndroid Build Coastguard Worker 
918*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)919*77c1e3ccSAndroid Build Coastguard Worker void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
920*77c1e3ccSAndroid Build Coastguard Worker                                const uint8_t *above, const uint8_t *left) {
921*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_loadl_epi64((__m128i const *)above);
922*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
923*77c1e3ccSAndroid Build Coastguard Worker   dc_store_8xh(&row, 32, dst, stride);
924*77c1e3ccSAndroid Build Coastguard Worker }
925*77c1e3ccSAndroid Build Coastguard Worker 
aom_v_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)926*77c1e3ccSAndroid Build Coastguard Worker void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
927*77c1e3ccSAndroid Build Coastguard Worker                                const uint8_t *above, const uint8_t *left) {
928*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_load_si128((__m128i const *)above);
929*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
930*77c1e3ccSAndroid Build Coastguard Worker   dc_store_16xh(&row, 4, dst, stride);
931*77c1e3ccSAndroid Build Coastguard Worker }
932*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
933*77c1e3ccSAndroid Build Coastguard Worker 
aom_v_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)934*77c1e3ccSAndroid Build Coastguard Worker void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
935*77c1e3ccSAndroid Build Coastguard Worker                                const uint8_t *above, const uint8_t *left) {
936*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_load_si128((__m128i const *)above);
937*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
938*77c1e3ccSAndroid Build Coastguard Worker   dc_store_16xh(&row, 8, dst, stride);
939*77c1e3ccSAndroid Build Coastguard Worker }
940*77c1e3ccSAndroid Build Coastguard Worker 
aom_v_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)941*77c1e3ccSAndroid Build Coastguard Worker void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
942*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
943*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_load_si128((__m128i const *)above);
944*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
945*77c1e3ccSAndroid Build Coastguard Worker   dc_store_16xh(&row, 32, dst, stride);
946*77c1e3ccSAndroid Build Coastguard Worker }
947*77c1e3ccSAndroid Build Coastguard Worker 
948*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)949*77c1e3ccSAndroid Build Coastguard Worker void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
950*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
951*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row = _mm_load_si128((__m128i const *)above);
952*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
953*77c1e3ccSAndroid Build Coastguard Worker   dc_store_16xh(&row, 64, dst, stride);
954*77c1e3ccSAndroid Build Coastguard Worker }
955*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
956*77c1e3ccSAndroid Build Coastguard Worker 
v_predictor_32xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int height)957*77c1e3ccSAndroid Build Coastguard Worker static inline void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
958*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, int height) {
959*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row0 = _mm_load_si128((__m128i const *)above);
960*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
961*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < height; ++i) {
962*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)dst, row0);
963*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + 16), row1);
964*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
965*77c1e3ccSAndroid Build Coastguard Worker   }
966*77c1e3ccSAndroid Build Coastguard Worker }
967*77c1e3ccSAndroid Build Coastguard Worker 
968*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)969*77c1e3ccSAndroid Build Coastguard Worker void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
970*77c1e3ccSAndroid Build Coastguard Worker                                const uint8_t *above, const uint8_t *left) {
971*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
972*77c1e3ccSAndroid Build Coastguard Worker   v_predictor_32xh(dst, stride, above, 8);
973*77c1e3ccSAndroid Build Coastguard Worker }
974*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
975*77c1e3ccSAndroid Build Coastguard Worker 
aom_v_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)976*77c1e3ccSAndroid Build Coastguard Worker void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
977*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
978*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
979*77c1e3ccSAndroid Build Coastguard Worker   v_predictor_32xh(dst, stride, above, 16);
980*77c1e3ccSAndroid Build Coastguard Worker }
981*77c1e3ccSAndroid Build Coastguard Worker 
aom_v_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)982*77c1e3ccSAndroid Build Coastguard Worker void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
983*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
984*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
985*77c1e3ccSAndroid Build Coastguard Worker   v_predictor_32xh(dst, stride, above, 64);
986*77c1e3ccSAndroid Build Coastguard Worker }
987*77c1e3ccSAndroid Build Coastguard Worker 
v_predictor_64xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int height)988*77c1e3ccSAndroid Build Coastguard Worker static inline void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
989*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, int height) {
990*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row0 = _mm_load_si128((__m128i const *)above);
991*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
992*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
993*77c1e3ccSAndroid Build Coastguard Worker   const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
994*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < height; ++i) {
995*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)dst, row0);
996*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + 16), row1);
997*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + 32), row2);
998*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + 48), row3);
999*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
1000*77c1e3ccSAndroid Build Coastguard Worker   }
1001*77c1e3ccSAndroid Build Coastguard Worker }
1002*77c1e3ccSAndroid Build Coastguard Worker 
aom_v_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1003*77c1e3ccSAndroid Build Coastguard Worker void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
1004*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
1005*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
1006*77c1e3ccSAndroid Build Coastguard Worker   v_predictor_64xh(dst, stride, above, 64);
1007*77c1e3ccSAndroid Build Coastguard Worker }
1008*77c1e3ccSAndroid Build Coastguard Worker 
aom_v_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1009*77c1e3ccSAndroid Build Coastguard Worker void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
1010*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
1011*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
1012*77c1e3ccSAndroid Build Coastguard Worker   v_predictor_64xh(dst, stride, above, 32);
1013*77c1e3ccSAndroid Build Coastguard Worker }
1014*77c1e3ccSAndroid Build Coastguard Worker 
1015*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1016*77c1e3ccSAndroid Build Coastguard Worker void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
1017*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
1018*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
1019*77c1e3ccSAndroid Build Coastguard Worker   v_predictor_64xh(dst, stride, above, 16);
1020*77c1e3ccSAndroid Build Coastguard Worker }
1021*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1022*77c1e3ccSAndroid Build Coastguard Worker 
1023*77c1e3ccSAndroid Build Coastguard Worker // -----------------------------------------------------------------------------
1024*77c1e3ccSAndroid Build Coastguard Worker // H_PRED
1025*77c1e3ccSAndroid Build Coastguard Worker 
aom_h_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1026*77c1e3ccSAndroid Build Coastguard Worker void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
1027*77c1e3ccSAndroid Build Coastguard Worker                               const uint8_t *above, const uint8_t *left) {
1028*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
1029*77c1e3ccSAndroid Build Coastguard Worker   __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
1030*77c1e3ccSAndroid Build Coastguard Worker   left_col = _mm_unpacklo_epi8(left_col, left_col);
1031*77c1e3ccSAndroid Build Coastguard Worker   __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
1032*77c1e3ccSAndroid Build Coastguard Worker   __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
1033*77c1e3ccSAndroid Build Coastguard Worker   __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
1034*77c1e3ccSAndroid Build Coastguard Worker   __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
1035*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row0);
1036*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1037*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row1);
1038*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1039*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row2);
1040*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1041*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row3);
1042*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1043*77c1e3ccSAndroid Build Coastguard Worker   left_col = _mm_unpackhi_epi64(left_col, left_col);
1044*77c1e3ccSAndroid Build Coastguard Worker   row0 = _mm_shufflelo_epi16(left_col, 0);
1045*77c1e3ccSAndroid Build Coastguard Worker   row1 = _mm_shufflelo_epi16(left_col, 0x55);
1046*77c1e3ccSAndroid Build Coastguard Worker   row2 = _mm_shufflelo_epi16(left_col, 0xaa);
1047*77c1e3ccSAndroid Build Coastguard Worker   row3 = _mm_shufflelo_epi16(left_col, 0xff);
1048*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row0);
1049*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1050*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row1);
1051*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1052*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row2);
1053*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1054*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row3);
1055*77c1e3ccSAndroid Build Coastguard Worker }
1056*77c1e3ccSAndroid Build Coastguard Worker 
1057*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1058*77c1e3ccSAndroid Build Coastguard Worker void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
1059*77c1e3ccSAndroid Build Coastguard Worker                                const uint8_t *above, const uint8_t *left) {
1060*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
1061*77c1e3ccSAndroid Build Coastguard Worker   const __m128i left_col = _mm_load_si128((__m128i const *)left);
1062*77c1e3ccSAndroid Build Coastguard Worker   __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
1063*77c1e3ccSAndroid Build Coastguard Worker   __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
1064*77c1e3ccSAndroid Build Coastguard Worker 
1065*77c1e3ccSAndroid Build Coastguard Worker   __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
1066*77c1e3ccSAndroid Build Coastguard Worker   __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1067*77c1e3ccSAndroid Build Coastguard Worker   __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1068*77c1e3ccSAndroid Build Coastguard Worker   __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1069*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row0);
1070*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1071*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row1);
1072*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1073*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row2);
1074*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1075*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row3);
1076*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1077*77c1e3ccSAndroid Build Coastguard Worker 
1078*77c1e3ccSAndroid Build Coastguard Worker   left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
1079*77c1e3ccSAndroid Build Coastguard Worker   row0 = _mm_shufflelo_epi16(left_col_low, 0);
1080*77c1e3ccSAndroid Build Coastguard Worker   row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1081*77c1e3ccSAndroid Build Coastguard Worker   row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1082*77c1e3ccSAndroid Build Coastguard Worker   row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1083*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row0);
1084*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1085*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row1);
1086*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1087*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row2);
1088*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1089*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row3);
1090*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1091*77c1e3ccSAndroid Build Coastguard Worker 
1092*77c1e3ccSAndroid Build Coastguard Worker   row0 = _mm_shufflelo_epi16(left_col_high, 0);
1093*77c1e3ccSAndroid Build Coastguard Worker   row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1094*77c1e3ccSAndroid Build Coastguard Worker   row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1095*77c1e3ccSAndroid Build Coastguard Worker   row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1096*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row0);
1097*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1098*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row1);
1099*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1100*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row2);
1101*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1102*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row3);
1103*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1104*77c1e3ccSAndroid Build Coastguard Worker 
1105*77c1e3ccSAndroid Build Coastguard Worker   left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
1106*77c1e3ccSAndroid Build Coastguard Worker   row0 = _mm_shufflelo_epi16(left_col_high, 0);
1107*77c1e3ccSAndroid Build Coastguard Worker   row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1108*77c1e3ccSAndroid Build Coastguard Worker   row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1109*77c1e3ccSAndroid Build Coastguard Worker   row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1110*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row0);
1111*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1112*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row1);
1113*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1114*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row2);
1115*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1116*77c1e3ccSAndroid Build Coastguard Worker   *(int *)dst = _mm_cvtsi128_si32(row3);
1117*77c1e3ccSAndroid Build Coastguard Worker }
1118*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1119*77c1e3ccSAndroid Build Coastguard Worker 
aom_h_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1120*77c1e3ccSAndroid Build Coastguard Worker void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
1121*77c1e3ccSAndroid Build Coastguard Worker                               const uint8_t *above, const uint8_t *left) {
1122*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
1123*77c1e3ccSAndroid Build Coastguard Worker   __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
1124*77c1e3ccSAndroid Build Coastguard Worker   left_col = _mm_unpacklo_epi8(left_col, left_col);
1125*77c1e3ccSAndroid Build Coastguard Worker   __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
1126*77c1e3ccSAndroid Build Coastguard Worker   __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
1127*77c1e3ccSAndroid Build Coastguard Worker   __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
1128*77c1e3ccSAndroid Build Coastguard Worker   __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
1129*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)dst, row0);
1130*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1131*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)dst, row1);
1132*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1133*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)dst, row2);
1134*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1135*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)dst, row3);
1136*77c1e3ccSAndroid Build Coastguard Worker }
1137*77c1e3ccSAndroid Build Coastguard Worker 
h_predictor_8x16xc(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int count)1138*77c1e3ccSAndroid Build Coastguard Worker static inline void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
1139*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *above, const uint8_t *left,
1140*77c1e3ccSAndroid Build Coastguard Worker                                       int count) {
1141*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
1142*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < count; ++i) {
1143*77c1e3ccSAndroid Build Coastguard Worker     const __m128i left_col = _mm_load_si128((__m128i const *)left);
1144*77c1e3ccSAndroid Build Coastguard Worker     __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
1145*77c1e3ccSAndroid Build Coastguard Worker     __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
1146*77c1e3ccSAndroid Build Coastguard Worker 
1147*77c1e3ccSAndroid Build Coastguard Worker     __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
1148*77c1e3ccSAndroid Build Coastguard Worker     __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1149*77c1e3ccSAndroid Build Coastguard Worker     __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1150*77c1e3ccSAndroid Build Coastguard Worker     __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1151*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)dst, row0);
1152*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
1153*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)dst, row1);
1154*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
1155*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)dst, row2);
1156*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
1157*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)dst, row3);
1158*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
1159*77c1e3ccSAndroid Build Coastguard Worker 
1160*77c1e3ccSAndroid Build Coastguard Worker     left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
1161*77c1e3ccSAndroid Build Coastguard Worker     row0 = _mm_shufflelo_epi16(left_col_low, 0);
1162*77c1e3ccSAndroid Build Coastguard Worker     row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1163*77c1e3ccSAndroid Build Coastguard Worker     row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1164*77c1e3ccSAndroid Build Coastguard Worker     row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1165*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)dst, row0);
1166*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
1167*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)dst, row1);
1168*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
1169*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)dst, row2);
1170*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
1171*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)dst, row3);
1172*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
1173*77c1e3ccSAndroid Build Coastguard Worker 
1174*77c1e3ccSAndroid Build Coastguard Worker     row0 = _mm_shufflelo_epi16(left_col_high, 0);
1175*77c1e3ccSAndroid Build Coastguard Worker     row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1176*77c1e3ccSAndroid Build Coastguard Worker     row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1177*77c1e3ccSAndroid Build Coastguard Worker     row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1178*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)dst, row0);
1179*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
1180*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)dst, row1);
1181*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
1182*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)dst, row2);
1183*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
1184*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)dst, row3);
1185*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
1186*77c1e3ccSAndroid Build Coastguard Worker 
1187*77c1e3ccSAndroid Build Coastguard Worker     left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
1188*77c1e3ccSAndroid Build Coastguard Worker     row0 = _mm_shufflelo_epi16(left_col_high, 0);
1189*77c1e3ccSAndroid Build Coastguard Worker     row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1190*77c1e3ccSAndroid Build Coastguard Worker     row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1191*77c1e3ccSAndroid Build Coastguard Worker     row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1192*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)dst, row0);
1193*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
1194*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)dst, row1);
1195*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
1196*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)dst, row2);
1197*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
1198*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)dst, row3);
1199*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
1200*77c1e3ccSAndroid Build Coastguard Worker     left += 16;
1201*77c1e3ccSAndroid Build Coastguard Worker   }
1202*77c1e3ccSAndroid Build Coastguard Worker }
1203*77c1e3ccSAndroid Build Coastguard Worker 
aom_h_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1204*77c1e3ccSAndroid Build Coastguard Worker void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
1205*77c1e3ccSAndroid Build Coastguard Worker                                const uint8_t *above, const uint8_t *left) {
1206*77c1e3ccSAndroid Build Coastguard Worker   h_predictor_8x16xc(dst, stride, above, left, 1);
1207*77c1e3ccSAndroid Build Coastguard Worker }
1208*77c1e3ccSAndroid Build Coastguard Worker 
1209*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1210*77c1e3ccSAndroid Build Coastguard Worker void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
1211*77c1e3ccSAndroid Build Coastguard Worker                                const uint8_t *above, const uint8_t *left) {
1212*77c1e3ccSAndroid Build Coastguard Worker   h_predictor_8x16xc(dst, stride, above, left, 2);
1213*77c1e3ccSAndroid Build Coastguard Worker }
1214*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1215*77c1e3ccSAndroid Build Coastguard Worker 
h_pred_store_16xh(const __m128i * row,int h,uint8_t * dst,ptrdiff_t stride)1216*77c1e3ccSAndroid Build Coastguard Worker static inline void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
1217*77c1e3ccSAndroid Build Coastguard Worker                                      ptrdiff_t stride) {
1218*77c1e3ccSAndroid Build Coastguard Worker   int i;
1219*77c1e3ccSAndroid Build Coastguard Worker   for (i = 0; i < h; ++i) {
1220*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)dst, row[i]);
1221*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
1222*77c1e3ccSAndroid Build Coastguard Worker   }
1223*77c1e3ccSAndroid Build Coastguard Worker }
1224*77c1e3ccSAndroid Build Coastguard Worker 
repeat_low_4pixels(const __m128i * x,__m128i * row)1225*77c1e3ccSAndroid Build Coastguard Worker static inline void repeat_low_4pixels(const __m128i *x, __m128i *row) {
1226*77c1e3ccSAndroid Build Coastguard Worker   const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
1227*77c1e3ccSAndroid Build Coastguard Worker   const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
1228*77c1e3ccSAndroid Build Coastguard Worker   const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
1229*77c1e3ccSAndroid Build Coastguard Worker   const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
1230*77c1e3ccSAndroid Build Coastguard Worker 
1231*77c1e3ccSAndroid Build Coastguard Worker   row[0] = _mm_unpacklo_epi64(u0, u0);
1232*77c1e3ccSAndroid Build Coastguard Worker   row[1] = _mm_unpacklo_epi64(u1, u1);
1233*77c1e3ccSAndroid Build Coastguard Worker   row[2] = _mm_unpacklo_epi64(u2, u2);
1234*77c1e3ccSAndroid Build Coastguard Worker   row[3] = _mm_unpacklo_epi64(u3, u3);
1235*77c1e3ccSAndroid Build Coastguard Worker }
1236*77c1e3ccSAndroid Build Coastguard Worker 
repeat_high_4pixels(const __m128i * x,__m128i * row)1237*77c1e3ccSAndroid Build Coastguard Worker static inline void repeat_high_4pixels(const __m128i *x, __m128i *row) {
1238*77c1e3ccSAndroid Build Coastguard Worker   const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
1239*77c1e3ccSAndroid Build Coastguard Worker   const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
1240*77c1e3ccSAndroid Build Coastguard Worker   const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
1241*77c1e3ccSAndroid Build Coastguard Worker   const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
1242*77c1e3ccSAndroid Build Coastguard Worker 
1243*77c1e3ccSAndroid Build Coastguard Worker   row[0] = _mm_unpackhi_epi64(u0, u0);
1244*77c1e3ccSAndroid Build Coastguard Worker   row[1] = _mm_unpackhi_epi64(u1, u1);
1245*77c1e3ccSAndroid Build Coastguard Worker   row[2] = _mm_unpackhi_epi64(u2, u2);
1246*77c1e3ccSAndroid Build Coastguard Worker   row[3] = _mm_unpackhi_epi64(u3, u3);
1247*77c1e3ccSAndroid Build Coastguard Worker }
1248*77c1e3ccSAndroid Build Coastguard Worker 
1249*77c1e3ccSAndroid Build Coastguard Worker // Process 16x8, first 4 rows
1250*77c1e3ccSAndroid Build Coastguard Worker // Use first 8 bytes of left register: xxxxxxxx33221100
h_prediction_16x8_1(const __m128i * left,uint8_t * dst,ptrdiff_t stride)1251*77c1e3ccSAndroid Build Coastguard Worker static inline void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
1252*77c1e3ccSAndroid Build Coastguard Worker                                        ptrdiff_t stride) {
1253*77c1e3ccSAndroid Build Coastguard Worker   __m128i row[4];
1254*77c1e3ccSAndroid Build Coastguard Worker   repeat_low_4pixels(left, row);
1255*77c1e3ccSAndroid Build Coastguard Worker   h_pred_store_16xh(row, 4, dst, stride);
1256*77c1e3ccSAndroid Build Coastguard Worker }
1257*77c1e3ccSAndroid Build Coastguard Worker 
1258*77c1e3ccSAndroid Build Coastguard Worker // Process 16x8, second 4 rows
1259*77c1e3ccSAndroid Build Coastguard Worker // Use second 8 bytes of left register: 77665544xxxxxxxx
h_prediction_16x8_2(const __m128i * left,uint8_t * dst,ptrdiff_t stride)1260*77c1e3ccSAndroid Build Coastguard Worker static inline void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
1261*77c1e3ccSAndroid Build Coastguard Worker                                        ptrdiff_t stride) {
1262*77c1e3ccSAndroid Build Coastguard Worker   __m128i row[4];
1263*77c1e3ccSAndroid Build Coastguard Worker   repeat_high_4pixels(left, row);
1264*77c1e3ccSAndroid Build Coastguard Worker   h_pred_store_16xh(row, 4, dst, stride);
1265*77c1e3ccSAndroid Build Coastguard Worker }
1266*77c1e3ccSAndroid Build Coastguard Worker 
1267*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1268*77c1e3ccSAndroid Build Coastguard Worker void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
1269*77c1e3ccSAndroid Build Coastguard Worker                                const uint8_t *above, const uint8_t *left) {
1270*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
1271*77c1e3ccSAndroid Build Coastguard Worker   const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
1272*77c1e3ccSAndroid Build Coastguard Worker   const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1273*77c1e3ccSAndroid Build Coastguard Worker   h_prediction_16x8_1(&left_col_8p, dst, stride);
1274*77c1e3ccSAndroid Build Coastguard Worker }
1275*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1276*77c1e3ccSAndroid Build Coastguard Worker 
aom_h_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1277*77c1e3ccSAndroid Build Coastguard Worker void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
1278*77c1e3ccSAndroid Build Coastguard Worker                                const uint8_t *above, const uint8_t *left) {
1279*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
1280*77c1e3ccSAndroid Build Coastguard Worker   const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
1281*77c1e3ccSAndroid Build Coastguard Worker   const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1282*77c1e3ccSAndroid Build Coastguard Worker   h_prediction_16x8_1(&left_col_8p, dst, stride);
1283*77c1e3ccSAndroid Build Coastguard Worker   dst += stride << 2;
1284*77c1e3ccSAndroid Build Coastguard Worker   h_prediction_16x8_2(&left_col_8p, dst, stride);
1285*77c1e3ccSAndroid Build Coastguard Worker }
1286*77c1e3ccSAndroid Build Coastguard Worker 
h_predictor_16xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int count)1287*77c1e3ccSAndroid Build Coastguard Worker static inline void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
1288*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *left, int count) {
1289*77c1e3ccSAndroid Build Coastguard Worker   int i = 0;
1290*77c1e3ccSAndroid Build Coastguard Worker   do {
1291*77c1e3ccSAndroid Build Coastguard Worker     const __m128i left_col = _mm_load_si128((const __m128i *)left);
1292*77c1e3ccSAndroid Build Coastguard Worker     const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
1293*77c1e3ccSAndroid Build Coastguard Worker     h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
1294*77c1e3ccSAndroid Build Coastguard Worker     dst += stride << 2;
1295*77c1e3ccSAndroid Build Coastguard Worker     h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
1296*77c1e3ccSAndroid Build Coastguard Worker     dst += stride << 2;
1297*77c1e3ccSAndroid Build Coastguard Worker 
1298*77c1e3ccSAndroid Build Coastguard Worker     const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
1299*77c1e3ccSAndroid Build Coastguard Worker     h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
1300*77c1e3ccSAndroid Build Coastguard Worker     dst += stride << 2;
1301*77c1e3ccSAndroid Build Coastguard Worker     h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
1302*77c1e3ccSAndroid Build Coastguard Worker     dst += stride << 2;
1303*77c1e3ccSAndroid Build Coastguard Worker 
1304*77c1e3ccSAndroid Build Coastguard Worker     left += 16;
1305*77c1e3ccSAndroid Build Coastguard Worker     i++;
1306*77c1e3ccSAndroid Build Coastguard Worker   } while (i < count);
1307*77c1e3ccSAndroid Build Coastguard Worker }
1308*77c1e3ccSAndroid Build Coastguard Worker 
aom_h_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1309*77c1e3ccSAndroid Build Coastguard Worker void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
1310*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
1311*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
1312*77c1e3ccSAndroid Build Coastguard Worker   h_predictor_16xh(dst, stride, left, 2);
1313*77c1e3ccSAndroid Build Coastguard Worker }
1314*77c1e3ccSAndroid Build Coastguard Worker 
1315*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1316*77c1e3ccSAndroid Build Coastguard Worker void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
1317*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
1318*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
1319*77c1e3ccSAndroid Build Coastguard Worker   h_predictor_16xh(dst, stride, left, 4);
1320*77c1e3ccSAndroid Build Coastguard Worker }
1321*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1322*77c1e3ccSAndroid Build Coastguard Worker 
h_pred_store_32xh(const __m128i * row,int h,uint8_t * dst,ptrdiff_t stride)1323*77c1e3ccSAndroid Build Coastguard Worker static inline void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
1324*77c1e3ccSAndroid Build Coastguard Worker                                      ptrdiff_t stride) {
1325*77c1e3ccSAndroid Build Coastguard Worker   int i;
1326*77c1e3ccSAndroid Build Coastguard Worker   for (i = 0; i < h; ++i) {
1327*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)dst, row[i]);
1328*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + 16), row[i]);
1329*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
1330*77c1e3ccSAndroid Build Coastguard Worker   }
1331*77c1e3ccSAndroid Build Coastguard Worker }
1332*77c1e3ccSAndroid Build Coastguard Worker 
1333*77c1e3ccSAndroid Build Coastguard Worker // Process 32x8, first 4 rows
1334*77c1e3ccSAndroid Build Coastguard Worker // Use first 8 bytes of left register: xxxxxxxx33221100
h_prediction_32x8_1(const __m128i * left,uint8_t * dst,ptrdiff_t stride)1335*77c1e3ccSAndroid Build Coastguard Worker static inline void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
1336*77c1e3ccSAndroid Build Coastguard Worker                                        ptrdiff_t stride) {
1337*77c1e3ccSAndroid Build Coastguard Worker   __m128i row[4];
1338*77c1e3ccSAndroid Build Coastguard Worker   repeat_low_4pixels(left, row);
1339*77c1e3ccSAndroid Build Coastguard Worker   h_pred_store_32xh(row, 4, dst, stride);
1340*77c1e3ccSAndroid Build Coastguard Worker }
1341*77c1e3ccSAndroid Build Coastguard Worker 
1342*77c1e3ccSAndroid Build Coastguard Worker // Process 32x8, second 4 rows
1343*77c1e3ccSAndroid Build Coastguard Worker // Use second 8 bytes of left register: 77665544xxxxxxxx
h_prediction_32x8_2(const __m128i * left,uint8_t * dst,ptrdiff_t stride)1344*77c1e3ccSAndroid Build Coastguard Worker static inline void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
1345*77c1e3ccSAndroid Build Coastguard Worker                                        ptrdiff_t stride) {
1346*77c1e3ccSAndroid Build Coastguard Worker   __m128i row[4];
1347*77c1e3ccSAndroid Build Coastguard Worker   repeat_high_4pixels(left, row);
1348*77c1e3ccSAndroid Build Coastguard Worker   h_pred_store_32xh(row, 4, dst, stride);
1349*77c1e3ccSAndroid Build Coastguard Worker }
1350*77c1e3ccSAndroid Build Coastguard Worker 
1351*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1352*77c1e3ccSAndroid Build Coastguard Worker void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
1353*77c1e3ccSAndroid Build Coastguard Worker                                const uint8_t *above, const uint8_t *left) {
1354*77c1e3ccSAndroid Build Coastguard Worker   __m128i left_col, left_col_8p;
1355*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
1356*77c1e3ccSAndroid Build Coastguard Worker 
1357*77c1e3ccSAndroid Build Coastguard Worker   left_col = _mm_load_si128((const __m128i *)left);
1358*77c1e3ccSAndroid Build Coastguard Worker 
1359*77c1e3ccSAndroid Build Coastguard Worker   left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1360*77c1e3ccSAndroid Build Coastguard Worker   h_prediction_32x8_1(&left_col_8p, dst, stride);
1361*77c1e3ccSAndroid Build Coastguard Worker   dst += stride << 2;
1362*77c1e3ccSAndroid Build Coastguard Worker   h_prediction_32x8_2(&left_col_8p, dst, stride);
1363*77c1e3ccSAndroid Build Coastguard Worker }
1364*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1365*77c1e3ccSAndroid Build Coastguard Worker 
aom_h_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1366*77c1e3ccSAndroid Build Coastguard Worker void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
1367*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
1368*77c1e3ccSAndroid Build Coastguard Worker   __m128i left_col, left_col_8p;
1369*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
1370*77c1e3ccSAndroid Build Coastguard Worker 
1371*77c1e3ccSAndroid Build Coastguard Worker   left_col = _mm_load_si128((const __m128i *)left);
1372*77c1e3ccSAndroid Build Coastguard Worker 
1373*77c1e3ccSAndroid Build Coastguard Worker   left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1374*77c1e3ccSAndroid Build Coastguard Worker   h_prediction_32x8_1(&left_col_8p, dst, stride);
1375*77c1e3ccSAndroid Build Coastguard Worker   dst += stride << 2;
1376*77c1e3ccSAndroid Build Coastguard Worker   h_prediction_32x8_2(&left_col_8p, dst, stride);
1377*77c1e3ccSAndroid Build Coastguard Worker   dst += stride << 2;
1378*77c1e3ccSAndroid Build Coastguard Worker 
1379*77c1e3ccSAndroid Build Coastguard Worker   left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
1380*77c1e3ccSAndroid Build Coastguard Worker   h_prediction_32x8_1(&left_col_8p, dst, stride);
1381*77c1e3ccSAndroid Build Coastguard Worker   dst += stride << 2;
1382*77c1e3ccSAndroid Build Coastguard Worker   h_prediction_32x8_2(&left_col_8p, dst, stride);
1383*77c1e3ccSAndroid Build Coastguard Worker }
1384*77c1e3ccSAndroid Build Coastguard Worker 
h_predictor_32xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int height)1385*77c1e3ccSAndroid Build Coastguard Worker static inline void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
1386*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *left, int height) {
1387*77c1e3ccSAndroid Build Coastguard Worker   int i = height >> 2;
1388*77c1e3ccSAndroid Build Coastguard Worker   do {
1389*77c1e3ccSAndroid Build Coastguard Worker     __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
1390*77c1e3ccSAndroid Build Coastguard Worker     left4 = _mm_unpacklo_epi8(left4, left4);
1391*77c1e3ccSAndroid Build Coastguard Worker     left4 = _mm_unpacklo_epi8(left4, left4);
1392*77c1e3ccSAndroid Build Coastguard Worker     const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
1393*77c1e3ccSAndroid Build Coastguard Worker     const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
1394*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)dst, r0);
1395*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + 16), r0);
1396*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + stride), r1);
1397*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + stride + 16), r1);
1398*77c1e3ccSAndroid Build Coastguard Worker     const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
1399*77c1e3ccSAndroid Build Coastguard Worker     const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
1400*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + stride * 2), r2);
1401*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
1402*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + stride * 3), r3);
1403*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
1404*77c1e3ccSAndroid Build Coastguard Worker     left += 4;
1405*77c1e3ccSAndroid Build Coastguard Worker     dst += stride * 4;
1406*77c1e3ccSAndroid Build Coastguard Worker   } while (--i);
1407*77c1e3ccSAndroid Build Coastguard Worker }
1408*77c1e3ccSAndroid Build Coastguard Worker 
aom_h_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1409*77c1e3ccSAndroid Build Coastguard Worker void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
1410*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
1411*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
1412*77c1e3ccSAndroid Build Coastguard Worker   h_predictor_32xh(dst, stride, left, 64);
1413*77c1e3ccSAndroid Build Coastguard Worker }
1414*77c1e3ccSAndroid Build Coastguard Worker 
h_predictor_64xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int height)1415*77c1e3ccSAndroid Build Coastguard Worker static inline void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
1416*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *left, int height) {
1417*77c1e3ccSAndroid Build Coastguard Worker   int i = height >> 2;
1418*77c1e3ccSAndroid Build Coastguard Worker   do {
1419*77c1e3ccSAndroid Build Coastguard Worker     __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
1420*77c1e3ccSAndroid Build Coastguard Worker     left4 = _mm_unpacklo_epi8(left4, left4);
1421*77c1e3ccSAndroid Build Coastguard Worker     left4 = _mm_unpacklo_epi8(left4, left4);
1422*77c1e3ccSAndroid Build Coastguard Worker     const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
1423*77c1e3ccSAndroid Build Coastguard Worker     const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
1424*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)dst, r0);
1425*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + 16), r0);
1426*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + 32), r0);
1427*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + 48), r0);
1428*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + stride), r1);
1429*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + stride + 16), r1);
1430*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + stride + 32), r1);
1431*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + stride + 48), r1);
1432*77c1e3ccSAndroid Build Coastguard Worker     const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
1433*77c1e3ccSAndroid Build Coastguard Worker     const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
1434*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + stride * 2), r2);
1435*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
1436*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
1437*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
1438*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + stride * 3), r3);
1439*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
1440*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
1441*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
1442*77c1e3ccSAndroid Build Coastguard Worker     left += 4;
1443*77c1e3ccSAndroid Build Coastguard Worker     dst += stride * 4;
1444*77c1e3ccSAndroid Build Coastguard Worker   } while (--i);
1445*77c1e3ccSAndroid Build Coastguard Worker }
1446*77c1e3ccSAndroid Build Coastguard Worker 
aom_h_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1447*77c1e3ccSAndroid Build Coastguard Worker void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
1448*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
1449*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
1450*77c1e3ccSAndroid Build Coastguard Worker   h_predictor_64xh(dst, stride, left, 64);
1451*77c1e3ccSAndroid Build Coastguard Worker }
1452*77c1e3ccSAndroid Build Coastguard Worker 
aom_h_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1453*77c1e3ccSAndroid Build Coastguard Worker void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
1454*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
1455*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
1456*77c1e3ccSAndroid Build Coastguard Worker   h_predictor_64xh(dst, stride, left, 32);
1457*77c1e3ccSAndroid Build Coastguard Worker }
1458*77c1e3ccSAndroid Build Coastguard Worker 
1459*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1460*77c1e3ccSAndroid Build Coastguard Worker void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
1461*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
1462*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
1463*77c1e3ccSAndroid Build Coastguard Worker   h_predictor_64xh(dst, stride, left, 16);
1464*77c1e3ccSAndroid Build Coastguard Worker }
1465*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1466