1*b2055c35SXin Li // Copyright 2014 Google Inc. All Rights Reserved.
2*b2055c35SXin Li //
3*b2055c35SXin Li // Use of this source code is governed by a BSD-style license
4*b2055c35SXin Li // that can be found in the COPYING file in the root of the source
5*b2055c35SXin Li // tree. An additional intellectual property rights grant can be found
6*b2055c35SXin Li // in the file PATENTS. All contributing project authors may
7*b2055c35SXin Li // be found in the AUTHORS file in the root of the source tree.
8*b2055c35SXin Li // -----------------------------------------------------------------------------
9*b2055c35SXin Li //
10*b2055c35SXin Li // SSE2 variant of methods for lossless decoder
11*b2055c35SXin Li //
12*b2055c35SXin Li // Author: Skal ([email protected])
13*b2055c35SXin Li
14*b2055c35SXin Li #include "src/dsp/dsp.h"
15*b2055c35SXin Li
16*b2055c35SXin Li #if defined(WEBP_USE_SSE2)
17*b2055c35SXin Li
18*b2055c35SXin Li #include "src/dsp/common_sse2.h"
19*b2055c35SXin Li #include "src/dsp/lossless.h"
20*b2055c35SXin Li #include "src/dsp/lossless_common.h"
21*b2055c35SXin Li #include <emmintrin.h>
22*b2055c35SXin Li
23*b2055c35SXin Li //------------------------------------------------------------------------------
24*b2055c35SXin Li // Predictor Transform
25*b2055c35SXin Li
ClampedAddSubtractFull_SSE2(uint32_t c0,uint32_t c1,uint32_t c2)26*b2055c35SXin Li static WEBP_INLINE uint32_t ClampedAddSubtractFull_SSE2(uint32_t c0,
27*b2055c35SXin Li uint32_t c1,
28*b2055c35SXin Li uint32_t c2) {
29*b2055c35SXin Li const __m128i zero = _mm_setzero_si128();
30*b2055c35SXin Li const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c0), zero);
31*b2055c35SXin Li const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c1), zero);
32*b2055c35SXin Li const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c2), zero);
33*b2055c35SXin Li const __m128i V1 = _mm_add_epi16(C0, C1);
34*b2055c35SXin Li const __m128i V2 = _mm_sub_epi16(V1, C2);
35*b2055c35SXin Li const __m128i b = _mm_packus_epi16(V2, V2);
36*b2055c35SXin Li return (uint32_t)_mm_cvtsi128_si32(b);
37*b2055c35SXin Li }
38*b2055c35SXin Li
ClampedAddSubtractHalf_SSE2(uint32_t c0,uint32_t c1,uint32_t c2)39*b2055c35SXin Li static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0,
40*b2055c35SXin Li uint32_t c1,
41*b2055c35SXin Li uint32_t c2) {
42*b2055c35SXin Li const __m128i zero = _mm_setzero_si128();
43*b2055c35SXin Li const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c0), zero);
44*b2055c35SXin Li const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c1), zero);
45*b2055c35SXin Li const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c2), zero);
46*b2055c35SXin Li const __m128i avg = _mm_add_epi16(C1, C0);
47*b2055c35SXin Li const __m128i A0 = _mm_srli_epi16(avg, 1);
48*b2055c35SXin Li const __m128i A1 = _mm_sub_epi16(A0, B0);
49*b2055c35SXin Li const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
50*b2055c35SXin Li const __m128i A2 = _mm_sub_epi16(A1, BgtA);
51*b2055c35SXin Li const __m128i A3 = _mm_srai_epi16(A2, 1);
52*b2055c35SXin Li const __m128i A4 = _mm_add_epi16(A0, A3);
53*b2055c35SXin Li const __m128i A5 = _mm_packus_epi16(A4, A4);
54*b2055c35SXin Li return (uint32_t)_mm_cvtsi128_si32(A5);
55*b2055c35SXin Li }
56*b2055c35SXin Li
Select_SSE2(uint32_t a,uint32_t b,uint32_t c)57*b2055c35SXin Li static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) {
58*b2055c35SXin Li int pa_minus_pb;
59*b2055c35SXin Li const __m128i zero = _mm_setzero_si128();
60*b2055c35SXin Li const __m128i A0 = _mm_cvtsi32_si128((int)a);
61*b2055c35SXin Li const __m128i B0 = _mm_cvtsi32_si128((int)b);
62*b2055c35SXin Li const __m128i C0 = _mm_cvtsi32_si128((int)c);
63*b2055c35SXin Li const __m128i AC0 = _mm_subs_epu8(A0, C0);
64*b2055c35SXin Li const __m128i CA0 = _mm_subs_epu8(C0, A0);
65*b2055c35SXin Li const __m128i BC0 = _mm_subs_epu8(B0, C0);
66*b2055c35SXin Li const __m128i CB0 = _mm_subs_epu8(C0, B0);
67*b2055c35SXin Li const __m128i AC = _mm_or_si128(AC0, CA0);
68*b2055c35SXin Li const __m128i BC = _mm_or_si128(BC0, CB0);
69*b2055c35SXin Li const __m128i pa = _mm_unpacklo_epi8(AC, zero); // |a - c|
70*b2055c35SXin Li const __m128i pb = _mm_unpacklo_epi8(BC, zero); // |b - c|
71*b2055c35SXin Li const __m128i diff = _mm_sub_epi16(pb, pa);
72*b2055c35SXin Li {
73*b2055c35SXin Li int16_t out[8];
74*b2055c35SXin Li _mm_storeu_si128((__m128i*)out, diff);
75*b2055c35SXin Li pa_minus_pb = out[0] + out[1] + out[2] + out[3];
76*b2055c35SXin Li }
77*b2055c35SXin Li return (pa_minus_pb <= 0) ? a : b;
78*b2055c35SXin Li }
79*b2055c35SXin Li
Average2_m128i(const __m128i * const a0,const __m128i * const a1,__m128i * const avg)80*b2055c35SXin Li static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
81*b2055c35SXin Li const __m128i* const a1,
82*b2055c35SXin Li __m128i* const avg) {
83*b2055c35SXin Li // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
84*b2055c35SXin Li const __m128i ones = _mm_set1_epi8(1);
85*b2055c35SXin Li const __m128i avg1 = _mm_avg_epu8(*a0, *a1);
86*b2055c35SXin Li const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);
87*b2055c35SXin Li *avg = _mm_sub_epi8(avg1, one);
88*b2055c35SXin Li }
89*b2055c35SXin Li
Average2_uint32_SSE2(const uint32_t a0,const uint32_t a1,__m128i * const avg)90*b2055c35SXin Li static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0,
91*b2055c35SXin Li const uint32_t a1,
92*b2055c35SXin Li __m128i* const avg) {
93*b2055c35SXin Li // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
94*b2055c35SXin Li const __m128i ones = _mm_set1_epi8(1);
95*b2055c35SXin Li const __m128i A0 = _mm_cvtsi32_si128((int)a0);
96*b2055c35SXin Li const __m128i A1 = _mm_cvtsi32_si128((int)a1);
97*b2055c35SXin Li const __m128i avg1 = _mm_avg_epu8(A0, A1);
98*b2055c35SXin Li const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones);
99*b2055c35SXin Li *avg = _mm_sub_epi8(avg1, one);
100*b2055c35SXin Li }
101*b2055c35SXin Li
Average2_uint32_16_SSE2(uint32_t a0,uint32_t a1)102*b2055c35SXin Li static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) {
103*b2055c35SXin Li const __m128i zero = _mm_setzero_si128();
104*b2055c35SXin Li const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a0), zero);
105*b2055c35SXin Li const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a1), zero);
106*b2055c35SXin Li const __m128i sum = _mm_add_epi16(A1, A0);
107*b2055c35SXin Li return _mm_srli_epi16(sum, 1);
108*b2055c35SXin Li }
109*b2055c35SXin Li
Average2_SSE2(uint32_t a0,uint32_t a1)110*b2055c35SXin Li static WEBP_INLINE uint32_t Average2_SSE2(uint32_t a0, uint32_t a1) {
111*b2055c35SXin Li __m128i output;
112*b2055c35SXin Li Average2_uint32_SSE2(a0, a1, &output);
113*b2055c35SXin Li return (uint32_t)_mm_cvtsi128_si32(output);
114*b2055c35SXin Li }
115*b2055c35SXin Li
Average3_SSE2(uint32_t a0,uint32_t a1,uint32_t a2)116*b2055c35SXin Li static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1,
117*b2055c35SXin Li uint32_t a2) {
118*b2055c35SXin Li const __m128i zero = _mm_setzero_si128();
119*b2055c35SXin Li const __m128i avg1 = Average2_uint32_16_SSE2(a0, a2);
120*b2055c35SXin Li const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a1), zero);
121*b2055c35SXin Li const __m128i sum = _mm_add_epi16(avg1, A1);
122*b2055c35SXin Li const __m128i avg2 = _mm_srli_epi16(sum, 1);
123*b2055c35SXin Li const __m128i A2 = _mm_packus_epi16(avg2, avg2);
124*b2055c35SXin Li return (uint32_t)_mm_cvtsi128_si32(A2);
125*b2055c35SXin Li }
126*b2055c35SXin Li
Average4_SSE2(uint32_t a0,uint32_t a1,uint32_t a2,uint32_t a3)127*b2055c35SXin Li static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
128*b2055c35SXin Li uint32_t a2, uint32_t a3) {
129*b2055c35SXin Li const __m128i avg1 = Average2_uint32_16_SSE2(a0, a1);
130*b2055c35SXin Li const __m128i avg2 = Average2_uint32_16_SSE2(a2, a3);
131*b2055c35SXin Li const __m128i sum = _mm_add_epi16(avg2, avg1);
132*b2055c35SXin Li const __m128i avg3 = _mm_srli_epi16(sum, 1);
133*b2055c35SXin Li const __m128i A0 = _mm_packus_epi16(avg3, avg3);
134*b2055c35SXin Li return (uint32_t)_mm_cvtsi128_si32(A0);
135*b2055c35SXin Li }
136*b2055c35SXin Li
Predictor5_SSE2(const uint32_t * const left,const uint32_t * const top)137*b2055c35SXin Li static uint32_t Predictor5_SSE2(const uint32_t* const left,
138*b2055c35SXin Li const uint32_t* const top) {
139*b2055c35SXin Li const uint32_t pred = Average3_SSE2(*left, top[0], top[1]);
140*b2055c35SXin Li return pred;
141*b2055c35SXin Li }
Predictor6_SSE2(const uint32_t * const left,const uint32_t * const top)142*b2055c35SXin Li static uint32_t Predictor6_SSE2(const uint32_t* const left,
143*b2055c35SXin Li const uint32_t* const top) {
144*b2055c35SXin Li const uint32_t pred = Average2_SSE2(*left, top[-1]);
145*b2055c35SXin Li return pred;
146*b2055c35SXin Li }
Predictor7_SSE2(const uint32_t * const left,const uint32_t * const top)147*b2055c35SXin Li static uint32_t Predictor7_SSE2(const uint32_t* const left,
148*b2055c35SXin Li const uint32_t* const top) {
149*b2055c35SXin Li const uint32_t pred = Average2_SSE2(*left, top[0]);
150*b2055c35SXin Li return pred;
151*b2055c35SXin Li }
Predictor8_SSE2(const uint32_t * const left,const uint32_t * const top)152*b2055c35SXin Li static uint32_t Predictor8_SSE2(const uint32_t* const left,
153*b2055c35SXin Li const uint32_t* const top) {
154*b2055c35SXin Li const uint32_t pred = Average2_SSE2(top[-1], top[0]);
155*b2055c35SXin Li (void)left;
156*b2055c35SXin Li return pred;
157*b2055c35SXin Li }
Predictor9_SSE2(const uint32_t * const left,const uint32_t * const top)158*b2055c35SXin Li static uint32_t Predictor9_SSE2(const uint32_t* const left,
159*b2055c35SXin Li const uint32_t* const top) {
160*b2055c35SXin Li const uint32_t pred = Average2_SSE2(top[0], top[1]);
161*b2055c35SXin Li (void)left;
162*b2055c35SXin Li return pred;
163*b2055c35SXin Li }
Predictor10_SSE2(const uint32_t * const left,const uint32_t * const top)164*b2055c35SXin Li static uint32_t Predictor10_SSE2(const uint32_t* const left,
165*b2055c35SXin Li const uint32_t* const top) {
166*b2055c35SXin Li const uint32_t pred = Average4_SSE2(*left, top[-1], top[0], top[1]);
167*b2055c35SXin Li return pred;
168*b2055c35SXin Li }
Predictor11_SSE2(const uint32_t * const left,const uint32_t * const top)169*b2055c35SXin Li static uint32_t Predictor11_SSE2(const uint32_t* const left,
170*b2055c35SXin Li const uint32_t* const top) {
171*b2055c35SXin Li const uint32_t pred = Select_SSE2(top[0], *left, top[-1]);
172*b2055c35SXin Li return pred;
173*b2055c35SXin Li }
Predictor12_SSE2(const uint32_t * const left,const uint32_t * const top)174*b2055c35SXin Li static uint32_t Predictor12_SSE2(const uint32_t* const left,
175*b2055c35SXin Li const uint32_t* const top) {
176*b2055c35SXin Li const uint32_t pred = ClampedAddSubtractFull_SSE2(*left, top[0], top[-1]);
177*b2055c35SXin Li return pred;
178*b2055c35SXin Li }
Predictor13_SSE2(const uint32_t * const left,const uint32_t * const top)179*b2055c35SXin Li static uint32_t Predictor13_SSE2(const uint32_t* const left,
180*b2055c35SXin Li const uint32_t* const top) {
181*b2055c35SXin Li const uint32_t pred = ClampedAddSubtractHalf_SSE2(*left, top[0], top[-1]);
182*b2055c35SXin Li return pred;
183*b2055c35SXin Li }
184*b2055c35SXin Li
185*b2055c35SXin Li // Batch versions of those functions.
186*b2055c35SXin Li
187*b2055c35SXin Li // Predictor0: ARGB_BLACK.
PredictorAdd0_SSE2(const uint32_t * in,const uint32_t * upper,int num_pixels,uint32_t * out)188*b2055c35SXin Li static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
189*b2055c35SXin Li int num_pixels, uint32_t* out) {
190*b2055c35SXin Li int i;
191*b2055c35SXin Li const __m128i black = _mm_set1_epi32((int)ARGB_BLACK);
192*b2055c35SXin Li for (i = 0; i + 4 <= num_pixels; i += 4) {
193*b2055c35SXin Li const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
194*b2055c35SXin Li const __m128i res = _mm_add_epi8(src, black);
195*b2055c35SXin Li _mm_storeu_si128((__m128i*)&out[i], res);
196*b2055c35SXin Li }
197*b2055c35SXin Li if (i != num_pixels) {
198*b2055c35SXin Li VP8LPredictorsAdd_C[0](in + i, NULL, num_pixels - i, out + i);
199*b2055c35SXin Li }
200*b2055c35SXin Li (void)upper;
201*b2055c35SXin Li }
202*b2055c35SXin Li
203*b2055c35SXin Li // Predictor1: left.
PredictorAdd1_SSE2(const uint32_t * in,const uint32_t * upper,int num_pixels,uint32_t * out)204*b2055c35SXin Li static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
205*b2055c35SXin Li int num_pixels, uint32_t* out) {
206*b2055c35SXin Li int i;
207*b2055c35SXin Li __m128i prev = _mm_set1_epi32((int)out[-1]);
208*b2055c35SXin Li for (i = 0; i + 4 <= num_pixels; i += 4) {
209*b2055c35SXin Li // a | b | c | d
210*b2055c35SXin Li const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
211*b2055c35SXin Li // 0 | a | b | c
212*b2055c35SXin Li const __m128i shift0 = _mm_slli_si128(src, 4);
213*b2055c35SXin Li // a | a + b | b + c | c + d
214*b2055c35SXin Li const __m128i sum0 = _mm_add_epi8(src, shift0);
215*b2055c35SXin Li // 0 | 0 | a | a + b
216*b2055c35SXin Li const __m128i shift1 = _mm_slli_si128(sum0, 8);
217*b2055c35SXin Li // a | a + b | a + b + c | a + b + c + d
218*b2055c35SXin Li const __m128i sum1 = _mm_add_epi8(sum0, shift1);
219*b2055c35SXin Li const __m128i res = _mm_add_epi8(sum1, prev);
220*b2055c35SXin Li _mm_storeu_si128((__m128i*)&out[i], res);
221*b2055c35SXin Li // replicate prev output on the four lanes
222*b2055c35SXin Li prev = _mm_shuffle_epi32(res, (3 << 0) | (3 << 2) | (3 << 4) | (3 << 6));
223*b2055c35SXin Li }
224*b2055c35SXin Li if (i != num_pixels) {
225*b2055c35SXin Li VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);
226*b2055c35SXin Li }
227*b2055c35SXin Li }
228*b2055c35SXin Li
229*b2055c35SXin Li // Macro that adds 32-bit integers from IN using mod 256 arithmetic
230*b2055c35SXin Li // per 8 bit channel.
231*b2055c35SXin Li #define GENERATE_PREDICTOR_1(X, IN) \
232*b2055c35SXin Li static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
233*b2055c35SXin Li int num_pixels, uint32_t* out) { \
234*b2055c35SXin Li int i; \
235*b2055c35SXin Li for (i = 0; i + 4 <= num_pixels; i += 4) { \
236*b2055c35SXin Li const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
237*b2055c35SXin Li const __m128i other = _mm_loadu_si128((const __m128i*)&(IN)); \
238*b2055c35SXin Li const __m128i res = _mm_add_epi8(src, other); \
239*b2055c35SXin Li _mm_storeu_si128((__m128i*)&out[i], res); \
240*b2055c35SXin Li } \
241*b2055c35SXin Li if (i != num_pixels) { \
242*b2055c35SXin Li VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
243*b2055c35SXin Li } \
244*b2055c35SXin Li }
245*b2055c35SXin Li
246*b2055c35SXin Li // Predictor2: Top.
247*b2055c35SXin Li GENERATE_PREDICTOR_1(2, upper[i])
248*b2055c35SXin Li // Predictor3: Top-right.
249*b2055c35SXin Li GENERATE_PREDICTOR_1(3, upper[i + 1])
250*b2055c35SXin Li // Predictor4: Top-left.
251*b2055c35SXin Li GENERATE_PREDICTOR_1(4, upper[i - 1])
252*b2055c35SXin Li #undef GENERATE_PREDICTOR_1
253*b2055c35SXin Li
254*b2055c35SXin Li // Due to averages with integers, values cannot be accumulated in parallel for
255*b2055c35SXin Li // predictors 5 to 7.
GENERATE_PREDICTOR_ADD(Predictor5_SSE2,PredictorAdd5_SSE2)256*b2055c35SXin Li GENERATE_PREDICTOR_ADD(Predictor5_SSE2, PredictorAdd5_SSE2)
257*b2055c35SXin Li GENERATE_PREDICTOR_ADD(Predictor6_SSE2, PredictorAdd6_SSE2)
258*b2055c35SXin Li GENERATE_PREDICTOR_ADD(Predictor7_SSE2, PredictorAdd7_SSE2)
259*b2055c35SXin Li
260*b2055c35SXin Li #define GENERATE_PREDICTOR_2(X, IN) \
261*b2055c35SXin Li static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
262*b2055c35SXin Li int num_pixels, uint32_t* out) { \
263*b2055c35SXin Li int i; \
264*b2055c35SXin Li for (i = 0; i + 4 <= num_pixels; i += 4) { \
265*b2055c35SXin Li const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN)); \
266*b2055c35SXin Li const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); \
267*b2055c35SXin Li const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
268*b2055c35SXin Li __m128i avg, res; \
269*b2055c35SXin Li Average2_m128i(&T, &Tother, &avg); \
270*b2055c35SXin Li res = _mm_add_epi8(avg, src); \
271*b2055c35SXin Li _mm_storeu_si128((__m128i*)&out[i], res); \
272*b2055c35SXin Li } \
273*b2055c35SXin Li if (i != num_pixels) { \
274*b2055c35SXin Li VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
275*b2055c35SXin Li } \
276*b2055c35SXin Li }
277*b2055c35SXin Li // Predictor8: average TL T.
278*b2055c35SXin Li GENERATE_PREDICTOR_2(8, upper[i - 1])
279*b2055c35SXin Li // Predictor9: average T TR.
280*b2055c35SXin Li GENERATE_PREDICTOR_2(9, upper[i + 1])
281*b2055c35SXin Li #undef GENERATE_PREDICTOR_2
282*b2055c35SXin Li
283*b2055c35SXin Li // Predictor10: average of (average of (L,TL), average of (T, TR)).
284*b2055c35SXin Li #define DO_PRED10(OUT) do { \
285*b2055c35SXin Li __m128i avgLTL, avg; \
286*b2055c35SXin Li Average2_m128i(&L, &TL, &avgLTL); \
287*b2055c35SXin Li Average2_m128i(&avgTTR, &avgLTL, &avg); \
288*b2055c35SXin Li L = _mm_add_epi8(avg, src); \
289*b2055c35SXin Li out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L); \
290*b2055c35SXin Li } while (0)
291*b2055c35SXin Li
292*b2055c35SXin Li #define DO_PRED10_SHIFT do { \
293*b2055c35SXin Li /* Rotate the pre-computed values for the next iteration.*/ \
294*b2055c35SXin Li avgTTR = _mm_srli_si128(avgTTR, 4); \
295*b2055c35SXin Li TL = _mm_srli_si128(TL, 4); \
296*b2055c35SXin Li src = _mm_srli_si128(src, 4); \
297*b2055c35SXin Li } while (0)
298*b2055c35SXin Li
299*b2055c35SXin Li static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
300*b2055c35SXin Li int num_pixels, uint32_t* out) {
301*b2055c35SXin Li int i;
302*b2055c35SXin Li __m128i L = _mm_cvtsi32_si128((int)out[-1]);
303*b2055c35SXin Li for (i = 0; i + 4 <= num_pixels; i += 4) {
304*b2055c35SXin Li __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
305*b2055c35SXin Li __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
306*b2055c35SXin Li const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
307*b2055c35SXin Li const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
308*b2055c35SXin Li __m128i avgTTR;
309*b2055c35SXin Li Average2_m128i(&T, &TR, &avgTTR);
310*b2055c35SXin Li DO_PRED10(0);
311*b2055c35SXin Li DO_PRED10_SHIFT;
312*b2055c35SXin Li DO_PRED10(1);
313*b2055c35SXin Li DO_PRED10_SHIFT;
314*b2055c35SXin Li DO_PRED10(2);
315*b2055c35SXin Li DO_PRED10_SHIFT;
316*b2055c35SXin Li DO_PRED10(3);
317*b2055c35SXin Li }
318*b2055c35SXin Li if (i != num_pixels) {
319*b2055c35SXin Li VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
320*b2055c35SXin Li }
321*b2055c35SXin Li }
322*b2055c35SXin Li #undef DO_PRED10
323*b2055c35SXin Li #undef DO_PRED10_SHIFT
324*b2055c35SXin Li
325*b2055c35SXin Li // Predictor11: select.
326*b2055c35SXin Li #define DO_PRED11(OUT) do { \
327*b2055c35SXin Li const __m128i L_lo = _mm_unpacklo_epi32(L, T); \
328*b2055c35SXin Li const __m128i TL_lo = _mm_unpacklo_epi32(TL, T); \
329*b2055c35SXin Li const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/ \
330*b2055c35SXin Li const __m128i mask = _mm_cmpgt_epi32(pb, pa); \
331*b2055c35SXin Li const __m128i A = _mm_and_si128(mask, L); \
332*b2055c35SXin Li const __m128i B = _mm_andnot_si128(mask, T); \
333*b2055c35SXin Li const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \
334*b2055c35SXin Li L = _mm_add_epi8(src, pred); \
335*b2055c35SXin Li out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L); \
336*b2055c35SXin Li } while (0)
337*b2055c35SXin Li
338*b2055c35SXin Li #define DO_PRED11_SHIFT do { \
339*b2055c35SXin Li /* Shift the pre-computed value for the next iteration.*/ \
340*b2055c35SXin Li T = _mm_srli_si128(T, 4); \
341*b2055c35SXin Li TL = _mm_srli_si128(TL, 4); \
342*b2055c35SXin Li src = _mm_srli_si128(src, 4); \
343*b2055c35SXin Li pa = _mm_srli_si128(pa, 4); \
344*b2055c35SXin Li } while (0)
345*b2055c35SXin Li
PredictorAdd11_SSE2(const uint32_t * in,const uint32_t * upper,int num_pixels,uint32_t * out)346*b2055c35SXin Li static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
347*b2055c35SXin Li int num_pixels, uint32_t* out) {
348*b2055c35SXin Li int i;
349*b2055c35SXin Li __m128i pa;
350*b2055c35SXin Li __m128i L = _mm_cvtsi32_si128((int)out[-1]);
351*b2055c35SXin Li for (i = 0; i + 4 <= num_pixels; i += 4) {
352*b2055c35SXin Li __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
353*b2055c35SXin Li __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
354*b2055c35SXin Li __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
355*b2055c35SXin Li {
356*b2055c35SXin Li // We can unpack with any value on the upper 32 bits, provided it's the
357*b2055c35SXin Li // same on both operands (so that their sum of abs diff is zero). Here we
358*b2055c35SXin Li // use T.
359*b2055c35SXin Li const __m128i T_lo = _mm_unpacklo_epi32(T, T);
360*b2055c35SXin Li const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);
361*b2055c35SXin Li const __m128i T_hi = _mm_unpackhi_epi32(T, T);
362*b2055c35SXin Li const __m128i TL_hi = _mm_unpackhi_epi32(TL, T);
363*b2055c35SXin Li const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo);
364*b2055c35SXin Li const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi);
365*b2055c35SXin Li pa = _mm_packs_epi32(s_lo, s_hi); // pa = sum |T-TL|
366*b2055c35SXin Li }
367*b2055c35SXin Li DO_PRED11(0);
368*b2055c35SXin Li DO_PRED11_SHIFT;
369*b2055c35SXin Li DO_PRED11(1);
370*b2055c35SXin Li DO_PRED11_SHIFT;
371*b2055c35SXin Li DO_PRED11(2);
372*b2055c35SXin Li DO_PRED11_SHIFT;
373*b2055c35SXin Li DO_PRED11(3);
374*b2055c35SXin Li }
375*b2055c35SXin Li if (i != num_pixels) {
376*b2055c35SXin Li VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
377*b2055c35SXin Li }
378*b2055c35SXin Li }
379*b2055c35SXin Li #undef DO_PRED11
380*b2055c35SXin Li #undef DO_PRED11_SHIFT
381*b2055c35SXin Li
382*b2055c35SXin Li // Predictor12: ClampedAddSubtractFull.
383*b2055c35SXin Li #define DO_PRED12(DIFF, LANE, OUT) do { \
384*b2055c35SXin Li const __m128i all = _mm_add_epi16(L, (DIFF)); \
385*b2055c35SXin Li const __m128i alls = _mm_packus_epi16(all, all); \
386*b2055c35SXin Li const __m128i res = _mm_add_epi8(src, alls); \
387*b2055c35SXin Li out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(res); \
388*b2055c35SXin Li L = _mm_unpacklo_epi8(res, zero); \
389*b2055c35SXin Li } while (0)
390*b2055c35SXin Li
391*b2055c35SXin Li #define DO_PRED12_SHIFT(DIFF, LANE) do { \
392*b2055c35SXin Li /* Shift the pre-computed value for the next iteration.*/ \
393*b2055c35SXin Li if ((LANE) == 0) (DIFF) = _mm_srli_si128((DIFF), 8); \
394*b2055c35SXin Li src = _mm_srli_si128(src, 4); \
395*b2055c35SXin Li } while (0)
396*b2055c35SXin Li
PredictorAdd12_SSE2(const uint32_t * in,const uint32_t * upper,int num_pixels,uint32_t * out)397*b2055c35SXin Li static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
398*b2055c35SXin Li int num_pixels, uint32_t* out) {
399*b2055c35SXin Li int i;
400*b2055c35SXin Li const __m128i zero = _mm_setzero_si128();
401*b2055c35SXin Li const __m128i L8 = _mm_cvtsi32_si128((int)out[-1]);
402*b2055c35SXin Li __m128i L = _mm_unpacklo_epi8(L8, zero);
403*b2055c35SXin Li for (i = 0; i + 4 <= num_pixels; i += 4) {
404*b2055c35SXin Li // Load 4 pixels at a time.
405*b2055c35SXin Li __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
406*b2055c35SXin Li const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
407*b2055c35SXin Li const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
408*b2055c35SXin Li const __m128i T_hi = _mm_unpackhi_epi8(T, zero);
409*b2055c35SXin Li const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
410*b2055c35SXin Li const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
411*b2055c35SXin Li const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);
412*b2055c35SXin Li __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
413*b2055c35SXin Li __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
414*b2055c35SXin Li DO_PRED12(diff_lo, 0, 0);
415*b2055c35SXin Li DO_PRED12_SHIFT(diff_lo, 0);
416*b2055c35SXin Li DO_PRED12(diff_lo, 1, 1);
417*b2055c35SXin Li DO_PRED12_SHIFT(diff_lo, 1);
418*b2055c35SXin Li DO_PRED12(diff_hi, 0, 2);
419*b2055c35SXin Li DO_PRED12_SHIFT(diff_hi, 0);
420*b2055c35SXin Li DO_PRED12(diff_hi, 1, 3);
421*b2055c35SXin Li }
422*b2055c35SXin Li if (i != num_pixels) {
423*b2055c35SXin Li VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i);
424*b2055c35SXin Li }
425*b2055c35SXin Li }
426*b2055c35SXin Li #undef DO_PRED12
427*b2055c35SXin Li #undef DO_PRED12_SHIFT
428*b2055c35SXin Li
429*b2055c35SXin Li // Due to averages with integers, values cannot be accumulated in parallel for
430*b2055c35SXin Li // predictors 13.
GENERATE_PREDICTOR_ADD(Predictor13_SSE2,PredictorAdd13_SSE2)431*b2055c35SXin Li GENERATE_PREDICTOR_ADD(Predictor13_SSE2, PredictorAdd13_SSE2)
432*b2055c35SXin Li
433*b2055c35SXin Li //------------------------------------------------------------------------------
434*b2055c35SXin Li // Subtract-Green Transform
435*b2055c35SXin Li
436*b2055c35SXin Li static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels,
437*b2055c35SXin Li uint32_t* dst) {
438*b2055c35SXin Li int i;
439*b2055c35SXin Li for (i = 0; i + 4 <= num_pixels; i += 4) {
440*b2055c35SXin Li const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
441*b2055c35SXin Li const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g
442*b2055c35SXin Li const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
443*b2055c35SXin Li const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g
444*b2055c35SXin Li const __m128i out = _mm_add_epi8(in, C);
445*b2055c35SXin Li _mm_storeu_si128((__m128i*)&dst[i], out);
446*b2055c35SXin Li }
447*b2055c35SXin Li // fallthrough and finish off with plain-C
448*b2055c35SXin Li if (i != num_pixels) {
449*b2055c35SXin Li VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i);
450*b2055c35SXin Li }
451*b2055c35SXin Li }
452*b2055c35SXin Li
453*b2055c35SXin Li //------------------------------------------------------------------------------
454*b2055c35SXin Li // Color Transform
455*b2055c35SXin Li
TransformColorInverse_SSE2(const VP8LMultipliers * const m,const uint32_t * const src,int num_pixels,uint32_t * dst)456*b2055c35SXin Li static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
457*b2055c35SXin Li const uint32_t* const src,
458*b2055c35SXin Li int num_pixels, uint32_t* dst) {
459*b2055c35SXin Li // sign-extended multiplying constants, pre-shifted by 5.
460*b2055c35SXin Li #define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend
461*b2055c35SXin Li #define MK_CST_16(HI, LO) \
462*b2055c35SXin Li _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
463*b2055c35SXin Li const __m128i mults_rb = MK_CST_16(CST(green_to_red_), CST(green_to_blue_));
464*b2055c35SXin Li const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0);
465*b2055c35SXin Li #undef MK_CST_16
466*b2055c35SXin Li #undef CST
467*b2055c35SXin Li const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00); // alpha-green masks
468*b2055c35SXin Li int i;
469*b2055c35SXin Li for (i = 0; i + 4 <= num_pixels; i += 4) {
470*b2055c35SXin Li const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
471*b2055c35SXin Li const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0
472*b2055c35SXin Li const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
473*b2055c35SXin Li const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0
474*b2055c35SXin Li const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1
475*b2055c35SXin Li const __m128i E = _mm_add_epi8(in, D); // x r' x b'
476*b2055c35SXin Li const __m128i F = _mm_slli_epi16(E, 8); // r' 0 b' 0
477*b2055c35SXin Li const __m128i G = _mm_mulhi_epi16(F, mults_b2); // x db2 0 0
478*b2055c35SXin Li const __m128i H = _mm_srli_epi32(G, 8); // 0 x db2 0
479*b2055c35SXin Li const __m128i I = _mm_add_epi8(H, F); // r' x b'' 0
480*b2055c35SXin Li const __m128i J = _mm_srli_epi16(I, 8); // 0 r' 0 b''
481*b2055c35SXin Li const __m128i out = _mm_or_si128(J, A);
482*b2055c35SXin Li _mm_storeu_si128((__m128i*)&dst[i], out);
483*b2055c35SXin Li }
484*b2055c35SXin Li // Fall-back to C-version for left-overs.
485*b2055c35SXin Li if (i != num_pixels) {
486*b2055c35SXin Li VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
487*b2055c35SXin Li }
488*b2055c35SXin Li }
489*b2055c35SXin Li
490*b2055c35SXin Li //------------------------------------------------------------------------------
491*b2055c35SXin Li // Color-space conversion functions
492*b2055c35SXin Li
ConvertBGRAToRGB_SSE2(const uint32_t * src,int num_pixels,uint8_t * dst)493*b2055c35SXin Li static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
494*b2055c35SXin Li uint8_t* dst) {
495*b2055c35SXin Li const __m128i* in = (const __m128i*)src;
496*b2055c35SXin Li __m128i* out = (__m128i*)dst;
497*b2055c35SXin Li
498*b2055c35SXin Li while (num_pixels >= 32) {
499*b2055c35SXin Li // Load the BGRA buffers.
500*b2055c35SXin Li __m128i in0 = _mm_loadu_si128(in + 0);
501*b2055c35SXin Li __m128i in1 = _mm_loadu_si128(in + 1);
502*b2055c35SXin Li __m128i in2 = _mm_loadu_si128(in + 2);
503*b2055c35SXin Li __m128i in3 = _mm_loadu_si128(in + 3);
504*b2055c35SXin Li __m128i in4 = _mm_loadu_si128(in + 4);
505*b2055c35SXin Li __m128i in5 = _mm_loadu_si128(in + 5);
506*b2055c35SXin Li __m128i in6 = _mm_loadu_si128(in + 6);
507*b2055c35SXin Li __m128i in7 = _mm_loadu_si128(in + 7);
508*b2055c35SXin Li VP8L32bToPlanar_SSE2(&in0, &in1, &in2, &in3);
509*b2055c35SXin Li VP8L32bToPlanar_SSE2(&in4, &in5, &in6, &in7);
510*b2055c35SXin Li // At this points, in1/in5 contains red only, in2/in6 green only ...
511*b2055c35SXin Li // Pack the colors in 24b RGB.
512*b2055c35SXin Li VP8PlanarTo24b_SSE2(&in1, &in5, &in2, &in6, &in3, &in7);
513*b2055c35SXin Li _mm_storeu_si128(out + 0, in1);
514*b2055c35SXin Li _mm_storeu_si128(out + 1, in5);
515*b2055c35SXin Li _mm_storeu_si128(out + 2, in2);
516*b2055c35SXin Li _mm_storeu_si128(out + 3, in6);
517*b2055c35SXin Li _mm_storeu_si128(out + 4, in3);
518*b2055c35SXin Li _mm_storeu_si128(out + 5, in7);
519*b2055c35SXin Li in += 8;
520*b2055c35SXin Li out += 6;
521*b2055c35SXin Li num_pixels -= 32;
522*b2055c35SXin Li }
523*b2055c35SXin Li // left-overs
524*b2055c35SXin Li if (num_pixels > 0) {
525*b2055c35SXin Li VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
526*b2055c35SXin Li }
527*b2055c35SXin Li }
528*b2055c35SXin Li
ConvertBGRAToRGBA_SSE2(const uint32_t * src,int num_pixels,uint8_t * dst)529*b2055c35SXin Li static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
530*b2055c35SXin Li int num_pixels, uint8_t* dst) {
531*b2055c35SXin Li const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ff);
532*b2055c35SXin Li const __m128i* in = (const __m128i*)src;
533*b2055c35SXin Li __m128i* out = (__m128i*)dst;
534*b2055c35SXin Li while (num_pixels >= 8) {
535*b2055c35SXin Li const __m128i A1 = _mm_loadu_si128(in++);
536*b2055c35SXin Li const __m128i A2 = _mm_loadu_si128(in++);
537*b2055c35SXin Li const __m128i B1 = _mm_and_si128(A1, red_blue_mask); // R 0 B 0
538*b2055c35SXin Li const __m128i B2 = _mm_and_si128(A2, red_blue_mask); // R 0 B 0
539*b2055c35SXin Li const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1); // 0 G 0 A
540*b2055c35SXin Li const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2); // 0 G 0 A
541*b2055c35SXin Li const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1));
542*b2055c35SXin Li const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1));
543*b2055c35SXin Li const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1));
544*b2055c35SXin Li const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1));
545*b2055c35SXin Li const __m128i F1 = _mm_or_si128(E1, C1);
546*b2055c35SXin Li const __m128i F2 = _mm_or_si128(E2, C2);
547*b2055c35SXin Li _mm_storeu_si128(out++, F1);
548*b2055c35SXin Li _mm_storeu_si128(out++, F2);
549*b2055c35SXin Li num_pixels -= 8;
550*b2055c35SXin Li }
551*b2055c35SXin Li // left-overs
552*b2055c35SXin Li if (num_pixels > 0) {
553*b2055c35SXin Li VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
554*b2055c35SXin Li }
555*b2055c35SXin Li }
556*b2055c35SXin Li
ConvertBGRAToRGBA4444_SSE2(const uint32_t * src,int num_pixels,uint8_t * dst)557*b2055c35SXin Li static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
558*b2055c35SXin Li int num_pixels, uint8_t* dst) {
559*b2055c35SXin Li const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
560*b2055c35SXin Li const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0);
561*b2055c35SXin Li const __m128i* in = (const __m128i*)src;
562*b2055c35SXin Li __m128i* out = (__m128i*)dst;
563*b2055c35SXin Li while (num_pixels >= 8) {
564*b2055c35SXin Li const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
565*b2055c35SXin Li const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
566*b2055c35SXin Li const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...
567*b2055c35SXin Li const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...
568*b2055c35SXin Li const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...
569*b2055c35SXin Li const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...
570*b2055c35SXin Li const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7
571*b2055c35SXin Li const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7
572*b2055c35SXin Li const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7
573*b2055c35SXin Li const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7
574*b2055c35SXin Li const __m128i ga1 = _mm_srli_epi16(ga0, 4); // g0-|g1-|...|a6-|a7-
575*b2055c35SXin Li const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0); // -r0|-r1|...|-b6|-a7
576*b2055c35SXin Li const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f); // g0-|g1-|...|a6-|a7-
577*b2055c35SXin Li const __m128i rgba0 = _mm_or_si128(ga2, rb1); // rg0..rg7 | ba0..ba7
578*b2055c35SXin Li const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0
579*b2055c35SXin Li #if (WEBP_SWAP_16BIT_CSP == 1)
580*b2055c35SXin Li const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7
581*b2055c35SXin Li #else
582*b2055c35SXin Li const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7
583*b2055c35SXin Li #endif
584*b2055c35SXin Li _mm_storeu_si128(out++, rgba);
585*b2055c35SXin Li num_pixels -= 8;
586*b2055c35SXin Li }
587*b2055c35SXin Li // left-overs
588*b2055c35SXin Li if (num_pixels > 0) {
589*b2055c35SXin Li VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
590*b2055c35SXin Li }
591*b2055c35SXin Li }
592*b2055c35SXin Li
ConvertBGRAToRGB565_SSE2(const uint32_t * src,int num_pixels,uint8_t * dst)593*b2055c35SXin Li static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,
594*b2055c35SXin Li int num_pixels, uint8_t* dst) {
595*b2055c35SXin Li const __m128i mask_0xe0 = _mm_set1_epi8((char)0xe0);
596*b2055c35SXin Li const __m128i mask_0xf8 = _mm_set1_epi8((char)0xf8);
597*b2055c35SXin Li const __m128i mask_0x07 = _mm_set1_epi8(0x07);
598*b2055c35SXin Li const __m128i* in = (const __m128i*)src;
599*b2055c35SXin Li __m128i* out = (__m128i*)dst;
600*b2055c35SXin Li while (num_pixels >= 8) {
601*b2055c35SXin Li const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
602*b2055c35SXin Li const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
603*b2055c35SXin Li const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...
604*b2055c35SXin Li const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...
605*b2055c35SXin Li const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...
606*b2055c35SXin Li const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...
607*b2055c35SXin Li const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7
608*b2055c35SXin Li const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7
609*b2055c35SXin Li const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7
610*b2055c35SXin Li const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7
611*b2055c35SXin Li const __m128i rb1 = _mm_and_si128(rb0, mask_0xf8); // -r0..-r7|-b0..-b7
612*b2055c35SXin Li const __m128i g_lo1 = _mm_srli_epi16(ga0, 5);
613*b2055c35SXin Li const __m128i g_lo2 = _mm_and_si128(g_lo1, mask_0x07); // g0-...g7-|xx (3b)
614*b2055c35SXin Li const __m128i g_hi1 = _mm_slli_epi16(ga0, 3);
615*b2055c35SXin Li const __m128i g_hi2 = _mm_and_si128(g_hi1, mask_0xe0); // -g0...-g7|xx (3b)
616*b2055c35SXin Li const __m128i b0 = _mm_srli_si128(rb1, 8); // -b0...-b7|0
617*b2055c35SXin Li const __m128i rg1 = _mm_or_si128(rb1, g_lo2); // gr0...gr7|xx
618*b2055c35SXin Li const __m128i b1 = _mm_srli_epi16(b0, 3);
619*b2055c35SXin Li const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx
620*b2055c35SXin Li #if (WEBP_SWAP_16BIT_CSP == 1)
621*b2055c35SXin Li const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7
622*b2055c35SXin Li #else
623*b2055c35SXin Li const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7
624*b2055c35SXin Li #endif
625*b2055c35SXin Li _mm_storeu_si128(out++, rgba);
626*b2055c35SXin Li num_pixels -= 8;
627*b2055c35SXin Li }
628*b2055c35SXin Li // left-overs
629*b2055c35SXin Li if (num_pixels > 0) {
630*b2055c35SXin Li VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
631*b2055c35SXin Li }
632*b2055c35SXin Li }
633*b2055c35SXin Li
ConvertBGRAToBGR_SSE2(const uint32_t * src,int num_pixels,uint8_t * dst)634*b2055c35SXin Li static void ConvertBGRAToBGR_SSE2(const uint32_t* src,
635*b2055c35SXin Li int num_pixels, uint8_t* dst) {
636*b2055c35SXin Li const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
637*b2055c35SXin Li const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
638*b2055c35SXin Li const __m128i* in = (const __m128i*)src;
639*b2055c35SXin Li const uint8_t* const end = dst + num_pixels * 3;
640*b2055c35SXin Li // the last storel_epi64 below writes 8 bytes starting at offset 18
641*b2055c35SXin Li while (dst + 26 <= end) {
642*b2055c35SXin Li const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
643*b2055c35SXin Li const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
644*b2055c35SXin Li const __m128i a0l = _mm_and_si128(bgra0, mask_l); // bgr0|0|bgr0|0
645*b2055c35SXin Li const __m128i a4l = _mm_and_si128(bgra4, mask_l); // bgr0|0|bgr0|0
646*b2055c35SXin Li const __m128i a0h = _mm_and_si128(bgra0, mask_h); // 0|bgr0|0|bgr0
647*b2055c35SXin Li const __m128i a4h = _mm_and_si128(bgra4, mask_h); // 0|bgr0|0|bgr0
648*b2055c35SXin Li const __m128i b0h = _mm_srli_epi64(a0h, 8); // 000b|gr00|000b|gr00
649*b2055c35SXin Li const __m128i b4h = _mm_srli_epi64(a4h, 8); // 000b|gr00|000b|gr00
650*b2055c35SXin Li const __m128i c0 = _mm_or_si128(a0l, b0h); // rgbrgb00|rgbrgb00
651*b2055c35SXin Li const __m128i c4 = _mm_or_si128(a4l, b4h); // rgbrgb00|rgbrgb00
652*b2055c35SXin Li const __m128i c2 = _mm_srli_si128(c0, 8);
653*b2055c35SXin Li const __m128i c6 = _mm_srli_si128(c4, 8);
654*b2055c35SXin Li _mm_storel_epi64((__m128i*)(dst + 0), c0);
655*b2055c35SXin Li _mm_storel_epi64((__m128i*)(dst + 6), c2);
656*b2055c35SXin Li _mm_storel_epi64((__m128i*)(dst + 12), c4);
657*b2055c35SXin Li _mm_storel_epi64((__m128i*)(dst + 18), c6);
658*b2055c35SXin Li dst += 24;
659*b2055c35SXin Li num_pixels -= 8;
660*b2055c35SXin Li }
661*b2055c35SXin Li // left-overs
662*b2055c35SXin Li if (num_pixels > 0) {
663*b2055c35SXin Li VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst);
664*b2055c35SXin Li }
665*b2055c35SXin Li }
666*b2055c35SXin Li
667*b2055c35SXin Li //------------------------------------------------------------------------------
668*b2055c35SXin Li // Entry point
669*b2055c35SXin Li
670*b2055c35SXin Li extern void VP8LDspInitSSE2(void);
671*b2055c35SXin Li
VP8LDspInitSSE2(void)672*b2055c35SXin Li WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {
673*b2055c35SXin Li VP8LPredictors[5] = Predictor5_SSE2;
674*b2055c35SXin Li VP8LPredictors[6] = Predictor6_SSE2;
675*b2055c35SXin Li VP8LPredictors[7] = Predictor7_SSE2;
676*b2055c35SXin Li VP8LPredictors[8] = Predictor8_SSE2;
677*b2055c35SXin Li VP8LPredictors[9] = Predictor9_SSE2;
678*b2055c35SXin Li VP8LPredictors[10] = Predictor10_SSE2;
679*b2055c35SXin Li VP8LPredictors[11] = Predictor11_SSE2;
680*b2055c35SXin Li VP8LPredictors[12] = Predictor12_SSE2;
681*b2055c35SXin Li VP8LPredictors[13] = Predictor13_SSE2;
682*b2055c35SXin Li
683*b2055c35SXin Li VP8LPredictorsAdd[0] = PredictorAdd0_SSE2;
684*b2055c35SXin Li VP8LPredictorsAdd[1] = PredictorAdd1_SSE2;
685*b2055c35SXin Li VP8LPredictorsAdd[2] = PredictorAdd2_SSE2;
686*b2055c35SXin Li VP8LPredictorsAdd[3] = PredictorAdd3_SSE2;
687*b2055c35SXin Li VP8LPredictorsAdd[4] = PredictorAdd4_SSE2;
688*b2055c35SXin Li VP8LPredictorsAdd[5] = PredictorAdd5_SSE2;
689*b2055c35SXin Li VP8LPredictorsAdd[6] = PredictorAdd6_SSE2;
690*b2055c35SXin Li VP8LPredictorsAdd[7] = PredictorAdd7_SSE2;
691*b2055c35SXin Li VP8LPredictorsAdd[8] = PredictorAdd8_SSE2;
692*b2055c35SXin Li VP8LPredictorsAdd[9] = PredictorAdd9_SSE2;
693*b2055c35SXin Li VP8LPredictorsAdd[10] = PredictorAdd10_SSE2;
694*b2055c35SXin Li VP8LPredictorsAdd[11] = PredictorAdd11_SSE2;
695*b2055c35SXin Li VP8LPredictorsAdd[12] = PredictorAdd12_SSE2;
696*b2055c35SXin Li VP8LPredictorsAdd[13] = PredictorAdd13_SSE2;
697*b2055c35SXin Li
698*b2055c35SXin Li VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_SSE2;
699*b2055c35SXin Li VP8LTransformColorInverse = TransformColorInverse_SSE2;
700*b2055c35SXin Li
701*b2055c35SXin Li VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE2;
702*b2055c35SXin Li VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_SSE2;
703*b2055c35SXin Li VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_SSE2;
704*b2055c35SXin Li VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_SSE2;
705*b2055c35SXin Li VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE2;
706*b2055c35SXin Li }
707*b2055c35SXin Li
708*b2055c35SXin Li #else // !WEBP_USE_SSE2
709*b2055c35SXin Li
710*b2055c35SXin Li WEBP_DSP_INIT_STUB(VP8LDspInitSSE2)
711*b2055c35SXin Li
712*b2055c35SXin Li #endif // WEBP_USE_SSE2
713