1*b2055c35SXin Li // Copyright 2021 Google Inc. All Rights Reserved.
2*b2055c35SXin Li //
3*b2055c35SXin Li // Use of this source code is governed by a BSD-style license
4*b2055c35SXin Li // that can be found in the COPYING file in the root of the source
5*b2055c35SXin Li // tree. An additional intellectual property rights grant can be found
6*b2055c35SXin Li // in the file PATENTS. All contributing project authors may
7*b2055c35SXin Li // be found in the AUTHORS file in the root of the source tree.
8*b2055c35SXin Li // -----------------------------------------------------------------------------
9*b2055c35SXin Li //
10*b2055c35SXin Li // SSE41 variant of methods for lossless decoder
11*b2055c35SXin Li
12*b2055c35SXin Li #include "src/dsp/dsp.h"
13*b2055c35SXin Li
14*b2055c35SXin Li #if defined(WEBP_USE_SSE41)
15*b2055c35SXin Li
16*b2055c35SXin Li #include "src/dsp/common_sse41.h"
17*b2055c35SXin Li #include "src/dsp/lossless.h"
18*b2055c35SXin Li #include "src/dsp/lossless_common.h"
19*b2055c35SXin Li
20*b2055c35SXin Li //------------------------------------------------------------------------------
21*b2055c35SXin Li // Color-space conversion functions
22*b2055c35SXin Li
TransformColorInverse_SSE41(const VP8LMultipliers * const m,const uint32_t * const src,int num_pixels,uint32_t * dst)23*b2055c35SXin Li static void TransformColorInverse_SSE41(const VP8LMultipliers* const m,
24*b2055c35SXin Li const uint32_t* const src,
25*b2055c35SXin Li int num_pixels, uint32_t* dst) {
26*b2055c35SXin Li // sign-extended multiplying constants, pre-shifted by 5.
27*b2055c35SXin Li #define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend
28*b2055c35SXin Li const __m128i mults_rb =
29*b2055c35SXin Li _mm_set1_epi32((int)((uint32_t)CST(green_to_red_) << 16 |
30*b2055c35SXin Li (CST(green_to_blue_) & 0xffff)));
31*b2055c35SXin Li const __m128i mults_b2 = _mm_set1_epi32(CST(red_to_blue_));
32*b2055c35SXin Li #undef CST
33*b2055c35SXin Li const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00);
34*b2055c35SXin Li const __m128i perm1 = _mm_setr_epi8(-1, 1, -1, 1, -1, 5, -1, 5,
35*b2055c35SXin Li -1, 9, -1, 9, -1, 13, -1, 13);
36*b2055c35SXin Li const __m128i perm2 = _mm_setr_epi8(-1, 2, -1, -1, -1, 6, -1, -1,
37*b2055c35SXin Li -1, 10, -1, -1, -1, 14, -1, -1);
38*b2055c35SXin Li int i;
39*b2055c35SXin Li for (i = 0; i + 4 <= num_pixels; i += 4) {
40*b2055c35SXin Li const __m128i A = _mm_loadu_si128((const __m128i*)(src + i));
41*b2055c35SXin Li const __m128i B = _mm_shuffle_epi8(A, perm1); // argb -> g0g0
42*b2055c35SXin Li const __m128i C = _mm_mulhi_epi16(B, mults_rb);
43*b2055c35SXin Li const __m128i D = _mm_add_epi8(A, C);
44*b2055c35SXin Li const __m128i E = _mm_shuffle_epi8(D, perm2);
45*b2055c35SXin Li const __m128i F = _mm_mulhi_epi16(E, mults_b2);
46*b2055c35SXin Li const __m128i G = _mm_add_epi8(D, F);
47*b2055c35SXin Li const __m128i out = _mm_blendv_epi8(G, A, mask_ag);
48*b2055c35SXin Li _mm_storeu_si128((__m128i*)&dst[i], out);
49*b2055c35SXin Li }
50*b2055c35SXin Li // Fall-back to C-version for left-overs.
51*b2055c35SXin Li if (i != num_pixels) {
52*b2055c35SXin Li VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
53*b2055c35SXin Li }
54*b2055c35SXin Li }
55*b2055c35SXin Li
56*b2055c35SXin Li //------------------------------------------------------------------------------
57*b2055c35SXin Li
58*b2055c35SXin Li #define ARGB_TO_RGB_SSE41 do { \
59*b2055c35SXin Li while (num_pixels >= 16) { \
60*b2055c35SXin Li const __m128i in0 = _mm_loadu_si128(in + 0); \
61*b2055c35SXin Li const __m128i in1 = _mm_loadu_si128(in + 1); \
62*b2055c35SXin Li const __m128i in2 = _mm_loadu_si128(in + 2); \
63*b2055c35SXin Li const __m128i in3 = _mm_loadu_si128(in + 3); \
64*b2055c35SXin Li const __m128i a0 = _mm_shuffle_epi8(in0, perm0); \
65*b2055c35SXin Li const __m128i a1 = _mm_shuffle_epi8(in1, perm1); \
66*b2055c35SXin Li const __m128i a2 = _mm_shuffle_epi8(in2, perm2); \
67*b2055c35SXin Li const __m128i a3 = _mm_shuffle_epi8(in3, perm3); \
68*b2055c35SXin Li const __m128i b0 = _mm_blend_epi16(a0, a1, 0xc0); \
69*b2055c35SXin Li const __m128i b1 = _mm_blend_epi16(a1, a2, 0xf0); \
70*b2055c35SXin Li const __m128i b2 = _mm_blend_epi16(a2, a3, 0xfc); \
71*b2055c35SXin Li _mm_storeu_si128(out + 0, b0); \
72*b2055c35SXin Li _mm_storeu_si128(out + 1, b1); \
73*b2055c35SXin Li _mm_storeu_si128(out + 2, b2); \
74*b2055c35SXin Li in += 4; \
75*b2055c35SXin Li out += 3; \
76*b2055c35SXin Li num_pixels -= 16; \
77*b2055c35SXin Li } \
78*b2055c35SXin Li } while (0)
79*b2055c35SXin Li
ConvertBGRAToRGB_SSE41(const uint32_t * src,int num_pixels,uint8_t * dst)80*b2055c35SXin Li static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels,
81*b2055c35SXin Li uint8_t* dst) {
82*b2055c35SXin Li const __m128i* in = (const __m128i*)src;
83*b2055c35SXin Li __m128i* out = (__m128i*)dst;
84*b2055c35SXin Li const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9,
85*b2055c35SXin Li 8, 14, 13, 12, -1, -1, -1, -1);
86*b2055c35SXin Li const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
87*b2055c35SXin Li const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
88*b2055c35SXin Li const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
89*b2055c35SXin Li
90*b2055c35SXin Li ARGB_TO_RGB_SSE41;
91*b2055c35SXin Li
92*b2055c35SXin Li // left-overs
93*b2055c35SXin Li if (num_pixels > 0) {
94*b2055c35SXin Li VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
95*b2055c35SXin Li }
96*b2055c35SXin Li }
97*b2055c35SXin Li
ConvertBGRAToBGR_SSE41(const uint32_t * src,int num_pixels,uint8_t * dst)98*b2055c35SXin Li static void ConvertBGRAToBGR_SSE41(const uint32_t* src,
99*b2055c35SXin Li int num_pixels, uint8_t* dst) {
100*b2055c35SXin Li const __m128i* in = (const __m128i*)src;
101*b2055c35SXin Li __m128i* out = (__m128i*)dst;
102*b2055c35SXin Li const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10,
103*b2055c35SXin Li 12, 13, 14, -1, -1, -1, -1);
104*b2055c35SXin Li const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
105*b2055c35SXin Li const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
106*b2055c35SXin Li const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
107*b2055c35SXin Li
108*b2055c35SXin Li ARGB_TO_RGB_SSE41;
109*b2055c35SXin Li
110*b2055c35SXin Li // left-overs
111*b2055c35SXin Li if (num_pixels > 0) {
112*b2055c35SXin Li VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
113*b2055c35SXin Li }
114*b2055c35SXin Li }
115*b2055c35SXin Li
116*b2055c35SXin Li #undef ARGB_TO_RGB_SSE41
117*b2055c35SXin Li
118*b2055c35SXin Li //------------------------------------------------------------------------------
119*b2055c35SXin Li // Entry point
120*b2055c35SXin Li
121*b2055c35SXin Li extern void VP8LDspInitSSE41(void);
122*b2055c35SXin Li
VP8LDspInitSSE41(void)123*b2055c35SXin Li WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE41(void) {
124*b2055c35SXin Li VP8LTransformColorInverse = TransformColorInverse_SSE41;
125*b2055c35SXin Li VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE41;
126*b2055c35SXin Li VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE41;
127*b2055c35SXin Li }
128*b2055c35SXin Li
129*b2055c35SXin Li #else // !WEBP_USE_SSE41
130*b2055c35SXin Li
131*b2055c35SXin Li WEBP_DSP_INIT_STUB(VP8LDspInitSSE41)
132*b2055c35SXin Li
133*b2055c35SXin Li #endif // WEBP_USE_SSE41
134