xref: /aosp_15_r20/external/webp/src/dsp/ssim_sse2.c (revision b2055c353e87c8814eb2b6b1b11112a1562253bd)
1*b2055c35SXin Li // Copyright 2017 Google Inc. All Rights Reserved.
2*b2055c35SXin Li //
3*b2055c35SXin Li // Use of this source code is governed by a BSD-style license
4*b2055c35SXin Li // that can be found in the COPYING file in the root of the source
5*b2055c35SXin Li // tree. An additional intellectual property rights grant can be found
6*b2055c35SXin Li // in the file PATENTS. All contributing project authors may
7*b2055c35SXin Li // be found in the AUTHORS file in the root of the source tree.
8*b2055c35SXin Li // -----------------------------------------------------------------------------
9*b2055c35SXin Li //
10*b2055c35SXin Li // SSE2 version of distortion calculation
11*b2055c35SXin Li //
12*b2055c35SXin Li // Author: Skal ([email protected])
13*b2055c35SXin Li 
14*b2055c35SXin Li #include "src/dsp/dsp.h"
15*b2055c35SXin Li 
16*b2055c35SXin Li #if defined(WEBP_USE_SSE2)
17*b2055c35SXin Li 
18*b2055c35SXin Li #include <assert.h>
19*b2055c35SXin Li #include <emmintrin.h>
20*b2055c35SXin Li 
21*b2055c35SXin Li #include "src/dsp/common_sse2.h"
22*b2055c35SXin Li 
23*b2055c35SXin Li #if !defined(WEBP_DISABLE_STATS)
24*b2055c35SXin Li 
25*b2055c35SXin Li // Helper function
SubtractAndSquare_SSE2(const __m128i a,const __m128i b,__m128i * const sum)26*b2055c35SXin Li static WEBP_INLINE void SubtractAndSquare_SSE2(const __m128i a, const __m128i b,
27*b2055c35SXin Li                                                __m128i* const sum) {
28*b2055c35SXin Li   // take abs(a-b) in 8b
29*b2055c35SXin Li   const __m128i a_b = _mm_subs_epu8(a, b);
30*b2055c35SXin Li   const __m128i b_a = _mm_subs_epu8(b, a);
31*b2055c35SXin Li   const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
32*b2055c35SXin Li   // zero-extend to 16b
33*b2055c35SXin Li   const __m128i zero = _mm_setzero_si128();
34*b2055c35SXin Li   const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
35*b2055c35SXin Li   const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
36*b2055c35SXin Li   // multiply with self
37*b2055c35SXin Li   const __m128i sum1 = _mm_madd_epi16(C0, C0);
38*b2055c35SXin Li   const __m128i sum2 = _mm_madd_epi16(C1, C1);
39*b2055c35SXin Li   *sum = _mm_add_epi32(sum1, sum2);
40*b2055c35SXin Li }
41*b2055c35SXin Li 
42*b2055c35SXin Li //------------------------------------------------------------------------------
43*b2055c35SXin Li // SSIM / PSNR entry point
44*b2055c35SXin Li 
AccumulateSSE_SSE2(const uint8_t * src1,const uint8_t * src2,int len)45*b2055c35SXin Li static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
46*b2055c35SXin Li                                    const uint8_t* src2, int len) {
47*b2055c35SXin Li   int i = 0;
48*b2055c35SXin Li   uint32_t sse2 = 0;
49*b2055c35SXin Li   if (len >= 16) {
50*b2055c35SXin Li     const int limit = len - 32;
51*b2055c35SXin Li     int32_t tmp[4];
52*b2055c35SXin Li     __m128i sum1;
53*b2055c35SXin Li     __m128i sum = _mm_setzero_si128();
54*b2055c35SXin Li     __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
55*b2055c35SXin Li     __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
56*b2055c35SXin Li     i += 16;
57*b2055c35SXin Li     while (i <= limit) {
58*b2055c35SXin Li       const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
59*b2055c35SXin Li       const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
60*b2055c35SXin Li       __m128i sum2;
61*b2055c35SXin Li       i += 16;
62*b2055c35SXin Li       SubtractAndSquare_SSE2(a0, b0, &sum1);
63*b2055c35SXin Li       sum = _mm_add_epi32(sum, sum1);
64*b2055c35SXin Li       a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
65*b2055c35SXin Li       b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
66*b2055c35SXin Li       i += 16;
67*b2055c35SXin Li       SubtractAndSquare_SSE2(a1, b1, &sum2);
68*b2055c35SXin Li       sum = _mm_add_epi32(sum, sum2);
69*b2055c35SXin Li     }
70*b2055c35SXin Li     SubtractAndSquare_SSE2(a0, b0, &sum1);
71*b2055c35SXin Li     sum = _mm_add_epi32(sum, sum1);
72*b2055c35SXin Li     _mm_storeu_si128((__m128i*)tmp, sum);
73*b2055c35SXin Li     sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
74*b2055c35SXin Li   }
75*b2055c35SXin Li 
76*b2055c35SXin Li   for (; i < len; ++i) {
77*b2055c35SXin Li     const int32_t diff = src1[i] - src2[i];
78*b2055c35SXin Li     sse2 += diff * diff;
79*b2055c35SXin Li   }
80*b2055c35SXin Li   return sse2;
81*b2055c35SXin Li }
82*b2055c35SXin Li #endif  // !defined(WEBP_DISABLE_STATS)
83*b2055c35SXin Li 
84*b2055c35SXin Li #if !defined(WEBP_REDUCE_SIZE)
85*b2055c35SXin Li 
HorizontalAdd16b_SSE2(const __m128i * const m)86*b2055c35SXin Li static uint32_t HorizontalAdd16b_SSE2(const __m128i* const m) {
87*b2055c35SXin Li   uint16_t tmp[8];
88*b2055c35SXin Li   const __m128i a = _mm_srli_si128(*m, 8);
89*b2055c35SXin Li   const __m128i b = _mm_add_epi16(*m, a);
90*b2055c35SXin Li   _mm_storeu_si128((__m128i*)tmp, b);
91*b2055c35SXin Li   return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
92*b2055c35SXin Li }
93*b2055c35SXin Li 
HorizontalAdd32b_SSE2(const __m128i * const m)94*b2055c35SXin Li static uint32_t HorizontalAdd32b_SSE2(const __m128i* const m) {
95*b2055c35SXin Li   const __m128i a = _mm_srli_si128(*m, 8);
96*b2055c35SXin Li   const __m128i b = _mm_add_epi32(*m, a);
97*b2055c35SXin Li   const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
98*b2055c35SXin Li   return (uint32_t)_mm_cvtsi128_si32(c);
99*b2055c35SXin Li }
100*b2055c35SXin Li 
101*b2055c35SXin Li static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
102*b2055c35SXin Li 
103*b2055c35SXin Li #define ACCUMULATE_ROW(WEIGHT) do {                         \
104*b2055c35SXin Li   /* compute row weight (Wx * Wy) */                        \
105*b2055c35SXin Li   const __m128i Wy = _mm_set1_epi16((WEIGHT));              \
106*b2055c35SXin Li   const __m128i W = _mm_mullo_epi16(Wx, Wy);                \
107*b2055c35SXin Li   /* process 8 bytes at a time (7 bytes, actually) */       \
108*b2055c35SXin Li   const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
109*b2055c35SXin Li   const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
110*b2055c35SXin Li   /* convert to 16b and multiply by weight */               \
111*b2055c35SXin Li   const __m128i a1 = _mm_unpacklo_epi8(a0, zero);           \
112*b2055c35SXin Li   const __m128i b1 = _mm_unpacklo_epi8(b0, zero);           \
113*b2055c35SXin Li   const __m128i wa1 = _mm_mullo_epi16(a1, W);               \
114*b2055c35SXin Li   const __m128i wb1 = _mm_mullo_epi16(b1, W);               \
115*b2055c35SXin Li   /* accumulate */                                          \
116*b2055c35SXin Li   xm  = _mm_add_epi16(xm, wa1);                             \
117*b2055c35SXin Li   ym  = _mm_add_epi16(ym, wb1);                             \
118*b2055c35SXin Li   xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1));        \
119*b2055c35SXin Li   xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1));        \
120*b2055c35SXin Li   yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1));        \
121*b2055c35SXin Li   src1 += stride1;                                          \
122*b2055c35SXin Li   src2 += stride2;                                          \
123*b2055c35SXin Li } while (0)
124*b2055c35SXin Li 
SSIMGet_SSE2(const uint8_t * src1,int stride1,const uint8_t * src2,int stride2)125*b2055c35SXin Li static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
126*b2055c35SXin Li                            const uint8_t* src2, int stride2) {
127*b2055c35SXin Li   VP8DistoStats stats;
128*b2055c35SXin Li   const __m128i zero = _mm_setzero_si128();
129*b2055c35SXin Li   __m128i xm = zero, ym = zero;                // 16b accums
130*b2055c35SXin Li   __m128i xxm = zero, yym = zero, xym = zero;  // 32b accum
131*b2055c35SXin Li   const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
132*b2055c35SXin Li   assert(2 * VP8_SSIM_KERNEL + 1 == 7);
133*b2055c35SXin Li   ACCUMULATE_ROW(1);
134*b2055c35SXin Li   ACCUMULATE_ROW(2);
135*b2055c35SXin Li   ACCUMULATE_ROW(3);
136*b2055c35SXin Li   ACCUMULATE_ROW(4);
137*b2055c35SXin Li   ACCUMULATE_ROW(3);
138*b2055c35SXin Li   ACCUMULATE_ROW(2);
139*b2055c35SXin Li   ACCUMULATE_ROW(1);
140*b2055c35SXin Li   stats.xm  = HorizontalAdd16b_SSE2(&xm);
141*b2055c35SXin Li   stats.ym  = HorizontalAdd16b_SSE2(&ym);
142*b2055c35SXin Li   stats.xxm = HorizontalAdd32b_SSE2(&xxm);
143*b2055c35SXin Li   stats.xym = HorizontalAdd32b_SSE2(&xym);
144*b2055c35SXin Li   stats.yym = HorizontalAdd32b_SSE2(&yym);
145*b2055c35SXin Li   return VP8SSIMFromStats(&stats);
146*b2055c35SXin Li }
147*b2055c35SXin Li 
148*b2055c35SXin Li #endif  // !defined(WEBP_REDUCE_SIZE)
149*b2055c35SXin Li 
150*b2055c35SXin Li extern void VP8SSIMDspInitSSE2(void);
151*b2055c35SXin Li 
VP8SSIMDspInitSSE2(void)152*b2055c35SXin Li WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
153*b2055c35SXin Li #if !defined(WEBP_DISABLE_STATS)
154*b2055c35SXin Li   VP8AccumulateSSE = AccumulateSSE_SSE2;
155*b2055c35SXin Li #endif
156*b2055c35SXin Li #if !defined(WEBP_REDUCE_SIZE)
157*b2055c35SXin Li   VP8SSIMGet = SSIMGet_SSE2;
158*b2055c35SXin Li #endif
159*b2055c35SXin Li }
160*b2055c35SXin Li 
161*b2055c35SXin Li #else  // !WEBP_USE_SSE2
162*b2055c35SXin Li 
163*b2055c35SXin Li WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
164*b2055c35SXin Li 
165*b2055c35SXin Li #endif  // WEBP_USE_SSE2
166