xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/sad_avx2.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker  *
4*fb1b10abSAndroid Build Coastguard Worker  *  Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker  *  that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker  *  tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker  *  in the file PATENTS.  All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker  *  be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker  */
10*fb1b10abSAndroid Build Coastguard Worker #include <immintrin.h>
11*fb1b10abSAndroid Build Coastguard Worker #include "./vpx_dsp_rtcd.h"
12*fb1b10abSAndroid Build Coastguard Worker #include "vpx_ports/mem.h"
13*fb1b10abSAndroid Build Coastguard Worker 
sad64xh_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)14*fb1b10abSAndroid Build Coastguard Worker static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
15*fb1b10abSAndroid Build Coastguard Worker                                         const uint8_t *ref_ptr, int ref_stride,
16*fb1b10abSAndroid Build Coastguard Worker                                         int h) {
17*fb1b10abSAndroid Build Coastguard Worker   int i, res;
18*fb1b10abSAndroid Build Coastguard Worker   __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
19*fb1b10abSAndroid Build Coastguard Worker   __m256i sum_sad = _mm256_setzero_si256();
20*fb1b10abSAndroid Build Coastguard Worker   __m256i sum_sad_h;
21*fb1b10abSAndroid Build Coastguard Worker   __m128i sum_sad128;
22*fb1b10abSAndroid Build Coastguard Worker   for (i = 0; i < h; i++) {
23*fb1b10abSAndroid Build Coastguard Worker     ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
24*fb1b10abSAndroid Build Coastguard Worker     ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
25*fb1b10abSAndroid Build Coastguard Worker     sad1_reg =
26*fb1b10abSAndroid Build Coastguard Worker         _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
27*fb1b10abSAndroid Build Coastguard Worker     sad2_reg = _mm256_sad_epu8(
28*fb1b10abSAndroid Build Coastguard Worker         ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
29*fb1b10abSAndroid Build Coastguard Worker     sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
30*fb1b10abSAndroid Build Coastguard Worker     ref_ptr += ref_stride;
31*fb1b10abSAndroid Build Coastguard Worker     src_ptr += src_stride;
32*fb1b10abSAndroid Build Coastguard Worker   }
33*fb1b10abSAndroid Build Coastguard Worker   sum_sad_h = _mm256_srli_si256(sum_sad, 8);
34*fb1b10abSAndroid Build Coastguard Worker   sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
35*fb1b10abSAndroid Build Coastguard Worker   sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
36*fb1b10abSAndroid Build Coastguard Worker   sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
37*fb1b10abSAndroid Build Coastguard Worker   res = _mm_cvtsi128_si32(sum_sad128);
38*fb1b10abSAndroid Build Coastguard Worker   return res;
39*fb1b10abSAndroid Build Coastguard Worker }
40*fb1b10abSAndroid Build Coastguard Worker 
sad32xh_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)41*fb1b10abSAndroid Build Coastguard Worker static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
42*fb1b10abSAndroid Build Coastguard Worker                                         const uint8_t *ref_ptr, int ref_stride,
43*fb1b10abSAndroid Build Coastguard Worker                                         int h) {
44*fb1b10abSAndroid Build Coastguard Worker   int i, res;
45*fb1b10abSAndroid Build Coastguard Worker   __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
46*fb1b10abSAndroid Build Coastguard Worker   __m256i sum_sad = _mm256_setzero_si256();
47*fb1b10abSAndroid Build Coastguard Worker   __m256i sum_sad_h;
48*fb1b10abSAndroid Build Coastguard Worker   __m128i sum_sad128;
49*fb1b10abSAndroid Build Coastguard Worker   const int ref2_stride = ref_stride << 1;
50*fb1b10abSAndroid Build Coastguard Worker   const int src2_stride = src_stride << 1;
51*fb1b10abSAndroid Build Coastguard Worker   const int max = h >> 1;
52*fb1b10abSAndroid Build Coastguard Worker   for (i = 0; i < max; i++) {
53*fb1b10abSAndroid Build Coastguard Worker     ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
54*fb1b10abSAndroid Build Coastguard Worker     ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
55*fb1b10abSAndroid Build Coastguard Worker     sad1_reg =
56*fb1b10abSAndroid Build Coastguard Worker         _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
57*fb1b10abSAndroid Build Coastguard Worker     sad2_reg = _mm256_sad_epu8(
58*fb1b10abSAndroid Build Coastguard Worker         ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
59*fb1b10abSAndroid Build Coastguard Worker     sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
60*fb1b10abSAndroid Build Coastguard Worker     ref_ptr += ref2_stride;
61*fb1b10abSAndroid Build Coastguard Worker     src_ptr += src2_stride;
62*fb1b10abSAndroid Build Coastguard Worker   }
63*fb1b10abSAndroid Build Coastguard Worker   sum_sad_h = _mm256_srli_si256(sum_sad, 8);
64*fb1b10abSAndroid Build Coastguard Worker   sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
65*fb1b10abSAndroid Build Coastguard Worker   sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
66*fb1b10abSAndroid Build Coastguard Worker   sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
67*fb1b10abSAndroid Build Coastguard Worker   res = _mm_cvtsi128_si32(sum_sad128);
68*fb1b10abSAndroid Build Coastguard Worker   return res;
69*fb1b10abSAndroid Build Coastguard Worker }
70*fb1b10abSAndroid Build Coastguard Worker 
71*fb1b10abSAndroid Build Coastguard Worker #define FSAD64_H(h)                                                           \
72*fb1b10abSAndroid Build Coastguard Worker   unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
73*fb1b10abSAndroid Build Coastguard Worker                                     const uint8_t *ref_ptr, int ref_stride) { \
74*fb1b10abSAndroid Build Coastguard Worker     return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
75*fb1b10abSAndroid Build Coastguard Worker   }
76*fb1b10abSAndroid Build Coastguard Worker 
77*fb1b10abSAndroid Build Coastguard Worker #define FSADS64_H(h)                                                          \
78*fb1b10abSAndroid Build Coastguard Worker   unsigned int vpx_sad_skip_64x##h##_avx2(                                    \
79*fb1b10abSAndroid Build Coastguard Worker       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
80*fb1b10abSAndroid Build Coastguard Worker       int ref_stride) {                                                       \
81*fb1b10abSAndroid Build Coastguard Worker     return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
82*fb1b10abSAndroid Build Coastguard Worker                             h / 2);                                           \
83*fb1b10abSAndroid Build Coastguard Worker   }
84*fb1b10abSAndroid Build Coastguard Worker 
85*fb1b10abSAndroid Build Coastguard Worker #define FSAD32_H(h)                                                           \
86*fb1b10abSAndroid Build Coastguard Worker   unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
87*fb1b10abSAndroid Build Coastguard Worker                                     const uint8_t *ref_ptr, int ref_stride) { \
88*fb1b10abSAndroid Build Coastguard Worker     return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
89*fb1b10abSAndroid Build Coastguard Worker   }
90*fb1b10abSAndroid Build Coastguard Worker 
91*fb1b10abSAndroid Build Coastguard Worker #define FSADS32_H(h)                                                          \
92*fb1b10abSAndroid Build Coastguard Worker   unsigned int vpx_sad_skip_32x##h##_avx2(                                    \
93*fb1b10abSAndroid Build Coastguard Worker       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
94*fb1b10abSAndroid Build Coastguard Worker       int ref_stride) {                                                       \
95*fb1b10abSAndroid Build Coastguard Worker     return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
96*fb1b10abSAndroid Build Coastguard Worker                             h / 2);                                           \
97*fb1b10abSAndroid Build Coastguard Worker   }
98*fb1b10abSAndroid Build Coastguard Worker 
99*fb1b10abSAndroid Build Coastguard Worker #define FSAD64  \
100*fb1b10abSAndroid Build Coastguard Worker   FSAD64_H(64)  \
101*fb1b10abSAndroid Build Coastguard Worker   FSAD64_H(32)  \
102*fb1b10abSAndroid Build Coastguard Worker   FSADS64_H(64) \
103*fb1b10abSAndroid Build Coastguard Worker   FSADS64_H(32)
104*fb1b10abSAndroid Build Coastguard Worker 
105*fb1b10abSAndroid Build Coastguard Worker #define FSAD32  \
106*fb1b10abSAndroid Build Coastguard Worker   FSAD32_H(64)  \
107*fb1b10abSAndroid Build Coastguard Worker   FSAD32_H(32)  \
108*fb1b10abSAndroid Build Coastguard Worker   FSAD32_H(16)  \
109*fb1b10abSAndroid Build Coastguard Worker   FSADS32_H(64) \
110*fb1b10abSAndroid Build Coastguard Worker   FSADS32_H(32) \
111*fb1b10abSAndroid Build Coastguard Worker   FSADS32_H(16)
112*fb1b10abSAndroid Build Coastguard Worker 
113*fb1b10abSAndroid Build Coastguard Worker FSAD64
114*fb1b10abSAndroid Build Coastguard Worker FSAD32
115*fb1b10abSAndroid Build Coastguard Worker 
116*fb1b10abSAndroid Build Coastguard Worker #undef FSAD64
117*fb1b10abSAndroid Build Coastguard Worker #undef FSAD32
118*fb1b10abSAndroid Build Coastguard Worker #undef FSAD64_H
119*fb1b10abSAndroid Build Coastguard Worker #undef FSAD32_H
120*fb1b10abSAndroid Build Coastguard Worker #undef FSADS64_H
121*fb1b10abSAndroid Build Coastguard Worker #undef FSADS32_H
122*fb1b10abSAndroid Build Coastguard Worker 
123*fb1b10abSAndroid Build Coastguard Worker #define FSADAVG64_H(h)                                                        \
124*fb1b10abSAndroid Build Coastguard Worker   unsigned int vpx_sad64x##h##_avg_avx2(                                      \
125*fb1b10abSAndroid Build Coastguard Worker       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
126*fb1b10abSAndroid Build Coastguard Worker       int ref_stride, const uint8_t *second_pred) {                           \
127*fb1b10abSAndroid Build Coastguard Worker     int i;                                                                    \
128*fb1b10abSAndroid Build Coastguard Worker     __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
129*fb1b10abSAndroid Build Coastguard Worker     __m256i sum_sad = _mm256_setzero_si256();                                 \
130*fb1b10abSAndroid Build Coastguard Worker     __m256i sum_sad_h;                                                        \
131*fb1b10abSAndroid Build Coastguard Worker     __m128i sum_sad128;                                                       \
132*fb1b10abSAndroid Build Coastguard Worker     for (i = 0; i < h; i++) {                                                 \
133*fb1b10abSAndroid Build Coastguard Worker       ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
134*fb1b10abSAndroid Build Coastguard Worker       ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
135*fb1b10abSAndroid Build Coastguard Worker       ref1_reg = _mm256_avg_epu8(                                             \
136*fb1b10abSAndroid Build Coastguard Worker           ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
137*fb1b10abSAndroid Build Coastguard Worker       ref2_reg = _mm256_avg_epu8(                                             \
138*fb1b10abSAndroid Build Coastguard Worker           ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
139*fb1b10abSAndroid Build Coastguard Worker       sad1_reg = _mm256_sad_epu8(                                             \
140*fb1b10abSAndroid Build Coastguard Worker           ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
141*fb1b10abSAndroid Build Coastguard Worker       sad2_reg = _mm256_sad_epu8(                                             \
142*fb1b10abSAndroid Build Coastguard Worker           ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
143*fb1b10abSAndroid Build Coastguard Worker       sum_sad =                                                               \
144*fb1b10abSAndroid Build Coastguard Worker           _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
145*fb1b10abSAndroid Build Coastguard Worker       ref_ptr += ref_stride;                                                  \
146*fb1b10abSAndroid Build Coastguard Worker       src_ptr += src_stride;                                                  \
147*fb1b10abSAndroid Build Coastguard Worker       second_pred += 64;                                                      \
148*fb1b10abSAndroid Build Coastguard Worker     }                                                                         \
149*fb1b10abSAndroid Build Coastguard Worker     sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
150*fb1b10abSAndroid Build Coastguard Worker     sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
151*fb1b10abSAndroid Build Coastguard Worker     sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
152*fb1b10abSAndroid Build Coastguard Worker     sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
153*fb1b10abSAndroid Build Coastguard Worker     return (unsigned int)_mm_cvtsi128_si32(sum_sad128);                       \
154*fb1b10abSAndroid Build Coastguard Worker   }
155*fb1b10abSAndroid Build Coastguard Worker 
156*fb1b10abSAndroid Build Coastguard Worker #define FSADAVG32_H(h)                                                        \
157*fb1b10abSAndroid Build Coastguard Worker   unsigned int vpx_sad32x##h##_avg_avx2(                                      \
158*fb1b10abSAndroid Build Coastguard Worker       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
159*fb1b10abSAndroid Build Coastguard Worker       int ref_stride, const uint8_t *second_pred) {                           \
160*fb1b10abSAndroid Build Coastguard Worker     int i;                                                                    \
161*fb1b10abSAndroid Build Coastguard Worker     __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
162*fb1b10abSAndroid Build Coastguard Worker     __m256i sum_sad = _mm256_setzero_si256();                                 \
163*fb1b10abSAndroid Build Coastguard Worker     __m256i sum_sad_h;                                                        \
164*fb1b10abSAndroid Build Coastguard Worker     __m128i sum_sad128;                                                       \
165*fb1b10abSAndroid Build Coastguard Worker     int ref2_stride = ref_stride << 1;                                        \
166*fb1b10abSAndroid Build Coastguard Worker     int src2_stride = src_stride << 1;                                        \
167*fb1b10abSAndroid Build Coastguard Worker     int max = h >> 1;                                                         \
168*fb1b10abSAndroid Build Coastguard Worker     for (i = 0; i < max; i++) {                                               \
169*fb1b10abSAndroid Build Coastguard Worker       ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
170*fb1b10abSAndroid Build Coastguard Worker       ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
171*fb1b10abSAndroid Build Coastguard Worker       ref1_reg = _mm256_avg_epu8(                                             \
172*fb1b10abSAndroid Build Coastguard Worker           ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
173*fb1b10abSAndroid Build Coastguard Worker       ref2_reg = _mm256_avg_epu8(                                             \
174*fb1b10abSAndroid Build Coastguard Worker           ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
175*fb1b10abSAndroid Build Coastguard Worker       sad1_reg = _mm256_sad_epu8(                                             \
176*fb1b10abSAndroid Build Coastguard Worker           ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
177*fb1b10abSAndroid Build Coastguard Worker       sad2_reg = _mm256_sad_epu8(                                             \
178*fb1b10abSAndroid Build Coastguard Worker           ref2_reg,                                                           \
179*fb1b10abSAndroid Build Coastguard Worker           _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
180*fb1b10abSAndroid Build Coastguard Worker       sum_sad =                                                               \
181*fb1b10abSAndroid Build Coastguard Worker           _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
182*fb1b10abSAndroid Build Coastguard Worker       ref_ptr += ref2_stride;                                                 \
183*fb1b10abSAndroid Build Coastguard Worker       src_ptr += src2_stride;                                                 \
184*fb1b10abSAndroid Build Coastguard Worker       second_pred += 64;                                                      \
185*fb1b10abSAndroid Build Coastguard Worker     }                                                                         \
186*fb1b10abSAndroid Build Coastguard Worker     sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
187*fb1b10abSAndroid Build Coastguard Worker     sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
188*fb1b10abSAndroid Build Coastguard Worker     sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
189*fb1b10abSAndroid Build Coastguard Worker     sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
190*fb1b10abSAndroid Build Coastguard Worker     return (unsigned int)_mm_cvtsi128_si32(sum_sad128);                       \
191*fb1b10abSAndroid Build Coastguard Worker   }
192*fb1b10abSAndroid Build Coastguard Worker 
193*fb1b10abSAndroid Build Coastguard Worker #define FSADAVG64 \
194*fb1b10abSAndroid Build Coastguard Worker   FSADAVG64_H(64) \
195*fb1b10abSAndroid Build Coastguard Worker   FSADAVG64_H(32)
196*fb1b10abSAndroid Build Coastguard Worker 
197*fb1b10abSAndroid Build Coastguard Worker #define FSADAVG32 \
198*fb1b10abSAndroid Build Coastguard Worker   FSADAVG32_H(64) \
199*fb1b10abSAndroid Build Coastguard Worker   FSADAVG32_H(32) \
200*fb1b10abSAndroid Build Coastguard Worker   FSADAVG32_H(16)
201*fb1b10abSAndroid Build Coastguard Worker 
202*fb1b10abSAndroid Build Coastguard Worker FSADAVG64
203*fb1b10abSAndroid Build Coastguard Worker FSADAVG32
204*fb1b10abSAndroid Build Coastguard Worker 
205*fb1b10abSAndroid Build Coastguard Worker #undef FSADAVG64
206*fb1b10abSAndroid Build Coastguard Worker #undef FSADAVG32
207*fb1b10abSAndroid Build Coastguard Worker #undef FSADAVG64_H
208*fb1b10abSAndroid Build Coastguard Worker #undef FSADAVG32_H
209