xref: /aosp_15_r20/external/libaom/aom_dsp/x86/sad_avx2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker  *
4*77c1e3ccSAndroid Build Coastguard Worker  * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker  * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker  * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker  */
11*77c1e3ccSAndroid Build Coastguard Worker #include <immintrin.h>
12*77c1e3ccSAndroid Build Coastguard Worker 
13*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_dsp_rtcd.h"
14*77c1e3ccSAndroid Build Coastguard Worker 
15*77c1e3ccSAndroid Build Coastguard Worker #include "aom_ports/mem.h"
16*77c1e3ccSAndroid Build Coastguard Worker 
sad64xh_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)17*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
18*77c1e3ccSAndroid Build Coastguard Worker                                         const uint8_t *ref_ptr, int ref_stride,
19*77c1e3ccSAndroid Build Coastguard Worker                                         int h) {
20*77c1e3ccSAndroid Build Coastguard Worker   int i;
21*77c1e3ccSAndroid Build Coastguard Worker   __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
22*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum_sad = _mm256_setzero_si256();
23*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum_sad_h;
24*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_sad128;
25*77c1e3ccSAndroid Build Coastguard Worker   for (i = 0; i < h; i++) {
26*77c1e3ccSAndroid Build Coastguard Worker     ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
27*77c1e3ccSAndroid Build Coastguard Worker     ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
28*77c1e3ccSAndroid Build Coastguard Worker     sad1_reg =
29*77c1e3ccSAndroid Build Coastguard Worker         _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
30*77c1e3ccSAndroid Build Coastguard Worker     sad2_reg = _mm256_sad_epu8(
31*77c1e3ccSAndroid Build Coastguard Worker         ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
32*77c1e3ccSAndroid Build Coastguard Worker     sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
33*77c1e3ccSAndroid Build Coastguard Worker     ref_ptr += ref_stride;
34*77c1e3ccSAndroid Build Coastguard Worker     src_ptr += src_stride;
35*77c1e3ccSAndroid Build Coastguard Worker   }
36*77c1e3ccSAndroid Build Coastguard Worker   sum_sad_h = _mm256_srli_si256(sum_sad, 8);
37*77c1e3ccSAndroid Build Coastguard Worker   sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
38*77c1e3ccSAndroid Build Coastguard Worker   sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
39*77c1e3ccSAndroid Build Coastguard Worker   sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
40*77c1e3ccSAndroid Build Coastguard Worker   unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128);
41*77c1e3ccSAndroid Build Coastguard Worker   _mm256_zeroupper();
42*77c1e3ccSAndroid Build Coastguard Worker   return res;
43*77c1e3ccSAndroid Build Coastguard Worker }
44*77c1e3ccSAndroid Build Coastguard Worker 
sad32xh_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)45*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
46*77c1e3ccSAndroid Build Coastguard Worker                                         const uint8_t *ref_ptr, int ref_stride,
47*77c1e3ccSAndroid Build Coastguard Worker                                         int h) {
48*77c1e3ccSAndroid Build Coastguard Worker   int i;
49*77c1e3ccSAndroid Build Coastguard Worker   __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
50*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum_sad = _mm256_setzero_si256();
51*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum_sad_h;
52*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum_sad128;
53*77c1e3ccSAndroid Build Coastguard Worker   int ref2_stride = ref_stride << 1;
54*77c1e3ccSAndroid Build Coastguard Worker   int src2_stride = src_stride << 1;
55*77c1e3ccSAndroid Build Coastguard Worker   int max = h >> 1;
56*77c1e3ccSAndroid Build Coastguard Worker   for (i = 0; i < max; i++) {
57*77c1e3ccSAndroid Build Coastguard Worker     ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
58*77c1e3ccSAndroid Build Coastguard Worker     ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
59*77c1e3ccSAndroid Build Coastguard Worker     sad1_reg =
60*77c1e3ccSAndroid Build Coastguard Worker         _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
61*77c1e3ccSAndroid Build Coastguard Worker     sad2_reg = _mm256_sad_epu8(
62*77c1e3ccSAndroid Build Coastguard Worker         ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
63*77c1e3ccSAndroid Build Coastguard Worker     sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
64*77c1e3ccSAndroid Build Coastguard Worker     ref_ptr += ref2_stride;
65*77c1e3ccSAndroid Build Coastguard Worker     src_ptr += src2_stride;
66*77c1e3ccSAndroid Build Coastguard Worker   }
67*77c1e3ccSAndroid Build Coastguard Worker   sum_sad_h = _mm256_srli_si256(sum_sad, 8);
68*77c1e3ccSAndroid Build Coastguard Worker   sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
69*77c1e3ccSAndroid Build Coastguard Worker   sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
70*77c1e3ccSAndroid Build Coastguard Worker   sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
71*77c1e3ccSAndroid Build Coastguard Worker   unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128);
72*77c1e3ccSAndroid Build Coastguard Worker   _mm256_zeroupper();
73*77c1e3ccSAndroid Build Coastguard Worker   return res;
74*77c1e3ccSAndroid Build Coastguard Worker }
75*77c1e3ccSAndroid Build Coastguard Worker 
76*77c1e3ccSAndroid Build Coastguard Worker #define FSAD64_H(h)                                                           \
77*77c1e3ccSAndroid Build Coastguard Worker   unsigned int aom_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
78*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *ref_ptr, int ref_stride) { \
79*77c1e3ccSAndroid Build Coastguard Worker     return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
80*77c1e3ccSAndroid Build Coastguard Worker   }
81*77c1e3ccSAndroid Build Coastguard Worker 
82*77c1e3ccSAndroid Build Coastguard Worker #define FSADS64_H(h)                                                          \
83*77c1e3ccSAndroid Build Coastguard Worker   unsigned int aom_sad_skip_64x##h##_avx2(                                    \
84*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
85*77c1e3ccSAndroid Build Coastguard Worker       int ref_stride) {                                                       \
86*77c1e3ccSAndroid Build Coastguard Worker     return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
87*77c1e3ccSAndroid Build Coastguard Worker                             h / 2);                                           \
88*77c1e3ccSAndroid Build Coastguard Worker   }
89*77c1e3ccSAndroid Build Coastguard Worker 
90*77c1e3ccSAndroid Build Coastguard Worker #define FSAD32_H(h)                                                           \
91*77c1e3ccSAndroid Build Coastguard Worker   unsigned int aom_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
92*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *ref_ptr, int ref_stride) { \
93*77c1e3ccSAndroid Build Coastguard Worker     return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
94*77c1e3ccSAndroid Build Coastguard Worker   }
95*77c1e3ccSAndroid Build Coastguard Worker 
96*77c1e3ccSAndroid Build Coastguard Worker #define FSADS32_H(h)                                                          \
97*77c1e3ccSAndroid Build Coastguard Worker   unsigned int aom_sad_skip_32x##h##_avx2(                                    \
98*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
99*77c1e3ccSAndroid Build Coastguard Worker       int ref_stride) {                                                       \
100*77c1e3ccSAndroid Build Coastguard Worker     return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
101*77c1e3ccSAndroid Build Coastguard Worker                             h / 2);                                           \
102*77c1e3ccSAndroid Build Coastguard Worker   }
103*77c1e3ccSAndroid Build Coastguard Worker 
104*77c1e3ccSAndroid Build Coastguard Worker #define FSAD64  \
105*77c1e3ccSAndroid Build Coastguard Worker   FSAD64_H(64)  \
106*77c1e3ccSAndroid Build Coastguard Worker   FSAD64_H(32)  \
107*77c1e3ccSAndroid Build Coastguard Worker   FSADS64_H(64) \
108*77c1e3ccSAndroid Build Coastguard Worker   FSADS64_H(32)
109*77c1e3ccSAndroid Build Coastguard Worker 
110*77c1e3ccSAndroid Build Coastguard Worker #define FSAD32  \
111*77c1e3ccSAndroid Build Coastguard Worker   FSAD32_H(64)  \
112*77c1e3ccSAndroid Build Coastguard Worker   FSAD32_H(32)  \
113*77c1e3ccSAndroid Build Coastguard Worker   FSAD32_H(16)  \
114*77c1e3ccSAndroid Build Coastguard Worker   FSADS32_H(64) \
115*77c1e3ccSAndroid Build Coastguard Worker   FSADS32_H(32) \
116*77c1e3ccSAndroid Build Coastguard Worker   FSADS32_H(16)
117*77c1e3ccSAndroid Build Coastguard Worker 
118*77c1e3ccSAndroid Build Coastguard Worker /* clang-format off */
119*77c1e3ccSAndroid Build Coastguard Worker FSAD64
120*77c1e3ccSAndroid Build Coastguard Worker FSAD32
121*77c1e3ccSAndroid Build Coastguard Worker /* clang-format on */
122*77c1e3ccSAndroid Build Coastguard Worker 
123*77c1e3ccSAndroid Build Coastguard Worker #undef FSAD64
124*77c1e3ccSAndroid Build Coastguard Worker #undef FSAD32
125*77c1e3ccSAndroid Build Coastguard Worker #undef FSAD64_H
126*77c1e3ccSAndroid Build Coastguard Worker #undef FSAD32_H
127*77c1e3ccSAndroid Build Coastguard Worker 
128*77c1e3ccSAndroid Build Coastguard Worker #define FSADAVG64_H(h)                                                        \
129*77c1e3ccSAndroid Build Coastguard Worker   unsigned int aom_sad64x##h##_avg_avx2(                                      \
130*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
131*77c1e3ccSAndroid Build Coastguard Worker       int ref_stride, const uint8_t *second_pred) {                           \
132*77c1e3ccSAndroid Build Coastguard Worker     int i;                                                                    \
133*77c1e3ccSAndroid Build Coastguard Worker     __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
134*77c1e3ccSAndroid Build Coastguard Worker     __m256i sum_sad = _mm256_setzero_si256();                                 \
135*77c1e3ccSAndroid Build Coastguard Worker     __m256i sum_sad_h;                                                        \
136*77c1e3ccSAndroid Build Coastguard Worker     __m128i sum_sad128;                                                       \
137*77c1e3ccSAndroid Build Coastguard Worker     for (i = 0; i < h; i++) {                                                 \
138*77c1e3ccSAndroid Build Coastguard Worker       ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
139*77c1e3ccSAndroid Build Coastguard Worker       ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
140*77c1e3ccSAndroid Build Coastguard Worker       ref1_reg = _mm256_avg_epu8(                                             \
141*77c1e3ccSAndroid Build Coastguard Worker           ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
142*77c1e3ccSAndroid Build Coastguard Worker       ref2_reg = _mm256_avg_epu8(                                             \
143*77c1e3ccSAndroid Build Coastguard Worker           ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
144*77c1e3ccSAndroid Build Coastguard Worker       sad1_reg = _mm256_sad_epu8(                                             \
145*77c1e3ccSAndroid Build Coastguard Worker           ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
146*77c1e3ccSAndroid Build Coastguard Worker       sad2_reg = _mm256_sad_epu8(                                             \
147*77c1e3ccSAndroid Build Coastguard Worker           ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
148*77c1e3ccSAndroid Build Coastguard Worker       sum_sad =                                                               \
149*77c1e3ccSAndroid Build Coastguard Worker           _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
150*77c1e3ccSAndroid Build Coastguard Worker       ref_ptr += ref_stride;                                                  \
151*77c1e3ccSAndroid Build Coastguard Worker       src_ptr += src_stride;                                                  \
152*77c1e3ccSAndroid Build Coastguard Worker       second_pred += 64;                                                      \
153*77c1e3ccSAndroid Build Coastguard Worker     }                                                                         \
154*77c1e3ccSAndroid Build Coastguard Worker     sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
155*77c1e3ccSAndroid Build Coastguard Worker     sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
156*77c1e3ccSAndroid Build Coastguard Worker     sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
157*77c1e3ccSAndroid Build Coastguard Worker     sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
158*77c1e3ccSAndroid Build Coastguard Worker     unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128);           \
159*77c1e3ccSAndroid Build Coastguard Worker     _mm256_zeroupper();                                                       \
160*77c1e3ccSAndroid Build Coastguard Worker     return res;                                                               \
161*77c1e3ccSAndroid Build Coastguard Worker   }
162*77c1e3ccSAndroid Build Coastguard Worker 
163*77c1e3ccSAndroid Build Coastguard Worker #define FSADAVG32_H(h)                                                        \
164*77c1e3ccSAndroid Build Coastguard Worker   unsigned int aom_sad32x##h##_avg_avx2(                                      \
165*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
166*77c1e3ccSAndroid Build Coastguard Worker       int ref_stride, const uint8_t *second_pred) {                           \
167*77c1e3ccSAndroid Build Coastguard Worker     int i;                                                                    \
168*77c1e3ccSAndroid Build Coastguard Worker     __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
169*77c1e3ccSAndroid Build Coastguard Worker     __m256i sum_sad = _mm256_setzero_si256();                                 \
170*77c1e3ccSAndroid Build Coastguard Worker     __m256i sum_sad_h;                                                        \
171*77c1e3ccSAndroid Build Coastguard Worker     __m128i sum_sad128;                                                       \
172*77c1e3ccSAndroid Build Coastguard Worker     int ref2_stride = ref_stride << 1;                                        \
173*77c1e3ccSAndroid Build Coastguard Worker     int src2_stride = src_stride << 1;                                        \
174*77c1e3ccSAndroid Build Coastguard Worker     int max = h >> 1;                                                         \
175*77c1e3ccSAndroid Build Coastguard Worker     for (i = 0; i < max; i++) {                                               \
176*77c1e3ccSAndroid Build Coastguard Worker       ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
177*77c1e3ccSAndroid Build Coastguard Worker       ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
178*77c1e3ccSAndroid Build Coastguard Worker       ref1_reg = _mm256_avg_epu8(                                             \
179*77c1e3ccSAndroid Build Coastguard Worker           ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
180*77c1e3ccSAndroid Build Coastguard Worker       ref2_reg = _mm256_avg_epu8(                                             \
181*77c1e3ccSAndroid Build Coastguard Worker           ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
182*77c1e3ccSAndroid Build Coastguard Worker       sad1_reg = _mm256_sad_epu8(                                             \
183*77c1e3ccSAndroid Build Coastguard Worker           ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
184*77c1e3ccSAndroid Build Coastguard Worker       sad2_reg = _mm256_sad_epu8(                                             \
185*77c1e3ccSAndroid Build Coastguard Worker           ref2_reg,                                                           \
186*77c1e3ccSAndroid Build Coastguard Worker           _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
187*77c1e3ccSAndroid Build Coastguard Worker       sum_sad =                                                               \
188*77c1e3ccSAndroid Build Coastguard Worker           _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
189*77c1e3ccSAndroid Build Coastguard Worker       ref_ptr += ref2_stride;                                                 \
190*77c1e3ccSAndroid Build Coastguard Worker       src_ptr += src2_stride;                                                 \
191*77c1e3ccSAndroid Build Coastguard Worker       second_pred += 64;                                                      \
192*77c1e3ccSAndroid Build Coastguard Worker     }                                                                         \
193*77c1e3ccSAndroid Build Coastguard Worker     sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
194*77c1e3ccSAndroid Build Coastguard Worker     sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
195*77c1e3ccSAndroid Build Coastguard Worker     sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
196*77c1e3ccSAndroid Build Coastguard Worker     sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
197*77c1e3ccSAndroid Build Coastguard Worker     unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128);           \
198*77c1e3ccSAndroid Build Coastguard Worker     _mm256_zeroupper();                                                       \
199*77c1e3ccSAndroid Build Coastguard Worker     return res;                                                               \
200*77c1e3ccSAndroid Build Coastguard Worker   }
201*77c1e3ccSAndroid Build Coastguard Worker 
202*77c1e3ccSAndroid Build Coastguard Worker #define FSADAVG64 \
203*77c1e3ccSAndroid Build Coastguard Worker   FSADAVG64_H(64) \
204*77c1e3ccSAndroid Build Coastguard Worker   FSADAVG64_H(32)
205*77c1e3ccSAndroid Build Coastguard Worker 
206*77c1e3ccSAndroid Build Coastguard Worker #define FSADAVG32 \
207*77c1e3ccSAndroid Build Coastguard Worker   FSADAVG32_H(64) \
208*77c1e3ccSAndroid Build Coastguard Worker   FSADAVG32_H(32) \
209*77c1e3ccSAndroid Build Coastguard Worker   FSADAVG32_H(16)
210*77c1e3ccSAndroid Build Coastguard Worker 
211*77c1e3ccSAndroid Build Coastguard Worker /* clang-format off */
212*77c1e3ccSAndroid Build Coastguard Worker FSADAVG64
213*77c1e3ccSAndroid Build Coastguard Worker FSADAVG32
214*77c1e3ccSAndroid Build Coastguard Worker /* clang-format on */
215*77c1e3ccSAndroid Build Coastguard Worker 
216*77c1e3ccSAndroid Build Coastguard Worker #undef FSADAVG64
217*77c1e3ccSAndroid Build Coastguard Worker #undef FSADAVG32
218*77c1e3ccSAndroid Build Coastguard Worker #undef FSADAVG64_H
219*77c1e3ccSAndroid Build Coastguard Worker #undef FSADAVG32_H
220