xref: /aosp_15_r20/external/libaom/aom_dsp/x86/sad4d_avx2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker  *
4*77c1e3ccSAndroid Build Coastguard Worker  * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker  * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker  * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker  */
11*77c1e3ccSAndroid Build Coastguard Worker #include <immintrin.h>  // AVX2
12*77c1e3ccSAndroid Build Coastguard Worker 
13*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_dsp_rtcd.h"
14*77c1e3ccSAndroid Build Coastguard Worker 
15*77c1e3ccSAndroid Build Coastguard Worker #include "aom/aom_integer.h"
16*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/x86/synonyms_avx2.h"
17*77c1e3ccSAndroid Build Coastguard Worker 
aggregate_and_store_sum(uint32_t res[4],const __m256i * sum_ref0,const __m256i * sum_ref1,const __m256i * sum_ref2,const __m256i * sum_ref3)18*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void aggregate_and_store_sum(uint32_t res[4],
19*77c1e3ccSAndroid Build Coastguard Worker                                                      const __m256i *sum_ref0,
20*77c1e3ccSAndroid Build Coastguard Worker                                                      const __m256i *sum_ref1,
21*77c1e3ccSAndroid Build Coastguard Worker                                                      const __m256i *sum_ref2,
22*77c1e3ccSAndroid Build Coastguard Worker                                                      const __m256i *sum_ref3) {
23*77c1e3ccSAndroid Build Coastguard Worker   // In sum_ref-i the result is saved in the first 4 bytes and the other 4
24*77c1e3ccSAndroid Build Coastguard Worker   // bytes are zeroed.
25*77c1e3ccSAndroid Build Coastguard Worker   // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
26*77c1e3ccSAndroid Build Coastguard Worker   // 0, 0, 1, 1
27*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum_ref01 = _mm256_castps_si256(_mm256_shuffle_ps(
28*77c1e3ccSAndroid Build Coastguard Worker       _mm256_castsi256_ps(*sum_ref0), _mm256_castsi256_ps(*sum_ref1),
29*77c1e3ccSAndroid Build Coastguard Worker       _MM_SHUFFLE(2, 0, 2, 0)));
30*77c1e3ccSAndroid Build Coastguard Worker   // 2, 2, 3, 3
31*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum_ref23 = _mm256_castps_si256(_mm256_shuffle_ps(
32*77c1e3ccSAndroid Build Coastguard Worker       _mm256_castsi256_ps(*sum_ref2), _mm256_castsi256_ps(*sum_ref3),
33*77c1e3ccSAndroid Build Coastguard Worker       _MM_SHUFFLE(2, 0, 2, 0)));
34*77c1e3ccSAndroid Build Coastguard Worker 
35*77c1e3ccSAndroid Build Coastguard Worker   // sum adjacent 32 bit integers
36*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum_ref0123 = _mm256_hadd_epi32(sum_ref01, sum_ref23);
37*77c1e3ccSAndroid Build Coastguard Worker 
38*77c1e3ccSAndroid Build Coastguard Worker   // add the low 128 bit to the high 128 bit
39*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(sum_ref0123),
40*77c1e3ccSAndroid Build Coastguard Worker                               _mm256_extractf128_si256(sum_ref0123, 1));
41*77c1e3ccSAndroid Build Coastguard Worker 
42*77c1e3ccSAndroid Build Coastguard Worker   _mm_storeu_si128((__m128i *)(res), sum);
43*77c1e3ccSAndroid Build Coastguard Worker }
44*77c1e3ccSAndroid Build Coastguard Worker 
aom_sadMxNx4d_avx2(int M,int N,const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4])45*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void aom_sadMxNx4d_avx2(
46*77c1e3ccSAndroid Build Coastguard Worker     int M, int N, const uint8_t *src, int src_stride,
47*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) {
48*77c1e3ccSAndroid Build Coastguard Worker   __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
49*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
50*77c1e3ccSAndroid Build Coastguard Worker   int i, j;
51*77c1e3ccSAndroid Build Coastguard Worker   const uint8_t *ref0, *ref1, *ref2, *ref3;
52*77c1e3ccSAndroid Build Coastguard Worker 
53*77c1e3ccSAndroid Build Coastguard Worker   ref0 = ref[0];
54*77c1e3ccSAndroid Build Coastguard Worker   ref1 = ref[1];
55*77c1e3ccSAndroid Build Coastguard Worker   ref2 = ref[2];
56*77c1e3ccSAndroid Build Coastguard Worker   ref3 = ref[3];
57*77c1e3ccSAndroid Build Coastguard Worker   sum_ref0 = _mm256_setzero_si256();
58*77c1e3ccSAndroid Build Coastguard Worker   sum_ref2 = _mm256_setzero_si256();
59*77c1e3ccSAndroid Build Coastguard Worker   sum_ref1 = _mm256_setzero_si256();
60*77c1e3ccSAndroid Build Coastguard Worker   sum_ref3 = _mm256_setzero_si256();
61*77c1e3ccSAndroid Build Coastguard Worker 
62*77c1e3ccSAndroid Build Coastguard Worker   for (i = 0; i < N; i++) {
63*77c1e3ccSAndroid Build Coastguard Worker     for (j = 0; j < M; j += 32) {
64*77c1e3ccSAndroid Build Coastguard Worker       // load src and all refs
65*77c1e3ccSAndroid Build Coastguard Worker       src_reg = _mm256_loadu_si256((const __m256i *)(src + j));
66*77c1e3ccSAndroid Build Coastguard Worker       ref0_reg = _mm256_loadu_si256((const __m256i *)(ref0 + j));
67*77c1e3ccSAndroid Build Coastguard Worker       ref1_reg = _mm256_loadu_si256((const __m256i *)(ref1 + j));
68*77c1e3ccSAndroid Build Coastguard Worker       ref2_reg = _mm256_loadu_si256((const __m256i *)(ref2 + j));
69*77c1e3ccSAndroid Build Coastguard Worker       ref3_reg = _mm256_loadu_si256((const __m256i *)(ref3 + j));
70*77c1e3ccSAndroid Build Coastguard Worker 
71*77c1e3ccSAndroid Build Coastguard Worker       // sum of the absolute differences between every ref-i to src
72*77c1e3ccSAndroid Build Coastguard Worker       ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
73*77c1e3ccSAndroid Build Coastguard Worker       ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
74*77c1e3ccSAndroid Build Coastguard Worker       ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
75*77c1e3ccSAndroid Build Coastguard Worker       ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
76*77c1e3ccSAndroid Build Coastguard Worker       // sum every ref-i
77*77c1e3ccSAndroid Build Coastguard Worker       sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
78*77c1e3ccSAndroid Build Coastguard Worker       sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
79*77c1e3ccSAndroid Build Coastguard Worker       sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
80*77c1e3ccSAndroid Build Coastguard Worker       sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
81*77c1e3ccSAndroid Build Coastguard Worker     }
82*77c1e3ccSAndroid Build Coastguard Worker     src += src_stride;
83*77c1e3ccSAndroid Build Coastguard Worker     ref0 += ref_stride;
84*77c1e3ccSAndroid Build Coastguard Worker     ref1 += ref_stride;
85*77c1e3ccSAndroid Build Coastguard Worker     ref2 += ref_stride;
86*77c1e3ccSAndroid Build Coastguard Worker     ref3 += ref_stride;
87*77c1e3ccSAndroid Build Coastguard Worker   }
88*77c1e3ccSAndroid Build Coastguard Worker 
89*77c1e3ccSAndroid Build Coastguard Worker   aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &sum_ref3);
90*77c1e3ccSAndroid Build Coastguard Worker }
91*77c1e3ccSAndroid Build Coastguard Worker 
aom_sadMxNx3d_avx2(int M,int N,const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4])92*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void aom_sadMxNx3d_avx2(
93*77c1e3ccSAndroid Build Coastguard Worker     int M, int N, const uint8_t *src, int src_stride,
94*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) {
95*77c1e3ccSAndroid Build Coastguard Worker   __m256i src_reg, ref0_reg, ref1_reg, ref2_reg;
96*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum_ref0, sum_ref1, sum_ref2;
97*77c1e3ccSAndroid Build Coastguard Worker   int i, j;
98*77c1e3ccSAndroid Build Coastguard Worker   const uint8_t *ref0, *ref1, *ref2;
99*77c1e3ccSAndroid Build Coastguard Worker   const __m256i zero = _mm256_setzero_si256();
100*77c1e3ccSAndroid Build Coastguard Worker 
101*77c1e3ccSAndroid Build Coastguard Worker   ref0 = ref[0];
102*77c1e3ccSAndroid Build Coastguard Worker   ref1 = ref[1];
103*77c1e3ccSAndroid Build Coastguard Worker   ref2 = ref[2];
104*77c1e3ccSAndroid Build Coastguard Worker   sum_ref0 = _mm256_setzero_si256();
105*77c1e3ccSAndroid Build Coastguard Worker   sum_ref2 = _mm256_setzero_si256();
106*77c1e3ccSAndroid Build Coastguard Worker   sum_ref1 = _mm256_setzero_si256();
107*77c1e3ccSAndroid Build Coastguard Worker 
108*77c1e3ccSAndroid Build Coastguard Worker   for (i = 0; i < N; i++) {
109*77c1e3ccSAndroid Build Coastguard Worker     for (j = 0; j < M; j += 32) {
110*77c1e3ccSAndroid Build Coastguard Worker       // load src and all refs
111*77c1e3ccSAndroid Build Coastguard Worker       src_reg = _mm256_loadu_si256((const __m256i *)(src + j));
112*77c1e3ccSAndroid Build Coastguard Worker       ref0_reg = _mm256_loadu_si256((const __m256i *)(ref0 + j));
113*77c1e3ccSAndroid Build Coastguard Worker       ref1_reg = _mm256_loadu_si256((const __m256i *)(ref1 + j));
114*77c1e3ccSAndroid Build Coastguard Worker       ref2_reg = _mm256_loadu_si256((const __m256i *)(ref2 + j));
115*77c1e3ccSAndroid Build Coastguard Worker 
116*77c1e3ccSAndroid Build Coastguard Worker       // sum of the absolute differences between every ref-i to src
117*77c1e3ccSAndroid Build Coastguard Worker       ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
118*77c1e3ccSAndroid Build Coastguard Worker       ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
119*77c1e3ccSAndroid Build Coastguard Worker       ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
120*77c1e3ccSAndroid Build Coastguard Worker       // sum every ref-i
121*77c1e3ccSAndroid Build Coastguard Worker       sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
122*77c1e3ccSAndroid Build Coastguard Worker       sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
123*77c1e3ccSAndroid Build Coastguard Worker       sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
124*77c1e3ccSAndroid Build Coastguard Worker     }
125*77c1e3ccSAndroid Build Coastguard Worker     src += src_stride;
126*77c1e3ccSAndroid Build Coastguard Worker     ref0 += ref_stride;
127*77c1e3ccSAndroid Build Coastguard Worker     ref1 += ref_stride;
128*77c1e3ccSAndroid Build Coastguard Worker     ref2 += ref_stride;
129*77c1e3ccSAndroid Build Coastguard Worker   }
130*77c1e3ccSAndroid Build Coastguard Worker   aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &zero);
131*77c1e3ccSAndroid Build Coastguard Worker }
132*77c1e3ccSAndroid Build Coastguard Worker 
133*77c1e3ccSAndroid Build Coastguard Worker #define SADMXN_AVX2(m, n)                                                      \
134*77c1e3ccSAndroid Build Coastguard Worker   void aom_sad##m##x##n##x4d_avx2(const uint8_t *src, int src_stride,          \
135*77c1e3ccSAndroid Build Coastguard Worker                                   const uint8_t *const ref[4], int ref_stride, \
136*77c1e3ccSAndroid Build Coastguard Worker                                   uint32_t res[4]) {                           \
137*77c1e3ccSAndroid Build Coastguard Worker     aom_sadMxNx4d_avx2(m, n, src, src_stride, ref, ref_stride, res);           \
138*77c1e3ccSAndroid Build Coastguard Worker   }                                                                            \
139*77c1e3ccSAndroid Build Coastguard Worker   void aom_sad##m##x##n##x3d_avx2(const uint8_t *src, int src_stride,          \
140*77c1e3ccSAndroid Build Coastguard Worker                                   const uint8_t *const ref[4], int ref_stride, \
141*77c1e3ccSAndroid Build Coastguard Worker                                   uint32_t res[4]) {                           \
142*77c1e3ccSAndroid Build Coastguard Worker     aom_sadMxNx3d_avx2(m, n, src, src_stride, ref, ref_stride, res);           \
143*77c1e3ccSAndroid Build Coastguard Worker   }
144*77c1e3ccSAndroid Build Coastguard Worker 
145*77c1e3ccSAndroid Build Coastguard Worker SADMXN_AVX2(32, 16)
146*77c1e3ccSAndroid Build Coastguard Worker SADMXN_AVX2(32, 32)
147*77c1e3ccSAndroid Build Coastguard Worker SADMXN_AVX2(32, 64)
148*77c1e3ccSAndroid Build Coastguard Worker 
149*77c1e3ccSAndroid Build Coastguard Worker SADMXN_AVX2(64, 32)
150*77c1e3ccSAndroid Build Coastguard Worker SADMXN_AVX2(64, 64)
151*77c1e3ccSAndroid Build Coastguard Worker SADMXN_AVX2(64, 128)
152*77c1e3ccSAndroid Build Coastguard Worker 
153*77c1e3ccSAndroid Build Coastguard Worker SADMXN_AVX2(128, 64)
154*77c1e3ccSAndroid Build Coastguard Worker SADMXN_AVX2(128, 128)
155*77c1e3ccSAndroid Build Coastguard Worker 
156*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY
157*77c1e3ccSAndroid Build Coastguard Worker SADMXN_AVX2(32, 8)
158*77c1e3ccSAndroid Build Coastguard Worker SADMXN_AVX2(64, 16)
159*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY
160*77c1e3ccSAndroid Build Coastguard Worker 
161*77c1e3ccSAndroid Build Coastguard Worker #define SAD_SKIP_MXN_AVX2(m, n)                                             \
162*77c1e3ccSAndroid Build Coastguard Worker   void aom_sad_skip_##m##x##n##x4d_avx2(const uint8_t *src, int src_stride, \
163*77c1e3ccSAndroid Build Coastguard Worker                                         const uint8_t *const ref[4],        \
164*77c1e3ccSAndroid Build Coastguard Worker                                         int ref_stride, uint32_t res[4]) {  \
165*77c1e3ccSAndroid Build Coastguard Worker     aom_sadMxNx4d_avx2(m, ((n) >> 1), src, 2 * src_stride, ref,             \
166*77c1e3ccSAndroid Build Coastguard Worker                        2 * ref_stride, res);                                \
167*77c1e3ccSAndroid Build Coastguard Worker     res[0] <<= 1;                                                           \
168*77c1e3ccSAndroid Build Coastguard Worker     res[1] <<= 1;                                                           \
169*77c1e3ccSAndroid Build Coastguard Worker     res[2] <<= 1;                                                           \
170*77c1e3ccSAndroid Build Coastguard Worker     res[3] <<= 1;                                                           \
171*77c1e3ccSAndroid Build Coastguard Worker   }
172*77c1e3ccSAndroid Build Coastguard Worker 
173*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_MXN_AVX2(32, 16)
174*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_MXN_AVX2(32, 32)
175*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_MXN_AVX2(32, 64)
176*77c1e3ccSAndroid Build Coastguard Worker 
177*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_MXN_AVX2(64, 32)
178*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_MXN_AVX2(64, 64)
179*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_MXN_AVX2(64, 128)
180*77c1e3ccSAndroid Build Coastguard Worker 
181*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_MXN_AVX2(128, 64)
182*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_MXN_AVX2(128, 128)
183*77c1e3ccSAndroid Build Coastguard Worker 
184*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY
185*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_MXN_AVX2(32, 8)
186*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_MXN_AVX2(64, 16)
187*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY
188*77c1e3ccSAndroid Build Coastguard Worker 
aom_sad16xNx3d_avx2(int N,const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4])189*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void aom_sad16xNx3d_avx2(int N, const uint8_t *src,
190*77c1e3ccSAndroid Build Coastguard Worker                                                  int src_stride,
191*77c1e3ccSAndroid Build Coastguard Worker                                                  const uint8_t *const ref[4],
192*77c1e3ccSAndroid Build Coastguard Worker                                                  int ref_stride,
193*77c1e3ccSAndroid Build Coastguard Worker                                                  uint32_t res[4]) {
194*77c1e3ccSAndroid Build Coastguard Worker   __m256i src_reg, ref0_reg, ref1_reg, ref2_reg;
195*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum_ref0, sum_ref1, sum_ref2;
196*77c1e3ccSAndroid Build Coastguard Worker   const uint8_t *ref0, *ref1, *ref2;
197*77c1e3ccSAndroid Build Coastguard Worker   const __m256i zero = _mm256_setzero_si256();
198*77c1e3ccSAndroid Build Coastguard Worker   assert(N % 2 == 0);
199*77c1e3ccSAndroid Build Coastguard Worker 
200*77c1e3ccSAndroid Build Coastguard Worker   ref0 = ref[0];
201*77c1e3ccSAndroid Build Coastguard Worker   ref1 = ref[1];
202*77c1e3ccSAndroid Build Coastguard Worker   ref2 = ref[2];
203*77c1e3ccSAndroid Build Coastguard Worker   sum_ref0 = _mm256_setzero_si256();
204*77c1e3ccSAndroid Build Coastguard Worker   sum_ref2 = _mm256_setzero_si256();
205*77c1e3ccSAndroid Build Coastguard Worker   sum_ref1 = _mm256_setzero_si256();
206*77c1e3ccSAndroid Build Coastguard Worker 
207*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < N; i += 2) {
208*77c1e3ccSAndroid Build Coastguard Worker     // load src and all refs
209*77c1e3ccSAndroid Build Coastguard Worker     src_reg = yy_loadu2_128(src + src_stride, src);
210*77c1e3ccSAndroid Build Coastguard Worker     ref0_reg = yy_loadu2_128(ref0 + ref_stride, ref0);
211*77c1e3ccSAndroid Build Coastguard Worker     ref1_reg = yy_loadu2_128(ref1 + ref_stride, ref1);
212*77c1e3ccSAndroid Build Coastguard Worker     ref2_reg = yy_loadu2_128(ref2 + ref_stride, ref2);
213*77c1e3ccSAndroid Build Coastguard Worker 
214*77c1e3ccSAndroid Build Coastguard Worker     // sum of the absolute differences between every ref-i to src
215*77c1e3ccSAndroid Build Coastguard Worker     ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
216*77c1e3ccSAndroid Build Coastguard Worker     ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
217*77c1e3ccSAndroid Build Coastguard Worker     ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
218*77c1e3ccSAndroid Build Coastguard Worker 
219*77c1e3ccSAndroid Build Coastguard Worker     // sum every ref-i
220*77c1e3ccSAndroid Build Coastguard Worker     sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
221*77c1e3ccSAndroid Build Coastguard Worker     sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
222*77c1e3ccSAndroid Build Coastguard Worker     sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
223*77c1e3ccSAndroid Build Coastguard Worker 
224*77c1e3ccSAndroid Build Coastguard Worker     src += 2 * src_stride;
225*77c1e3ccSAndroid Build Coastguard Worker     ref0 += 2 * ref_stride;
226*77c1e3ccSAndroid Build Coastguard Worker     ref1 += 2 * ref_stride;
227*77c1e3ccSAndroid Build Coastguard Worker     ref2 += 2 * ref_stride;
228*77c1e3ccSAndroid Build Coastguard Worker   }
229*77c1e3ccSAndroid Build Coastguard Worker 
230*77c1e3ccSAndroid Build Coastguard Worker   aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &zero);
231*77c1e3ccSAndroid Build Coastguard Worker }
232*77c1e3ccSAndroid Build Coastguard Worker 
aom_sad16xNx4d_avx2(int N,const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4])233*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void aom_sad16xNx4d_avx2(int N, const uint8_t *src,
234*77c1e3ccSAndroid Build Coastguard Worker                                                  int src_stride,
235*77c1e3ccSAndroid Build Coastguard Worker                                                  const uint8_t *const ref[4],
236*77c1e3ccSAndroid Build Coastguard Worker                                                  int ref_stride,
237*77c1e3ccSAndroid Build Coastguard Worker                                                  uint32_t res[4]) {
238*77c1e3ccSAndroid Build Coastguard Worker   __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
239*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
240*77c1e3ccSAndroid Build Coastguard Worker   const uint8_t *ref0, *ref1, *ref2, *ref3;
241*77c1e3ccSAndroid Build Coastguard Worker   assert(N % 2 == 0);
242*77c1e3ccSAndroid Build Coastguard Worker 
243*77c1e3ccSAndroid Build Coastguard Worker   ref0 = ref[0];
244*77c1e3ccSAndroid Build Coastguard Worker   ref1 = ref[1];
245*77c1e3ccSAndroid Build Coastguard Worker   ref2 = ref[2];
246*77c1e3ccSAndroid Build Coastguard Worker   ref3 = ref[3];
247*77c1e3ccSAndroid Build Coastguard Worker 
248*77c1e3ccSAndroid Build Coastguard Worker   sum_ref0 = _mm256_setzero_si256();
249*77c1e3ccSAndroid Build Coastguard Worker   sum_ref2 = _mm256_setzero_si256();
250*77c1e3ccSAndroid Build Coastguard Worker   sum_ref1 = _mm256_setzero_si256();
251*77c1e3ccSAndroid Build Coastguard Worker   sum_ref3 = _mm256_setzero_si256();
252*77c1e3ccSAndroid Build Coastguard Worker 
253*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < N; i += 2) {
254*77c1e3ccSAndroid Build Coastguard Worker     // load src and all refs
255*77c1e3ccSAndroid Build Coastguard Worker     src_reg = yy_loadu2_128(src + src_stride, src);
256*77c1e3ccSAndroid Build Coastguard Worker     ref0_reg = yy_loadu2_128(ref0 + ref_stride, ref0);
257*77c1e3ccSAndroid Build Coastguard Worker     ref1_reg = yy_loadu2_128(ref1 + ref_stride, ref1);
258*77c1e3ccSAndroid Build Coastguard Worker     ref2_reg = yy_loadu2_128(ref2 + ref_stride, ref2);
259*77c1e3ccSAndroid Build Coastguard Worker     ref3_reg = yy_loadu2_128(ref3 + ref_stride, ref3);
260*77c1e3ccSAndroid Build Coastguard Worker 
261*77c1e3ccSAndroid Build Coastguard Worker     // sum of the absolute differences between every ref-i to src
262*77c1e3ccSAndroid Build Coastguard Worker     ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
263*77c1e3ccSAndroid Build Coastguard Worker     ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
264*77c1e3ccSAndroid Build Coastguard Worker     ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
265*77c1e3ccSAndroid Build Coastguard Worker     ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
266*77c1e3ccSAndroid Build Coastguard Worker 
267*77c1e3ccSAndroid Build Coastguard Worker     // sum every ref-i
268*77c1e3ccSAndroid Build Coastguard Worker     sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
269*77c1e3ccSAndroid Build Coastguard Worker     sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
270*77c1e3ccSAndroid Build Coastguard Worker     sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
271*77c1e3ccSAndroid Build Coastguard Worker     sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
272*77c1e3ccSAndroid Build Coastguard Worker 
273*77c1e3ccSAndroid Build Coastguard Worker     src += 2 * src_stride;
274*77c1e3ccSAndroid Build Coastguard Worker     ref0 += 2 * ref_stride;
275*77c1e3ccSAndroid Build Coastguard Worker     ref1 += 2 * ref_stride;
276*77c1e3ccSAndroid Build Coastguard Worker     ref2 += 2 * ref_stride;
277*77c1e3ccSAndroid Build Coastguard Worker     ref3 += 2 * ref_stride;
278*77c1e3ccSAndroid Build Coastguard Worker   }
279*77c1e3ccSAndroid Build Coastguard Worker 
280*77c1e3ccSAndroid Build Coastguard Worker   aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &sum_ref3);
281*77c1e3ccSAndroid Build Coastguard Worker }
282*77c1e3ccSAndroid Build Coastguard Worker 
283*77c1e3ccSAndroid Build Coastguard Worker #define SAD16XNX3_AVX2(n)                                                   \
284*77c1e3ccSAndroid Build Coastguard Worker   void aom_sad16x##n##x3d_avx2(const uint8_t *src, int src_stride,          \
285*77c1e3ccSAndroid Build Coastguard Worker                                const uint8_t *const ref[4], int ref_stride, \
286*77c1e3ccSAndroid Build Coastguard Worker                                uint32_t res[4]) {                           \
287*77c1e3ccSAndroid Build Coastguard Worker     aom_sad16xNx3d_avx2(n, src, src_stride, ref, ref_stride, res);          \
288*77c1e3ccSAndroid Build Coastguard Worker   }
289*77c1e3ccSAndroid Build Coastguard Worker #define SAD16XNX4_AVX2(n)                                                   \
290*77c1e3ccSAndroid Build Coastguard Worker   void aom_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride,          \
291*77c1e3ccSAndroid Build Coastguard Worker                                const uint8_t *const ref[4], int ref_stride, \
292*77c1e3ccSAndroid Build Coastguard Worker                                uint32_t res[4]) {                           \
293*77c1e3ccSAndroid Build Coastguard Worker     aom_sad16xNx4d_avx2(n, src, src_stride, ref, ref_stride, res);          \
294*77c1e3ccSAndroid Build Coastguard Worker   }
295*77c1e3ccSAndroid Build Coastguard Worker 
296*77c1e3ccSAndroid Build Coastguard Worker SAD16XNX4_AVX2(32)
297*77c1e3ccSAndroid Build Coastguard Worker SAD16XNX4_AVX2(16)
298*77c1e3ccSAndroid Build Coastguard Worker SAD16XNX4_AVX2(8)
299*77c1e3ccSAndroid Build Coastguard Worker 
300*77c1e3ccSAndroid Build Coastguard Worker SAD16XNX3_AVX2(32)
301*77c1e3ccSAndroid Build Coastguard Worker SAD16XNX3_AVX2(16)
302*77c1e3ccSAndroid Build Coastguard Worker SAD16XNX3_AVX2(8)
303*77c1e3ccSAndroid Build Coastguard Worker 
304*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY
305*77c1e3ccSAndroid Build Coastguard Worker SAD16XNX3_AVX2(64)
306*77c1e3ccSAndroid Build Coastguard Worker SAD16XNX3_AVX2(4)
307*77c1e3ccSAndroid Build Coastguard Worker 
308*77c1e3ccSAndroid Build Coastguard Worker SAD16XNX4_AVX2(64)
309*77c1e3ccSAndroid Build Coastguard Worker SAD16XNX4_AVX2(4)
310*77c1e3ccSAndroid Build Coastguard Worker 
311*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY
312*77c1e3ccSAndroid Build Coastguard Worker 
313*77c1e3ccSAndroid Build Coastguard Worker #define SAD_SKIP_16XN_AVX2(n)                                                 \
314*77c1e3ccSAndroid Build Coastguard Worker   void aom_sad_skip_16x##n##x4d_avx2(const uint8_t *src, int src_stride,      \
315*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *const ref[4],             \
316*77c1e3ccSAndroid Build Coastguard Worker                                      int ref_stride, uint32_t res[4]) {       \
317*77c1e3ccSAndroid Build Coastguard Worker     aom_sad16xNx4d_avx2(((n) >> 1), src, 2 * src_stride, ref, 2 * ref_stride, \
318*77c1e3ccSAndroid Build Coastguard Worker                         res);                                                 \
319*77c1e3ccSAndroid Build Coastguard Worker     res[0] <<= 1;                                                             \
320*77c1e3ccSAndroid Build Coastguard Worker     res[1] <<= 1;                                                             \
321*77c1e3ccSAndroid Build Coastguard Worker     res[2] <<= 1;                                                             \
322*77c1e3ccSAndroid Build Coastguard Worker     res[3] <<= 1;                                                             \
323*77c1e3ccSAndroid Build Coastguard Worker   }
324*77c1e3ccSAndroid Build Coastguard Worker 
325*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_16XN_AVX2(32)
326*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_16XN_AVX2(16)
327*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_16XN_AVX2(8)
328*77c1e3ccSAndroid Build Coastguard Worker 
329*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY
330*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_16XN_AVX2(64)
331*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_16XN_AVX2(4)
332*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY
333