1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker *
4*77c1e3ccSAndroid Build Coastguard Worker * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker */
11*77c1e3ccSAndroid Build Coastguard Worker #include <immintrin.h> // AVX2
12*77c1e3ccSAndroid Build Coastguard Worker
13*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_dsp_rtcd.h"
14*77c1e3ccSAndroid Build Coastguard Worker
15*77c1e3ccSAndroid Build Coastguard Worker #include "aom/aom_integer.h"
16*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/x86/synonyms_avx2.h"
17*77c1e3ccSAndroid Build Coastguard Worker
aggregate_and_store_sum(uint32_t res[4],const __m256i * sum_ref0,const __m256i * sum_ref1,const __m256i * sum_ref2,const __m256i * sum_ref3)18*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void aggregate_and_store_sum(uint32_t res[4],
19*77c1e3ccSAndroid Build Coastguard Worker const __m256i *sum_ref0,
20*77c1e3ccSAndroid Build Coastguard Worker const __m256i *sum_ref1,
21*77c1e3ccSAndroid Build Coastguard Worker const __m256i *sum_ref2,
22*77c1e3ccSAndroid Build Coastguard Worker const __m256i *sum_ref3) {
23*77c1e3ccSAndroid Build Coastguard Worker // In sum_ref-i the result is saved in the first 4 bytes and the other 4
24*77c1e3ccSAndroid Build Coastguard Worker // bytes are zeroed.
25*77c1e3ccSAndroid Build Coastguard Worker // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
26*77c1e3ccSAndroid Build Coastguard Worker // 0, 0, 1, 1
27*77c1e3ccSAndroid Build Coastguard Worker __m256i sum_ref01 = _mm256_castps_si256(_mm256_shuffle_ps(
28*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi256_ps(*sum_ref0), _mm256_castsi256_ps(*sum_ref1),
29*77c1e3ccSAndroid Build Coastguard Worker _MM_SHUFFLE(2, 0, 2, 0)));
30*77c1e3ccSAndroid Build Coastguard Worker // 2, 2, 3, 3
31*77c1e3ccSAndroid Build Coastguard Worker __m256i sum_ref23 = _mm256_castps_si256(_mm256_shuffle_ps(
32*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi256_ps(*sum_ref2), _mm256_castsi256_ps(*sum_ref3),
33*77c1e3ccSAndroid Build Coastguard Worker _MM_SHUFFLE(2, 0, 2, 0)));
34*77c1e3ccSAndroid Build Coastguard Worker
35*77c1e3ccSAndroid Build Coastguard Worker // sum adjacent 32 bit integers
36*77c1e3ccSAndroid Build Coastguard Worker __m256i sum_ref0123 = _mm256_hadd_epi32(sum_ref01, sum_ref23);
37*77c1e3ccSAndroid Build Coastguard Worker
38*77c1e3ccSAndroid Build Coastguard Worker // add the low 128 bit to the high 128 bit
39*77c1e3ccSAndroid Build Coastguard Worker __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(sum_ref0123),
40*77c1e3ccSAndroid Build Coastguard Worker _mm256_extractf128_si256(sum_ref0123, 1));
41*77c1e3ccSAndroid Build Coastguard Worker
42*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(res), sum);
43*77c1e3ccSAndroid Build Coastguard Worker }
44*77c1e3ccSAndroid Build Coastguard Worker
aom_sadMxNx4d_avx2(int M,int N,const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4])45*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void aom_sadMxNx4d_avx2(
46*77c1e3ccSAndroid Build Coastguard Worker int M, int N, const uint8_t *src, int src_stride,
47*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) {
48*77c1e3ccSAndroid Build Coastguard Worker __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
49*77c1e3ccSAndroid Build Coastguard Worker __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
50*77c1e3ccSAndroid Build Coastguard Worker int i, j;
51*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref0, *ref1, *ref2, *ref3;
52*77c1e3ccSAndroid Build Coastguard Worker
53*77c1e3ccSAndroid Build Coastguard Worker ref0 = ref[0];
54*77c1e3ccSAndroid Build Coastguard Worker ref1 = ref[1];
55*77c1e3ccSAndroid Build Coastguard Worker ref2 = ref[2];
56*77c1e3ccSAndroid Build Coastguard Worker ref3 = ref[3];
57*77c1e3ccSAndroid Build Coastguard Worker sum_ref0 = _mm256_setzero_si256();
58*77c1e3ccSAndroid Build Coastguard Worker sum_ref2 = _mm256_setzero_si256();
59*77c1e3ccSAndroid Build Coastguard Worker sum_ref1 = _mm256_setzero_si256();
60*77c1e3ccSAndroid Build Coastguard Worker sum_ref3 = _mm256_setzero_si256();
61*77c1e3ccSAndroid Build Coastguard Worker
62*77c1e3ccSAndroid Build Coastguard Worker for (i = 0; i < N; i++) {
63*77c1e3ccSAndroid Build Coastguard Worker for (j = 0; j < M; j += 32) {
64*77c1e3ccSAndroid Build Coastguard Worker // load src and all refs
65*77c1e3ccSAndroid Build Coastguard Worker src_reg = _mm256_loadu_si256((const __m256i *)(src + j));
66*77c1e3ccSAndroid Build Coastguard Worker ref0_reg = _mm256_loadu_si256((const __m256i *)(ref0 + j));
67*77c1e3ccSAndroid Build Coastguard Worker ref1_reg = _mm256_loadu_si256((const __m256i *)(ref1 + j));
68*77c1e3ccSAndroid Build Coastguard Worker ref2_reg = _mm256_loadu_si256((const __m256i *)(ref2 + j));
69*77c1e3ccSAndroid Build Coastguard Worker ref3_reg = _mm256_loadu_si256((const __m256i *)(ref3 + j));
70*77c1e3ccSAndroid Build Coastguard Worker
71*77c1e3ccSAndroid Build Coastguard Worker // sum of the absolute differences between every ref-i to src
72*77c1e3ccSAndroid Build Coastguard Worker ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
73*77c1e3ccSAndroid Build Coastguard Worker ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
74*77c1e3ccSAndroid Build Coastguard Worker ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
75*77c1e3ccSAndroid Build Coastguard Worker ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
76*77c1e3ccSAndroid Build Coastguard Worker // sum every ref-i
77*77c1e3ccSAndroid Build Coastguard Worker sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
78*77c1e3ccSAndroid Build Coastguard Worker sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
79*77c1e3ccSAndroid Build Coastguard Worker sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
80*77c1e3ccSAndroid Build Coastguard Worker sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
81*77c1e3ccSAndroid Build Coastguard Worker }
82*77c1e3ccSAndroid Build Coastguard Worker src += src_stride;
83*77c1e3ccSAndroid Build Coastguard Worker ref0 += ref_stride;
84*77c1e3ccSAndroid Build Coastguard Worker ref1 += ref_stride;
85*77c1e3ccSAndroid Build Coastguard Worker ref2 += ref_stride;
86*77c1e3ccSAndroid Build Coastguard Worker ref3 += ref_stride;
87*77c1e3ccSAndroid Build Coastguard Worker }
88*77c1e3ccSAndroid Build Coastguard Worker
89*77c1e3ccSAndroid Build Coastguard Worker aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &sum_ref3);
90*77c1e3ccSAndroid Build Coastguard Worker }
91*77c1e3ccSAndroid Build Coastguard Worker
aom_sadMxNx3d_avx2(int M,int N,const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4])92*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void aom_sadMxNx3d_avx2(
93*77c1e3ccSAndroid Build Coastguard Worker int M, int N, const uint8_t *src, int src_stride,
94*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) {
95*77c1e3ccSAndroid Build Coastguard Worker __m256i src_reg, ref0_reg, ref1_reg, ref2_reg;
96*77c1e3ccSAndroid Build Coastguard Worker __m256i sum_ref0, sum_ref1, sum_ref2;
97*77c1e3ccSAndroid Build Coastguard Worker int i, j;
98*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref0, *ref1, *ref2;
99*77c1e3ccSAndroid Build Coastguard Worker const __m256i zero = _mm256_setzero_si256();
100*77c1e3ccSAndroid Build Coastguard Worker
101*77c1e3ccSAndroid Build Coastguard Worker ref0 = ref[0];
102*77c1e3ccSAndroid Build Coastguard Worker ref1 = ref[1];
103*77c1e3ccSAndroid Build Coastguard Worker ref2 = ref[2];
104*77c1e3ccSAndroid Build Coastguard Worker sum_ref0 = _mm256_setzero_si256();
105*77c1e3ccSAndroid Build Coastguard Worker sum_ref2 = _mm256_setzero_si256();
106*77c1e3ccSAndroid Build Coastguard Worker sum_ref1 = _mm256_setzero_si256();
107*77c1e3ccSAndroid Build Coastguard Worker
108*77c1e3ccSAndroid Build Coastguard Worker for (i = 0; i < N; i++) {
109*77c1e3ccSAndroid Build Coastguard Worker for (j = 0; j < M; j += 32) {
110*77c1e3ccSAndroid Build Coastguard Worker // load src and all refs
111*77c1e3ccSAndroid Build Coastguard Worker src_reg = _mm256_loadu_si256((const __m256i *)(src + j));
112*77c1e3ccSAndroid Build Coastguard Worker ref0_reg = _mm256_loadu_si256((const __m256i *)(ref0 + j));
113*77c1e3ccSAndroid Build Coastguard Worker ref1_reg = _mm256_loadu_si256((const __m256i *)(ref1 + j));
114*77c1e3ccSAndroid Build Coastguard Worker ref2_reg = _mm256_loadu_si256((const __m256i *)(ref2 + j));
115*77c1e3ccSAndroid Build Coastguard Worker
116*77c1e3ccSAndroid Build Coastguard Worker // sum of the absolute differences between every ref-i to src
117*77c1e3ccSAndroid Build Coastguard Worker ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
118*77c1e3ccSAndroid Build Coastguard Worker ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
119*77c1e3ccSAndroid Build Coastguard Worker ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
120*77c1e3ccSAndroid Build Coastguard Worker // sum every ref-i
121*77c1e3ccSAndroid Build Coastguard Worker sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
122*77c1e3ccSAndroid Build Coastguard Worker sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
123*77c1e3ccSAndroid Build Coastguard Worker sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
124*77c1e3ccSAndroid Build Coastguard Worker }
125*77c1e3ccSAndroid Build Coastguard Worker src += src_stride;
126*77c1e3ccSAndroid Build Coastguard Worker ref0 += ref_stride;
127*77c1e3ccSAndroid Build Coastguard Worker ref1 += ref_stride;
128*77c1e3ccSAndroid Build Coastguard Worker ref2 += ref_stride;
129*77c1e3ccSAndroid Build Coastguard Worker }
130*77c1e3ccSAndroid Build Coastguard Worker aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &zero);
131*77c1e3ccSAndroid Build Coastguard Worker }
132*77c1e3ccSAndroid Build Coastguard Worker
133*77c1e3ccSAndroid Build Coastguard Worker #define SADMXN_AVX2(m, n) \
134*77c1e3ccSAndroid Build Coastguard Worker void aom_sad##m##x##n##x4d_avx2(const uint8_t *src, int src_stride, \
135*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *const ref[4], int ref_stride, \
136*77c1e3ccSAndroid Build Coastguard Worker uint32_t res[4]) { \
137*77c1e3ccSAndroid Build Coastguard Worker aom_sadMxNx4d_avx2(m, n, src, src_stride, ref, ref_stride, res); \
138*77c1e3ccSAndroid Build Coastguard Worker } \
139*77c1e3ccSAndroid Build Coastguard Worker void aom_sad##m##x##n##x3d_avx2(const uint8_t *src, int src_stride, \
140*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *const ref[4], int ref_stride, \
141*77c1e3ccSAndroid Build Coastguard Worker uint32_t res[4]) { \
142*77c1e3ccSAndroid Build Coastguard Worker aom_sadMxNx3d_avx2(m, n, src, src_stride, ref, ref_stride, res); \
143*77c1e3ccSAndroid Build Coastguard Worker }
144*77c1e3ccSAndroid Build Coastguard Worker
145*77c1e3ccSAndroid Build Coastguard Worker SADMXN_AVX2(32, 16)
146*77c1e3ccSAndroid Build Coastguard Worker SADMXN_AVX2(32, 32)
147*77c1e3ccSAndroid Build Coastguard Worker SADMXN_AVX2(32, 64)
148*77c1e3ccSAndroid Build Coastguard Worker
149*77c1e3ccSAndroid Build Coastguard Worker SADMXN_AVX2(64, 32)
150*77c1e3ccSAndroid Build Coastguard Worker SADMXN_AVX2(64, 64)
151*77c1e3ccSAndroid Build Coastguard Worker SADMXN_AVX2(64, 128)
152*77c1e3ccSAndroid Build Coastguard Worker
153*77c1e3ccSAndroid Build Coastguard Worker SADMXN_AVX2(128, 64)
154*77c1e3ccSAndroid Build Coastguard Worker SADMXN_AVX2(128, 128)
155*77c1e3ccSAndroid Build Coastguard Worker
156*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY
157*77c1e3ccSAndroid Build Coastguard Worker SADMXN_AVX2(32, 8)
158*77c1e3ccSAndroid Build Coastguard Worker SADMXN_AVX2(64, 16)
159*77c1e3ccSAndroid Build Coastguard Worker #endif // !CONFIG_REALTIME_ONLY
160*77c1e3ccSAndroid Build Coastguard Worker
161*77c1e3ccSAndroid Build Coastguard Worker #define SAD_SKIP_MXN_AVX2(m, n) \
162*77c1e3ccSAndroid Build Coastguard Worker void aom_sad_skip_##m##x##n##x4d_avx2(const uint8_t *src, int src_stride, \
163*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *const ref[4], \
164*77c1e3ccSAndroid Build Coastguard Worker int ref_stride, uint32_t res[4]) { \
165*77c1e3ccSAndroid Build Coastguard Worker aom_sadMxNx4d_avx2(m, ((n) >> 1), src, 2 * src_stride, ref, \
166*77c1e3ccSAndroid Build Coastguard Worker 2 * ref_stride, res); \
167*77c1e3ccSAndroid Build Coastguard Worker res[0] <<= 1; \
168*77c1e3ccSAndroid Build Coastguard Worker res[1] <<= 1; \
169*77c1e3ccSAndroid Build Coastguard Worker res[2] <<= 1; \
170*77c1e3ccSAndroid Build Coastguard Worker res[3] <<= 1; \
171*77c1e3ccSAndroid Build Coastguard Worker }
172*77c1e3ccSAndroid Build Coastguard Worker
173*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_MXN_AVX2(32, 16)
174*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_MXN_AVX2(32, 32)
175*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_MXN_AVX2(32, 64)
176*77c1e3ccSAndroid Build Coastguard Worker
177*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_MXN_AVX2(64, 32)
178*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_MXN_AVX2(64, 64)
179*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_MXN_AVX2(64, 128)
180*77c1e3ccSAndroid Build Coastguard Worker
181*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_MXN_AVX2(128, 64)
182*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_MXN_AVX2(128, 128)
183*77c1e3ccSAndroid Build Coastguard Worker
184*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY
185*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_MXN_AVX2(32, 8)
186*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_MXN_AVX2(64, 16)
187*77c1e3ccSAndroid Build Coastguard Worker #endif // !CONFIG_REALTIME_ONLY
188*77c1e3ccSAndroid Build Coastguard Worker
aom_sad16xNx3d_avx2(int N,const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4])189*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void aom_sad16xNx3d_avx2(int N, const uint8_t *src,
190*77c1e3ccSAndroid Build Coastguard Worker int src_stride,
191*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *const ref[4],
192*77c1e3ccSAndroid Build Coastguard Worker int ref_stride,
193*77c1e3ccSAndroid Build Coastguard Worker uint32_t res[4]) {
194*77c1e3ccSAndroid Build Coastguard Worker __m256i src_reg, ref0_reg, ref1_reg, ref2_reg;
195*77c1e3ccSAndroid Build Coastguard Worker __m256i sum_ref0, sum_ref1, sum_ref2;
196*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref0, *ref1, *ref2;
197*77c1e3ccSAndroid Build Coastguard Worker const __m256i zero = _mm256_setzero_si256();
198*77c1e3ccSAndroid Build Coastguard Worker assert(N % 2 == 0);
199*77c1e3ccSAndroid Build Coastguard Worker
200*77c1e3ccSAndroid Build Coastguard Worker ref0 = ref[0];
201*77c1e3ccSAndroid Build Coastguard Worker ref1 = ref[1];
202*77c1e3ccSAndroid Build Coastguard Worker ref2 = ref[2];
203*77c1e3ccSAndroid Build Coastguard Worker sum_ref0 = _mm256_setzero_si256();
204*77c1e3ccSAndroid Build Coastguard Worker sum_ref2 = _mm256_setzero_si256();
205*77c1e3ccSAndroid Build Coastguard Worker sum_ref1 = _mm256_setzero_si256();
206*77c1e3ccSAndroid Build Coastguard Worker
207*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < N; i += 2) {
208*77c1e3ccSAndroid Build Coastguard Worker // load src and all refs
209*77c1e3ccSAndroid Build Coastguard Worker src_reg = yy_loadu2_128(src + src_stride, src);
210*77c1e3ccSAndroid Build Coastguard Worker ref0_reg = yy_loadu2_128(ref0 + ref_stride, ref0);
211*77c1e3ccSAndroid Build Coastguard Worker ref1_reg = yy_loadu2_128(ref1 + ref_stride, ref1);
212*77c1e3ccSAndroid Build Coastguard Worker ref2_reg = yy_loadu2_128(ref2 + ref_stride, ref2);
213*77c1e3ccSAndroid Build Coastguard Worker
214*77c1e3ccSAndroid Build Coastguard Worker // sum of the absolute differences between every ref-i to src
215*77c1e3ccSAndroid Build Coastguard Worker ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
216*77c1e3ccSAndroid Build Coastguard Worker ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
217*77c1e3ccSAndroid Build Coastguard Worker ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
218*77c1e3ccSAndroid Build Coastguard Worker
219*77c1e3ccSAndroid Build Coastguard Worker // sum every ref-i
220*77c1e3ccSAndroid Build Coastguard Worker sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
221*77c1e3ccSAndroid Build Coastguard Worker sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
222*77c1e3ccSAndroid Build Coastguard Worker sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
223*77c1e3ccSAndroid Build Coastguard Worker
224*77c1e3ccSAndroid Build Coastguard Worker src += 2 * src_stride;
225*77c1e3ccSAndroid Build Coastguard Worker ref0 += 2 * ref_stride;
226*77c1e3ccSAndroid Build Coastguard Worker ref1 += 2 * ref_stride;
227*77c1e3ccSAndroid Build Coastguard Worker ref2 += 2 * ref_stride;
228*77c1e3ccSAndroid Build Coastguard Worker }
229*77c1e3ccSAndroid Build Coastguard Worker
230*77c1e3ccSAndroid Build Coastguard Worker aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &zero);
231*77c1e3ccSAndroid Build Coastguard Worker }
232*77c1e3ccSAndroid Build Coastguard Worker
aom_sad16xNx4d_avx2(int N,const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4])233*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void aom_sad16xNx4d_avx2(int N, const uint8_t *src,
234*77c1e3ccSAndroid Build Coastguard Worker int src_stride,
235*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *const ref[4],
236*77c1e3ccSAndroid Build Coastguard Worker int ref_stride,
237*77c1e3ccSAndroid Build Coastguard Worker uint32_t res[4]) {
238*77c1e3ccSAndroid Build Coastguard Worker __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
239*77c1e3ccSAndroid Build Coastguard Worker __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
240*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref0, *ref1, *ref2, *ref3;
241*77c1e3ccSAndroid Build Coastguard Worker assert(N % 2 == 0);
242*77c1e3ccSAndroid Build Coastguard Worker
243*77c1e3ccSAndroid Build Coastguard Worker ref0 = ref[0];
244*77c1e3ccSAndroid Build Coastguard Worker ref1 = ref[1];
245*77c1e3ccSAndroid Build Coastguard Worker ref2 = ref[2];
246*77c1e3ccSAndroid Build Coastguard Worker ref3 = ref[3];
247*77c1e3ccSAndroid Build Coastguard Worker
248*77c1e3ccSAndroid Build Coastguard Worker sum_ref0 = _mm256_setzero_si256();
249*77c1e3ccSAndroid Build Coastguard Worker sum_ref2 = _mm256_setzero_si256();
250*77c1e3ccSAndroid Build Coastguard Worker sum_ref1 = _mm256_setzero_si256();
251*77c1e3ccSAndroid Build Coastguard Worker sum_ref3 = _mm256_setzero_si256();
252*77c1e3ccSAndroid Build Coastguard Worker
253*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < N; i += 2) {
254*77c1e3ccSAndroid Build Coastguard Worker // load src and all refs
255*77c1e3ccSAndroid Build Coastguard Worker src_reg = yy_loadu2_128(src + src_stride, src);
256*77c1e3ccSAndroid Build Coastguard Worker ref0_reg = yy_loadu2_128(ref0 + ref_stride, ref0);
257*77c1e3ccSAndroid Build Coastguard Worker ref1_reg = yy_loadu2_128(ref1 + ref_stride, ref1);
258*77c1e3ccSAndroid Build Coastguard Worker ref2_reg = yy_loadu2_128(ref2 + ref_stride, ref2);
259*77c1e3ccSAndroid Build Coastguard Worker ref3_reg = yy_loadu2_128(ref3 + ref_stride, ref3);
260*77c1e3ccSAndroid Build Coastguard Worker
261*77c1e3ccSAndroid Build Coastguard Worker // sum of the absolute differences between every ref-i to src
262*77c1e3ccSAndroid Build Coastguard Worker ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
263*77c1e3ccSAndroid Build Coastguard Worker ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
264*77c1e3ccSAndroid Build Coastguard Worker ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
265*77c1e3ccSAndroid Build Coastguard Worker ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
266*77c1e3ccSAndroid Build Coastguard Worker
267*77c1e3ccSAndroid Build Coastguard Worker // sum every ref-i
268*77c1e3ccSAndroid Build Coastguard Worker sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
269*77c1e3ccSAndroid Build Coastguard Worker sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
270*77c1e3ccSAndroid Build Coastguard Worker sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
271*77c1e3ccSAndroid Build Coastguard Worker sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
272*77c1e3ccSAndroid Build Coastguard Worker
273*77c1e3ccSAndroid Build Coastguard Worker src += 2 * src_stride;
274*77c1e3ccSAndroid Build Coastguard Worker ref0 += 2 * ref_stride;
275*77c1e3ccSAndroid Build Coastguard Worker ref1 += 2 * ref_stride;
276*77c1e3ccSAndroid Build Coastguard Worker ref2 += 2 * ref_stride;
277*77c1e3ccSAndroid Build Coastguard Worker ref3 += 2 * ref_stride;
278*77c1e3ccSAndroid Build Coastguard Worker }
279*77c1e3ccSAndroid Build Coastguard Worker
280*77c1e3ccSAndroid Build Coastguard Worker aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &sum_ref3);
281*77c1e3ccSAndroid Build Coastguard Worker }
282*77c1e3ccSAndroid Build Coastguard Worker
283*77c1e3ccSAndroid Build Coastguard Worker #define SAD16XNX3_AVX2(n) \
284*77c1e3ccSAndroid Build Coastguard Worker void aom_sad16x##n##x3d_avx2(const uint8_t *src, int src_stride, \
285*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *const ref[4], int ref_stride, \
286*77c1e3ccSAndroid Build Coastguard Worker uint32_t res[4]) { \
287*77c1e3ccSAndroid Build Coastguard Worker aom_sad16xNx3d_avx2(n, src, src_stride, ref, ref_stride, res); \
288*77c1e3ccSAndroid Build Coastguard Worker }
289*77c1e3ccSAndroid Build Coastguard Worker #define SAD16XNX4_AVX2(n) \
290*77c1e3ccSAndroid Build Coastguard Worker void aom_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride, \
291*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *const ref[4], int ref_stride, \
292*77c1e3ccSAndroid Build Coastguard Worker uint32_t res[4]) { \
293*77c1e3ccSAndroid Build Coastguard Worker aom_sad16xNx4d_avx2(n, src, src_stride, ref, ref_stride, res); \
294*77c1e3ccSAndroid Build Coastguard Worker }
295*77c1e3ccSAndroid Build Coastguard Worker
296*77c1e3ccSAndroid Build Coastguard Worker SAD16XNX4_AVX2(32)
297*77c1e3ccSAndroid Build Coastguard Worker SAD16XNX4_AVX2(16)
298*77c1e3ccSAndroid Build Coastguard Worker SAD16XNX4_AVX2(8)
299*77c1e3ccSAndroid Build Coastguard Worker
300*77c1e3ccSAndroid Build Coastguard Worker SAD16XNX3_AVX2(32)
301*77c1e3ccSAndroid Build Coastguard Worker SAD16XNX3_AVX2(16)
302*77c1e3ccSAndroid Build Coastguard Worker SAD16XNX3_AVX2(8)
303*77c1e3ccSAndroid Build Coastguard Worker
304*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY
305*77c1e3ccSAndroid Build Coastguard Worker SAD16XNX3_AVX2(64)
306*77c1e3ccSAndroid Build Coastguard Worker SAD16XNX3_AVX2(4)
307*77c1e3ccSAndroid Build Coastguard Worker
308*77c1e3ccSAndroid Build Coastguard Worker SAD16XNX4_AVX2(64)
309*77c1e3ccSAndroid Build Coastguard Worker SAD16XNX4_AVX2(4)
310*77c1e3ccSAndroid Build Coastguard Worker
311*77c1e3ccSAndroid Build Coastguard Worker #endif // !CONFIG_REALTIME_ONLY
312*77c1e3ccSAndroid Build Coastguard Worker
313*77c1e3ccSAndroid Build Coastguard Worker #define SAD_SKIP_16XN_AVX2(n) \
314*77c1e3ccSAndroid Build Coastguard Worker void aom_sad_skip_16x##n##x4d_avx2(const uint8_t *src, int src_stride, \
315*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *const ref[4], \
316*77c1e3ccSAndroid Build Coastguard Worker int ref_stride, uint32_t res[4]) { \
317*77c1e3ccSAndroid Build Coastguard Worker aom_sad16xNx4d_avx2(((n) >> 1), src, 2 * src_stride, ref, 2 * ref_stride, \
318*77c1e3ccSAndroid Build Coastguard Worker res); \
319*77c1e3ccSAndroid Build Coastguard Worker res[0] <<= 1; \
320*77c1e3ccSAndroid Build Coastguard Worker res[1] <<= 1; \
321*77c1e3ccSAndroid Build Coastguard Worker res[2] <<= 1; \
322*77c1e3ccSAndroid Build Coastguard Worker res[3] <<= 1; \
323*77c1e3ccSAndroid Build Coastguard Worker }
324*77c1e3ccSAndroid Build Coastguard Worker
325*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_16XN_AVX2(32)
326*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_16XN_AVX2(16)
327*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_16XN_AVX2(8)
328*77c1e3ccSAndroid Build Coastguard Worker
329*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY
330*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_16XN_AVX2(64)
331*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_16XN_AVX2(4)
332*77c1e3ccSAndroid Build Coastguard Worker #endif // !CONFIG_REALTIME_ONLY
333