1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker *
4*77c1e3ccSAndroid Build Coastguard Worker * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker */
11*77c1e3ccSAndroid Build Coastguard Worker #include <immintrin.h>
12*77c1e3ccSAndroid Build Coastguard Worker
13*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_dsp_rtcd.h"
14*77c1e3ccSAndroid Build Coastguard Worker
15*77c1e3ccSAndroid Build Coastguard Worker #include "aom_ports/mem.h"
16*77c1e3ccSAndroid Build Coastguard Worker
sad64xh_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)17*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
18*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
19*77c1e3ccSAndroid Build Coastguard Worker int h) {
20*77c1e3ccSAndroid Build Coastguard Worker int i;
21*77c1e3ccSAndroid Build Coastguard Worker __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
22*77c1e3ccSAndroid Build Coastguard Worker __m256i sum_sad = _mm256_setzero_si256();
23*77c1e3ccSAndroid Build Coastguard Worker __m256i sum_sad_h;
24*77c1e3ccSAndroid Build Coastguard Worker __m128i sum_sad128;
25*77c1e3ccSAndroid Build Coastguard Worker for (i = 0; i < h; i++) {
26*77c1e3ccSAndroid Build Coastguard Worker ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
27*77c1e3ccSAndroid Build Coastguard Worker ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
28*77c1e3ccSAndroid Build Coastguard Worker sad1_reg =
29*77c1e3ccSAndroid Build Coastguard Worker _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
30*77c1e3ccSAndroid Build Coastguard Worker sad2_reg = _mm256_sad_epu8(
31*77c1e3ccSAndroid Build Coastguard Worker ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
32*77c1e3ccSAndroid Build Coastguard Worker sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
33*77c1e3ccSAndroid Build Coastguard Worker ref_ptr += ref_stride;
34*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
35*77c1e3ccSAndroid Build Coastguard Worker }
36*77c1e3ccSAndroid Build Coastguard Worker sum_sad_h = _mm256_srli_si256(sum_sad, 8);
37*77c1e3ccSAndroid Build Coastguard Worker sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
38*77c1e3ccSAndroid Build Coastguard Worker sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
39*77c1e3ccSAndroid Build Coastguard Worker sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
40*77c1e3ccSAndroid Build Coastguard Worker unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128);
41*77c1e3ccSAndroid Build Coastguard Worker _mm256_zeroupper();
42*77c1e3ccSAndroid Build Coastguard Worker return res;
43*77c1e3ccSAndroid Build Coastguard Worker }
44*77c1e3ccSAndroid Build Coastguard Worker
sad32xh_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)45*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
46*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
47*77c1e3ccSAndroid Build Coastguard Worker int h) {
48*77c1e3ccSAndroid Build Coastguard Worker int i;
49*77c1e3ccSAndroid Build Coastguard Worker __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
50*77c1e3ccSAndroid Build Coastguard Worker __m256i sum_sad = _mm256_setzero_si256();
51*77c1e3ccSAndroid Build Coastguard Worker __m256i sum_sad_h;
52*77c1e3ccSAndroid Build Coastguard Worker __m128i sum_sad128;
53*77c1e3ccSAndroid Build Coastguard Worker int ref2_stride = ref_stride << 1;
54*77c1e3ccSAndroid Build Coastguard Worker int src2_stride = src_stride << 1;
55*77c1e3ccSAndroid Build Coastguard Worker int max = h >> 1;
56*77c1e3ccSAndroid Build Coastguard Worker for (i = 0; i < max; i++) {
57*77c1e3ccSAndroid Build Coastguard Worker ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
58*77c1e3ccSAndroid Build Coastguard Worker ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
59*77c1e3ccSAndroid Build Coastguard Worker sad1_reg =
60*77c1e3ccSAndroid Build Coastguard Worker _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
61*77c1e3ccSAndroid Build Coastguard Worker sad2_reg = _mm256_sad_epu8(
62*77c1e3ccSAndroid Build Coastguard Worker ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
63*77c1e3ccSAndroid Build Coastguard Worker sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
64*77c1e3ccSAndroid Build Coastguard Worker ref_ptr += ref2_stride;
65*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src2_stride;
66*77c1e3ccSAndroid Build Coastguard Worker }
67*77c1e3ccSAndroid Build Coastguard Worker sum_sad_h = _mm256_srli_si256(sum_sad, 8);
68*77c1e3ccSAndroid Build Coastguard Worker sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
69*77c1e3ccSAndroid Build Coastguard Worker sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
70*77c1e3ccSAndroid Build Coastguard Worker sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
71*77c1e3ccSAndroid Build Coastguard Worker unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128);
72*77c1e3ccSAndroid Build Coastguard Worker _mm256_zeroupper();
73*77c1e3ccSAndroid Build Coastguard Worker return res;
74*77c1e3ccSAndroid Build Coastguard Worker }
75*77c1e3ccSAndroid Build Coastguard Worker
76*77c1e3ccSAndroid Build Coastguard Worker #define FSAD64_H(h) \
77*77c1e3ccSAndroid Build Coastguard Worker unsigned int aom_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
78*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride) { \
79*77c1e3ccSAndroid Build Coastguard Worker return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \
80*77c1e3ccSAndroid Build Coastguard Worker }
81*77c1e3ccSAndroid Build Coastguard Worker
82*77c1e3ccSAndroid Build Coastguard Worker #define FSADS64_H(h) \
83*77c1e3ccSAndroid Build Coastguard Worker unsigned int aom_sad_skip_64x##h##_avx2( \
84*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
85*77c1e3ccSAndroid Build Coastguard Worker int ref_stride) { \
86*77c1e3ccSAndroid Build Coastguard Worker return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
87*77c1e3ccSAndroid Build Coastguard Worker h / 2); \
88*77c1e3ccSAndroid Build Coastguard Worker }
89*77c1e3ccSAndroid Build Coastguard Worker
90*77c1e3ccSAndroid Build Coastguard Worker #define FSAD32_H(h) \
91*77c1e3ccSAndroid Build Coastguard Worker unsigned int aom_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
92*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride) { \
93*77c1e3ccSAndroid Build Coastguard Worker return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \
94*77c1e3ccSAndroid Build Coastguard Worker }
95*77c1e3ccSAndroid Build Coastguard Worker
96*77c1e3ccSAndroid Build Coastguard Worker #define FSADS32_H(h) \
97*77c1e3ccSAndroid Build Coastguard Worker unsigned int aom_sad_skip_32x##h##_avx2( \
98*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
99*77c1e3ccSAndroid Build Coastguard Worker int ref_stride) { \
100*77c1e3ccSAndroid Build Coastguard Worker return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
101*77c1e3ccSAndroid Build Coastguard Worker h / 2); \
102*77c1e3ccSAndroid Build Coastguard Worker }
103*77c1e3ccSAndroid Build Coastguard Worker
104*77c1e3ccSAndroid Build Coastguard Worker #define FSAD64 \
105*77c1e3ccSAndroid Build Coastguard Worker FSAD64_H(64) \
106*77c1e3ccSAndroid Build Coastguard Worker FSAD64_H(32) \
107*77c1e3ccSAndroid Build Coastguard Worker FSADS64_H(64) \
108*77c1e3ccSAndroid Build Coastguard Worker FSADS64_H(32)
109*77c1e3ccSAndroid Build Coastguard Worker
110*77c1e3ccSAndroid Build Coastguard Worker #define FSAD32 \
111*77c1e3ccSAndroid Build Coastguard Worker FSAD32_H(64) \
112*77c1e3ccSAndroid Build Coastguard Worker FSAD32_H(32) \
113*77c1e3ccSAndroid Build Coastguard Worker FSAD32_H(16) \
114*77c1e3ccSAndroid Build Coastguard Worker FSADS32_H(64) \
115*77c1e3ccSAndroid Build Coastguard Worker FSADS32_H(32) \
116*77c1e3ccSAndroid Build Coastguard Worker FSADS32_H(16)
117*77c1e3ccSAndroid Build Coastguard Worker
118*77c1e3ccSAndroid Build Coastguard Worker /* clang-format off */
119*77c1e3ccSAndroid Build Coastguard Worker FSAD64
120*77c1e3ccSAndroid Build Coastguard Worker FSAD32
121*77c1e3ccSAndroid Build Coastguard Worker /* clang-format on */
122*77c1e3ccSAndroid Build Coastguard Worker
123*77c1e3ccSAndroid Build Coastguard Worker #undef FSAD64
124*77c1e3ccSAndroid Build Coastguard Worker #undef FSAD32
125*77c1e3ccSAndroid Build Coastguard Worker #undef FSAD64_H
126*77c1e3ccSAndroid Build Coastguard Worker #undef FSAD32_H
127*77c1e3ccSAndroid Build Coastguard Worker
128*77c1e3ccSAndroid Build Coastguard Worker #define FSADAVG64_H(h) \
129*77c1e3ccSAndroid Build Coastguard Worker unsigned int aom_sad64x##h##_avg_avx2( \
130*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
131*77c1e3ccSAndroid Build Coastguard Worker int ref_stride, const uint8_t *second_pred) { \
132*77c1e3ccSAndroid Build Coastguard Worker int i; \
133*77c1e3ccSAndroid Build Coastguard Worker __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
134*77c1e3ccSAndroid Build Coastguard Worker __m256i sum_sad = _mm256_setzero_si256(); \
135*77c1e3ccSAndroid Build Coastguard Worker __m256i sum_sad_h; \
136*77c1e3ccSAndroid Build Coastguard Worker __m128i sum_sad128; \
137*77c1e3ccSAndroid Build Coastguard Worker for (i = 0; i < h; i++) { \
138*77c1e3ccSAndroid Build Coastguard Worker ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
139*77c1e3ccSAndroid Build Coastguard Worker ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
140*77c1e3ccSAndroid Build Coastguard Worker ref1_reg = _mm256_avg_epu8( \
141*77c1e3ccSAndroid Build Coastguard Worker ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \
142*77c1e3ccSAndroid Build Coastguard Worker ref2_reg = _mm256_avg_epu8( \
143*77c1e3ccSAndroid Build Coastguard Worker ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
144*77c1e3ccSAndroid Build Coastguard Worker sad1_reg = _mm256_sad_epu8( \
145*77c1e3ccSAndroid Build Coastguard Worker ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
146*77c1e3ccSAndroid Build Coastguard Worker sad2_reg = _mm256_sad_epu8( \
147*77c1e3ccSAndroid Build Coastguard Worker ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
148*77c1e3ccSAndroid Build Coastguard Worker sum_sad = \
149*77c1e3ccSAndroid Build Coastguard Worker _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
150*77c1e3ccSAndroid Build Coastguard Worker ref_ptr += ref_stride; \
151*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride; \
152*77c1e3ccSAndroid Build Coastguard Worker second_pred += 64; \
153*77c1e3ccSAndroid Build Coastguard Worker } \
154*77c1e3ccSAndroid Build Coastguard Worker sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
155*77c1e3ccSAndroid Build Coastguard Worker sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
156*77c1e3ccSAndroid Build Coastguard Worker sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
157*77c1e3ccSAndroid Build Coastguard Worker sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
158*77c1e3ccSAndroid Build Coastguard Worker unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128); \
159*77c1e3ccSAndroid Build Coastguard Worker _mm256_zeroupper(); \
160*77c1e3ccSAndroid Build Coastguard Worker return res; \
161*77c1e3ccSAndroid Build Coastguard Worker }
162*77c1e3ccSAndroid Build Coastguard Worker
163*77c1e3ccSAndroid Build Coastguard Worker #define FSADAVG32_H(h) \
164*77c1e3ccSAndroid Build Coastguard Worker unsigned int aom_sad32x##h##_avg_avx2( \
165*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
166*77c1e3ccSAndroid Build Coastguard Worker int ref_stride, const uint8_t *second_pred) { \
167*77c1e3ccSAndroid Build Coastguard Worker int i; \
168*77c1e3ccSAndroid Build Coastguard Worker __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
169*77c1e3ccSAndroid Build Coastguard Worker __m256i sum_sad = _mm256_setzero_si256(); \
170*77c1e3ccSAndroid Build Coastguard Worker __m256i sum_sad_h; \
171*77c1e3ccSAndroid Build Coastguard Worker __m128i sum_sad128; \
172*77c1e3ccSAndroid Build Coastguard Worker int ref2_stride = ref_stride << 1; \
173*77c1e3ccSAndroid Build Coastguard Worker int src2_stride = src_stride << 1; \
174*77c1e3ccSAndroid Build Coastguard Worker int max = h >> 1; \
175*77c1e3ccSAndroid Build Coastguard Worker for (i = 0; i < max; i++) { \
176*77c1e3ccSAndroid Build Coastguard Worker ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
177*77c1e3ccSAndroid Build Coastguard Worker ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
178*77c1e3ccSAndroid Build Coastguard Worker ref1_reg = _mm256_avg_epu8( \
179*77c1e3ccSAndroid Build Coastguard Worker ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \
180*77c1e3ccSAndroid Build Coastguard Worker ref2_reg = _mm256_avg_epu8( \
181*77c1e3ccSAndroid Build Coastguard Worker ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
182*77c1e3ccSAndroid Build Coastguard Worker sad1_reg = _mm256_sad_epu8( \
183*77c1e3ccSAndroid Build Coastguard Worker ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
184*77c1e3ccSAndroid Build Coastguard Worker sad2_reg = _mm256_sad_epu8( \
185*77c1e3ccSAndroid Build Coastguard Worker ref2_reg, \
186*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
187*77c1e3ccSAndroid Build Coastguard Worker sum_sad = \
188*77c1e3ccSAndroid Build Coastguard Worker _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
189*77c1e3ccSAndroid Build Coastguard Worker ref_ptr += ref2_stride; \
190*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src2_stride; \
191*77c1e3ccSAndroid Build Coastguard Worker second_pred += 64; \
192*77c1e3ccSAndroid Build Coastguard Worker } \
193*77c1e3ccSAndroid Build Coastguard Worker sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
194*77c1e3ccSAndroid Build Coastguard Worker sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
195*77c1e3ccSAndroid Build Coastguard Worker sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
196*77c1e3ccSAndroid Build Coastguard Worker sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
197*77c1e3ccSAndroid Build Coastguard Worker unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128); \
198*77c1e3ccSAndroid Build Coastguard Worker _mm256_zeroupper(); \
199*77c1e3ccSAndroid Build Coastguard Worker return res; \
200*77c1e3ccSAndroid Build Coastguard Worker }
201*77c1e3ccSAndroid Build Coastguard Worker
202*77c1e3ccSAndroid Build Coastguard Worker #define FSADAVG64 \
203*77c1e3ccSAndroid Build Coastguard Worker FSADAVG64_H(64) \
204*77c1e3ccSAndroid Build Coastguard Worker FSADAVG64_H(32)
205*77c1e3ccSAndroid Build Coastguard Worker
206*77c1e3ccSAndroid Build Coastguard Worker #define FSADAVG32 \
207*77c1e3ccSAndroid Build Coastguard Worker FSADAVG32_H(64) \
208*77c1e3ccSAndroid Build Coastguard Worker FSADAVG32_H(32) \
209*77c1e3ccSAndroid Build Coastguard Worker FSADAVG32_H(16)
210*77c1e3ccSAndroid Build Coastguard Worker
211*77c1e3ccSAndroid Build Coastguard Worker /* clang-format off */
212*77c1e3ccSAndroid Build Coastguard Worker FSADAVG64
213*77c1e3ccSAndroid Build Coastguard Worker FSADAVG32
214*77c1e3ccSAndroid Build Coastguard Worker /* clang-format on */
215*77c1e3ccSAndroid Build Coastguard Worker
216*77c1e3ccSAndroid Build Coastguard Worker #undef FSADAVG64
217*77c1e3ccSAndroid Build Coastguard Worker #undef FSADAVG32
218*77c1e3ccSAndroid Build Coastguard Worker #undef FSADAVG64_H
219*77c1e3ccSAndroid Build Coastguard Worker #undef FSADAVG32_H
220