1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker *
4*fb1b10abSAndroid Build Coastguard Worker * Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker * that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker * tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker * in the file PATENTS. All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker * be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker */
10*fb1b10abSAndroid Build Coastguard Worker #include <immintrin.h>
11*fb1b10abSAndroid Build Coastguard Worker #include "./vpx_dsp_rtcd.h"
12*fb1b10abSAndroid Build Coastguard Worker #include "vpx_ports/mem.h"
13*fb1b10abSAndroid Build Coastguard Worker
sad64xh_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)14*fb1b10abSAndroid Build Coastguard Worker static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
15*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
16*fb1b10abSAndroid Build Coastguard Worker int h) {
17*fb1b10abSAndroid Build Coastguard Worker int i, res;
18*fb1b10abSAndroid Build Coastguard Worker __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
19*fb1b10abSAndroid Build Coastguard Worker __m256i sum_sad = _mm256_setzero_si256();
20*fb1b10abSAndroid Build Coastguard Worker __m256i sum_sad_h;
21*fb1b10abSAndroid Build Coastguard Worker __m128i sum_sad128;
22*fb1b10abSAndroid Build Coastguard Worker for (i = 0; i < h; i++) {
23*fb1b10abSAndroid Build Coastguard Worker ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
24*fb1b10abSAndroid Build Coastguard Worker ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
25*fb1b10abSAndroid Build Coastguard Worker sad1_reg =
26*fb1b10abSAndroid Build Coastguard Worker _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
27*fb1b10abSAndroid Build Coastguard Worker sad2_reg = _mm256_sad_epu8(
28*fb1b10abSAndroid Build Coastguard Worker ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
29*fb1b10abSAndroid Build Coastguard Worker sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
30*fb1b10abSAndroid Build Coastguard Worker ref_ptr += ref_stride;
31*fb1b10abSAndroid Build Coastguard Worker src_ptr += src_stride;
32*fb1b10abSAndroid Build Coastguard Worker }
33*fb1b10abSAndroid Build Coastguard Worker sum_sad_h = _mm256_srli_si256(sum_sad, 8);
34*fb1b10abSAndroid Build Coastguard Worker sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
35*fb1b10abSAndroid Build Coastguard Worker sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
36*fb1b10abSAndroid Build Coastguard Worker sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
37*fb1b10abSAndroid Build Coastguard Worker res = _mm_cvtsi128_si32(sum_sad128);
38*fb1b10abSAndroid Build Coastguard Worker return res;
39*fb1b10abSAndroid Build Coastguard Worker }
40*fb1b10abSAndroid Build Coastguard Worker
sad32xh_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)41*fb1b10abSAndroid Build Coastguard Worker static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
42*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
43*fb1b10abSAndroid Build Coastguard Worker int h) {
44*fb1b10abSAndroid Build Coastguard Worker int i, res;
45*fb1b10abSAndroid Build Coastguard Worker __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
46*fb1b10abSAndroid Build Coastguard Worker __m256i sum_sad = _mm256_setzero_si256();
47*fb1b10abSAndroid Build Coastguard Worker __m256i sum_sad_h;
48*fb1b10abSAndroid Build Coastguard Worker __m128i sum_sad128;
49*fb1b10abSAndroid Build Coastguard Worker const int ref2_stride = ref_stride << 1;
50*fb1b10abSAndroid Build Coastguard Worker const int src2_stride = src_stride << 1;
51*fb1b10abSAndroid Build Coastguard Worker const int max = h >> 1;
52*fb1b10abSAndroid Build Coastguard Worker for (i = 0; i < max; i++) {
53*fb1b10abSAndroid Build Coastguard Worker ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
54*fb1b10abSAndroid Build Coastguard Worker ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
55*fb1b10abSAndroid Build Coastguard Worker sad1_reg =
56*fb1b10abSAndroid Build Coastguard Worker _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
57*fb1b10abSAndroid Build Coastguard Worker sad2_reg = _mm256_sad_epu8(
58*fb1b10abSAndroid Build Coastguard Worker ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
59*fb1b10abSAndroid Build Coastguard Worker sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
60*fb1b10abSAndroid Build Coastguard Worker ref_ptr += ref2_stride;
61*fb1b10abSAndroid Build Coastguard Worker src_ptr += src2_stride;
62*fb1b10abSAndroid Build Coastguard Worker }
63*fb1b10abSAndroid Build Coastguard Worker sum_sad_h = _mm256_srli_si256(sum_sad, 8);
64*fb1b10abSAndroid Build Coastguard Worker sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
65*fb1b10abSAndroid Build Coastguard Worker sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
66*fb1b10abSAndroid Build Coastguard Worker sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
67*fb1b10abSAndroid Build Coastguard Worker res = _mm_cvtsi128_si32(sum_sad128);
68*fb1b10abSAndroid Build Coastguard Worker return res;
69*fb1b10abSAndroid Build Coastguard Worker }
70*fb1b10abSAndroid Build Coastguard Worker
71*fb1b10abSAndroid Build Coastguard Worker #define FSAD64_H(h) \
72*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
73*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride) { \
74*fb1b10abSAndroid Build Coastguard Worker return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \
75*fb1b10abSAndroid Build Coastguard Worker }
76*fb1b10abSAndroid Build Coastguard Worker
77*fb1b10abSAndroid Build Coastguard Worker #define FSADS64_H(h) \
78*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_sad_skip_64x##h##_avx2( \
79*fb1b10abSAndroid Build Coastguard Worker const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
80*fb1b10abSAndroid Build Coastguard Worker int ref_stride) { \
81*fb1b10abSAndroid Build Coastguard Worker return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
82*fb1b10abSAndroid Build Coastguard Worker h / 2); \
83*fb1b10abSAndroid Build Coastguard Worker }
84*fb1b10abSAndroid Build Coastguard Worker
85*fb1b10abSAndroid Build Coastguard Worker #define FSAD32_H(h) \
86*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
87*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride) { \
88*fb1b10abSAndroid Build Coastguard Worker return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \
89*fb1b10abSAndroid Build Coastguard Worker }
90*fb1b10abSAndroid Build Coastguard Worker
91*fb1b10abSAndroid Build Coastguard Worker #define FSADS32_H(h) \
92*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_sad_skip_32x##h##_avx2( \
93*fb1b10abSAndroid Build Coastguard Worker const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
94*fb1b10abSAndroid Build Coastguard Worker int ref_stride) { \
95*fb1b10abSAndroid Build Coastguard Worker return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
96*fb1b10abSAndroid Build Coastguard Worker h / 2); \
97*fb1b10abSAndroid Build Coastguard Worker }
98*fb1b10abSAndroid Build Coastguard Worker
99*fb1b10abSAndroid Build Coastguard Worker #define FSAD64 \
100*fb1b10abSAndroid Build Coastguard Worker FSAD64_H(64) \
101*fb1b10abSAndroid Build Coastguard Worker FSAD64_H(32) \
102*fb1b10abSAndroid Build Coastguard Worker FSADS64_H(64) \
103*fb1b10abSAndroid Build Coastguard Worker FSADS64_H(32)
104*fb1b10abSAndroid Build Coastguard Worker
105*fb1b10abSAndroid Build Coastguard Worker #define FSAD32 \
106*fb1b10abSAndroid Build Coastguard Worker FSAD32_H(64) \
107*fb1b10abSAndroid Build Coastguard Worker FSAD32_H(32) \
108*fb1b10abSAndroid Build Coastguard Worker FSAD32_H(16) \
109*fb1b10abSAndroid Build Coastguard Worker FSADS32_H(64) \
110*fb1b10abSAndroid Build Coastguard Worker FSADS32_H(32) \
111*fb1b10abSAndroid Build Coastguard Worker FSADS32_H(16)
112*fb1b10abSAndroid Build Coastguard Worker
113*fb1b10abSAndroid Build Coastguard Worker FSAD64
114*fb1b10abSAndroid Build Coastguard Worker FSAD32
115*fb1b10abSAndroid Build Coastguard Worker
116*fb1b10abSAndroid Build Coastguard Worker #undef FSAD64
117*fb1b10abSAndroid Build Coastguard Worker #undef FSAD32
118*fb1b10abSAndroid Build Coastguard Worker #undef FSAD64_H
119*fb1b10abSAndroid Build Coastguard Worker #undef FSAD32_H
120*fb1b10abSAndroid Build Coastguard Worker #undef FSADS64_H
121*fb1b10abSAndroid Build Coastguard Worker #undef FSADS32_H
122*fb1b10abSAndroid Build Coastguard Worker
123*fb1b10abSAndroid Build Coastguard Worker #define FSADAVG64_H(h) \
124*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_sad64x##h##_avg_avx2( \
125*fb1b10abSAndroid Build Coastguard Worker const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
126*fb1b10abSAndroid Build Coastguard Worker int ref_stride, const uint8_t *second_pred) { \
127*fb1b10abSAndroid Build Coastguard Worker int i; \
128*fb1b10abSAndroid Build Coastguard Worker __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
129*fb1b10abSAndroid Build Coastguard Worker __m256i sum_sad = _mm256_setzero_si256(); \
130*fb1b10abSAndroid Build Coastguard Worker __m256i sum_sad_h; \
131*fb1b10abSAndroid Build Coastguard Worker __m128i sum_sad128; \
132*fb1b10abSAndroid Build Coastguard Worker for (i = 0; i < h; i++) { \
133*fb1b10abSAndroid Build Coastguard Worker ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
134*fb1b10abSAndroid Build Coastguard Worker ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
135*fb1b10abSAndroid Build Coastguard Worker ref1_reg = _mm256_avg_epu8( \
136*fb1b10abSAndroid Build Coastguard Worker ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \
137*fb1b10abSAndroid Build Coastguard Worker ref2_reg = _mm256_avg_epu8( \
138*fb1b10abSAndroid Build Coastguard Worker ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
139*fb1b10abSAndroid Build Coastguard Worker sad1_reg = _mm256_sad_epu8( \
140*fb1b10abSAndroid Build Coastguard Worker ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
141*fb1b10abSAndroid Build Coastguard Worker sad2_reg = _mm256_sad_epu8( \
142*fb1b10abSAndroid Build Coastguard Worker ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
143*fb1b10abSAndroid Build Coastguard Worker sum_sad = \
144*fb1b10abSAndroid Build Coastguard Worker _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
145*fb1b10abSAndroid Build Coastguard Worker ref_ptr += ref_stride; \
146*fb1b10abSAndroid Build Coastguard Worker src_ptr += src_stride; \
147*fb1b10abSAndroid Build Coastguard Worker second_pred += 64; \
148*fb1b10abSAndroid Build Coastguard Worker } \
149*fb1b10abSAndroid Build Coastguard Worker sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
150*fb1b10abSAndroid Build Coastguard Worker sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
151*fb1b10abSAndroid Build Coastguard Worker sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
152*fb1b10abSAndroid Build Coastguard Worker sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
153*fb1b10abSAndroid Build Coastguard Worker return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \
154*fb1b10abSAndroid Build Coastguard Worker }
155*fb1b10abSAndroid Build Coastguard Worker
156*fb1b10abSAndroid Build Coastguard Worker #define FSADAVG32_H(h) \
157*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_sad32x##h##_avg_avx2( \
158*fb1b10abSAndroid Build Coastguard Worker const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
159*fb1b10abSAndroid Build Coastguard Worker int ref_stride, const uint8_t *second_pred) { \
160*fb1b10abSAndroid Build Coastguard Worker int i; \
161*fb1b10abSAndroid Build Coastguard Worker __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
162*fb1b10abSAndroid Build Coastguard Worker __m256i sum_sad = _mm256_setzero_si256(); \
163*fb1b10abSAndroid Build Coastguard Worker __m256i sum_sad_h; \
164*fb1b10abSAndroid Build Coastguard Worker __m128i sum_sad128; \
165*fb1b10abSAndroid Build Coastguard Worker int ref2_stride = ref_stride << 1; \
166*fb1b10abSAndroid Build Coastguard Worker int src2_stride = src_stride << 1; \
167*fb1b10abSAndroid Build Coastguard Worker int max = h >> 1; \
168*fb1b10abSAndroid Build Coastguard Worker for (i = 0; i < max; i++) { \
169*fb1b10abSAndroid Build Coastguard Worker ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
170*fb1b10abSAndroid Build Coastguard Worker ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
171*fb1b10abSAndroid Build Coastguard Worker ref1_reg = _mm256_avg_epu8( \
172*fb1b10abSAndroid Build Coastguard Worker ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \
173*fb1b10abSAndroid Build Coastguard Worker ref2_reg = _mm256_avg_epu8( \
174*fb1b10abSAndroid Build Coastguard Worker ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
175*fb1b10abSAndroid Build Coastguard Worker sad1_reg = _mm256_sad_epu8( \
176*fb1b10abSAndroid Build Coastguard Worker ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
177*fb1b10abSAndroid Build Coastguard Worker sad2_reg = _mm256_sad_epu8( \
178*fb1b10abSAndroid Build Coastguard Worker ref2_reg, \
179*fb1b10abSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
180*fb1b10abSAndroid Build Coastguard Worker sum_sad = \
181*fb1b10abSAndroid Build Coastguard Worker _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
182*fb1b10abSAndroid Build Coastguard Worker ref_ptr += ref2_stride; \
183*fb1b10abSAndroid Build Coastguard Worker src_ptr += src2_stride; \
184*fb1b10abSAndroid Build Coastguard Worker second_pred += 64; \
185*fb1b10abSAndroid Build Coastguard Worker } \
186*fb1b10abSAndroid Build Coastguard Worker sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
187*fb1b10abSAndroid Build Coastguard Worker sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
188*fb1b10abSAndroid Build Coastguard Worker sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
189*fb1b10abSAndroid Build Coastguard Worker sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
190*fb1b10abSAndroid Build Coastguard Worker return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \
191*fb1b10abSAndroid Build Coastguard Worker }
192*fb1b10abSAndroid Build Coastguard Worker
193*fb1b10abSAndroid Build Coastguard Worker #define FSADAVG64 \
194*fb1b10abSAndroid Build Coastguard Worker FSADAVG64_H(64) \
195*fb1b10abSAndroid Build Coastguard Worker FSADAVG64_H(32)
196*fb1b10abSAndroid Build Coastguard Worker
197*fb1b10abSAndroid Build Coastguard Worker #define FSADAVG32 \
198*fb1b10abSAndroid Build Coastguard Worker FSADAVG32_H(64) \
199*fb1b10abSAndroid Build Coastguard Worker FSADAVG32_H(32) \
200*fb1b10abSAndroid Build Coastguard Worker FSADAVG32_H(16)
201*fb1b10abSAndroid Build Coastguard Worker
202*fb1b10abSAndroid Build Coastguard Worker FSADAVG64
203*fb1b10abSAndroid Build Coastguard Worker FSADAVG32
204*fb1b10abSAndroid Build Coastguard Worker
205*fb1b10abSAndroid Build Coastguard Worker #undef FSADAVG64
206*fb1b10abSAndroid Build Coastguard Worker #undef FSADAVG32
207*fb1b10abSAndroid Build Coastguard Worker #undef FSADAVG64_H
208*fb1b10abSAndroid Build Coastguard Worker #undef FSADAVG32_H
209