1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker *
4*77c1e3ccSAndroid Build Coastguard Worker * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker */
11*77c1e3ccSAndroid Build Coastguard Worker
12*77c1e3ccSAndroid Build Coastguard Worker #include <immintrin.h>
13*77c1e3ccSAndroid Build Coastguard Worker
14*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_dsp_rtcd.h"
15*77c1e3ccSAndroid Build Coastguard Worker
16*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
17*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/x86/synonyms.h"
18*77c1e3ccSAndroid Build Coastguard Worker
mm256_add_hi_lo_epi16(const __m256i val)19*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i mm256_add_hi_lo_epi16(const __m256i val) {
20*77c1e3ccSAndroid Build Coastguard Worker return _mm_add_epi16(_mm256_castsi256_si128(val),
21*77c1e3ccSAndroid Build Coastguard Worker _mm256_extractf128_si256(val, 1));
22*77c1e3ccSAndroid Build Coastguard Worker }
23*77c1e3ccSAndroid Build Coastguard Worker
mm256_add_hi_lo_epi32(const __m256i val)24*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i mm256_add_hi_lo_epi32(const __m256i val) {
25*77c1e3ccSAndroid Build Coastguard Worker return _mm_add_epi32(_mm256_castsi256_si128(val),
26*77c1e3ccSAndroid Build Coastguard Worker _mm256_extractf128_si256(val, 1));
27*77c1e3ccSAndroid Build Coastguard Worker }
28*77c1e3ccSAndroid Build Coastguard Worker
variance_kernel_avx2(const __m256i src,const __m256i ref,__m256i * const sse,__m256i * const sum)29*77c1e3ccSAndroid Build Coastguard Worker static inline void variance_kernel_avx2(const __m256i src, const __m256i ref,
30*77c1e3ccSAndroid Build Coastguard Worker __m256i *const sse,
31*77c1e3ccSAndroid Build Coastguard Worker __m256i *const sum) {
32*77c1e3ccSAndroid Build Coastguard Worker const __m256i adj_sub = _mm256_set1_epi16((short)0xff01); // (1,-1)
33*77c1e3ccSAndroid Build Coastguard Worker
34*77c1e3ccSAndroid Build Coastguard Worker // unpack into pairs of source and reference values
35*77c1e3ccSAndroid Build Coastguard Worker const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
36*77c1e3ccSAndroid Build Coastguard Worker const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref);
37*77c1e3ccSAndroid Build Coastguard Worker
38*77c1e3ccSAndroid Build Coastguard Worker // subtract adjacent elements using src*1 + ref*-1
39*77c1e3ccSAndroid Build Coastguard Worker const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
40*77c1e3ccSAndroid Build Coastguard Worker const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
41*77c1e3ccSAndroid Build Coastguard Worker const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
42*77c1e3ccSAndroid Build Coastguard Worker const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
43*77c1e3ccSAndroid Build Coastguard Worker
44*77c1e3ccSAndroid Build Coastguard Worker // add to the running totals
45*77c1e3ccSAndroid Build Coastguard Worker *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1));
46*77c1e3ccSAndroid Build Coastguard Worker *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1));
47*77c1e3ccSAndroid Build Coastguard Worker }
48*77c1e3ccSAndroid Build Coastguard Worker
variance_final_from_32bit_sum_avx2(__m256i vsse,__m128i vsum,unsigned int * const sse)49*77c1e3ccSAndroid Build Coastguard Worker static inline int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum,
50*77c1e3ccSAndroid Build Coastguard Worker unsigned int *const sse) {
51*77c1e3ccSAndroid Build Coastguard Worker // extract the low lane and add it to the high lane
52*77c1e3ccSAndroid Build Coastguard Worker const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse);
53*77c1e3ccSAndroid Build Coastguard Worker
54*77c1e3ccSAndroid Build Coastguard Worker // unpack sse and sum registers and add
55*77c1e3ccSAndroid Build Coastguard Worker const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum);
56*77c1e3ccSAndroid Build Coastguard Worker const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum);
57*77c1e3ccSAndroid Build Coastguard Worker const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
58*77c1e3ccSAndroid Build Coastguard Worker
59*77c1e3ccSAndroid Build Coastguard Worker // perform the final summation and extract the results
60*77c1e3ccSAndroid Build Coastguard Worker const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
61*77c1e3ccSAndroid Build Coastguard Worker *((int *)sse) = _mm_cvtsi128_si32(res);
62*77c1e3ccSAndroid Build Coastguard Worker return _mm_extract_epi32(res, 1);
63*77c1e3ccSAndroid Build Coastguard Worker }
64*77c1e3ccSAndroid Build Coastguard Worker
65*77c1e3ccSAndroid Build Coastguard Worker // handle pixels (<= 512)
variance_final_512_avx2(__m256i vsse,__m256i vsum,unsigned int * const sse)66*77c1e3ccSAndroid Build Coastguard Worker static inline int variance_final_512_avx2(__m256i vsse, __m256i vsum,
67*77c1e3ccSAndroid Build Coastguard Worker unsigned int *const sse) {
68*77c1e3ccSAndroid Build Coastguard Worker // extract the low lane and add it to the high lane
69*77c1e3ccSAndroid Build Coastguard Worker const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
70*77c1e3ccSAndroid Build Coastguard Worker const __m128i vsum_64 = _mm_add_epi16(vsum_128, _mm_srli_si128(vsum_128, 8));
71*77c1e3ccSAndroid Build Coastguard Worker const __m128i sum_int32 = _mm_cvtepi16_epi32(vsum_64);
72*77c1e3ccSAndroid Build Coastguard Worker return variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse);
73*77c1e3ccSAndroid Build Coastguard Worker }
74*77c1e3ccSAndroid Build Coastguard Worker
75*77c1e3ccSAndroid Build Coastguard Worker // handle 1024 pixels (32x32, 16x64, 64x16)
variance_final_1024_avx2(__m256i vsse,__m256i vsum,unsigned int * const sse)76*77c1e3ccSAndroid Build Coastguard Worker static inline int variance_final_1024_avx2(__m256i vsse, __m256i vsum,
77*77c1e3ccSAndroid Build Coastguard Worker unsigned int *const sse) {
78*77c1e3ccSAndroid Build Coastguard Worker // extract the low lane and add it to the high lane
79*77c1e3ccSAndroid Build Coastguard Worker const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
80*77c1e3ccSAndroid Build Coastguard Worker const __m128i vsum_64 =
81*77c1e3ccSAndroid Build Coastguard Worker _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128),
82*77c1e3ccSAndroid Build Coastguard Worker _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8)));
83*77c1e3ccSAndroid Build Coastguard Worker return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse);
84*77c1e3ccSAndroid Build Coastguard Worker }
85*77c1e3ccSAndroid Build Coastguard Worker
sum_to_32bit_avx2(const __m256i sum)86*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i sum_to_32bit_avx2(const __m256i sum) {
87*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum));
88*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum_hi =
89*77c1e3ccSAndroid Build Coastguard Worker _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1));
90*77c1e3ccSAndroid Build Coastguard Worker return _mm256_add_epi32(sum_lo, sum_hi);
91*77c1e3ccSAndroid Build Coastguard Worker }
92*77c1e3ccSAndroid Build Coastguard Worker
93*77c1e3ccSAndroid Build Coastguard Worker // handle 2048 pixels (32x64, 64x32)
variance_final_2048_avx2(__m256i vsse,__m256i vsum,unsigned int * const sse)94*77c1e3ccSAndroid Build Coastguard Worker static inline int variance_final_2048_avx2(__m256i vsse, __m256i vsum,
95*77c1e3ccSAndroid Build Coastguard Worker unsigned int *const sse) {
96*77c1e3ccSAndroid Build Coastguard Worker vsum = sum_to_32bit_avx2(vsum);
97*77c1e3ccSAndroid Build Coastguard Worker const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);
98*77c1e3ccSAndroid Build Coastguard Worker return variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);
99*77c1e3ccSAndroid Build Coastguard Worker }
100*77c1e3ccSAndroid Build Coastguard Worker
variance16_kernel_avx2(const uint8_t * const src,const int src_stride,const uint8_t * const ref,const int ref_stride,__m256i * const sse,__m256i * const sum)101*77c1e3ccSAndroid Build Coastguard Worker static inline void variance16_kernel_avx2(
102*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *const src, const int src_stride, const uint8_t *const ref,
103*77c1e3ccSAndroid Build Coastguard Worker const int ref_stride, __m256i *const sse, __m256i *const sum) {
104*77c1e3ccSAndroid Build Coastguard Worker const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
105*77c1e3ccSAndroid Build Coastguard Worker const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
106*77c1e3ccSAndroid Build Coastguard Worker const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride));
107*77c1e3ccSAndroid Build Coastguard Worker const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride));
108*77c1e3ccSAndroid Build Coastguard Worker const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
109*77c1e3ccSAndroid Build Coastguard Worker const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1);
110*77c1e3ccSAndroid Build Coastguard Worker variance_kernel_avx2(s, r, sse, sum);
111*77c1e3ccSAndroid Build Coastguard Worker }
112*77c1e3ccSAndroid Build Coastguard Worker
variance32_kernel_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i * const sse,__m256i * const sum)113*77c1e3ccSAndroid Build Coastguard Worker static inline void variance32_kernel_avx2(const uint8_t *const src,
114*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *const ref,
115*77c1e3ccSAndroid Build Coastguard Worker __m256i *const sse,
116*77c1e3ccSAndroid Build Coastguard Worker __m256i *const sum) {
117*77c1e3ccSAndroid Build Coastguard Worker const __m256i s = _mm256_loadu_si256((__m256i const *)(src));
118*77c1e3ccSAndroid Build Coastguard Worker const __m256i r = _mm256_loadu_si256((__m256i const *)(ref));
119*77c1e3ccSAndroid Build Coastguard Worker variance_kernel_avx2(s, r, sse, sum);
120*77c1e3ccSAndroid Build Coastguard Worker }
121*77c1e3ccSAndroid Build Coastguard Worker
variance16_avx2(const uint8_t * src,const int src_stride,const uint8_t * ref,const int ref_stride,const int h,__m256i * const vsse,__m256i * const vsum)122*77c1e3ccSAndroid Build Coastguard Worker static inline void variance16_avx2(const uint8_t *src, const int src_stride,
123*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref, const int ref_stride,
124*77c1e3ccSAndroid Build Coastguard Worker const int h, __m256i *const vsse,
125*77c1e3ccSAndroid Build Coastguard Worker __m256i *const vsum) {
126*77c1e3ccSAndroid Build Coastguard Worker *vsum = _mm256_setzero_si256();
127*77c1e3ccSAndroid Build Coastguard Worker
128*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < h; i += 2) {
129*77c1e3ccSAndroid Build Coastguard Worker variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
130*77c1e3ccSAndroid Build Coastguard Worker src += 2 * src_stride;
131*77c1e3ccSAndroid Build Coastguard Worker ref += 2 * ref_stride;
132*77c1e3ccSAndroid Build Coastguard Worker }
133*77c1e3ccSAndroid Build Coastguard Worker }
134*77c1e3ccSAndroid Build Coastguard Worker
variance32_avx2(const uint8_t * src,const int src_stride,const uint8_t * ref,const int ref_stride,const int h,__m256i * const vsse,__m256i * const vsum)135*77c1e3ccSAndroid Build Coastguard Worker static inline void variance32_avx2(const uint8_t *src, const int src_stride,
136*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref, const int ref_stride,
137*77c1e3ccSAndroid Build Coastguard Worker const int h, __m256i *const vsse,
138*77c1e3ccSAndroid Build Coastguard Worker __m256i *const vsum) {
139*77c1e3ccSAndroid Build Coastguard Worker *vsum = _mm256_setzero_si256();
140*77c1e3ccSAndroid Build Coastguard Worker
141*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < h; i++) {
142*77c1e3ccSAndroid Build Coastguard Worker variance32_kernel_avx2(src, ref, vsse, vsum);
143*77c1e3ccSAndroid Build Coastguard Worker src += src_stride;
144*77c1e3ccSAndroid Build Coastguard Worker ref += ref_stride;
145*77c1e3ccSAndroid Build Coastguard Worker }
146*77c1e3ccSAndroid Build Coastguard Worker }
147*77c1e3ccSAndroid Build Coastguard Worker
variance64_avx2(const uint8_t * src,const int src_stride,const uint8_t * ref,const int ref_stride,const int h,__m256i * const vsse,__m256i * const vsum)148*77c1e3ccSAndroid Build Coastguard Worker static inline void variance64_avx2(const uint8_t *src, const int src_stride,
149*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref, const int ref_stride,
150*77c1e3ccSAndroid Build Coastguard Worker const int h, __m256i *const vsse,
151*77c1e3ccSAndroid Build Coastguard Worker __m256i *const vsum) {
152*77c1e3ccSAndroid Build Coastguard Worker *vsum = _mm256_setzero_si256();
153*77c1e3ccSAndroid Build Coastguard Worker
154*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < h; i++) {
155*77c1e3ccSAndroid Build Coastguard Worker variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
156*77c1e3ccSAndroid Build Coastguard Worker variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
157*77c1e3ccSAndroid Build Coastguard Worker src += src_stride;
158*77c1e3ccSAndroid Build Coastguard Worker ref += ref_stride;
159*77c1e3ccSAndroid Build Coastguard Worker }
160*77c1e3ccSAndroid Build Coastguard Worker }
161*77c1e3ccSAndroid Build Coastguard Worker
variance128_avx2(const uint8_t * src,const int src_stride,const uint8_t * ref,const int ref_stride,const int h,__m256i * const vsse,__m256i * const vsum)162*77c1e3ccSAndroid Build Coastguard Worker static inline void variance128_avx2(const uint8_t *src, const int src_stride,
163*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref, const int ref_stride,
164*77c1e3ccSAndroid Build Coastguard Worker const int h, __m256i *const vsse,
165*77c1e3ccSAndroid Build Coastguard Worker __m256i *const vsum) {
166*77c1e3ccSAndroid Build Coastguard Worker *vsum = _mm256_setzero_si256();
167*77c1e3ccSAndroid Build Coastguard Worker
168*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < h; i++) {
169*77c1e3ccSAndroid Build Coastguard Worker variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
170*77c1e3ccSAndroid Build Coastguard Worker variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
171*77c1e3ccSAndroid Build Coastguard Worker variance32_kernel_avx2(src + 64, ref + 64, vsse, vsum);
172*77c1e3ccSAndroid Build Coastguard Worker variance32_kernel_avx2(src + 96, ref + 96, vsse, vsum);
173*77c1e3ccSAndroid Build Coastguard Worker src += src_stride;
174*77c1e3ccSAndroid Build Coastguard Worker ref += ref_stride;
175*77c1e3ccSAndroid Build Coastguard Worker }
176*77c1e3ccSAndroid Build Coastguard Worker }
177*77c1e3ccSAndroid Build Coastguard Worker
178*77c1e3ccSAndroid Build Coastguard Worker #define AOM_VAR_NO_LOOP_AVX2(bw, bh, bits, max_pixel) \
179*77c1e3ccSAndroid Build Coastguard Worker unsigned int aom_variance##bw##x##bh##_avx2( \
180*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
181*77c1e3ccSAndroid Build Coastguard Worker unsigned int *sse) { \
182*77c1e3ccSAndroid Build Coastguard Worker __m256i vsse = _mm256_setzero_si256(); \
183*77c1e3ccSAndroid Build Coastguard Worker __m256i vsum; \
184*77c1e3ccSAndroid Build Coastguard Worker variance##bw##_avx2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum); \
185*77c1e3ccSAndroid Build Coastguard Worker const int sum = variance_final_##max_pixel##_avx2(vsse, vsum, sse); \
186*77c1e3ccSAndroid Build Coastguard Worker return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \
187*77c1e3ccSAndroid Build Coastguard Worker }
188*77c1e3ccSAndroid Build Coastguard Worker
189*77c1e3ccSAndroid Build Coastguard Worker AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512)
190*77c1e3ccSAndroid Build Coastguard Worker AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512)
191*77c1e3ccSAndroid Build Coastguard Worker AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512)
192*77c1e3ccSAndroid Build Coastguard Worker
193*77c1e3ccSAndroid Build Coastguard Worker AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512)
194*77c1e3ccSAndroid Build Coastguard Worker AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024)
195*77c1e3ccSAndroid Build Coastguard Worker AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048)
196*77c1e3ccSAndroid Build Coastguard Worker
197*77c1e3ccSAndroid Build Coastguard Worker AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048)
198*77c1e3ccSAndroid Build Coastguard Worker
199*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY
200*77c1e3ccSAndroid Build Coastguard Worker AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024)
201*77c1e3ccSAndroid Build Coastguard Worker AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512)
202*77c1e3ccSAndroid Build Coastguard Worker AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024)
203*77c1e3ccSAndroid Build Coastguard Worker AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512)
204*77c1e3ccSAndroid Build Coastguard Worker #endif
205*77c1e3ccSAndroid Build Coastguard Worker
206*77c1e3ccSAndroid Build Coastguard Worker #define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh) \
207*77c1e3ccSAndroid Build Coastguard Worker unsigned int aom_variance##bw##x##bh##_avx2( \
208*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
209*77c1e3ccSAndroid Build Coastguard Worker unsigned int *sse) { \
210*77c1e3ccSAndroid Build Coastguard Worker __m256i vsse = _mm256_setzero_si256(); \
211*77c1e3ccSAndroid Build Coastguard Worker __m256i vsum = _mm256_setzero_si256(); \
212*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < (bh / uh); i++) { \
213*77c1e3ccSAndroid Build Coastguard Worker __m256i vsum16; \
214*77c1e3ccSAndroid Build Coastguard Worker variance##bw##_avx2(src, src_stride, ref, ref_stride, uh, &vsse, \
215*77c1e3ccSAndroid Build Coastguard Worker &vsum16); \
216*77c1e3ccSAndroid Build Coastguard Worker vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16)); \
217*77c1e3ccSAndroid Build Coastguard Worker src += uh * src_stride; \
218*77c1e3ccSAndroid Build Coastguard Worker ref += uh * ref_stride; \
219*77c1e3ccSAndroid Build Coastguard Worker } \
220*77c1e3ccSAndroid Build Coastguard Worker const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum); \
221*77c1e3ccSAndroid Build Coastguard Worker const int sum = variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse); \
222*77c1e3ccSAndroid Build Coastguard Worker return *sse - (unsigned int)(((int64_t)sum * sum) >> bits); \
223*77c1e3ccSAndroid Build Coastguard Worker }
224*77c1e3ccSAndroid Build Coastguard Worker
225*77c1e3ccSAndroid Build Coastguard Worker AOM_VAR_LOOP_AVX2(64, 64, 12, 32) // 64x32 * ( 64/32)
226*77c1e3ccSAndroid Build Coastguard Worker AOM_VAR_LOOP_AVX2(64, 128, 13, 32) // 64x32 * (128/32)
227*77c1e3ccSAndroid Build Coastguard Worker AOM_VAR_LOOP_AVX2(128, 64, 13, 16) // 128x16 * ( 64/16)
228*77c1e3ccSAndroid Build Coastguard Worker AOM_VAR_LOOP_AVX2(128, 128, 14, 16) // 128x16 * (128/16)
229*77c1e3ccSAndroid Build Coastguard Worker
aom_mse16x16_avx2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)230*77c1e3ccSAndroid Build Coastguard Worker unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
231*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref, int ref_stride,
232*77c1e3ccSAndroid Build Coastguard Worker unsigned int *sse) {
233*77c1e3ccSAndroid Build Coastguard Worker aom_variance16x16_avx2(src, src_stride, ref, ref_stride, sse);
234*77c1e3ccSAndroid Build Coastguard Worker return *sse;
235*77c1e3ccSAndroid Build Coastguard Worker }
236*77c1e3ccSAndroid Build Coastguard Worker
mm256_loadu2(const uint8_t * p0,const uint8_t * p1)237*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) {
238*77c1e3ccSAndroid Build Coastguard Worker const __m256i d =
239*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
240*77c1e3ccSAndroid Build Coastguard Worker return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
241*77c1e3ccSAndroid Build Coastguard Worker }
242*77c1e3ccSAndroid Build Coastguard Worker
243*77c1e3ccSAndroid Build Coastguard Worker #if CONFIG_AV1_HIGHBITDEPTH
mm256_loadu2_16(const uint16_t * p0,const uint16_t * p1)244*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) {
245*77c1e3ccSAndroid Build Coastguard Worker const __m256i d =
246*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
247*77c1e3ccSAndroid Build Coastguard Worker return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
248*77c1e3ccSAndroid Build Coastguard Worker }
249*77c1e3ccSAndroid Build Coastguard Worker #endif // CONFIG_AV1_HIGHBITDEPTH
250*77c1e3ccSAndroid Build Coastguard Worker
comp_mask_pred_line_avx2(const __m256i s0,const __m256i s1,const __m256i a,uint8_t * comp_pred)251*77c1e3ccSAndroid Build Coastguard Worker static inline void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1,
252*77c1e3ccSAndroid Build Coastguard Worker const __m256i a,
253*77c1e3ccSAndroid Build Coastguard Worker uint8_t *comp_pred) {
254*77c1e3ccSAndroid Build Coastguard Worker const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
255*77c1e3ccSAndroid Build Coastguard Worker const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS;
256*77c1e3ccSAndroid Build Coastguard Worker const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits));
257*77c1e3ccSAndroid Build Coastguard Worker
258*77c1e3ccSAndroid Build Coastguard Worker const __m256i ma = _mm256_sub_epi8(alpha_max, a);
259*77c1e3ccSAndroid Build Coastguard Worker
260*77c1e3ccSAndroid Build Coastguard Worker const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1);
261*77c1e3ccSAndroid Build Coastguard Worker const __m256i aaAL = _mm256_unpacklo_epi8(a, ma);
262*77c1e3ccSAndroid Build Coastguard Worker const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1);
263*77c1e3ccSAndroid Build Coastguard Worker const __m256i aaAH = _mm256_unpackhi_epi8(a, ma);
264*77c1e3ccSAndroid Build Coastguard Worker
265*77c1e3ccSAndroid Build Coastguard Worker const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL);
266*77c1e3ccSAndroid Build Coastguard Worker const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH);
267*77c1e3ccSAndroid Build Coastguard Worker const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset);
268*77c1e3ccSAndroid Build Coastguard Worker const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset);
269*77c1e3ccSAndroid Build Coastguard Worker
270*77c1e3ccSAndroid Build Coastguard Worker const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH);
271*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(comp_pred), roundA);
272*77c1e3ccSAndroid Build Coastguard Worker }
273*77c1e3ccSAndroid Build Coastguard Worker
aom_comp_avg_pred_avx2(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride)274*77c1e3ccSAndroid Build Coastguard Worker void aom_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
275*77c1e3ccSAndroid Build Coastguard Worker int height, const uint8_t *ref, int ref_stride) {
276*77c1e3ccSAndroid Build Coastguard Worker int row = 0;
277*77c1e3ccSAndroid Build Coastguard Worker if (width == 8) {
278*77c1e3ccSAndroid Build Coastguard Worker do {
279*77c1e3ccSAndroid Build Coastguard Worker const __m256i pred_0123 = _mm256_loadu_si256((const __m256i *)(pred));
280*77c1e3ccSAndroid Build Coastguard Worker const __m128i ref_0 = _mm_loadl_epi64((const __m128i *)(ref));
281*77c1e3ccSAndroid Build Coastguard Worker const __m128i ref_1 =
282*77c1e3ccSAndroid Build Coastguard Worker _mm_loadl_epi64((const __m128i *)(ref + ref_stride));
283*77c1e3ccSAndroid Build Coastguard Worker const __m128i ref_2 =
284*77c1e3ccSAndroid Build Coastguard Worker _mm_loadl_epi64((const __m128i *)(ref + 2 * ref_stride));
285*77c1e3ccSAndroid Build Coastguard Worker const __m128i ref_3 =
286*77c1e3ccSAndroid Build Coastguard Worker _mm_loadl_epi64((const __m128i *)(ref + 3 * ref_stride));
287*77c1e3ccSAndroid Build Coastguard Worker const __m128i ref_01 = _mm_unpacklo_epi64(ref_0, ref_1);
288*77c1e3ccSAndroid Build Coastguard Worker const __m128i ref_23 = _mm_unpacklo_epi64(ref_2, ref_3);
289*77c1e3ccSAndroid Build Coastguard Worker
290*77c1e3ccSAndroid Build Coastguard Worker const __m256i ref_0123 =
291*77c1e3ccSAndroid Build Coastguard Worker _mm256_inserti128_si256(_mm256_castsi128_si256(ref_01), ref_23, 1);
292*77c1e3ccSAndroid Build Coastguard Worker const __m256i average = _mm256_avg_epu8(pred_0123, ref_0123);
293*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(comp_pred), average);
294*77c1e3ccSAndroid Build Coastguard Worker
295*77c1e3ccSAndroid Build Coastguard Worker row += 4;
296*77c1e3ccSAndroid Build Coastguard Worker pred += 32;
297*77c1e3ccSAndroid Build Coastguard Worker comp_pred += 32;
298*77c1e3ccSAndroid Build Coastguard Worker ref += 4 * ref_stride;
299*77c1e3ccSAndroid Build Coastguard Worker } while (row < height);
300*77c1e3ccSAndroid Build Coastguard Worker } else if (width == 16) {
301*77c1e3ccSAndroid Build Coastguard Worker do {
302*77c1e3ccSAndroid Build Coastguard Worker const __m256i pred_0 = _mm256_loadu_si256((const __m256i *)(pred));
303*77c1e3ccSAndroid Build Coastguard Worker const __m256i pred_1 = _mm256_loadu_si256((const __m256i *)(pred + 32));
304*77c1e3ccSAndroid Build Coastguard Worker const __m256i tmp0 =
305*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(ref)));
306*77c1e3ccSAndroid Build Coastguard Worker const __m256i ref_0 = _mm256_inserti128_si256(
307*77c1e3ccSAndroid Build Coastguard Worker tmp0, _mm_loadu_si128((const __m128i *)(ref + ref_stride)), 1);
308*77c1e3ccSAndroid Build Coastguard Worker const __m256i tmp1 = _mm256_castsi128_si256(
309*77c1e3ccSAndroid Build Coastguard Worker _mm_loadu_si128((const __m128i *)(ref + 2 * ref_stride)));
310*77c1e3ccSAndroid Build Coastguard Worker const __m256i ref_1 = _mm256_inserti128_si256(
311*77c1e3ccSAndroid Build Coastguard Worker tmp1, _mm_loadu_si128((const __m128i *)(ref + 3 * ref_stride)), 1);
312*77c1e3ccSAndroid Build Coastguard Worker const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
313*77c1e3ccSAndroid Build Coastguard Worker const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
314*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(comp_pred), average_0);
315*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(comp_pred + 32), average_1);
316*77c1e3ccSAndroid Build Coastguard Worker
317*77c1e3ccSAndroid Build Coastguard Worker row += 4;
318*77c1e3ccSAndroid Build Coastguard Worker pred += 64;
319*77c1e3ccSAndroid Build Coastguard Worker comp_pred += 64;
320*77c1e3ccSAndroid Build Coastguard Worker ref += 4 * ref_stride;
321*77c1e3ccSAndroid Build Coastguard Worker } while (row < height);
322*77c1e3ccSAndroid Build Coastguard Worker } else if (width == 32) {
323*77c1e3ccSAndroid Build Coastguard Worker do {
324*77c1e3ccSAndroid Build Coastguard Worker const __m256i pred_0 = _mm256_loadu_si256((const __m256i *)(pred));
325*77c1e3ccSAndroid Build Coastguard Worker const __m256i pred_1 = _mm256_loadu_si256((const __m256i *)(pred + 32));
326*77c1e3ccSAndroid Build Coastguard Worker const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref));
327*77c1e3ccSAndroid Build Coastguard Worker const __m256i ref_1 =
328*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
329*77c1e3ccSAndroid Build Coastguard Worker const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
330*77c1e3ccSAndroid Build Coastguard Worker const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
331*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(comp_pred), average_0);
332*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(comp_pred + 32), average_1);
333*77c1e3ccSAndroid Build Coastguard Worker
334*77c1e3ccSAndroid Build Coastguard Worker row += 2;
335*77c1e3ccSAndroid Build Coastguard Worker pred += 64;
336*77c1e3ccSAndroid Build Coastguard Worker comp_pred += 64;
337*77c1e3ccSAndroid Build Coastguard Worker ref += 2 * ref_stride;
338*77c1e3ccSAndroid Build Coastguard Worker } while (row < height);
339*77c1e3ccSAndroid Build Coastguard Worker } else if (width % 64 == 0) {
340*77c1e3ccSAndroid Build Coastguard Worker do {
341*77c1e3ccSAndroid Build Coastguard Worker for (int x = 0; x < width; x += 64) {
342*77c1e3ccSAndroid Build Coastguard Worker const __m256i pred_0 = _mm256_loadu_si256((const __m256i *)(pred + x));
343*77c1e3ccSAndroid Build Coastguard Worker const __m256i pred_1 =
344*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((const __m256i *)(pred + x + 32));
345*77c1e3ccSAndroid Build Coastguard Worker const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref + x));
346*77c1e3ccSAndroid Build Coastguard Worker const __m256i ref_1 =
347*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((const __m256i *)(ref + x + 32));
348*77c1e3ccSAndroid Build Coastguard Worker const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
349*77c1e3ccSAndroid Build Coastguard Worker const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
350*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(comp_pred + x), average_0);
351*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(comp_pred + x + 32), average_1);
352*77c1e3ccSAndroid Build Coastguard Worker }
353*77c1e3ccSAndroid Build Coastguard Worker row++;
354*77c1e3ccSAndroid Build Coastguard Worker pred += width;
355*77c1e3ccSAndroid Build Coastguard Worker comp_pred += width;
356*77c1e3ccSAndroid Build Coastguard Worker ref += ref_stride;
357*77c1e3ccSAndroid Build Coastguard Worker } while (row < height);
358*77c1e3ccSAndroid Build Coastguard Worker } else {
359*77c1e3ccSAndroid Build Coastguard Worker aom_comp_avg_pred_c(comp_pred, pred, width, height, ref, ref_stride);
360*77c1e3ccSAndroid Build Coastguard Worker }
361*77c1e3ccSAndroid Build Coastguard Worker }
362*77c1e3ccSAndroid Build Coastguard Worker
aom_comp_mask_pred_avx2(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)363*77c1e3ccSAndroid Build Coastguard Worker void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
364*77c1e3ccSAndroid Build Coastguard Worker int height, const uint8_t *ref, int ref_stride,
365*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *mask, int mask_stride,
366*77c1e3ccSAndroid Build Coastguard Worker int invert_mask) {
367*77c1e3ccSAndroid Build Coastguard Worker int i = 0;
368*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src0 = invert_mask ? pred : ref;
369*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src1 = invert_mask ? ref : pred;
370*77c1e3ccSAndroid Build Coastguard Worker const int stride0 = invert_mask ? width : ref_stride;
371*77c1e3ccSAndroid Build Coastguard Worker const int stride1 = invert_mask ? ref_stride : width;
372*77c1e3ccSAndroid Build Coastguard Worker if (width == 8) {
373*77c1e3ccSAndroid Build Coastguard Worker comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
374*77c1e3ccSAndroid Build Coastguard Worker mask, mask_stride);
375*77c1e3ccSAndroid Build Coastguard Worker } else if (width == 16) {
376*77c1e3ccSAndroid Build Coastguard Worker do {
377*77c1e3ccSAndroid Build Coastguard Worker const __m256i sA0 = mm256_loadu2(src0 + stride0, src0);
378*77c1e3ccSAndroid Build Coastguard Worker const __m256i sA1 = mm256_loadu2(src1 + stride1, src1);
379*77c1e3ccSAndroid Build Coastguard Worker const __m256i aA = mm256_loadu2(mask + mask_stride, mask);
380*77c1e3ccSAndroid Build Coastguard Worker src0 += (stride0 << 1);
381*77c1e3ccSAndroid Build Coastguard Worker src1 += (stride1 << 1);
382*77c1e3ccSAndroid Build Coastguard Worker mask += (mask_stride << 1);
383*77c1e3ccSAndroid Build Coastguard Worker const __m256i sB0 = mm256_loadu2(src0 + stride0, src0);
384*77c1e3ccSAndroid Build Coastguard Worker const __m256i sB1 = mm256_loadu2(src1 + stride1, src1);
385*77c1e3ccSAndroid Build Coastguard Worker const __m256i aB = mm256_loadu2(mask + mask_stride, mask);
386*77c1e3ccSAndroid Build Coastguard Worker src0 += (stride0 << 1);
387*77c1e3ccSAndroid Build Coastguard Worker src1 += (stride1 << 1);
388*77c1e3ccSAndroid Build Coastguard Worker mask += (mask_stride << 1);
389*77c1e3ccSAndroid Build Coastguard Worker // comp_pred's stride == width == 16
390*77c1e3ccSAndroid Build Coastguard Worker comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
391*77c1e3ccSAndroid Build Coastguard Worker comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32);
392*77c1e3ccSAndroid Build Coastguard Worker comp_pred += (16 << 2);
393*77c1e3ccSAndroid Build Coastguard Worker i += 4;
394*77c1e3ccSAndroid Build Coastguard Worker } while (i < height);
395*77c1e3ccSAndroid Build Coastguard Worker } else {
396*77c1e3ccSAndroid Build Coastguard Worker do {
397*77c1e3ccSAndroid Build Coastguard Worker for (int x = 0; x < width; x += 32) {
398*77c1e3ccSAndroid Build Coastguard Worker const __m256i sA0 = _mm256_lddqu_si256((const __m256i *)(src0 + x));
399*77c1e3ccSAndroid Build Coastguard Worker const __m256i sA1 = _mm256_lddqu_si256((const __m256i *)(src1 + x));
400*77c1e3ccSAndroid Build Coastguard Worker const __m256i aA = _mm256_lddqu_si256((const __m256i *)(mask + x));
401*77c1e3ccSAndroid Build Coastguard Worker
402*77c1e3ccSAndroid Build Coastguard Worker comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
403*77c1e3ccSAndroid Build Coastguard Worker comp_pred += 32;
404*77c1e3ccSAndroid Build Coastguard Worker }
405*77c1e3ccSAndroid Build Coastguard Worker src0 += stride0;
406*77c1e3ccSAndroid Build Coastguard Worker src1 += stride1;
407*77c1e3ccSAndroid Build Coastguard Worker mask += mask_stride;
408*77c1e3ccSAndroid Build Coastguard Worker i++;
409*77c1e3ccSAndroid Build Coastguard Worker } while (i < height);
410*77c1e3ccSAndroid Build Coastguard Worker }
411*77c1e3ccSAndroid Build Coastguard Worker }
412*77c1e3ccSAndroid Build Coastguard Worker
413*77c1e3ccSAndroid Build Coastguard Worker #if CONFIG_AV1_HIGHBITDEPTH
highbd_comp_mask_pred_line_avx2(const __m256i s0,const __m256i s1,const __m256i a)414*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0,
415*77c1e3ccSAndroid Build Coastguard Worker const __m256i s1,
416*77c1e3ccSAndroid Build Coastguard Worker const __m256i a) {
417*77c1e3ccSAndroid Build Coastguard Worker const __m256i alpha_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
418*77c1e3ccSAndroid Build Coastguard Worker const __m256i round_const =
419*77c1e3ccSAndroid Build Coastguard Worker _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
420*77c1e3ccSAndroid Build Coastguard Worker const __m256i a_inv = _mm256_sub_epi16(alpha_max, a);
421*77c1e3ccSAndroid Build Coastguard Worker
422*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_lo = _mm256_unpacklo_epi16(s0, s1);
423*77c1e3ccSAndroid Build Coastguard Worker const __m256i a_lo = _mm256_unpacklo_epi16(a, a_inv);
424*77c1e3ccSAndroid Build Coastguard Worker const __m256i pred_lo = _mm256_madd_epi16(s_lo, a_lo);
425*77c1e3ccSAndroid Build Coastguard Worker const __m256i pred_l = _mm256_srai_epi32(
426*77c1e3ccSAndroid Build Coastguard Worker _mm256_add_epi32(pred_lo, round_const), AOM_BLEND_A64_ROUND_BITS);
427*77c1e3ccSAndroid Build Coastguard Worker
428*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_hi = _mm256_unpackhi_epi16(s0, s1);
429*77c1e3ccSAndroid Build Coastguard Worker const __m256i a_hi = _mm256_unpackhi_epi16(a, a_inv);
430*77c1e3ccSAndroid Build Coastguard Worker const __m256i pred_hi = _mm256_madd_epi16(s_hi, a_hi);
431*77c1e3ccSAndroid Build Coastguard Worker const __m256i pred_h = _mm256_srai_epi32(
432*77c1e3ccSAndroid Build Coastguard Worker _mm256_add_epi32(pred_hi, round_const), AOM_BLEND_A64_ROUND_BITS);
433*77c1e3ccSAndroid Build Coastguard Worker
434*77c1e3ccSAndroid Build Coastguard Worker const __m256i comp = _mm256_packs_epi32(pred_l, pred_h);
435*77c1e3ccSAndroid Build Coastguard Worker
436*77c1e3ccSAndroid Build Coastguard Worker return comp;
437*77c1e3ccSAndroid Build Coastguard Worker }
438*77c1e3ccSAndroid Build Coastguard Worker
aom_highbd_comp_mask_pred_avx2(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)439*77c1e3ccSAndroid Build Coastguard Worker void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8,
440*77c1e3ccSAndroid Build Coastguard Worker int width, int height, const uint8_t *ref8,
441*77c1e3ccSAndroid Build Coastguard Worker int ref_stride, const uint8_t *mask,
442*77c1e3ccSAndroid Build Coastguard Worker int mask_stride, int invert_mask) {
443*77c1e3ccSAndroid Build Coastguard Worker int i = 0;
444*77c1e3ccSAndroid Build Coastguard Worker uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
445*77c1e3ccSAndroid Build Coastguard Worker uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
446*77c1e3ccSAndroid Build Coastguard Worker uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
447*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *src0 = invert_mask ? pred : ref;
448*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *src1 = invert_mask ? ref : pred;
449*77c1e3ccSAndroid Build Coastguard Worker const int stride0 = invert_mask ? width : ref_stride;
450*77c1e3ccSAndroid Build Coastguard Worker const int stride1 = invert_mask ? ref_stride : width;
451*77c1e3ccSAndroid Build Coastguard Worker const __m256i zero = _mm256_setzero_si256();
452*77c1e3ccSAndroid Build Coastguard Worker
453*77c1e3ccSAndroid Build Coastguard Worker if (width == 8) {
454*77c1e3ccSAndroid Build Coastguard Worker do {
455*77c1e3ccSAndroid Build Coastguard Worker const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0);
456*77c1e3ccSAndroid Build Coastguard Worker const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1);
457*77c1e3ccSAndroid Build Coastguard Worker
458*77c1e3ccSAndroid Build Coastguard Worker const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask);
459*77c1e3ccSAndroid Build Coastguard Worker const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8));
460*77c1e3ccSAndroid Build Coastguard Worker
461*77c1e3ccSAndroid Build Coastguard Worker __m256i m = _mm256_castsi128_si256(m_l);
462*77c1e3ccSAndroid Build Coastguard Worker m = _mm256_insertf128_si256(m, m_h, 1);
463*77c1e3ccSAndroid Build Coastguard Worker const __m256i m_16 = _mm256_unpacklo_epi8(m, zero);
464*77c1e3ccSAndroid Build Coastguard Worker
465*77c1e3ccSAndroid Build Coastguard Worker const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
466*77c1e3ccSAndroid Build Coastguard Worker
467*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp));
468*77c1e3ccSAndroid Build Coastguard Worker
469*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(comp_pred + width),
470*77c1e3ccSAndroid Build Coastguard Worker _mm256_extractf128_si256(comp, 1));
471*77c1e3ccSAndroid Build Coastguard Worker
472*77c1e3ccSAndroid Build Coastguard Worker src0 += (stride0 << 1);
473*77c1e3ccSAndroid Build Coastguard Worker src1 += (stride1 << 1);
474*77c1e3ccSAndroid Build Coastguard Worker mask += (mask_stride << 1);
475*77c1e3ccSAndroid Build Coastguard Worker comp_pred += (width << 1);
476*77c1e3ccSAndroid Build Coastguard Worker i += 2;
477*77c1e3ccSAndroid Build Coastguard Worker } while (i < height);
478*77c1e3ccSAndroid Build Coastguard Worker } else if (width == 16) {
479*77c1e3ccSAndroid Build Coastguard Worker do {
480*77c1e3ccSAndroid Build Coastguard Worker const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0));
481*77c1e3ccSAndroid Build Coastguard Worker const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1));
482*77c1e3ccSAndroid Build Coastguard Worker const __m256i m_16 =
483*77c1e3ccSAndroid Build Coastguard Worker _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
484*77c1e3ccSAndroid Build Coastguard Worker
485*77c1e3ccSAndroid Build Coastguard Worker const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
486*77c1e3ccSAndroid Build Coastguard Worker
487*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)comp_pred, comp);
488*77c1e3ccSAndroid Build Coastguard Worker
489*77c1e3ccSAndroid Build Coastguard Worker src0 += stride0;
490*77c1e3ccSAndroid Build Coastguard Worker src1 += stride1;
491*77c1e3ccSAndroid Build Coastguard Worker mask += mask_stride;
492*77c1e3ccSAndroid Build Coastguard Worker comp_pred += width;
493*77c1e3ccSAndroid Build Coastguard Worker i += 1;
494*77c1e3ccSAndroid Build Coastguard Worker } while (i < height);
495*77c1e3ccSAndroid Build Coastguard Worker } else {
496*77c1e3ccSAndroid Build Coastguard Worker do {
497*77c1e3ccSAndroid Build Coastguard Worker for (int x = 0; x < width; x += 32) {
498*77c1e3ccSAndroid Build Coastguard Worker const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0 + x));
499*77c1e3ccSAndroid Build Coastguard Worker const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + x + 16));
500*77c1e3ccSAndroid Build Coastguard Worker const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1 + x));
501*77c1e3ccSAndroid Build Coastguard Worker const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + x + 16));
502*77c1e3ccSAndroid Build Coastguard Worker
503*77c1e3ccSAndroid Build Coastguard Worker const __m256i m01_16 =
504*77c1e3ccSAndroid Build Coastguard Worker _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + x)));
505*77c1e3ccSAndroid Build Coastguard Worker const __m256i m23_16 = _mm256_cvtepu8_epi16(
506*77c1e3ccSAndroid Build Coastguard Worker _mm_loadu_si128((const __m128i *)(mask + x + 16)));
507*77c1e3ccSAndroid Build Coastguard Worker
508*77c1e3ccSAndroid Build Coastguard Worker const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16);
509*77c1e3ccSAndroid Build Coastguard Worker const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16);
510*77c1e3ccSAndroid Build Coastguard Worker
511*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)comp_pred, comp);
512*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1);
513*77c1e3ccSAndroid Build Coastguard Worker
514*77c1e3ccSAndroid Build Coastguard Worker comp_pred += 32;
515*77c1e3ccSAndroid Build Coastguard Worker }
516*77c1e3ccSAndroid Build Coastguard Worker src0 += stride0;
517*77c1e3ccSAndroid Build Coastguard Worker src1 += stride1;
518*77c1e3ccSAndroid Build Coastguard Worker mask += mask_stride;
519*77c1e3ccSAndroid Build Coastguard Worker i += 1;
520*77c1e3ccSAndroid Build Coastguard Worker } while (i < height);
521*77c1e3ccSAndroid Build Coastguard Worker }
522*77c1e3ccSAndroid Build Coastguard Worker }
523*77c1e3ccSAndroid Build Coastguard Worker #endif // CONFIG_AV1_HIGHBITDEPTH
524*77c1e3ccSAndroid Build Coastguard Worker
mse_4xh_16bit_avx2(uint8_t * dst,int dstride,uint16_t * src,int sstride,int h)525*77c1e3ccSAndroid Build Coastguard Worker static uint64_t mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
526*77c1e3ccSAndroid Build Coastguard Worker int sstride, int h) {
527*77c1e3ccSAndroid Build Coastguard Worker uint64_t sum = 0;
528*77c1e3ccSAndroid Build Coastguard Worker __m128i dst0_4x8, dst1_4x8, dst2_4x8, dst3_4x8, dst_16x8;
529*77c1e3ccSAndroid Build Coastguard Worker __m128i src0_4x16, src1_4x16, src2_4x16, src3_4x16;
530*77c1e3ccSAndroid Build Coastguard Worker __m256i src0_8x16, src1_8x16, dst_16x16, src_16x16;
531*77c1e3ccSAndroid Build Coastguard Worker __m256i res0_4x64, res1_4x64;
532*77c1e3ccSAndroid Build Coastguard Worker __m256i sub_result;
533*77c1e3ccSAndroid Build Coastguard Worker const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
534*77c1e3ccSAndroid Build Coastguard Worker __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
535*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < h; i += 4) {
536*77c1e3ccSAndroid Build Coastguard Worker dst0_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 0) * dstride]));
537*77c1e3ccSAndroid Build Coastguard Worker dst1_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 1) * dstride]));
538*77c1e3ccSAndroid Build Coastguard Worker dst2_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 2) * dstride]));
539*77c1e3ccSAndroid Build Coastguard Worker dst3_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 3) * dstride]));
540*77c1e3ccSAndroid Build Coastguard Worker dst_16x8 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(dst0_4x8, dst1_4x8),
541*77c1e3ccSAndroid Build Coastguard Worker _mm_unpacklo_epi32(dst2_4x8, dst3_4x8));
542*77c1e3ccSAndroid Build Coastguard Worker dst_16x16 = _mm256_cvtepu8_epi16(dst_16x8);
543*77c1e3ccSAndroid Build Coastguard Worker
544*77c1e3ccSAndroid Build Coastguard Worker src0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
545*77c1e3ccSAndroid Build Coastguard Worker src1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
546*77c1e3ccSAndroid Build Coastguard Worker src2_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 2) * sstride]));
547*77c1e3ccSAndroid Build Coastguard Worker src3_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 3) * sstride]));
548*77c1e3ccSAndroid Build Coastguard Worker src0_8x16 =
549*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi128_si256(_mm_unpacklo_epi64(src0_4x16, src1_4x16));
550*77c1e3ccSAndroid Build Coastguard Worker src1_8x16 =
551*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi128_si256(_mm_unpacklo_epi64(src2_4x16, src3_4x16));
552*77c1e3ccSAndroid Build Coastguard Worker src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
553*77c1e3ccSAndroid Build Coastguard Worker
554*77c1e3ccSAndroid Build Coastguard Worker // r15 r14 r13------------r1 r0 - 16 bit
555*77c1e3ccSAndroid Build Coastguard Worker sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
556*77c1e3ccSAndroid Build Coastguard Worker
557*77c1e3ccSAndroid Build Coastguard Worker // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit
558*77c1e3ccSAndroid Build Coastguard Worker src_16x16 = _mm256_madd_epi16(sub_result, sub_result);
559*77c1e3ccSAndroid Build Coastguard Worker
560*77c1e3ccSAndroid Build Coastguard Worker // accumulation of result
561*77c1e3ccSAndroid Build Coastguard Worker square_result = _mm256_add_epi32(square_result, src_16x16);
562*77c1e3ccSAndroid Build Coastguard Worker }
563*77c1e3ccSAndroid Build Coastguard Worker
564*77c1e3ccSAndroid Build Coastguard Worker // s5 s4 s1 s0 - 64bit
565*77c1e3ccSAndroid Build Coastguard Worker res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros);
566*77c1e3ccSAndroid Build Coastguard Worker // s7 s6 s3 s2 - 64bit
567*77c1e3ccSAndroid Build Coastguard Worker res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros);
568*77c1e3ccSAndroid Build Coastguard Worker // r3 r2 r1 r0 - 64bit
569*77c1e3ccSAndroid Build Coastguard Worker res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64);
570*77c1e3ccSAndroid Build Coastguard Worker // r1+r3 r2+r0 - 64bit
571*77c1e3ccSAndroid Build Coastguard Worker const __m128i sum_1x64 =
572*77c1e3ccSAndroid Build Coastguard Worker _mm_add_epi64(_mm256_castsi256_si128(res0_4x64),
573*77c1e3ccSAndroid Build Coastguard Worker _mm256_extracti128_si256(res0_4x64, 1));
574*77c1e3ccSAndroid Build Coastguard Worker xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8)));
575*77c1e3ccSAndroid Build Coastguard Worker return sum;
576*77c1e3ccSAndroid Build Coastguard Worker }
577*77c1e3ccSAndroid Build Coastguard Worker
578*77c1e3ccSAndroid Build Coastguard Worker // Compute mse of four consecutive 4x4 blocks.
579*77c1e3ccSAndroid Build Coastguard Worker // In src buffer, each 4x4 block in a 32x32 filter block is stored sequentially.
580*77c1e3ccSAndroid Build Coastguard Worker // Hence src_blk_stride is same as block width. Whereas dst buffer is a frame
581*77c1e3ccSAndroid Build Coastguard Worker // buffer, thus dstride is a frame level stride.
mse_4xh_quad_16bit_avx2(uint8_t * dst,int dstride,uint16_t * src,int src_blk_stride,int h)582*77c1e3ccSAndroid Build Coastguard Worker static uint64_t mse_4xh_quad_16bit_avx2(uint8_t *dst, int dstride,
583*77c1e3ccSAndroid Build Coastguard Worker uint16_t *src, int src_blk_stride,
584*77c1e3ccSAndroid Build Coastguard Worker int h) {
585*77c1e3ccSAndroid Build Coastguard Worker uint64_t sum = 0;
586*77c1e3ccSAndroid Build Coastguard Worker __m128i dst0_16x8, dst1_16x8, dst2_16x8, dst3_16x8;
587*77c1e3ccSAndroid Build Coastguard Worker __m256i dst0_16x16, dst1_16x16, dst2_16x16, dst3_16x16;
588*77c1e3ccSAndroid Build Coastguard Worker __m256i res0_4x64, res1_4x64;
589*77c1e3ccSAndroid Build Coastguard Worker __m256i sub_result_0, sub_result_1, sub_result_2, sub_result_3;
590*77c1e3ccSAndroid Build Coastguard Worker const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
591*77c1e3ccSAndroid Build Coastguard Worker __m256i square_result = zeros;
592*77c1e3ccSAndroid Build Coastguard Worker uint16_t *src_temp = src;
593*77c1e3ccSAndroid Build Coastguard Worker
594*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < h; i += 4) {
595*77c1e3ccSAndroid Build Coastguard Worker dst0_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 0) * dstride]));
596*77c1e3ccSAndroid Build Coastguard Worker dst1_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 1) * dstride]));
597*77c1e3ccSAndroid Build Coastguard Worker dst2_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 2) * dstride]));
598*77c1e3ccSAndroid Build Coastguard Worker dst3_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 3) * dstride]));
599*77c1e3ccSAndroid Build Coastguard Worker
600*77c1e3ccSAndroid Build Coastguard Worker // row0 of 1st,2nd, 3rd and 4th 4x4 blocks- d00 d10 d20 d30
601*77c1e3ccSAndroid Build Coastguard Worker dst0_16x16 = _mm256_cvtepu8_epi16(dst0_16x8);
602*77c1e3ccSAndroid Build Coastguard Worker // row1 of 1st,2nd, 3rd and 4th 4x4 blocks - d01 d11 d21 d31
603*77c1e3ccSAndroid Build Coastguard Worker dst1_16x16 = _mm256_cvtepu8_epi16(dst1_16x8);
604*77c1e3ccSAndroid Build Coastguard Worker // row2 of 1st,2nd, 3rd and 4th 4x4 blocks - d02 d12 d22 d32
605*77c1e3ccSAndroid Build Coastguard Worker dst2_16x16 = _mm256_cvtepu8_epi16(dst2_16x8);
606*77c1e3ccSAndroid Build Coastguard Worker // row3 of 1st,2nd, 3rd and 4th 4x4 blocks - d03 d13 d23 d33
607*77c1e3ccSAndroid Build Coastguard Worker dst3_16x16 = _mm256_cvtepu8_epi16(dst3_16x8);
608*77c1e3ccSAndroid Build Coastguard Worker
609*77c1e3ccSAndroid Build Coastguard Worker // All rows of 1st 4x4 block - r00 r01 r02 r03
610*77c1e3ccSAndroid Build Coastguard Worker __m256i src0_16x16 = _mm256_loadu_si256((__m256i const *)(&src_temp[0]));
611*77c1e3ccSAndroid Build Coastguard Worker // All rows of 2nd 4x4 block - r10 r11 r12 r13
612*77c1e3ccSAndroid Build Coastguard Worker __m256i src1_16x16 =
613*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i const *)(&src_temp[src_blk_stride]));
614*77c1e3ccSAndroid Build Coastguard Worker // All rows of 3rd 4x4 block - r20 r21 r22 r23
615*77c1e3ccSAndroid Build Coastguard Worker __m256i src2_16x16 =
616*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i const *)(&src_temp[2 * src_blk_stride]));
617*77c1e3ccSAndroid Build Coastguard Worker // All rows of 4th 4x4 block - r30 r31 r32 r33
618*77c1e3ccSAndroid Build Coastguard Worker __m256i src3_16x16 =
619*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i const *)(&src_temp[3 * src_blk_stride]));
620*77c1e3ccSAndroid Build Coastguard Worker
621*77c1e3ccSAndroid Build Coastguard Worker // r00 r10 r02 r12
622*77c1e3ccSAndroid Build Coastguard Worker __m256i tmp0_16x16 = _mm256_unpacklo_epi64(src0_16x16, src1_16x16);
623*77c1e3ccSAndroid Build Coastguard Worker // r01 r11 r03 r13
624*77c1e3ccSAndroid Build Coastguard Worker __m256i tmp1_16x16 = _mm256_unpackhi_epi64(src0_16x16, src1_16x16);
625*77c1e3ccSAndroid Build Coastguard Worker // r20 r30 r22 r32
626*77c1e3ccSAndroid Build Coastguard Worker __m256i tmp2_16x16 = _mm256_unpacklo_epi64(src2_16x16, src3_16x16);
627*77c1e3ccSAndroid Build Coastguard Worker // r21 r31 r23 r33
628*77c1e3ccSAndroid Build Coastguard Worker __m256i tmp3_16x16 = _mm256_unpackhi_epi64(src2_16x16, src3_16x16);
629*77c1e3ccSAndroid Build Coastguard Worker
630*77c1e3ccSAndroid Build Coastguard Worker // r00 r10 r20 r30
631*77c1e3ccSAndroid Build Coastguard Worker src0_16x16 = _mm256_permute2f128_si256(tmp0_16x16, tmp2_16x16, 0x20);
632*77c1e3ccSAndroid Build Coastguard Worker // r01 r11 r21 r31
633*77c1e3ccSAndroid Build Coastguard Worker src1_16x16 = _mm256_permute2f128_si256(tmp1_16x16, tmp3_16x16, 0x20);
634*77c1e3ccSAndroid Build Coastguard Worker // r02 r12 r22 r32
635*77c1e3ccSAndroid Build Coastguard Worker src2_16x16 = _mm256_permute2f128_si256(tmp0_16x16, tmp2_16x16, 0x31);
636*77c1e3ccSAndroid Build Coastguard Worker // r03 r13 r23 r33
637*77c1e3ccSAndroid Build Coastguard Worker src3_16x16 = _mm256_permute2f128_si256(tmp1_16x16, tmp3_16x16, 0x31);
638*77c1e3ccSAndroid Build Coastguard Worker
639*77c1e3ccSAndroid Build Coastguard Worker // r15 r14 r13------------r1 r0 - 16 bit
640*77c1e3ccSAndroid Build Coastguard Worker sub_result_0 = _mm256_abs_epi16(_mm256_sub_epi16(src0_16x16, dst0_16x16));
641*77c1e3ccSAndroid Build Coastguard Worker sub_result_1 = _mm256_abs_epi16(_mm256_sub_epi16(src1_16x16, dst1_16x16));
642*77c1e3ccSAndroid Build Coastguard Worker sub_result_2 = _mm256_abs_epi16(_mm256_sub_epi16(src2_16x16, dst2_16x16));
643*77c1e3ccSAndroid Build Coastguard Worker sub_result_3 = _mm256_abs_epi16(_mm256_sub_epi16(src3_16x16, dst3_16x16));
644*77c1e3ccSAndroid Build Coastguard Worker
645*77c1e3ccSAndroid Build Coastguard Worker // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit
646*77c1e3ccSAndroid Build Coastguard Worker src0_16x16 = _mm256_madd_epi16(sub_result_0, sub_result_0);
647*77c1e3ccSAndroid Build Coastguard Worker src1_16x16 = _mm256_madd_epi16(sub_result_1, sub_result_1);
648*77c1e3ccSAndroid Build Coastguard Worker src2_16x16 = _mm256_madd_epi16(sub_result_2, sub_result_2);
649*77c1e3ccSAndroid Build Coastguard Worker src3_16x16 = _mm256_madd_epi16(sub_result_3, sub_result_3);
650*77c1e3ccSAndroid Build Coastguard Worker
651*77c1e3ccSAndroid Build Coastguard Worker // accumulation of result
652*77c1e3ccSAndroid Build Coastguard Worker src0_16x16 = _mm256_add_epi32(src0_16x16, src1_16x16);
653*77c1e3ccSAndroid Build Coastguard Worker src2_16x16 = _mm256_add_epi32(src2_16x16, src3_16x16);
654*77c1e3ccSAndroid Build Coastguard Worker const __m256i square_result_0 = _mm256_add_epi32(src0_16x16, src2_16x16);
655*77c1e3ccSAndroid Build Coastguard Worker square_result = _mm256_add_epi32(square_result, square_result_0);
656*77c1e3ccSAndroid Build Coastguard Worker src_temp += 16;
657*77c1e3ccSAndroid Build Coastguard Worker }
658*77c1e3ccSAndroid Build Coastguard Worker
659*77c1e3ccSAndroid Build Coastguard Worker // s5 s4 s1 s0 - 64bit
660*77c1e3ccSAndroid Build Coastguard Worker res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros);
661*77c1e3ccSAndroid Build Coastguard Worker // s7 s6 s3 s2 - 64bit
662*77c1e3ccSAndroid Build Coastguard Worker res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros);
663*77c1e3ccSAndroid Build Coastguard Worker // r3 r2 r1 r0 - 64bit
664*77c1e3ccSAndroid Build Coastguard Worker res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64);
665*77c1e3ccSAndroid Build Coastguard Worker // r1+r3 r2+r0 - 64bit
666*77c1e3ccSAndroid Build Coastguard Worker const __m128i sum_1x64 =
667*77c1e3ccSAndroid Build Coastguard Worker _mm_add_epi64(_mm256_castsi256_si128(res0_4x64),
668*77c1e3ccSAndroid Build Coastguard Worker _mm256_extracti128_si256(res0_4x64, 1));
669*77c1e3ccSAndroid Build Coastguard Worker xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8)));
670*77c1e3ccSAndroid Build Coastguard Worker return sum;
671*77c1e3ccSAndroid Build Coastguard Worker }
672*77c1e3ccSAndroid Build Coastguard Worker
mse_8xh_16bit_avx2(uint8_t * dst,int dstride,uint16_t * src,int sstride,int h)673*77c1e3ccSAndroid Build Coastguard Worker static uint64_t mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
674*77c1e3ccSAndroid Build Coastguard Worker int sstride, int h) {
675*77c1e3ccSAndroid Build Coastguard Worker uint64_t sum = 0;
676*77c1e3ccSAndroid Build Coastguard Worker __m128i dst0_8x8, dst1_8x8, dst3_16x8;
677*77c1e3ccSAndroid Build Coastguard Worker __m256i src0_8x16, src1_8x16, src_16x16, dst_16x16;
678*77c1e3ccSAndroid Build Coastguard Worker __m256i res0_4x64, res1_4x64;
679*77c1e3ccSAndroid Build Coastguard Worker __m256i sub_result;
680*77c1e3ccSAndroid Build Coastguard Worker const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
681*77c1e3ccSAndroid Build Coastguard Worker __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
682*77c1e3ccSAndroid Build Coastguard Worker
683*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < h; i += 2) {
684*77c1e3ccSAndroid Build Coastguard Worker dst0_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
685*77c1e3ccSAndroid Build Coastguard Worker dst1_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride]));
686*77c1e3ccSAndroid Build Coastguard Worker dst3_16x8 = _mm_unpacklo_epi64(dst0_8x8, dst1_8x8);
687*77c1e3ccSAndroid Build Coastguard Worker dst_16x16 = _mm256_cvtepu8_epi16(dst3_16x8);
688*77c1e3ccSAndroid Build Coastguard Worker
689*77c1e3ccSAndroid Build Coastguard Worker src0_8x16 =
690*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src[i * sstride]));
691*77c1e3ccSAndroid Build Coastguard Worker src1_8x16 = _mm256_castsi128_si256(
692*77c1e3ccSAndroid Build Coastguard Worker _mm_loadu_si128((__m128i *)&src[(i + 1) * sstride]));
693*77c1e3ccSAndroid Build Coastguard Worker src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
694*77c1e3ccSAndroid Build Coastguard Worker
695*77c1e3ccSAndroid Build Coastguard Worker // r15 r14 r13 - - - r1 r0 - 16 bit
696*77c1e3ccSAndroid Build Coastguard Worker sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
697*77c1e3ccSAndroid Build Coastguard Worker
698*77c1e3ccSAndroid Build Coastguard Worker // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit
699*77c1e3ccSAndroid Build Coastguard Worker src_16x16 = _mm256_madd_epi16(sub_result, sub_result);
700*77c1e3ccSAndroid Build Coastguard Worker
701*77c1e3ccSAndroid Build Coastguard Worker // accumulation of result
702*77c1e3ccSAndroid Build Coastguard Worker square_result = _mm256_add_epi32(square_result, src_16x16);
703*77c1e3ccSAndroid Build Coastguard Worker }
704*77c1e3ccSAndroid Build Coastguard Worker
705*77c1e3ccSAndroid Build Coastguard Worker // s5 s4 s1 s0 - 64bit
706*77c1e3ccSAndroid Build Coastguard Worker res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros);
707*77c1e3ccSAndroid Build Coastguard Worker // s7 s6 s3 s2 - 64bit
708*77c1e3ccSAndroid Build Coastguard Worker res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros);
709*77c1e3ccSAndroid Build Coastguard Worker // r3 r2 r1 r0 - 64bit
710*77c1e3ccSAndroid Build Coastguard Worker res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64);
711*77c1e3ccSAndroid Build Coastguard Worker // r1+r3 r2+r0 - 64bit
712*77c1e3ccSAndroid Build Coastguard Worker const __m128i sum_1x64 =
713*77c1e3ccSAndroid Build Coastguard Worker _mm_add_epi64(_mm256_castsi256_si128(res0_4x64),
714*77c1e3ccSAndroid Build Coastguard Worker _mm256_extracti128_si256(res0_4x64, 1));
715*77c1e3ccSAndroid Build Coastguard Worker xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8)));
716*77c1e3ccSAndroid Build Coastguard Worker return sum;
717*77c1e3ccSAndroid Build Coastguard Worker }
718*77c1e3ccSAndroid Build Coastguard Worker
719*77c1e3ccSAndroid Build Coastguard Worker // Compute mse of two consecutive 8x8 blocks.
720*77c1e3ccSAndroid Build Coastguard Worker // In src buffer, each 8x8 block in a 64x64 filter block is stored sequentially.
721*77c1e3ccSAndroid Build Coastguard Worker // Hence src_blk_stride is same as block width. Whereas dst buffer is a frame
722*77c1e3ccSAndroid Build Coastguard Worker // buffer, thus dstride is a frame level stride.
mse_8xh_dual_16bit_avx2(uint8_t * dst,int dstride,uint16_t * src,int src_blk_stride,int h)723*77c1e3ccSAndroid Build Coastguard Worker static uint64_t mse_8xh_dual_16bit_avx2(uint8_t *dst, int dstride,
724*77c1e3ccSAndroid Build Coastguard Worker uint16_t *src, int src_blk_stride,
725*77c1e3ccSAndroid Build Coastguard Worker int h) {
726*77c1e3ccSAndroid Build Coastguard Worker uint64_t sum = 0;
727*77c1e3ccSAndroid Build Coastguard Worker __m128i dst0_16x8, dst1_16x8;
728*77c1e3ccSAndroid Build Coastguard Worker __m256i dst0_16x16, dst1_16x16;
729*77c1e3ccSAndroid Build Coastguard Worker __m256i res0_4x64, res1_4x64;
730*77c1e3ccSAndroid Build Coastguard Worker __m256i sub_result_0, sub_result_1;
731*77c1e3ccSAndroid Build Coastguard Worker const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
732*77c1e3ccSAndroid Build Coastguard Worker __m256i square_result = zeros;
733*77c1e3ccSAndroid Build Coastguard Worker uint16_t *src_temp = src;
734*77c1e3ccSAndroid Build Coastguard Worker
735*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < h; i += 2) {
736*77c1e3ccSAndroid Build Coastguard Worker dst0_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 0) * dstride]));
737*77c1e3ccSAndroid Build Coastguard Worker dst1_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 1) * dstride]));
738*77c1e3ccSAndroid Build Coastguard Worker
739*77c1e3ccSAndroid Build Coastguard Worker // row0 of 1st and 2nd 8x8 block - d00 d10
740*77c1e3ccSAndroid Build Coastguard Worker dst0_16x16 = _mm256_cvtepu8_epi16(dst0_16x8);
741*77c1e3ccSAndroid Build Coastguard Worker // row1 of 1st and 2nd 8x8 block - d01 d11
742*77c1e3ccSAndroid Build Coastguard Worker dst1_16x16 = _mm256_cvtepu8_epi16(dst1_16x8);
743*77c1e3ccSAndroid Build Coastguard Worker
744*77c1e3ccSAndroid Build Coastguard Worker // 2 rows of 1st 8x8 block - r00 r01
745*77c1e3ccSAndroid Build Coastguard Worker __m256i src0_16x16 = _mm256_loadu_si256((__m256i const *)(&src_temp[0]));
746*77c1e3ccSAndroid Build Coastguard Worker // 2 rows of 2nd 8x8 block - r10 r11
747*77c1e3ccSAndroid Build Coastguard Worker __m256i src1_16x16 =
748*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i const *)(&src_temp[src_blk_stride]));
749*77c1e3ccSAndroid Build Coastguard Worker // r00 r10 - 128bit
750*77c1e3ccSAndroid Build Coastguard Worker __m256i tmp0_16x16 =
751*77c1e3ccSAndroid Build Coastguard Worker _mm256_permute2f128_si256(src0_16x16, src1_16x16, 0x20);
752*77c1e3ccSAndroid Build Coastguard Worker // r01 r11 - 128bit
753*77c1e3ccSAndroid Build Coastguard Worker __m256i tmp1_16x16 =
754*77c1e3ccSAndroid Build Coastguard Worker _mm256_permute2f128_si256(src0_16x16, src1_16x16, 0x31);
755*77c1e3ccSAndroid Build Coastguard Worker
756*77c1e3ccSAndroid Build Coastguard Worker // r15 r14 r13------------r1 r0 - 16 bit
757*77c1e3ccSAndroid Build Coastguard Worker sub_result_0 = _mm256_abs_epi16(_mm256_sub_epi16(tmp0_16x16, dst0_16x16));
758*77c1e3ccSAndroid Build Coastguard Worker sub_result_1 = _mm256_abs_epi16(_mm256_sub_epi16(tmp1_16x16, dst1_16x16));
759*77c1e3ccSAndroid Build Coastguard Worker
760*77c1e3ccSAndroid Build Coastguard Worker // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit each
761*77c1e3ccSAndroid Build Coastguard Worker src0_16x16 = _mm256_madd_epi16(sub_result_0, sub_result_0);
762*77c1e3ccSAndroid Build Coastguard Worker src1_16x16 = _mm256_madd_epi16(sub_result_1, sub_result_1);
763*77c1e3ccSAndroid Build Coastguard Worker
764*77c1e3ccSAndroid Build Coastguard Worker // accumulation of result
765*77c1e3ccSAndroid Build Coastguard Worker src0_16x16 = _mm256_add_epi32(src0_16x16, src1_16x16);
766*77c1e3ccSAndroid Build Coastguard Worker square_result = _mm256_add_epi32(square_result, src0_16x16);
767*77c1e3ccSAndroid Build Coastguard Worker src_temp += 16;
768*77c1e3ccSAndroid Build Coastguard Worker }
769*77c1e3ccSAndroid Build Coastguard Worker
770*77c1e3ccSAndroid Build Coastguard Worker // s5 s4 s1 s0 - 64bit
771*77c1e3ccSAndroid Build Coastguard Worker res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros);
772*77c1e3ccSAndroid Build Coastguard Worker // s7 s6 s3 s2 - 64bit
773*77c1e3ccSAndroid Build Coastguard Worker res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros);
774*77c1e3ccSAndroid Build Coastguard Worker // r3 r2 r1 r0 - 64bit
775*77c1e3ccSAndroid Build Coastguard Worker res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64);
776*77c1e3ccSAndroid Build Coastguard Worker // r1+r3 r2+r0 - 64bit
777*77c1e3ccSAndroid Build Coastguard Worker const __m128i sum_1x64 =
778*77c1e3ccSAndroid Build Coastguard Worker _mm_add_epi64(_mm256_castsi256_si128(res0_4x64),
779*77c1e3ccSAndroid Build Coastguard Worker _mm256_extracti128_si256(res0_4x64, 1));
780*77c1e3ccSAndroid Build Coastguard Worker xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8)));
781*77c1e3ccSAndroid Build Coastguard Worker return sum;
782*77c1e3ccSAndroid Build Coastguard Worker }
783*77c1e3ccSAndroid Build Coastguard Worker
aom_mse_wxh_16bit_avx2(uint8_t * dst,int dstride,uint16_t * src,int sstride,int w,int h)784*77c1e3ccSAndroid Build Coastguard Worker uint64_t aom_mse_wxh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
785*77c1e3ccSAndroid Build Coastguard Worker int sstride, int w, int h) {
786*77c1e3ccSAndroid Build Coastguard Worker assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
787*77c1e3ccSAndroid Build Coastguard Worker "w=8/4 and h=8/4 must be satisfied");
788*77c1e3ccSAndroid Build Coastguard Worker switch (w) {
789*77c1e3ccSAndroid Build Coastguard Worker case 4: return mse_4xh_16bit_avx2(dst, dstride, src, sstride, h);
790*77c1e3ccSAndroid Build Coastguard Worker case 8: return mse_8xh_16bit_avx2(dst, dstride, src, sstride, h);
791*77c1e3ccSAndroid Build Coastguard Worker default: assert(0 && "unsupported width"); return -1;
792*77c1e3ccSAndroid Build Coastguard Worker }
793*77c1e3ccSAndroid Build Coastguard Worker }
794*77c1e3ccSAndroid Build Coastguard Worker
795*77c1e3ccSAndroid Build Coastguard Worker // Computes mse of two 8x8 or four 4x4 consecutive blocks. Luma plane uses 8x8
796*77c1e3ccSAndroid Build Coastguard Worker // block and Chroma uses 4x4 block. In src buffer, each block in a filter block
797*77c1e3ccSAndroid Build Coastguard Worker // is stored sequentially. Hence src_blk_stride is same as block width. Whereas
798*77c1e3ccSAndroid Build Coastguard Worker // dst buffer is a frame buffer, thus dstride is a frame level stride.
aom_mse_16xh_16bit_avx2(uint8_t * dst,int dstride,uint16_t * src,int w,int h)799*77c1e3ccSAndroid Build Coastguard Worker uint64_t aom_mse_16xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
800*77c1e3ccSAndroid Build Coastguard Worker int w, int h) {
801*77c1e3ccSAndroid Build Coastguard Worker assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
802*77c1e3ccSAndroid Build Coastguard Worker "w=8/4 and h=8/4 must be satisfied");
803*77c1e3ccSAndroid Build Coastguard Worker switch (w) {
804*77c1e3ccSAndroid Build Coastguard Worker case 4: return mse_4xh_quad_16bit_avx2(dst, dstride, src, w * h, h);
805*77c1e3ccSAndroid Build Coastguard Worker case 8: return mse_8xh_dual_16bit_avx2(dst, dstride, src, w * h, h);
806*77c1e3ccSAndroid Build Coastguard Worker default: assert(0 && "unsupported width"); return -1;
807*77c1e3ccSAndroid Build Coastguard Worker }
808*77c1e3ccSAndroid Build Coastguard Worker }
809*77c1e3ccSAndroid Build Coastguard Worker
calc_sum_sse_wd32_avx2(const uint8_t * src,const uint8_t * ref,__m256i set_one_minusone,__m256i sse_8x16[2],__m256i sum_8x16[2])810*77c1e3ccSAndroid Build Coastguard Worker static inline void calc_sum_sse_wd32_avx2(const uint8_t *src,
811*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref,
812*77c1e3ccSAndroid Build Coastguard Worker __m256i set_one_minusone,
813*77c1e3ccSAndroid Build Coastguard Worker __m256i sse_8x16[2],
814*77c1e3ccSAndroid Build Coastguard Worker __m256i sum_8x16[2]) {
815*77c1e3ccSAndroid Build Coastguard Worker const __m256i s00_256 = _mm256_loadu_si256((__m256i const *)(src));
816*77c1e3ccSAndroid Build Coastguard Worker const __m256i r00_256 = _mm256_loadu_si256((__m256i const *)(ref));
817*77c1e3ccSAndroid Build Coastguard Worker
818*77c1e3ccSAndroid Build Coastguard Worker const __m256i u_low_256 = _mm256_unpacklo_epi8(s00_256, r00_256);
819*77c1e3ccSAndroid Build Coastguard Worker const __m256i u_high_256 = _mm256_unpackhi_epi8(s00_256, r00_256);
820*77c1e3ccSAndroid Build Coastguard Worker
821*77c1e3ccSAndroid Build Coastguard Worker const __m256i diff0 = _mm256_maddubs_epi16(u_low_256, set_one_minusone);
822*77c1e3ccSAndroid Build Coastguard Worker const __m256i diff1 = _mm256_maddubs_epi16(u_high_256, set_one_minusone);
823*77c1e3ccSAndroid Build Coastguard Worker
824*77c1e3ccSAndroid Build Coastguard Worker sse_8x16[0] = _mm256_add_epi32(sse_8x16[0], _mm256_madd_epi16(diff0, diff0));
825*77c1e3ccSAndroid Build Coastguard Worker sse_8x16[1] = _mm256_add_epi32(sse_8x16[1], _mm256_madd_epi16(diff1, diff1));
826*77c1e3ccSAndroid Build Coastguard Worker sum_8x16[0] = _mm256_add_epi16(sum_8x16[0], diff0);
827*77c1e3ccSAndroid Build Coastguard Worker sum_8x16[1] = _mm256_add_epi16(sum_8x16[1], diff1);
828*77c1e3ccSAndroid Build Coastguard Worker }
829*77c1e3ccSAndroid Build Coastguard Worker
calc_sum_sse_order(__m256i * sse_hx16,__m256i * sum_hx16,unsigned int * tot_sse,int * tot_sum)830*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i calc_sum_sse_order(__m256i *sse_hx16, __m256i *sum_hx16,
831*77c1e3ccSAndroid Build Coastguard Worker unsigned int *tot_sse, int *tot_sum) {
832*77c1e3ccSAndroid Build Coastguard Worker // s00 s01 s10 s11 s20 s21 s30 s31
833*77c1e3ccSAndroid Build Coastguard Worker const __m256i sse_results = _mm256_hadd_epi32(sse_hx16[0], sse_hx16[1]);
834*77c1e3ccSAndroid Build Coastguard Worker // d00 d01 d02 d03 | d10 d11 d12 d13 | d20 d21 d22 d23 | d30 d31 d32 d33
835*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum_result_r0 = _mm256_hadd_epi16(sum_hx16[0], sum_hx16[1]);
836*77c1e3ccSAndroid Build Coastguard Worker // d00 d01 d10 d11 | d00 d02 d10 d11 | d20 d21 d30 d31 | d20 d21 d30 d31
837*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum_result_1 = _mm256_hadd_epi16(sum_result_r0, sum_result_r0);
838*77c1e3ccSAndroid Build Coastguard Worker // d00 d01 d10 d11 d20 d21 d30 d31 | X
839*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum_result_3 = _mm256_permute4x64_epi64(sum_result_1, 0x08);
840*77c1e3ccSAndroid Build Coastguard Worker // d00 d01 d10 d11 d20 d21 d30 d31
841*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum_results =
842*77c1e3ccSAndroid Build Coastguard Worker _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum_result_3));
843*77c1e3ccSAndroid Build Coastguard Worker
844*77c1e3ccSAndroid Build Coastguard Worker // Add sum & sse registers appropriately to get total sum & sse separately.
845*77c1e3ccSAndroid Build Coastguard Worker // s0 s1 d0 d1 s2 s3 d2 d3
846*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum_sse_add = _mm256_hadd_epi32(sse_results, sum_results);
847*77c1e3ccSAndroid Build Coastguard Worker // s0 s1 s2 s3 d0 d1 d2 d3
848*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum_sse_order_add = _mm256_permute4x64_epi64(sum_sse_add, 0xd8);
849*77c1e3ccSAndroid Build Coastguard Worker // s0+s1 s2+s3 s0+s1 s2+s3 d0+d1 d2+d3 d0+d1 d2+d3
850*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum_sse_order_add_1 =
851*77c1e3ccSAndroid Build Coastguard Worker _mm256_hadd_epi32(sum_sse_order_add, sum_sse_order_add);
852*77c1e3ccSAndroid Build Coastguard Worker // s0 x x x | d0 x x x
853*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum_sse_order_add_final =
854*77c1e3ccSAndroid Build Coastguard Worker _mm256_hadd_epi32(sum_sse_order_add_1, sum_sse_order_add_1);
855*77c1e3ccSAndroid Build Coastguard Worker // s0
856*77c1e3ccSAndroid Build Coastguard Worker const uint32_t first_value =
857*77c1e3ccSAndroid Build Coastguard Worker (uint32_t)_mm256_extract_epi32(sum_sse_order_add_final, 0);
858*77c1e3ccSAndroid Build Coastguard Worker *tot_sse += first_value;
859*77c1e3ccSAndroid Build Coastguard Worker // d0
860*77c1e3ccSAndroid Build Coastguard Worker const int second_value = _mm256_extract_epi32(sum_sse_order_add_final, 4);
861*77c1e3ccSAndroid Build Coastguard Worker *tot_sum += second_value;
862*77c1e3ccSAndroid Build Coastguard Worker return sum_sse_order_add;
863*77c1e3ccSAndroid Build Coastguard Worker }
864*77c1e3ccSAndroid Build Coastguard Worker
get_var_sse_sum_8x8_quad_avx2(const uint8_t * src,int src_stride,const uint8_t * ref,const int ref_stride,const int h,uint32_t * sse8x8,int * sum8x8,unsigned int * tot_sse,int * tot_sum,uint32_t * var8x8)865*77c1e3ccSAndroid Build Coastguard Worker static inline void get_var_sse_sum_8x8_quad_avx2(
866*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src, int src_stride, const uint8_t *ref,
867*77c1e3ccSAndroid Build Coastguard Worker const int ref_stride, const int h, uint32_t *sse8x8, int *sum8x8,
868*77c1e3ccSAndroid Build Coastguard Worker unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8) {
869*77c1e3ccSAndroid Build Coastguard Worker assert(h <= 128); // May overflow for larger height.
870*77c1e3ccSAndroid Build Coastguard Worker __m256i sse_8x16[2], sum_8x16[2];
871*77c1e3ccSAndroid Build Coastguard Worker sum_8x16[0] = _mm256_setzero_si256();
872*77c1e3ccSAndroid Build Coastguard Worker sse_8x16[0] = _mm256_setzero_si256();
873*77c1e3ccSAndroid Build Coastguard Worker sum_8x16[1] = sum_8x16[0];
874*77c1e3ccSAndroid Build Coastguard Worker sse_8x16[1] = sse_8x16[0];
875*77c1e3ccSAndroid Build Coastguard Worker const __m256i set_one_minusone = _mm256_set1_epi16((short)0xff01);
876*77c1e3ccSAndroid Build Coastguard Worker
877*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < h; i++) {
878*77c1e3ccSAndroid Build Coastguard Worker // Process 8x32 block of one row.
879*77c1e3ccSAndroid Build Coastguard Worker calc_sum_sse_wd32_avx2(src, ref, set_one_minusone, sse_8x16, sum_8x16);
880*77c1e3ccSAndroid Build Coastguard Worker src += src_stride;
881*77c1e3ccSAndroid Build Coastguard Worker ref += ref_stride;
882*77c1e3ccSAndroid Build Coastguard Worker }
883*77c1e3ccSAndroid Build Coastguard Worker
884*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum_sse_order_add =
885*77c1e3ccSAndroid Build Coastguard Worker calc_sum_sse_order(sse_8x16, sum_8x16, tot_sse, tot_sum);
886*77c1e3ccSAndroid Build Coastguard Worker
887*77c1e3ccSAndroid Build Coastguard Worker // s0 s1 s2 s3
888*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)sse8x8,
889*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi256_si128(sum_sse_order_add));
890*77c1e3ccSAndroid Build Coastguard Worker // d0 d1 d2 d3
891*77c1e3ccSAndroid Build Coastguard Worker const __m128i sum_temp8x8 = _mm256_extractf128_si256(sum_sse_order_add, 1);
892*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)sum8x8, sum_temp8x8);
893*77c1e3ccSAndroid Build Coastguard Worker
894*77c1e3ccSAndroid Build Coastguard Worker // (d0xd0 >> 6)=f0 (d1xd1 >> 6)=f1 (d2xd2 >> 6)=f2 (d3xd3 >> 6)=f3
895*77c1e3ccSAndroid Build Coastguard Worker const __m128i mull_results =
896*77c1e3ccSAndroid Build Coastguard Worker _mm_srli_epi32(_mm_mullo_epi32(sum_temp8x8, sum_temp8x8), 6);
897*77c1e3ccSAndroid Build Coastguard Worker // s0-f0=v0 s1-f1=v1 s2-f2=v2 s3-f3=v3
898*77c1e3ccSAndroid Build Coastguard Worker const __m128i variance_8x8 =
899*77c1e3ccSAndroid Build Coastguard Worker _mm_sub_epi32(_mm256_castsi256_si128(sum_sse_order_add), mull_results);
900*77c1e3ccSAndroid Build Coastguard Worker // v0 v1 v2 v3
901*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)var8x8, variance_8x8);
902*77c1e3ccSAndroid Build Coastguard Worker }
903*77c1e3ccSAndroid Build Coastguard Worker
get_var_sse_sum_16x16_dual_avx2(const uint8_t * src,int src_stride,const uint8_t * ref,const int ref_stride,const int h,uint32_t * sse16x16,unsigned int * tot_sse,int * tot_sum,uint32_t * var16x16)904*77c1e3ccSAndroid Build Coastguard Worker static inline void get_var_sse_sum_16x16_dual_avx2(
905*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src, int src_stride, const uint8_t *ref,
906*77c1e3ccSAndroid Build Coastguard Worker const int ref_stride, const int h, uint32_t *sse16x16,
907*77c1e3ccSAndroid Build Coastguard Worker unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16) {
908*77c1e3ccSAndroid Build Coastguard Worker assert(h <= 128); // May overflow for larger height.
909*77c1e3ccSAndroid Build Coastguard Worker __m256i sse_16x16[2], sum_16x16[2];
910*77c1e3ccSAndroid Build Coastguard Worker sum_16x16[0] = _mm256_setzero_si256();
911*77c1e3ccSAndroid Build Coastguard Worker sse_16x16[0] = _mm256_setzero_si256();
912*77c1e3ccSAndroid Build Coastguard Worker sum_16x16[1] = sum_16x16[0];
913*77c1e3ccSAndroid Build Coastguard Worker sse_16x16[1] = sse_16x16[0];
914*77c1e3ccSAndroid Build Coastguard Worker const __m256i set_one_minusone = _mm256_set1_epi16((short)0xff01);
915*77c1e3ccSAndroid Build Coastguard Worker
916*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < h; i++) {
917*77c1e3ccSAndroid Build Coastguard Worker // Process 16x32 block of one row.
918*77c1e3ccSAndroid Build Coastguard Worker calc_sum_sse_wd32_avx2(src, ref, set_one_minusone, sse_16x16, sum_16x16);
919*77c1e3ccSAndroid Build Coastguard Worker src += src_stride;
920*77c1e3ccSAndroid Build Coastguard Worker ref += ref_stride;
921*77c1e3ccSAndroid Build Coastguard Worker }
922*77c1e3ccSAndroid Build Coastguard Worker
923*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum_sse_order_add =
924*77c1e3ccSAndroid Build Coastguard Worker calc_sum_sse_order(sse_16x16, sum_16x16, tot_sse, tot_sum);
925*77c1e3ccSAndroid Build Coastguard Worker
926*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum_sse_order_add_1 =
927*77c1e3ccSAndroid Build Coastguard Worker _mm256_hadd_epi32(sum_sse_order_add, sum_sse_order_add);
928*77c1e3ccSAndroid Build Coastguard Worker
929*77c1e3ccSAndroid Build Coastguard Worker // s0+s1 s2+s3 x x
930*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)sse16x16,
931*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi256_si128(sum_sse_order_add_1));
932*77c1e3ccSAndroid Build Coastguard Worker
933*77c1e3ccSAndroid Build Coastguard Worker // d0+d1 d2+d3 x x
934*77c1e3ccSAndroid Build Coastguard Worker const __m128i sum_temp16x16 =
935*77c1e3ccSAndroid Build Coastguard Worker _mm256_extractf128_si256(sum_sse_order_add_1, 1);
936*77c1e3ccSAndroid Build Coastguard Worker
937*77c1e3ccSAndroid Build Coastguard Worker // (d0xd0 >> 6)=f0 (d1xd1 >> 6)=f1 (d2xd2 >> 6)=f2 (d3xd3 >> 6)=f3
938*77c1e3ccSAndroid Build Coastguard Worker const __m128i mull_results =
939*77c1e3ccSAndroid Build Coastguard Worker _mm_srli_epi32(_mm_mullo_epi32(sum_temp16x16, sum_temp16x16), 8);
940*77c1e3ccSAndroid Build Coastguard Worker
941*77c1e3ccSAndroid Build Coastguard Worker // s0-f0=v0 s1-f1=v1 s2-f2=v2 s3-f3=v3
942*77c1e3ccSAndroid Build Coastguard Worker const __m128i variance_16x16 =
943*77c1e3ccSAndroid Build Coastguard Worker _mm_sub_epi32(_mm256_castsi256_si128(sum_sse_order_add_1), mull_results);
944*77c1e3ccSAndroid Build Coastguard Worker
945*77c1e3ccSAndroid Build Coastguard Worker // v0 v1 v2 v3
946*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)var16x16, variance_16x16);
947*77c1e3ccSAndroid Build Coastguard Worker }
948*77c1e3ccSAndroid Build Coastguard Worker
aom_get_var_sse_sum_8x8_quad_avx2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse8x8,int * sum8x8,unsigned int * tot_sse,int * tot_sum,uint32_t * var8x8)949*77c1e3ccSAndroid Build Coastguard Worker void aom_get_var_sse_sum_8x8_quad_avx2(const uint8_t *src_ptr,
950*77c1e3ccSAndroid Build Coastguard Worker int source_stride,
951*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
952*77c1e3ccSAndroid Build Coastguard Worker uint32_t *sse8x8, int *sum8x8,
953*77c1e3ccSAndroid Build Coastguard Worker unsigned int *tot_sse, int *tot_sum,
954*77c1e3ccSAndroid Build Coastguard Worker uint32_t *var8x8) {
955*77c1e3ccSAndroid Build Coastguard Worker get_var_sse_sum_8x8_quad_avx2(src_ptr, source_stride, ref_ptr, ref_stride, 8,
956*77c1e3ccSAndroid Build Coastguard Worker sse8x8, sum8x8, tot_sse, tot_sum, var8x8);
957*77c1e3ccSAndroid Build Coastguard Worker }
958*77c1e3ccSAndroid Build Coastguard Worker
aom_get_var_sse_sum_16x16_dual_avx2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse16x16,unsigned int * tot_sse,int * tot_sum,uint32_t * var16x16)959*77c1e3ccSAndroid Build Coastguard Worker void aom_get_var_sse_sum_16x16_dual_avx2(const uint8_t *src_ptr,
960*77c1e3ccSAndroid Build Coastguard Worker int source_stride,
961*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
962*77c1e3ccSAndroid Build Coastguard Worker uint32_t *sse16x16,
963*77c1e3ccSAndroid Build Coastguard Worker unsigned int *tot_sse, int *tot_sum,
964*77c1e3ccSAndroid Build Coastguard Worker uint32_t *var16x16) {
965*77c1e3ccSAndroid Build Coastguard Worker get_var_sse_sum_16x16_dual_avx2(src_ptr, source_stride, ref_ptr, ref_stride,
966*77c1e3ccSAndroid Build Coastguard Worker 16, sse16x16, tot_sse, tot_sum, var16x16);
967*77c1e3ccSAndroid Build Coastguard Worker }
968