xref: /aosp_15_r20/external/libaom/aom_dsp/x86/sse_avx2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker  * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker  *
4*77c1e3ccSAndroid Build Coastguard Worker  * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker  * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker  * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker  */
11*77c1e3ccSAndroid Build Coastguard Worker 
12*77c1e3ccSAndroid Build Coastguard Worker #include <smmintrin.h>
13*77c1e3ccSAndroid Build Coastguard Worker #include <immintrin.h>
14*77c1e3ccSAndroid Build Coastguard Worker 
15*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_config.h"
16*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_dsp_rtcd.h"
17*77c1e3ccSAndroid Build Coastguard Worker 
18*77c1e3ccSAndroid Build Coastguard Worker #include "aom_ports/mem.h"
19*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/x86/synonyms.h"
20*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/x86/synonyms_avx2.h"
21*77c1e3ccSAndroid Build Coastguard Worker 
sse_w32_avx2(__m256i * sum,const uint8_t * a,const uint8_t * b)22*77c1e3ccSAndroid Build Coastguard Worker static inline void sse_w32_avx2(__m256i *sum, const uint8_t *a,
23*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *b) {
24*77c1e3ccSAndroid Build Coastguard Worker   const __m256i v_a0 = yy_loadu_256(a);
25*77c1e3ccSAndroid Build Coastguard Worker   const __m256i v_b0 = yy_loadu_256(b);
26*77c1e3ccSAndroid Build Coastguard Worker   const __m256i zero = _mm256_setzero_si256();
27*77c1e3ccSAndroid Build Coastguard Worker   const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero);
28*77c1e3ccSAndroid Build Coastguard Worker   const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero);
29*77c1e3ccSAndroid Build Coastguard Worker   const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero);
30*77c1e3ccSAndroid Build Coastguard Worker   const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero);
31*77c1e3ccSAndroid Build Coastguard Worker   const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
32*77c1e3ccSAndroid Build Coastguard Worker   const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
33*77c1e3ccSAndroid Build Coastguard Worker   *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
34*77c1e3ccSAndroid Build Coastguard Worker   *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w));
35*77c1e3ccSAndroid Build Coastguard Worker }
36*77c1e3ccSAndroid Build Coastguard Worker 
summary_all_avx2(const __m256i * sum_all)37*77c1e3ccSAndroid Build Coastguard Worker static inline int64_t summary_all_avx2(const __m256i *sum_all) {
38*77c1e3ccSAndroid Build Coastguard Worker   int64_t sum;
39*77c1e3ccSAndroid Build Coastguard Worker   __m256i zero = _mm256_setzero_si256();
40*77c1e3ccSAndroid Build Coastguard Worker   const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero);
41*77c1e3ccSAndroid Build Coastguard Worker   const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero);
42*77c1e3ccSAndroid Build Coastguard Worker   const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
43*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
44*77c1e3ccSAndroid Build Coastguard Worker                                          _mm256_extracti128_si256(sum_4x64, 1));
45*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
46*77c1e3ccSAndroid Build Coastguard Worker   xx_storel_64(&sum, sum_1x64);
47*77c1e3ccSAndroid Build Coastguard Worker   return sum;
48*77c1e3ccSAndroid Build Coastguard Worker }
49*77c1e3ccSAndroid Build Coastguard Worker 
50*77c1e3ccSAndroid Build Coastguard Worker #if CONFIG_AV1_HIGHBITDEPTH
summary_32_avx2(const __m256i * sum32,__m256i * sum)51*77c1e3ccSAndroid Build Coastguard Worker static inline void summary_32_avx2(const __m256i *sum32, __m256i *sum) {
52*77c1e3ccSAndroid Build Coastguard Worker   const __m256i sum0_4x64 =
53*77c1e3ccSAndroid Build Coastguard Worker       _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32));
54*77c1e3ccSAndroid Build Coastguard Worker   const __m256i sum1_4x64 =
55*77c1e3ccSAndroid Build Coastguard Worker       _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum32, 1));
56*77c1e3ccSAndroid Build Coastguard Worker   const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
57*77c1e3ccSAndroid Build Coastguard Worker   *sum = _mm256_add_epi64(*sum, sum_4x64);
58*77c1e3ccSAndroid Build Coastguard Worker }
59*77c1e3ccSAndroid Build Coastguard Worker 
summary_4x64_avx2(const __m256i sum_4x64)60*77c1e3ccSAndroid Build Coastguard Worker static inline int64_t summary_4x64_avx2(const __m256i sum_4x64) {
61*77c1e3ccSAndroid Build Coastguard Worker   int64_t sum;
62*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
63*77c1e3ccSAndroid Build Coastguard Worker                                          _mm256_extracti128_si256(sum_4x64, 1));
64*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
65*77c1e3ccSAndroid Build Coastguard Worker 
66*77c1e3ccSAndroid Build Coastguard Worker   xx_storel_64(&sum, sum_1x64);
67*77c1e3ccSAndroid Build Coastguard Worker   return sum;
68*77c1e3ccSAndroid Build Coastguard Worker }
69*77c1e3ccSAndroid Build Coastguard Worker #endif
70*77c1e3ccSAndroid Build Coastguard Worker 
sse_w4x4_avx2(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,__m256i * sum)71*77c1e3ccSAndroid Build Coastguard Worker static inline void sse_w4x4_avx2(const uint8_t *a, int a_stride,
72*77c1e3ccSAndroid Build Coastguard Worker                                  const uint8_t *b, int b_stride, __m256i *sum) {
73*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_a0 = xx_loadl_32(a);
74*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_a1 = xx_loadl_32(a + a_stride);
75*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_a2 = xx_loadl_32(a + a_stride * 2);
76*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_a3 = xx_loadl_32(a + a_stride * 3);
77*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_b0 = xx_loadl_32(b);
78*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_b1 = xx_loadl_32(b + b_stride);
79*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_b2 = xx_loadl_32(b + b_stride * 2);
80*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_b3 = xx_loadl_32(b + b_stride * 3);
81*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1),
82*77c1e3ccSAndroid Build Coastguard Worker                                              _mm_unpacklo_epi32(v_a2, v_a3));
83*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1),
84*77c1e3ccSAndroid Build Coastguard Worker                                              _mm_unpacklo_epi32(v_b2, v_b3));
85*77c1e3ccSAndroid Build Coastguard Worker   const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123);
86*77c1e3ccSAndroid Build Coastguard Worker   const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123);
87*77c1e3ccSAndroid Build Coastguard Worker   const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
88*77c1e3ccSAndroid Build Coastguard Worker   *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
89*77c1e3ccSAndroid Build Coastguard Worker }
90*77c1e3ccSAndroid Build Coastguard Worker 
sse_w8x2_avx2(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,__m256i * sum)91*77c1e3ccSAndroid Build Coastguard Worker static inline void sse_w8x2_avx2(const uint8_t *a, int a_stride,
92*77c1e3ccSAndroid Build Coastguard Worker                                  const uint8_t *b, int b_stride, __m256i *sum) {
93*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_a0 = xx_loadl_64(a);
94*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_a1 = xx_loadl_64(a + a_stride);
95*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_b0 = xx_loadl_64(b);
96*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_b1 = xx_loadl_64(b + b_stride);
97*77c1e3ccSAndroid Build Coastguard Worker   const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1));
98*77c1e3ccSAndroid Build Coastguard Worker   const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1));
99*77c1e3ccSAndroid Build Coastguard Worker   const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
100*77c1e3ccSAndroid Build Coastguard Worker   *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
101*77c1e3ccSAndroid Build Coastguard Worker }
102*77c1e3ccSAndroid Build Coastguard Worker 
aom_sse_avx2(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int width,int height)103*77c1e3ccSAndroid Build Coastguard Worker int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
104*77c1e3ccSAndroid Build Coastguard Worker                      int b_stride, int width, int height) {
105*77c1e3ccSAndroid Build Coastguard Worker   int32_t y = 0;
106*77c1e3ccSAndroid Build Coastguard Worker   int64_t sse = 0;
107*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum = _mm256_setzero_si256();
108*77c1e3ccSAndroid Build Coastguard Worker   __m256i zero = _mm256_setzero_si256();
109*77c1e3ccSAndroid Build Coastguard Worker   switch (width) {
110*77c1e3ccSAndroid Build Coastguard Worker     case 4:
111*77c1e3ccSAndroid Build Coastguard Worker       do {
112*77c1e3ccSAndroid Build Coastguard Worker         sse_w4x4_avx2(a, a_stride, b, b_stride, &sum);
113*77c1e3ccSAndroid Build Coastguard Worker         a += a_stride << 2;
114*77c1e3ccSAndroid Build Coastguard Worker         b += b_stride << 2;
115*77c1e3ccSAndroid Build Coastguard Worker         y += 4;
116*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
117*77c1e3ccSAndroid Build Coastguard Worker       sse = summary_all_avx2(&sum);
118*77c1e3ccSAndroid Build Coastguard Worker       break;
119*77c1e3ccSAndroid Build Coastguard Worker     case 8:
120*77c1e3ccSAndroid Build Coastguard Worker       do {
121*77c1e3ccSAndroid Build Coastguard Worker         sse_w8x2_avx2(a, a_stride, b, b_stride, &sum);
122*77c1e3ccSAndroid Build Coastguard Worker         a += a_stride << 1;
123*77c1e3ccSAndroid Build Coastguard Worker         b += b_stride << 1;
124*77c1e3ccSAndroid Build Coastguard Worker         y += 2;
125*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
126*77c1e3ccSAndroid Build Coastguard Worker       sse = summary_all_avx2(&sum);
127*77c1e3ccSAndroid Build Coastguard Worker       break;
128*77c1e3ccSAndroid Build Coastguard Worker     case 16:
129*77c1e3ccSAndroid Build Coastguard Worker       do {
130*77c1e3ccSAndroid Build Coastguard Worker         const __m128i v_a0 = xx_loadu_128(a);
131*77c1e3ccSAndroid Build Coastguard Worker         const __m128i v_a1 = xx_loadu_128(a + a_stride);
132*77c1e3ccSAndroid Build Coastguard Worker         const __m128i v_b0 = xx_loadu_128(b);
133*77c1e3ccSAndroid Build Coastguard Worker         const __m128i v_b1 = xx_loadu_128(b + b_stride);
134*77c1e3ccSAndroid Build Coastguard Worker         const __m256i v_a =
135*77c1e3ccSAndroid Build Coastguard Worker             _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01);
136*77c1e3ccSAndroid Build Coastguard Worker         const __m256i v_b =
137*77c1e3ccSAndroid Build Coastguard Worker             _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01);
138*77c1e3ccSAndroid Build Coastguard Worker         const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero);
139*77c1e3ccSAndroid Build Coastguard Worker         const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero);
140*77c1e3ccSAndroid Build Coastguard Worker         const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero);
141*77c1e3ccSAndroid Build Coastguard Worker         const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero);
142*77c1e3ccSAndroid Build Coastguard Worker         const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl);
143*77c1e3ccSAndroid Build Coastguard Worker         const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu);
144*77c1e3ccSAndroid Build Coastguard Worker         const __m256i temp =
145*77c1e3ccSAndroid Build Coastguard Worker             _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub),
146*77c1e3ccSAndroid Build Coastguard Worker                              _mm256_madd_epi16(v_bsub, v_bsub));
147*77c1e3ccSAndroid Build Coastguard Worker         sum = _mm256_add_epi32(sum, temp);
148*77c1e3ccSAndroid Build Coastguard Worker         a += a_stride << 1;
149*77c1e3ccSAndroid Build Coastguard Worker         b += b_stride << 1;
150*77c1e3ccSAndroid Build Coastguard Worker         y += 2;
151*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
152*77c1e3ccSAndroid Build Coastguard Worker       sse = summary_all_avx2(&sum);
153*77c1e3ccSAndroid Build Coastguard Worker       break;
154*77c1e3ccSAndroid Build Coastguard Worker     case 32:
155*77c1e3ccSAndroid Build Coastguard Worker       do {
156*77c1e3ccSAndroid Build Coastguard Worker         sse_w32_avx2(&sum, a, b);
157*77c1e3ccSAndroid Build Coastguard Worker         a += a_stride;
158*77c1e3ccSAndroid Build Coastguard Worker         b += b_stride;
159*77c1e3ccSAndroid Build Coastguard Worker         y += 1;
160*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
161*77c1e3ccSAndroid Build Coastguard Worker       sse = summary_all_avx2(&sum);
162*77c1e3ccSAndroid Build Coastguard Worker       break;
163*77c1e3ccSAndroid Build Coastguard Worker     case 64:
164*77c1e3ccSAndroid Build Coastguard Worker       do {
165*77c1e3ccSAndroid Build Coastguard Worker         sse_w32_avx2(&sum, a, b);
166*77c1e3ccSAndroid Build Coastguard Worker         sse_w32_avx2(&sum, a + 32, b + 32);
167*77c1e3ccSAndroid Build Coastguard Worker         a += a_stride;
168*77c1e3ccSAndroid Build Coastguard Worker         b += b_stride;
169*77c1e3ccSAndroid Build Coastguard Worker         y += 1;
170*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
171*77c1e3ccSAndroid Build Coastguard Worker       sse = summary_all_avx2(&sum);
172*77c1e3ccSAndroid Build Coastguard Worker       break;
173*77c1e3ccSAndroid Build Coastguard Worker     case 128:
174*77c1e3ccSAndroid Build Coastguard Worker       do {
175*77c1e3ccSAndroid Build Coastguard Worker         sse_w32_avx2(&sum, a, b);
176*77c1e3ccSAndroid Build Coastguard Worker         sse_w32_avx2(&sum, a + 32, b + 32);
177*77c1e3ccSAndroid Build Coastguard Worker         sse_w32_avx2(&sum, a + 64, b + 64);
178*77c1e3ccSAndroid Build Coastguard Worker         sse_w32_avx2(&sum, a + 96, b + 96);
179*77c1e3ccSAndroid Build Coastguard Worker         a += a_stride;
180*77c1e3ccSAndroid Build Coastguard Worker         b += b_stride;
181*77c1e3ccSAndroid Build Coastguard Worker         y += 1;
182*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
183*77c1e3ccSAndroid Build Coastguard Worker       sse = summary_all_avx2(&sum);
184*77c1e3ccSAndroid Build Coastguard Worker       break;
185*77c1e3ccSAndroid Build Coastguard Worker     default:
186*77c1e3ccSAndroid Build Coastguard Worker       if ((width & 0x07) == 0) {
187*77c1e3ccSAndroid Build Coastguard Worker         do {
188*77c1e3ccSAndroid Build Coastguard Worker           int i = 0;
189*77c1e3ccSAndroid Build Coastguard Worker           do {
190*77c1e3ccSAndroid Build Coastguard Worker             sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
191*77c1e3ccSAndroid Build Coastguard Worker             i += 8;
192*77c1e3ccSAndroid Build Coastguard Worker           } while (i < width);
193*77c1e3ccSAndroid Build Coastguard Worker           a += a_stride << 1;
194*77c1e3ccSAndroid Build Coastguard Worker           b += b_stride << 1;
195*77c1e3ccSAndroid Build Coastguard Worker           y += 2;
196*77c1e3ccSAndroid Build Coastguard Worker         } while (y < height);
197*77c1e3ccSAndroid Build Coastguard Worker       } else {
198*77c1e3ccSAndroid Build Coastguard Worker         do {
199*77c1e3ccSAndroid Build Coastguard Worker           int i = 0;
200*77c1e3ccSAndroid Build Coastguard Worker           do {
201*77c1e3ccSAndroid Build Coastguard Worker             sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
202*77c1e3ccSAndroid Build Coastguard Worker             const uint8_t *a2 = a + i + (a_stride << 1);
203*77c1e3ccSAndroid Build Coastguard Worker             const uint8_t *b2 = b + i + (b_stride << 1);
204*77c1e3ccSAndroid Build Coastguard Worker             sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum);
205*77c1e3ccSAndroid Build Coastguard Worker             i += 8;
206*77c1e3ccSAndroid Build Coastguard Worker           } while (i + 4 < width);
207*77c1e3ccSAndroid Build Coastguard Worker           sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum);
208*77c1e3ccSAndroid Build Coastguard Worker           a += a_stride << 2;
209*77c1e3ccSAndroid Build Coastguard Worker           b += b_stride << 2;
210*77c1e3ccSAndroid Build Coastguard Worker           y += 4;
211*77c1e3ccSAndroid Build Coastguard Worker         } while (y < height);
212*77c1e3ccSAndroid Build Coastguard Worker       }
213*77c1e3ccSAndroid Build Coastguard Worker       sse = summary_all_avx2(&sum);
214*77c1e3ccSAndroid Build Coastguard Worker       break;
215*77c1e3ccSAndroid Build Coastguard Worker   }
216*77c1e3ccSAndroid Build Coastguard Worker 
217*77c1e3ccSAndroid Build Coastguard Worker   return sse;
218*77c1e3ccSAndroid Build Coastguard Worker }
219*77c1e3ccSAndroid Build Coastguard Worker 
220*77c1e3ccSAndroid Build Coastguard Worker #if CONFIG_AV1_HIGHBITDEPTH
highbd_sse_w16_avx2(__m256i * sum,const uint16_t * a,const uint16_t * b)221*77c1e3ccSAndroid Build Coastguard Worker static inline void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
222*77c1e3ccSAndroid Build Coastguard Worker                                        const uint16_t *b) {
223*77c1e3ccSAndroid Build Coastguard Worker   const __m256i v_a_w = yy_loadu_256(a);
224*77c1e3ccSAndroid Build Coastguard Worker   const __m256i v_b_w = yy_loadu_256(b);
225*77c1e3ccSAndroid Build Coastguard Worker   const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
226*77c1e3ccSAndroid Build Coastguard Worker   *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
227*77c1e3ccSAndroid Build Coastguard Worker }
228*77c1e3ccSAndroid Build Coastguard Worker 
highbd_sse_w4x4_avx2(__m256i * sum,const uint16_t * a,int a_stride,const uint16_t * b,int b_stride)229*77c1e3ccSAndroid Build Coastguard Worker static inline void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a,
230*77c1e3ccSAndroid Build Coastguard Worker                                         int a_stride, const uint16_t *b,
231*77c1e3ccSAndroid Build Coastguard Worker                                         int b_stride) {
232*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_a0 = xx_loadl_64(a);
233*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_a1 = xx_loadl_64(a + a_stride);
234*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_a2 = xx_loadl_64(a + a_stride * 2);
235*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_a3 = xx_loadl_64(a + a_stride * 3);
236*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_b0 = xx_loadl_64(b);
237*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_b1 = xx_loadl_64(b + b_stride);
238*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_b2 = xx_loadl_64(b + b_stride * 2);
239*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_b3 = xx_loadl_64(b + b_stride * 3);
240*77c1e3ccSAndroid Build Coastguard Worker   const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1),
241*77c1e3ccSAndroid Build Coastguard Worker                                      _mm_unpacklo_epi64(v_a2, v_a3));
242*77c1e3ccSAndroid Build Coastguard Worker   const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1),
243*77c1e3ccSAndroid Build Coastguard Worker                                      _mm_unpacklo_epi64(v_b2, v_b3));
244*77c1e3ccSAndroid Build Coastguard Worker   const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
245*77c1e3ccSAndroid Build Coastguard Worker   *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
246*77c1e3ccSAndroid Build Coastguard Worker }
247*77c1e3ccSAndroid Build Coastguard Worker 
highbd_sse_w8x2_avx2(__m256i * sum,const uint16_t * a,int a_stride,const uint16_t * b,int b_stride)248*77c1e3ccSAndroid Build Coastguard Worker static inline void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a,
249*77c1e3ccSAndroid Build Coastguard Worker                                         int a_stride, const uint16_t *b,
250*77c1e3ccSAndroid Build Coastguard Worker                                         int b_stride) {
251*77c1e3ccSAndroid Build Coastguard Worker   const __m256i v_a_w = yy_loadu2_128(a + a_stride, a);
252*77c1e3ccSAndroid Build Coastguard Worker   const __m256i v_b_w = yy_loadu2_128(b + b_stride, b);
253*77c1e3ccSAndroid Build Coastguard Worker   const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
254*77c1e3ccSAndroid Build Coastguard Worker   *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
255*77c1e3ccSAndroid Build Coastguard Worker }
256*77c1e3ccSAndroid Build Coastguard Worker 
aom_highbd_sse_avx2(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int width,int height)257*77c1e3ccSAndroid Build Coastguard Worker int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
258*77c1e3ccSAndroid Build Coastguard Worker                             int b_stride, int width, int height) {
259*77c1e3ccSAndroid Build Coastguard Worker   int32_t y = 0;
260*77c1e3ccSAndroid Build Coastguard Worker   int64_t sse = 0;
261*77c1e3ccSAndroid Build Coastguard Worker   uint16_t *a = CONVERT_TO_SHORTPTR(a8);
262*77c1e3ccSAndroid Build Coastguard Worker   uint16_t *b = CONVERT_TO_SHORTPTR(b8);
263*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum = _mm256_setzero_si256();
264*77c1e3ccSAndroid Build Coastguard Worker   switch (width) {
265*77c1e3ccSAndroid Build Coastguard Worker     case 4:
266*77c1e3ccSAndroid Build Coastguard Worker       do {
267*77c1e3ccSAndroid Build Coastguard Worker         highbd_sse_w4x4_avx2(&sum, a, a_stride, b, b_stride);
268*77c1e3ccSAndroid Build Coastguard Worker         a += a_stride << 2;
269*77c1e3ccSAndroid Build Coastguard Worker         b += b_stride << 2;
270*77c1e3ccSAndroid Build Coastguard Worker         y += 4;
271*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
272*77c1e3ccSAndroid Build Coastguard Worker       sse = summary_all_avx2(&sum);
273*77c1e3ccSAndroid Build Coastguard Worker       break;
274*77c1e3ccSAndroid Build Coastguard Worker     case 8:
275*77c1e3ccSAndroid Build Coastguard Worker       do {
276*77c1e3ccSAndroid Build Coastguard Worker         highbd_sse_w8x2_avx2(&sum, a, a_stride, b, b_stride);
277*77c1e3ccSAndroid Build Coastguard Worker         a += a_stride << 1;
278*77c1e3ccSAndroid Build Coastguard Worker         b += b_stride << 1;
279*77c1e3ccSAndroid Build Coastguard Worker         y += 2;
280*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
281*77c1e3ccSAndroid Build Coastguard Worker       sse = summary_all_avx2(&sum);
282*77c1e3ccSAndroid Build Coastguard Worker       break;
283*77c1e3ccSAndroid Build Coastguard Worker     case 16:
284*77c1e3ccSAndroid Build Coastguard Worker       do {
285*77c1e3ccSAndroid Build Coastguard Worker         highbd_sse_w16_avx2(&sum, a, b);
286*77c1e3ccSAndroid Build Coastguard Worker         a += a_stride;
287*77c1e3ccSAndroid Build Coastguard Worker         b += b_stride;
288*77c1e3ccSAndroid Build Coastguard Worker         y += 1;
289*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
290*77c1e3ccSAndroid Build Coastguard Worker       sse = summary_all_avx2(&sum);
291*77c1e3ccSAndroid Build Coastguard Worker       break;
292*77c1e3ccSAndroid Build Coastguard Worker     case 32:
293*77c1e3ccSAndroid Build Coastguard Worker       do {
294*77c1e3ccSAndroid Build Coastguard Worker         int l = 0;
295*77c1e3ccSAndroid Build Coastguard Worker         __m256i sum32 = _mm256_setzero_si256();
296*77c1e3ccSAndroid Build Coastguard Worker         do {
297*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w16_avx2(&sum32, a, b);
298*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w16_avx2(&sum32, a + 16, b + 16);
299*77c1e3ccSAndroid Build Coastguard Worker           a += a_stride;
300*77c1e3ccSAndroid Build Coastguard Worker           b += b_stride;
301*77c1e3ccSAndroid Build Coastguard Worker           l += 1;
302*77c1e3ccSAndroid Build Coastguard Worker         } while (l < 64 && l < (height - y));
303*77c1e3ccSAndroid Build Coastguard Worker         summary_32_avx2(&sum32, &sum);
304*77c1e3ccSAndroid Build Coastguard Worker         y += 64;
305*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
306*77c1e3ccSAndroid Build Coastguard Worker       sse = summary_4x64_avx2(sum);
307*77c1e3ccSAndroid Build Coastguard Worker       break;
308*77c1e3ccSAndroid Build Coastguard Worker     case 64:
309*77c1e3ccSAndroid Build Coastguard Worker       do {
310*77c1e3ccSAndroid Build Coastguard Worker         int l = 0;
311*77c1e3ccSAndroid Build Coastguard Worker         __m256i sum32 = _mm256_setzero_si256();
312*77c1e3ccSAndroid Build Coastguard Worker         do {
313*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w16_avx2(&sum32, a, b);
314*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1);
315*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2);
316*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3);
317*77c1e3ccSAndroid Build Coastguard Worker           a += a_stride;
318*77c1e3ccSAndroid Build Coastguard Worker           b += b_stride;
319*77c1e3ccSAndroid Build Coastguard Worker           l += 1;
320*77c1e3ccSAndroid Build Coastguard Worker         } while (l < 32 && l < (height - y));
321*77c1e3ccSAndroid Build Coastguard Worker         summary_32_avx2(&sum32, &sum);
322*77c1e3ccSAndroid Build Coastguard Worker         y += 32;
323*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
324*77c1e3ccSAndroid Build Coastguard Worker       sse = summary_4x64_avx2(sum);
325*77c1e3ccSAndroid Build Coastguard Worker       break;
326*77c1e3ccSAndroid Build Coastguard Worker     case 128:
327*77c1e3ccSAndroid Build Coastguard Worker       do {
328*77c1e3ccSAndroid Build Coastguard Worker         int l = 0;
329*77c1e3ccSAndroid Build Coastguard Worker         __m256i sum32 = _mm256_setzero_si256();
330*77c1e3ccSAndroid Build Coastguard Worker         do {
331*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w16_avx2(&sum32, a, b);
332*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1);
333*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2);
334*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3);
335*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w16_avx2(&sum32, a + 16 * 4, b + 16 * 4);
336*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w16_avx2(&sum32, a + 16 * 5, b + 16 * 5);
337*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w16_avx2(&sum32, a + 16 * 6, b + 16 * 6);
338*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w16_avx2(&sum32, a + 16 * 7, b + 16 * 7);
339*77c1e3ccSAndroid Build Coastguard Worker           a += a_stride;
340*77c1e3ccSAndroid Build Coastguard Worker           b += b_stride;
341*77c1e3ccSAndroid Build Coastguard Worker           l += 1;
342*77c1e3ccSAndroid Build Coastguard Worker         } while (l < 16 && l < (height - y));
343*77c1e3ccSAndroid Build Coastguard Worker         summary_32_avx2(&sum32, &sum);
344*77c1e3ccSAndroid Build Coastguard Worker         y += 16;
345*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
346*77c1e3ccSAndroid Build Coastguard Worker       sse = summary_4x64_avx2(sum);
347*77c1e3ccSAndroid Build Coastguard Worker       break;
348*77c1e3ccSAndroid Build Coastguard Worker     default:
349*77c1e3ccSAndroid Build Coastguard Worker       if (width & 0x7) {
350*77c1e3ccSAndroid Build Coastguard Worker         do {
351*77c1e3ccSAndroid Build Coastguard Worker           int i = 0;
352*77c1e3ccSAndroid Build Coastguard Worker           __m256i sum32 = _mm256_setzero_si256();
353*77c1e3ccSAndroid Build Coastguard Worker           do {
354*77c1e3ccSAndroid Build Coastguard Worker             highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
355*77c1e3ccSAndroid Build Coastguard Worker             const uint16_t *a2 = a + i + (a_stride << 1);
356*77c1e3ccSAndroid Build Coastguard Worker             const uint16_t *b2 = b + i + (b_stride << 1);
357*77c1e3ccSAndroid Build Coastguard Worker             highbd_sse_w8x2_avx2(&sum32, a2, a_stride, b2, b_stride);
358*77c1e3ccSAndroid Build Coastguard Worker             i += 8;
359*77c1e3ccSAndroid Build Coastguard Worker           } while (i + 4 < width);
360*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w4x4_avx2(&sum32, a + i, a_stride, b + i, b_stride);
361*77c1e3ccSAndroid Build Coastguard Worker           summary_32_avx2(&sum32, &sum);
362*77c1e3ccSAndroid Build Coastguard Worker           a += a_stride << 2;
363*77c1e3ccSAndroid Build Coastguard Worker           b += b_stride << 2;
364*77c1e3ccSAndroid Build Coastguard Worker           y += 4;
365*77c1e3ccSAndroid Build Coastguard Worker         } while (y < height);
366*77c1e3ccSAndroid Build Coastguard Worker       } else {
367*77c1e3ccSAndroid Build Coastguard Worker         do {
368*77c1e3ccSAndroid Build Coastguard Worker           int l = 0;
369*77c1e3ccSAndroid Build Coastguard Worker           __m256i sum32 = _mm256_setzero_si256();
370*77c1e3ccSAndroid Build Coastguard Worker           do {
371*77c1e3ccSAndroid Build Coastguard Worker             int i = 0;
372*77c1e3ccSAndroid Build Coastguard Worker             do {
373*77c1e3ccSAndroid Build Coastguard Worker               highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
374*77c1e3ccSAndroid Build Coastguard Worker               i += 8;
375*77c1e3ccSAndroid Build Coastguard Worker             } while (i < width);
376*77c1e3ccSAndroid Build Coastguard Worker             a += a_stride << 1;
377*77c1e3ccSAndroid Build Coastguard Worker             b += b_stride << 1;
378*77c1e3ccSAndroid Build Coastguard Worker             l += 2;
379*77c1e3ccSAndroid Build Coastguard Worker           } while (l < 8 && l < (height - y));
380*77c1e3ccSAndroid Build Coastguard Worker           summary_32_avx2(&sum32, &sum);
381*77c1e3ccSAndroid Build Coastguard Worker           y += 8;
382*77c1e3ccSAndroid Build Coastguard Worker         } while (y < height);
383*77c1e3ccSAndroid Build Coastguard Worker       }
384*77c1e3ccSAndroid Build Coastguard Worker       sse = summary_4x64_avx2(sum);
385*77c1e3ccSAndroid Build Coastguard Worker       break;
386*77c1e3ccSAndroid Build Coastguard Worker   }
387*77c1e3ccSAndroid Build Coastguard Worker   return sse;
388*77c1e3ccSAndroid Build Coastguard Worker }
389*77c1e3ccSAndroid Build Coastguard Worker #endif  // CONFIG_AV1_HIGHBITDEPTH
390