xref: /aosp_15_r20/external/libaom/aom_dsp/x86/sse_sse4.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker  * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker  *
4*77c1e3ccSAndroid Build Coastguard Worker  * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker  * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker  * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker  */
11*77c1e3ccSAndroid Build Coastguard Worker 
12*77c1e3ccSAndroid Build Coastguard Worker #include <assert.h>
13*77c1e3ccSAndroid Build Coastguard Worker #include <smmintrin.h>
14*77c1e3ccSAndroid Build Coastguard Worker 
15*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_config.h"
16*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_dsp_rtcd.h"
17*77c1e3ccSAndroid Build Coastguard Worker 
18*77c1e3ccSAndroid Build Coastguard Worker #include "aom_ports/mem.h"
19*77c1e3ccSAndroid Build Coastguard Worker #include "aom/aom_integer.h"
20*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/x86/synonyms.h"
21*77c1e3ccSAndroid Build Coastguard Worker 
summary_all_sse4(const __m128i * sum_all)22*77c1e3ccSAndroid Build Coastguard Worker static inline int64_t summary_all_sse4(const __m128i *sum_all) {
23*77c1e3ccSAndroid Build Coastguard Worker   int64_t sum;
24*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all);
25*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8));
26*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1);
27*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
28*77c1e3ccSAndroid Build Coastguard Worker   xx_storel_64(&sum, sum_1x64);
29*77c1e3ccSAndroid Build Coastguard Worker   return sum;
30*77c1e3ccSAndroid Build Coastguard Worker }
31*77c1e3ccSAndroid Build Coastguard Worker 
32*77c1e3ccSAndroid Build Coastguard Worker #if CONFIG_AV1_HIGHBITDEPTH
summary_32_sse4(const __m128i * sum32,__m128i * sum64)33*77c1e3ccSAndroid Build Coastguard Worker static inline void summary_32_sse4(const __m128i *sum32, __m128i *sum64) {
34*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum0 = _mm_cvtepu32_epi64(*sum32);
35*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8));
36*77c1e3ccSAndroid Build Coastguard Worker   *sum64 = _mm_add_epi64(sum0, *sum64);
37*77c1e3ccSAndroid Build Coastguard Worker   *sum64 = _mm_add_epi64(sum1, *sum64);
38*77c1e3ccSAndroid Build Coastguard Worker }
39*77c1e3ccSAndroid Build Coastguard Worker #endif
40*77c1e3ccSAndroid Build Coastguard Worker 
sse_w16_sse4_1(__m128i * sum,const uint8_t * a,const uint8_t * b)41*77c1e3ccSAndroid Build Coastguard Worker static inline void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
42*77c1e3ccSAndroid Build Coastguard Worker                                   const uint8_t *b) {
43*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_a0 = xx_loadu_128(a);
44*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_b0 = xx_loadu_128(b);
45*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0);
46*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8));
47*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0);
48*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8));
49*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w);
50*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w);
51*77c1e3ccSAndroid Build Coastguard Worker   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w));
52*77c1e3ccSAndroid Build Coastguard Worker   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w));
53*77c1e3ccSAndroid Build Coastguard Worker }
54*77c1e3ccSAndroid Build Coastguard Worker 
sse4x2_sse4_1(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,__m128i * sum)55*77c1e3ccSAndroid Build Coastguard Worker static inline void sse4x2_sse4_1(const uint8_t *a, int a_stride,
56*77c1e3ccSAndroid Build Coastguard Worker                                  const uint8_t *b, int b_stride, __m128i *sum) {
57*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_a0 = xx_loadl_32(a);
58*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_a1 = xx_loadl_32(a + a_stride);
59*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_b0 = xx_loadl_32(b);
60*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_b1 = xx_loadl_32(b + b_stride);
61*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1));
62*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1));
63*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
64*77c1e3ccSAndroid Build Coastguard Worker   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
65*77c1e3ccSAndroid Build Coastguard Worker }
66*77c1e3ccSAndroid Build Coastguard Worker 
sse8_sse4_1(const uint8_t * a,const uint8_t * b,__m128i * sum)67*77c1e3ccSAndroid Build Coastguard Worker static inline void sse8_sse4_1(const uint8_t *a, const uint8_t *b,
68*77c1e3ccSAndroid Build Coastguard Worker                                __m128i *sum) {
69*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_a0 = xx_loadl_64(a);
70*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_b0 = xx_loadl_64(b);
71*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0);
72*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0);
73*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
74*77c1e3ccSAndroid Build Coastguard Worker   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
75*77c1e3ccSAndroid Build Coastguard Worker }
76*77c1e3ccSAndroid Build Coastguard Worker 
aom_sse_sse4_1(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int width,int height)77*77c1e3ccSAndroid Build Coastguard Worker int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
78*77c1e3ccSAndroid Build Coastguard Worker                        int b_stride, int width, int height) {
79*77c1e3ccSAndroid Build Coastguard Worker   int y = 0;
80*77c1e3ccSAndroid Build Coastguard Worker   int64_t sse = 0;
81*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum = _mm_setzero_si128();
82*77c1e3ccSAndroid Build Coastguard Worker   switch (width) {
83*77c1e3ccSAndroid Build Coastguard Worker     case 4:
84*77c1e3ccSAndroid Build Coastguard Worker       do {
85*77c1e3ccSAndroid Build Coastguard Worker         sse4x2_sse4_1(a, a_stride, b, b_stride, &sum);
86*77c1e3ccSAndroid Build Coastguard Worker         a += a_stride << 1;
87*77c1e3ccSAndroid Build Coastguard Worker         b += b_stride << 1;
88*77c1e3ccSAndroid Build Coastguard Worker         y += 2;
89*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
90*77c1e3ccSAndroid Build Coastguard Worker       sse = summary_all_sse4(&sum);
91*77c1e3ccSAndroid Build Coastguard Worker       break;
92*77c1e3ccSAndroid Build Coastguard Worker     case 8:
93*77c1e3ccSAndroid Build Coastguard Worker       do {
94*77c1e3ccSAndroid Build Coastguard Worker         sse8_sse4_1(a, b, &sum);
95*77c1e3ccSAndroid Build Coastguard Worker         a += a_stride;
96*77c1e3ccSAndroid Build Coastguard Worker         b += b_stride;
97*77c1e3ccSAndroid Build Coastguard Worker         y += 1;
98*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
99*77c1e3ccSAndroid Build Coastguard Worker       sse = summary_all_sse4(&sum);
100*77c1e3ccSAndroid Build Coastguard Worker       break;
101*77c1e3ccSAndroid Build Coastguard Worker     case 16:
102*77c1e3ccSAndroid Build Coastguard Worker       do {
103*77c1e3ccSAndroid Build Coastguard Worker         sse_w16_sse4_1(&sum, a, b);
104*77c1e3ccSAndroid Build Coastguard Worker         a += a_stride;
105*77c1e3ccSAndroid Build Coastguard Worker         b += b_stride;
106*77c1e3ccSAndroid Build Coastguard Worker         y += 1;
107*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
108*77c1e3ccSAndroid Build Coastguard Worker       sse = summary_all_sse4(&sum);
109*77c1e3ccSAndroid Build Coastguard Worker       break;
110*77c1e3ccSAndroid Build Coastguard Worker     case 32:
111*77c1e3ccSAndroid Build Coastguard Worker       do {
112*77c1e3ccSAndroid Build Coastguard Worker         sse_w16_sse4_1(&sum, a, b);
113*77c1e3ccSAndroid Build Coastguard Worker         sse_w16_sse4_1(&sum, a + 16, b + 16);
114*77c1e3ccSAndroid Build Coastguard Worker         a += a_stride;
115*77c1e3ccSAndroid Build Coastguard Worker         b += b_stride;
116*77c1e3ccSAndroid Build Coastguard Worker         y += 1;
117*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
118*77c1e3ccSAndroid Build Coastguard Worker       sse = summary_all_sse4(&sum);
119*77c1e3ccSAndroid Build Coastguard Worker       break;
120*77c1e3ccSAndroid Build Coastguard Worker     case 64:
121*77c1e3ccSAndroid Build Coastguard Worker       do {
122*77c1e3ccSAndroid Build Coastguard Worker         sse_w16_sse4_1(&sum, a, b);
123*77c1e3ccSAndroid Build Coastguard Worker         sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
124*77c1e3ccSAndroid Build Coastguard Worker         sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
125*77c1e3ccSAndroid Build Coastguard Worker         sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
126*77c1e3ccSAndroid Build Coastguard Worker         a += a_stride;
127*77c1e3ccSAndroid Build Coastguard Worker         b += b_stride;
128*77c1e3ccSAndroid Build Coastguard Worker         y += 1;
129*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
130*77c1e3ccSAndroid Build Coastguard Worker       sse = summary_all_sse4(&sum);
131*77c1e3ccSAndroid Build Coastguard Worker       break;
132*77c1e3ccSAndroid Build Coastguard Worker     case 128:
133*77c1e3ccSAndroid Build Coastguard Worker       do {
134*77c1e3ccSAndroid Build Coastguard Worker         sse_w16_sse4_1(&sum, a, b);
135*77c1e3ccSAndroid Build Coastguard Worker         sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
136*77c1e3ccSAndroid Build Coastguard Worker         sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
137*77c1e3ccSAndroid Build Coastguard Worker         sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
138*77c1e3ccSAndroid Build Coastguard Worker         sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4);
139*77c1e3ccSAndroid Build Coastguard Worker         sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5);
140*77c1e3ccSAndroid Build Coastguard Worker         sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6);
141*77c1e3ccSAndroid Build Coastguard Worker         sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7);
142*77c1e3ccSAndroid Build Coastguard Worker         a += a_stride;
143*77c1e3ccSAndroid Build Coastguard Worker         b += b_stride;
144*77c1e3ccSAndroid Build Coastguard Worker         y += 1;
145*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
146*77c1e3ccSAndroid Build Coastguard Worker       sse = summary_all_sse4(&sum);
147*77c1e3ccSAndroid Build Coastguard Worker       break;
148*77c1e3ccSAndroid Build Coastguard Worker     default:
149*77c1e3ccSAndroid Build Coastguard Worker       if (width & 0x07) {
150*77c1e3ccSAndroid Build Coastguard Worker         do {
151*77c1e3ccSAndroid Build Coastguard Worker           int i = 0;
152*77c1e3ccSAndroid Build Coastguard Worker           do {
153*77c1e3ccSAndroid Build Coastguard Worker             sse8_sse4_1(a + i, b + i, &sum);
154*77c1e3ccSAndroid Build Coastguard Worker             sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum);
155*77c1e3ccSAndroid Build Coastguard Worker             i += 8;
156*77c1e3ccSAndroid Build Coastguard Worker           } while (i + 4 < width);
157*77c1e3ccSAndroid Build Coastguard Worker           sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum);
158*77c1e3ccSAndroid Build Coastguard Worker           a += (a_stride << 1);
159*77c1e3ccSAndroid Build Coastguard Worker           b += (b_stride << 1);
160*77c1e3ccSAndroid Build Coastguard Worker           y += 2;
161*77c1e3ccSAndroid Build Coastguard Worker         } while (y < height);
162*77c1e3ccSAndroid Build Coastguard Worker       } else {
163*77c1e3ccSAndroid Build Coastguard Worker         do {
164*77c1e3ccSAndroid Build Coastguard Worker           int i = 0;
165*77c1e3ccSAndroid Build Coastguard Worker           do {
166*77c1e3ccSAndroid Build Coastguard Worker             sse8_sse4_1(a + i, b + i, &sum);
167*77c1e3ccSAndroid Build Coastguard Worker             i += 8;
168*77c1e3ccSAndroid Build Coastguard Worker           } while (i < width);
169*77c1e3ccSAndroid Build Coastguard Worker           a += a_stride;
170*77c1e3ccSAndroid Build Coastguard Worker           b += b_stride;
171*77c1e3ccSAndroid Build Coastguard Worker           y += 1;
172*77c1e3ccSAndroid Build Coastguard Worker         } while (y < height);
173*77c1e3ccSAndroid Build Coastguard Worker       }
174*77c1e3ccSAndroid Build Coastguard Worker       sse = summary_all_sse4(&sum);
175*77c1e3ccSAndroid Build Coastguard Worker       break;
176*77c1e3ccSAndroid Build Coastguard Worker   }
177*77c1e3ccSAndroid Build Coastguard Worker 
178*77c1e3ccSAndroid Build Coastguard Worker   return sse;
179*77c1e3ccSAndroid Build Coastguard Worker }
180*77c1e3ccSAndroid Build Coastguard Worker 
181*77c1e3ccSAndroid Build Coastguard Worker #if CONFIG_AV1_HIGHBITDEPTH
highbd_sse_w4x2_sse4_1(__m128i * sum,const uint16_t * a,int a_stride,const uint16_t * b,int b_stride)182*77c1e3ccSAndroid Build Coastguard Worker static inline void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a,
183*77c1e3ccSAndroid Build Coastguard Worker                                           int a_stride, const uint16_t *b,
184*77c1e3ccSAndroid Build Coastguard Worker                                           int b_stride) {
185*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_a0 = xx_loadl_64(a);
186*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_a1 = xx_loadl_64(a + a_stride);
187*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_b0 = xx_loadl_64(b);
188*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_b1 = xx_loadl_64(b + b_stride);
189*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1);
190*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1);
191*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
192*77c1e3ccSAndroid Build Coastguard Worker   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
193*77c1e3ccSAndroid Build Coastguard Worker }
194*77c1e3ccSAndroid Build Coastguard Worker 
highbd_sse_w8_sse4_1(__m128i * sum,const uint16_t * a,const uint16_t * b)195*77c1e3ccSAndroid Build Coastguard Worker static inline void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a,
196*77c1e3ccSAndroid Build Coastguard Worker                                         const uint16_t *b) {
197*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_a_w = xx_loadu_128(a);
198*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_b_w = xx_loadu_128(b);
199*77c1e3ccSAndroid Build Coastguard Worker   const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
200*77c1e3ccSAndroid Build Coastguard Worker   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
201*77c1e3ccSAndroid Build Coastguard Worker }
202*77c1e3ccSAndroid Build Coastguard Worker 
aom_highbd_sse_sse4_1(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int width,int height)203*77c1e3ccSAndroid Build Coastguard Worker int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride,
204*77c1e3ccSAndroid Build Coastguard Worker                               const uint8_t *b8, int b_stride, int width,
205*77c1e3ccSAndroid Build Coastguard Worker                               int height) {
206*77c1e3ccSAndroid Build Coastguard Worker   int32_t y = 0;
207*77c1e3ccSAndroid Build Coastguard Worker   int64_t sse = 0;
208*77c1e3ccSAndroid Build Coastguard Worker   uint16_t *a = CONVERT_TO_SHORTPTR(a8);
209*77c1e3ccSAndroid Build Coastguard Worker   uint16_t *b = CONVERT_TO_SHORTPTR(b8);
210*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum = _mm_setzero_si128();
211*77c1e3ccSAndroid Build Coastguard Worker   switch (width) {
212*77c1e3ccSAndroid Build Coastguard Worker     case 4:
213*77c1e3ccSAndroid Build Coastguard Worker       do {
214*77c1e3ccSAndroid Build Coastguard Worker         highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride);
215*77c1e3ccSAndroid Build Coastguard Worker         a += a_stride << 1;
216*77c1e3ccSAndroid Build Coastguard Worker         b += b_stride << 1;
217*77c1e3ccSAndroid Build Coastguard Worker         y += 2;
218*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
219*77c1e3ccSAndroid Build Coastguard Worker       sse = summary_all_sse4(&sum);
220*77c1e3ccSAndroid Build Coastguard Worker       break;
221*77c1e3ccSAndroid Build Coastguard Worker     case 8:
222*77c1e3ccSAndroid Build Coastguard Worker       do {
223*77c1e3ccSAndroid Build Coastguard Worker         highbd_sse_w8_sse4_1(&sum, a, b);
224*77c1e3ccSAndroid Build Coastguard Worker         a += a_stride;
225*77c1e3ccSAndroid Build Coastguard Worker         b += b_stride;
226*77c1e3ccSAndroid Build Coastguard Worker         y += 1;
227*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
228*77c1e3ccSAndroid Build Coastguard Worker       sse = summary_all_sse4(&sum);
229*77c1e3ccSAndroid Build Coastguard Worker       break;
230*77c1e3ccSAndroid Build Coastguard Worker     case 16:
231*77c1e3ccSAndroid Build Coastguard Worker       do {
232*77c1e3ccSAndroid Build Coastguard Worker         int l = 0;
233*77c1e3ccSAndroid Build Coastguard Worker         __m128i sum32 = _mm_setzero_si128();
234*77c1e3ccSAndroid Build Coastguard Worker         do {
235*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a, b);
236*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8);
237*77c1e3ccSAndroid Build Coastguard Worker           a += a_stride;
238*77c1e3ccSAndroid Build Coastguard Worker           b += b_stride;
239*77c1e3ccSAndroid Build Coastguard Worker           l += 1;
240*77c1e3ccSAndroid Build Coastguard Worker         } while (l < 64 && l < (height - y));
241*77c1e3ccSAndroid Build Coastguard Worker         summary_32_sse4(&sum32, &sum);
242*77c1e3ccSAndroid Build Coastguard Worker         y += 64;
243*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
244*77c1e3ccSAndroid Build Coastguard Worker       xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
245*77c1e3ccSAndroid Build Coastguard Worker       break;
246*77c1e3ccSAndroid Build Coastguard Worker     case 32:
247*77c1e3ccSAndroid Build Coastguard Worker       do {
248*77c1e3ccSAndroid Build Coastguard Worker         int l = 0;
249*77c1e3ccSAndroid Build Coastguard Worker         __m128i sum32 = _mm_setzero_si128();
250*77c1e3ccSAndroid Build Coastguard Worker         do {
251*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a, b);
252*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
253*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
254*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
255*77c1e3ccSAndroid Build Coastguard Worker           a += a_stride;
256*77c1e3ccSAndroid Build Coastguard Worker           b += b_stride;
257*77c1e3ccSAndroid Build Coastguard Worker           l += 1;
258*77c1e3ccSAndroid Build Coastguard Worker         } while (l < 32 && l < (height - y));
259*77c1e3ccSAndroid Build Coastguard Worker         summary_32_sse4(&sum32, &sum);
260*77c1e3ccSAndroid Build Coastguard Worker         y += 32;
261*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
262*77c1e3ccSAndroid Build Coastguard Worker       xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
263*77c1e3ccSAndroid Build Coastguard Worker       break;
264*77c1e3ccSAndroid Build Coastguard Worker     case 64:
265*77c1e3ccSAndroid Build Coastguard Worker       do {
266*77c1e3ccSAndroid Build Coastguard Worker         int l = 0;
267*77c1e3ccSAndroid Build Coastguard Worker         __m128i sum32 = _mm_setzero_si128();
268*77c1e3ccSAndroid Build Coastguard Worker         do {
269*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a, b);
270*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
271*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
272*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
273*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4);
274*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5);
275*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6);
276*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7);
277*77c1e3ccSAndroid Build Coastguard Worker           a += a_stride;
278*77c1e3ccSAndroid Build Coastguard Worker           b += b_stride;
279*77c1e3ccSAndroid Build Coastguard Worker           l += 1;
280*77c1e3ccSAndroid Build Coastguard Worker         } while (l < 16 && l < (height - y));
281*77c1e3ccSAndroid Build Coastguard Worker         summary_32_sse4(&sum32, &sum);
282*77c1e3ccSAndroid Build Coastguard Worker         y += 16;
283*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
284*77c1e3ccSAndroid Build Coastguard Worker       xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
285*77c1e3ccSAndroid Build Coastguard Worker       break;
286*77c1e3ccSAndroid Build Coastguard Worker     case 128:
287*77c1e3ccSAndroid Build Coastguard Worker       do {
288*77c1e3ccSAndroid Build Coastguard Worker         int l = 0;
289*77c1e3ccSAndroid Build Coastguard Worker         __m128i sum32 = _mm_setzero_si128();
290*77c1e3ccSAndroid Build Coastguard Worker         do {
291*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a, b);
292*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
293*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
294*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
295*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4);
296*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5);
297*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6);
298*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7);
299*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 8, b + 8 * 8);
300*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 9, b + 8 * 9);
301*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 10, b + 8 * 10);
302*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 11, b + 8 * 11);
303*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 12, b + 8 * 12);
304*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 13, b + 8 * 13);
305*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 14, b + 8 * 14);
306*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 15, b + 8 * 15);
307*77c1e3ccSAndroid Build Coastguard Worker           a += a_stride;
308*77c1e3ccSAndroid Build Coastguard Worker           b += b_stride;
309*77c1e3ccSAndroid Build Coastguard Worker           l += 1;
310*77c1e3ccSAndroid Build Coastguard Worker         } while (l < 8 && l < (height - y));
311*77c1e3ccSAndroid Build Coastguard Worker         summary_32_sse4(&sum32, &sum);
312*77c1e3ccSAndroid Build Coastguard Worker         y += 8;
313*77c1e3ccSAndroid Build Coastguard Worker       } while (y < height);
314*77c1e3ccSAndroid Build Coastguard Worker       xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
315*77c1e3ccSAndroid Build Coastguard Worker       break;
316*77c1e3ccSAndroid Build Coastguard Worker     default:
317*77c1e3ccSAndroid Build Coastguard Worker       if (width & 0x7) {
318*77c1e3ccSAndroid Build Coastguard Worker         do {
319*77c1e3ccSAndroid Build Coastguard Worker           __m128i sum32 = _mm_setzero_si128();
320*77c1e3ccSAndroid Build Coastguard Worker           int i = 0;
321*77c1e3ccSAndroid Build Coastguard Worker           do {
322*77c1e3ccSAndroid Build Coastguard Worker             highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
323*77c1e3ccSAndroid Build Coastguard Worker             highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride);
324*77c1e3ccSAndroid Build Coastguard Worker             i += 8;
325*77c1e3ccSAndroid Build Coastguard Worker           } while (i + 4 < width);
326*77c1e3ccSAndroid Build Coastguard Worker           highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride);
327*77c1e3ccSAndroid Build Coastguard Worker           a += (a_stride << 1);
328*77c1e3ccSAndroid Build Coastguard Worker           b += (b_stride << 1);
329*77c1e3ccSAndroid Build Coastguard Worker           y += 2;
330*77c1e3ccSAndroid Build Coastguard Worker           summary_32_sse4(&sum32, &sum);
331*77c1e3ccSAndroid Build Coastguard Worker         } while (y < height);
332*77c1e3ccSAndroid Build Coastguard Worker       } else {
333*77c1e3ccSAndroid Build Coastguard Worker         do {
334*77c1e3ccSAndroid Build Coastguard Worker           int l = 0;
335*77c1e3ccSAndroid Build Coastguard Worker           __m128i sum32 = _mm_setzero_si128();
336*77c1e3ccSAndroid Build Coastguard Worker           do {
337*77c1e3ccSAndroid Build Coastguard Worker             int i = 0;
338*77c1e3ccSAndroid Build Coastguard Worker             do {
339*77c1e3ccSAndroid Build Coastguard Worker               highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
340*77c1e3ccSAndroid Build Coastguard Worker               i += 8;
341*77c1e3ccSAndroid Build Coastguard Worker             } while (i < width);
342*77c1e3ccSAndroid Build Coastguard Worker             a += a_stride;
343*77c1e3ccSAndroid Build Coastguard Worker             b += b_stride;
344*77c1e3ccSAndroid Build Coastguard Worker             l += 1;
345*77c1e3ccSAndroid Build Coastguard Worker           } while (l < 8 && l < (height - y));
346*77c1e3ccSAndroid Build Coastguard Worker           summary_32_sse4(&sum32, &sum);
347*77c1e3ccSAndroid Build Coastguard Worker           y += 8;
348*77c1e3ccSAndroid Build Coastguard Worker         } while (y < height);
349*77c1e3ccSAndroid Build Coastguard Worker       }
350*77c1e3ccSAndroid Build Coastguard Worker       xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
351*77c1e3ccSAndroid Build Coastguard Worker       break;
352*77c1e3ccSAndroid Build Coastguard Worker   }
353*77c1e3ccSAndroid Build Coastguard Worker   return sse;
354*77c1e3ccSAndroid Build Coastguard Worker }
355*77c1e3ccSAndroid Build Coastguard Worker #endif  // CONFIG_AV1_HIGHBITDEPTH
356