xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/sse_sse4.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker  *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker  *
4*fb1b10abSAndroid Build Coastguard Worker  *  Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker  *  that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker  *  tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker  *  in the file PATENTS.  All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker  *  be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker  */
10*fb1b10abSAndroid Build Coastguard Worker 
11*fb1b10abSAndroid Build Coastguard Worker #include <assert.h>
12*fb1b10abSAndroid Build Coastguard Worker #include <smmintrin.h>
13*fb1b10abSAndroid Build Coastguard Worker 
14*fb1b10abSAndroid Build Coastguard Worker #include "./vpx_config.h"
15*fb1b10abSAndroid Build Coastguard Worker #include "./vpx_dsp_rtcd.h"
16*fb1b10abSAndroid Build Coastguard Worker 
17*fb1b10abSAndroid Build Coastguard Worker #include "vpx_ports/mem.h"
18*fb1b10abSAndroid Build Coastguard Worker #include "vpx/vpx_integer.h"
19*fb1b10abSAndroid Build Coastguard Worker #include "vpx_dsp/x86/mem_sse2.h"
20*fb1b10abSAndroid Build Coastguard Worker 
summary_all_sse4(const __m128i * sum_all)21*fb1b10abSAndroid Build Coastguard Worker static INLINE int64_t summary_all_sse4(const __m128i *sum_all) {
22*fb1b10abSAndroid Build Coastguard Worker   int64_t sum;
23*fb1b10abSAndroid Build Coastguard Worker   const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all);
24*fb1b10abSAndroid Build Coastguard Worker   const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8));
25*fb1b10abSAndroid Build Coastguard Worker   const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1);
26*fb1b10abSAndroid Build Coastguard Worker   const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
27*fb1b10abSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)&sum, sum_1x64);
28*fb1b10abSAndroid Build Coastguard Worker   return sum;
29*fb1b10abSAndroid Build Coastguard Worker }
30*fb1b10abSAndroid Build Coastguard Worker 
31*fb1b10abSAndroid Build Coastguard Worker #if CONFIG_VP9_HIGHBITDEPTH
summary_32_sse4(const __m128i * sum32,__m128i * sum64)32*fb1b10abSAndroid Build Coastguard Worker static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) {
33*fb1b10abSAndroid Build Coastguard Worker   const __m128i sum0 = _mm_cvtepu32_epi64(*sum32);
34*fb1b10abSAndroid Build Coastguard Worker   const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8));
35*fb1b10abSAndroid Build Coastguard Worker   *sum64 = _mm_add_epi64(sum0, *sum64);
36*fb1b10abSAndroid Build Coastguard Worker   *sum64 = _mm_add_epi64(sum1, *sum64);
37*fb1b10abSAndroid Build Coastguard Worker }
38*fb1b10abSAndroid Build Coastguard Worker #endif
39*fb1b10abSAndroid Build Coastguard Worker 
sse_w16_sse4_1(__m128i * sum,const uint8_t * a,const uint8_t * b)40*fb1b10abSAndroid Build Coastguard Worker static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
41*fb1b10abSAndroid Build Coastguard Worker                                   const uint8_t *b) {
42*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a);
43*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b);
44*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0);
45*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8));
46*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0);
47*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8));
48*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w);
49*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w);
50*fb1b10abSAndroid Build Coastguard Worker   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w));
51*fb1b10abSAndroid Build Coastguard Worker   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w));
52*fb1b10abSAndroid Build Coastguard Worker }
53*fb1b10abSAndroid Build Coastguard Worker 
sse4x2_sse4_1(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,__m128i * sum)54*fb1b10abSAndroid Build Coastguard Worker static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride,
55*fb1b10abSAndroid Build Coastguard Worker                                  const uint8_t *b, int b_stride, __m128i *sum) {
56*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_a0 = load_unaligned_u32(a);
57*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_a1 = load_unaligned_u32(a + a_stride);
58*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_b0 = load_unaligned_u32(b);
59*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_b1 = load_unaligned_u32(b + b_stride);
60*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1));
61*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1));
62*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
63*fb1b10abSAndroid Build Coastguard Worker   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
64*fb1b10abSAndroid Build Coastguard Worker }
65*fb1b10abSAndroid Build Coastguard Worker 
sse8_sse4_1(const uint8_t * a,const uint8_t * b,__m128i * sum)66*fb1b10abSAndroid Build Coastguard Worker static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b,
67*fb1b10abSAndroid Build Coastguard Worker                                __m128i *sum) {
68*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
69*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
70*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0);
71*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0);
72*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
73*fb1b10abSAndroid Build Coastguard Worker   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
74*fb1b10abSAndroid Build Coastguard Worker }
75*fb1b10abSAndroid Build Coastguard Worker 
vpx_sse_sse4_1(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int width,int height)76*fb1b10abSAndroid Build Coastguard Worker int64_t vpx_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
77*fb1b10abSAndroid Build Coastguard Worker                        int b_stride, int width, int height) {
78*fb1b10abSAndroid Build Coastguard Worker   int y = 0;
79*fb1b10abSAndroid Build Coastguard Worker   int64_t sse = 0;
80*fb1b10abSAndroid Build Coastguard Worker   __m128i sum = _mm_setzero_si128();
81*fb1b10abSAndroid Build Coastguard Worker   switch (width) {
82*fb1b10abSAndroid Build Coastguard Worker     case 4:
83*fb1b10abSAndroid Build Coastguard Worker       do {
84*fb1b10abSAndroid Build Coastguard Worker         sse4x2_sse4_1(a, a_stride, b, b_stride, &sum);
85*fb1b10abSAndroid Build Coastguard Worker         a += a_stride << 1;
86*fb1b10abSAndroid Build Coastguard Worker         b += b_stride << 1;
87*fb1b10abSAndroid Build Coastguard Worker         y += 2;
88*fb1b10abSAndroid Build Coastguard Worker       } while (y < height);
89*fb1b10abSAndroid Build Coastguard Worker       sse = summary_all_sse4(&sum);
90*fb1b10abSAndroid Build Coastguard Worker       break;
91*fb1b10abSAndroid Build Coastguard Worker     case 8:
92*fb1b10abSAndroid Build Coastguard Worker       do {
93*fb1b10abSAndroid Build Coastguard Worker         sse8_sse4_1(a, b, &sum);
94*fb1b10abSAndroid Build Coastguard Worker         a += a_stride;
95*fb1b10abSAndroid Build Coastguard Worker         b += b_stride;
96*fb1b10abSAndroid Build Coastguard Worker         y += 1;
97*fb1b10abSAndroid Build Coastguard Worker       } while (y < height);
98*fb1b10abSAndroid Build Coastguard Worker       sse = summary_all_sse4(&sum);
99*fb1b10abSAndroid Build Coastguard Worker       break;
100*fb1b10abSAndroid Build Coastguard Worker     case 16:
101*fb1b10abSAndroid Build Coastguard Worker       do {
102*fb1b10abSAndroid Build Coastguard Worker         sse_w16_sse4_1(&sum, a, b);
103*fb1b10abSAndroid Build Coastguard Worker         a += a_stride;
104*fb1b10abSAndroid Build Coastguard Worker         b += b_stride;
105*fb1b10abSAndroid Build Coastguard Worker         y += 1;
106*fb1b10abSAndroid Build Coastguard Worker       } while (y < height);
107*fb1b10abSAndroid Build Coastguard Worker       sse = summary_all_sse4(&sum);
108*fb1b10abSAndroid Build Coastguard Worker       break;
109*fb1b10abSAndroid Build Coastguard Worker     case 32:
110*fb1b10abSAndroid Build Coastguard Worker       do {
111*fb1b10abSAndroid Build Coastguard Worker         sse_w16_sse4_1(&sum, a, b);
112*fb1b10abSAndroid Build Coastguard Worker         sse_w16_sse4_1(&sum, a + 16, b + 16);
113*fb1b10abSAndroid Build Coastguard Worker         a += a_stride;
114*fb1b10abSAndroid Build Coastguard Worker         b += b_stride;
115*fb1b10abSAndroid Build Coastguard Worker         y += 1;
116*fb1b10abSAndroid Build Coastguard Worker       } while (y < height);
117*fb1b10abSAndroid Build Coastguard Worker       sse = summary_all_sse4(&sum);
118*fb1b10abSAndroid Build Coastguard Worker       break;
119*fb1b10abSAndroid Build Coastguard Worker     case 64:
120*fb1b10abSAndroid Build Coastguard Worker       do {
121*fb1b10abSAndroid Build Coastguard Worker         sse_w16_sse4_1(&sum, a, b);
122*fb1b10abSAndroid Build Coastguard Worker         sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
123*fb1b10abSAndroid Build Coastguard Worker         sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
124*fb1b10abSAndroid Build Coastguard Worker         sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
125*fb1b10abSAndroid Build Coastguard Worker         a += a_stride;
126*fb1b10abSAndroid Build Coastguard Worker         b += b_stride;
127*fb1b10abSAndroid Build Coastguard Worker         y += 1;
128*fb1b10abSAndroid Build Coastguard Worker       } while (y < height);
129*fb1b10abSAndroid Build Coastguard Worker       sse = summary_all_sse4(&sum);
130*fb1b10abSAndroid Build Coastguard Worker       break;
131*fb1b10abSAndroid Build Coastguard Worker     default:
132*fb1b10abSAndroid Build Coastguard Worker       if (width & 0x07) {
133*fb1b10abSAndroid Build Coastguard Worker         do {
134*fb1b10abSAndroid Build Coastguard Worker           int i = 0;
135*fb1b10abSAndroid Build Coastguard Worker           do {
136*fb1b10abSAndroid Build Coastguard Worker             sse8_sse4_1(a + i, b + i, &sum);
137*fb1b10abSAndroid Build Coastguard Worker             sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum);
138*fb1b10abSAndroid Build Coastguard Worker             i += 8;
139*fb1b10abSAndroid Build Coastguard Worker           } while (i + 4 < width);
140*fb1b10abSAndroid Build Coastguard Worker           sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum);
141*fb1b10abSAndroid Build Coastguard Worker           a += (a_stride << 1);
142*fb1b10abSAndroid Build Coastguard Worker           b += (b_stride << 1);
143*fb1b10abSAndroid Build Coastguard Worker           y += 2;
144*fb1b10abSAndroid Build Coastguard Worker         } while (y < height);
145*fb1b10abSAndroid Build Coastguard Worker       } else {
146*fb1b10abSAndroid Build Coastguard Worker         do {
147*fb1b10abSAndroid Build Coastguard Worker           int i = 0;
148*fb1b10abSAndroid Build Coastguard Worker           do {
149*fb1b10abSAndroid Build Coastguard Worker             sse8_sse4_1(a + i, b + i, &sum);
150*fb1b10abSAndroid Build Coastguard Worker             i += 8;
151*fb1b10abSAndroid Build Coastguard Worker           } while (i < width);
152*fb1b10abSAndroid Build Coastguard Worker           a += a_stride;
153*fb1b10abSAndroid Build Coastguard Worker           b += b_stride;
154*fb1b10abSAndroid Build Coastguard Worker           y += 1;
155*fb1b10abSAndroid Build Coastguard Worker         } while (y < height);
156*fb1b10abSAndroid Build Coastguard Worker       }
157*fb1b10abSAndroid Build Coastguard Worker       sse = summary_all_sse4(&sum);
158*fb1b10abSAndroid Build Coastguard Worker       break;
159*fb1b10abSAndroid Build Coastguard Worker   }
160*fb1b10abSAndroid Build Coastguard Worker 
161*fb1b10abSAndroid Build Coastguard Worker   return sse;
162*fb1b10abSAndroid Build Coastguard Worker }
163*fb1b10abSAndroid Build Coastguard Worker 
164*fb1b10abSAndroid Build Coastguard Worker #if CONFIG_VP9_HIGHBITDEPTH
highbd_sse_w4x2_sse4_1(__m128i * sum,const uint16_t * a,int a_stride,const uint16_t * b,int b_stride)165*fb1b10abSAndroid Build Coastguard Worker static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a,
166*fb1b10abSAndroid Build Coastguard Worker                                           int a_stride, const uint16_t *b,
167*fb1b10abSAndroid Build Coastguard Worker                                           int b_stride) {
168*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
169*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride));
170*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
171*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride));
172*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1);
173*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1);
174*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
175*fb1b10abSAndroid Build Coastguard Worker   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
176*fb1b10abSAndroid Build Coastguard Worker }
177*fb1b10abSAndroid Build Coastguard Worker 
highbd_sse_w8_sse4_1(__m128i * sum,const uint16_t * a,const uint16_t * b)178*fb1b10abSAndroid Build Coastguard Worker static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a,
179*fb1b10abSAndroid Build Coastguard Worker                                         const uint16_t *b) {
180*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_a_w = _mm_loadu_si128((const __m128i *)a);
181*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_b_w = _mm_loadu_si128((const __m128i *)b);
182*fb1b10abSAndroid Build Coastguard Worker   const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
183*fb1b10abSAndroid Build Coastguard Worker   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
184*fb1b10abSAndroid Build Coastguard Worker }
185*fb1b10abSAndroid Build Coastguard Worker 
vpx_highbd_sse_sse4_1(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int width,int height)186*fb1b10abSAndroid Build Coastguard Worker int64_t vpx_highbd_sse_sse4_1(const uint8_t *a8, int a_stride,
187*fb1b10abSAndroid Build Coastguard Worker                               const uint8_t *b8, int b_stride, int width,
188*fb1b10abSAndroid Build Coastguard Worker                               int height) {
189*fb1b10abSAndroid Build Coastguard Worker   int32_t y = 0;
190*fb1b10abSAndroid Build Coastguard Worker   int64_t sse = 0;
191*fb1b10abSAndroid Build Coastguard Worker   uint16_t *a = CONVERT_TO_SHORTPTR(a8);
192*fb1b10abSAndroid Build Coastguard Worker   uint16_t *b = CONVERT_TO_SHORTPTR(b8);
193*fb1b10abSAndroid Build Coastguard Worker   __m128i sum = _mm_setzero_si128();
194*fb1b10abSAndroid Build Coastguard Worker   switch (width) {
195*fb1b10abSAndroid Build Coastguard Worker     case 4:
196*fb1b10abSAndroid Build Coastguard Worker       do {
197*fb1b10abSAndroid Build Coastguard Worker         highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride);
198*fb1b10abSAndroid Build Coastguard Worker         a += a_stride << 1;
199*fb1b10abSAndroid Build Coastguard Worker         b += b_stride << 1;
200*fb1b10abSAndroid Build Coastguard Worker         y += 2;
201*fb1b10abSAndroid Build Coastguard Worker       } while (y < height);
202*fb1b10abSAndroid Build Coastguard Worker       sse = summary_all_sse4(&sum);
203*fb1b10abSAndroid Build Coastguard Worker       break;
204*fb1b10abSAndroid Build Coastguard Worker     case 8:
205*fb1b10abSAndroid Build Coastguard Worker       do {
206*fb1b10abSAndroid Build Coastguard Worker         highbd_sse_w8_sse4_1(&sum, a, b);
207*fb1b10abSAndroid Build Coastguard Worker         a += a_stride;
208*fb1b10abSAndroid Build Coastguard Worker         b += b_stride;
209*fb1b10abSAndroid Build Coastguard Worker         y += 1;
210*fb1b10abSAndroid Build Coastguard Worker       } while (y < height);
211*fb1b10abSAndroid Build Coastguard Worker       sse = summary_all_sse4(&sum);
212*fb1b10abSAndroid Build Coastguard Worker       break;
213*fb1b10abSAndroid Build Coastguard Worker     case 16:
214*fb1b10abSAndroid Build Coastguard Worker       do {
215*fb1b10abSAndroid Build Coastguard Worker         int l = 0;
216*fb1b10abSAndroid Build Coastguard Worker         __m128i sum32 = _mm_setzero_si128();
217*fb1b10abSAndroid Build Coastguard Worker         do {
218*fb1b10abSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a, b);
219*fb1b10abSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8);
220*fb1b10abSAndroid Build Coastguard Worker           a += a_stride;
221*fb1b10abSAndroid Build Coastguard Worker           b += b_stride;
222*fb1b10abSAndroid Build Coastguard Worker           l += 1;
223*fb1b10abSAndroid Build Coastguard Worker         } while (l < 64 && l < (height - y));
224*fb1b10abSAndroid Build Coastguard Worker         summary_32_sse4(&sum32, &sum);
225*fb1b10abSAndroid Build Coastguard Worker         y += 64;
226*fb1b10abSAndroid Build Coastguard Worker       } while (y < height);
227*fb1b10abSAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i *)&sse,
228*fb1b10abSAndroid Build Coastguard Worker                        _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
229*fb1b10abSAndroid Build Coastguard Worker       break;
230*fb1b10abSAndroid Build Coastguard Worker     case 32:
231*fb1b10abSAndroid Build Coastguard Worker       do {
232*fb1b10abSAndroid Build Coastguard Worker         int l = 0;
233*fb1b10abSAndroid Build Coastguard Worker         __m128i sum32 = _mm_setzero_si128();
234*fb1b10abSAndroid Build Coastguard Worker         do {
235*fb1b10abSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a, b);
236*fb1b10abSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
237*fb1b10abSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
238*fb1b10abSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
239*fb1b10abSAndroid Build Coastguard Worker           a += a_stride;
240*fb1b10abSAndroid Build Coastguard Worker           b += b_stride;
241*fb1b10abSAndroid Build Coastguard Worker           l += 1;
242*fb1b10abSAndroid Build Coastguard Worker         } while (l < 32 && l < (height - y));
243*fb1b10abSAndroid Build Coastguard Worker         summary_32_sse4(&sum32, &sum);
244*fb1b10abSAndroid Build Coastguard Worker         y += 32;
245*fb1b10abSAndroid Build Coastguard Worker       } while (y < height);
246*fb1b10abSAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i *)&sse,
247*fb1b10abSAndroid Build Coastguard Worker                        _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
248*fb1b10abSAndroid Build Coastguard Worker       break;
249*fb1b10abSAndroid Build Coastguard Worker     case 64:
250*fb1b10abSAndroid Build Coastguard Worker       do {
251*fb1b10abSAndroid Build Coastguard Worker         int l = 0;
252*fb1b10abSAndroid Build Coastguard Worker         __m128i sum32 = _mm_setzero_si128();
253*fb1b10abSAndroid Build Coastguard Worker         do {
254*fb1b10abSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a, b);
255*fb1b10abSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
256*fb1b10abSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
257*fb1b10abSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
258*fb1b10abSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4);
259*fb1b10abSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5);
260*fb1b10abSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6);
261*fb1b10abSAndroid Build Coastguard Worker           highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7);
262*fb1b10abSAndroid Build Coastguard Worker           a += a_stride;
263*fb1b10abSAndroid Build Coastguard Worker           b += b_stride;
264*fb1b10abSAndroid Build Coastguard Worker           l += 1;
265*fb1b10abSAndroid Build Coastguard Worker         } while (l < 16 && l < (height - y));
266*fb1b10abSAndroid Build Coastguard Worker         summary_32_sse4(&sum32, &sum);
267*fb1b10abSAndroid Build Coastguard Worker         y += 16;
268*fb1b10abSAndroid Build Coastguard Worker       } while (y < height);
269*fb1b10abSAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i *)&sse,
270*fb1b10abSAndroid Build Coastguard Worker                        _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
271*fb1b10abSAndroid Build Coastguard Worker       break;
272*fb1b10abSAndroid Build Coastguard Worker     default:
273*fb1b10abSAndroid Build Coastguard Worker       if (width & 0x7) {
274*fb1b10abSAndroid Build Coastguard Worker         do {
275*fb1b10abSAndroid Build Coastguard Worker           __m128i sum32 = _mm_setzero_si128();
276*fb1b10abSAndroid Build Coastguard Worker           int i = 0;
277*fb1b10abSAndroid Build Coastguard Worker           do {
278*fb1b10abSAndroid Build Coastguard Worker             highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
279*fb1b10abSAndroid Build Coastguard Worker             highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride);
280*fb1b10abSAndroid Build Coastguard Worker             i += 8;
281*fb1b10abSAndroid Build Coastguard Worker           } while (i + 4 < width);
282*fb1b10abSAndroid Build Coastguard Worker           highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride);
283*fb1b10abSAndroid Build Coastguard Worker           a += (a_stride << 1);
284*fb1b10abSAndroid Build Coastguard Worker           b += (b_stride << 1);
285*fb1b10abSAndroid Build Coastguard Worker           y += 2;
286*fb1b10abSAndroid Build Coastguard Worker           summary_32_sse4(&sum32, &sum);
287*fb1b10abSAndroid Build Coastguard Worker         } while (y < height);
288*fb1b10abSAndroid Build Coastguard Worker       } else {
289*fb1b10abSAndroid Build Coastguard Worker         do {
290*fb1b10abSAndroid Build Coastguard Worker           int l = 0;
291*fb1b10abSAndroid Build Coastguard Worker           __m128i sum32 = _mm_setzero_si128();
292*fb1b10abSAndroid Build Coastguard Worker           do {
293*fb1b10abSAndroid Build Coastguard Worker             int i = 0;
294*fb1b10abSAndroid Build Coastguard Worker             do {
295*fb1b10abSAndroid Build Coastguard Worker               highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
296*fb1b10abSAndroid Build Coastguard Worker               i += 8;
297*fb1b10abSAndroid Build Coastguard Worker             } while (i < width);
298*fb1b10abSAndroid Build Coastguard Worker             a += a_stride;
299*fb1b10abSAndroid Build Coastguard Worker             b += b_stride;
300*fb1b10abSAndroid Build Coastguard Worker             l += 1;
301*fb1b10abSAndroid Build Coastguard Worker           } while (l < 8 && l < (height - y));
302*fb1b10abSAndroid Build Coastguard Worker           summary_32_sse4(&sum32, &sum);
303*fb1b10abSAndroid Build Coastguard Worker           y += 8;
304*fb1b10abSAndroid Build Coastguard Worker         } while (y < height);
305*fb1b10abSAndroid Build Coastguard Worker       }
306*fb1b10abSAndroid Build Coastguard Worker       _mm_storel_epi64((__m128i *)&sse,
307*fb1b10abSAndroid Build Coastguard Worker                        _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
308*fb1b10abSAndroid Build Coastguard Worker       break;
309*fb1b10abSAndroid Build Coastguard Worker   }
310*fb1b10abSAndroid Build Coastguard Worker   return sse;
311*fb1b10abSAndroid Build Coastguard Worker }
312*fb1b10abSAndroid Build Coastguard Worker #endif  // CONFIG_VP9_HIGHBITDEPTH
313