1*abb65b4bSAndroid Build Coastguard Worker /*
2*abb65b4bSAndroid Build Coastguard Worker * Copyright (c) 2022 Samsung Electronics Co., Ltd.
3*abb65b4bSAndroid Build Coastguard Worker * All Rights Reserved.
4*abb65b4bSAndroid Build Coastguard Worker *
5*abb65b4bSAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without
6*abb65b4bSAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met:
7*abb65b4bSAndroid Build Coastguard Worker *
8*abb65b4bSAndroid Build Coastguard Worker * - Redistributions of source code must retain the above copyright notice,
9*abb65b4bSAndroid Build Coastguard Worker * this list of conditions and the following disclaimer.
10*abb65b4bSAndroid Build Coastguard Worker *
11*abb65b4bSAndroid Build Coastguard Worker * - Redistributions in binary form must reproduce the above copyright notice,
12*abb65b4bSAndroid Build Coastguard Worker * this list of conditions and the following disclaimer in the documentation
13*abb65b4bSAndroid Build Coastguard Worker * and/or other materials provided with the distribution.
14*abb65b4bSAndroid Build Coastguard Worker *
15*abb65b4bSAndroid Build Coastguard Worker * - Neither the name of the copyright owner, nor the names of its contributors
16*abb65b4bSAndroid Build Coastguard Worker * may be used to endorse or promote products derived from this software
17*abb65b4bSAndroid Build Coastguard Worker * without specific prior written permission.
18*abb65b4bSAndroid Build Coastguard Worker *
19*abb65b4bSAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20*abb65b4bSAndroid Build Coastguard Worker * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21*abb65b4bSAndroid Build Coastguard Worker * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22*abb65b4bSAndroid Build Coastguard Worker * ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23*abb65b4bSAndroid Build Coastguard Worker * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24*abb65b4bSAndroid Build Coastguard Worker * CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25*abb65b4bSAndroid Build Coastguard Worker * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26*abb65b4bSAndroid Build Coastguard Worker * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27*abb65b4bSAndroid Build Coastguard Worker * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28*abb65b4bSAndroid Build Coastguard Worker * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29*abb65b4bSAndroid Build Coastguard Worker * POSSIBILITY OF SUCH DAMAGE.
30*abb65b4bSAndroid Build Coastguard Worker */
31*abb65b4bSAndroid Build Coastguard Worker
32*abb65b4bSAndroid Build Coastguard Worker #include "oapv_sad_sse.h"
33*abb65b4bSAndroid Build Coastguard Worker
34*abb65b4bSAndroid Build Coastguard Worker #if X86_SSE
35*abb65b4bSAndroid Build Coastguard Worker
36*abb65b4bSAndroid Build Coastguard Worker /* SSD ***********************************************************************/
37*abb65b4bSAndroid Build Coastguard Worker #define SSE_SSD_16B_8PEL(src1, src2, shift, s00, s01, s02, s00a) \
38*abb65b4bSAndroid Build Coastguard Worker s00 = _mm_loadu_si128((__m128i*)(src1)); \
39*abb65b4bSAndroid Build Coastguard Worker s01 = _mm_loadu_si128((__m128i*)(src2)); \
40*abb65b4bSAndroid Build Coastguard Worker s02 = _mm_sub_epi16(s00, s01); \
41*abb65b4bSAndroid Build Coastguard Worker \
42*abb65b4bSAndroid Build Coastguard Worker s00 = _mm_cvtepi16_epi32(s02); \
43*abb65b4bSAndroid Build Coastguard Worker s00 = _mm_mullo_epi32(s00, s00); \
44*abb65b4bSAndroid Build Coastguard Worker \
45*abb65b4bSAndroid Build Coastguard Worker s01 = _mm_srli_si128(s02, 8); \
46*abb65b4bSAndroid Build Coastguard Worker s01 = _mm_cvtepi16_epi32(s01); \
47*abb65b4bSAndroid Build Coastguard Worker s01 = _mm_mullo_epi32(s01, s01); \
48*abb65b4bSAndroid Build Coastguard Worker \
49*abb65b4bSAndroid Build Coastguard Worker s00 = _mm_srli_epi32(s00, shift); \
50*abb65b4bSAndroid Build Coastguard Worker s01 = _mm_srli_epi32(s01, shift); \
51*abb65b4bSAndroid Build Coastguard Worker s00a = _mm_add_epi32(s00a, s00); \
52*abb65b4bSAndroid Build Coastguard Worker s00a = _mm_add_epi32(s00a, s01);
53*abb65b4bSAndroid Build Coastguard Worker
ssd_16b_sse_8x8(int w,int h,void * src1,void * src2,int s_src1,int s_src2,int bit_depth)54*abb65b4bSAndroid Build Coastguard Worker static s64 ssd_16b_sse_8x8(int w, int h, void * src1, void * src2, int s_src1, int s_src2, int bit_depth)
55*abb65b4bSAndroid Build Coastguard Worker {
56*abb65b4bSAndroid Build Coastguard Worker s64 ssd;
57*abb65b4bSAndroid Build Coastguard Worker s16 * s1;
58*abb65b4bSAndroid Build Coastguard Worker s16 * s2;
59*abb65b4bSAndroid Build Coastguard Worker const int shift = 0;
60*abb65b4bSAndroid Build Coastguard Worker __m128i s00, s01, s02, s00a;
61*abb65b4bSAndroid Build Coastguard Worker
62*abb65b4bSAndroid Build Coastguard Worker s1 = (s16 *)src1;
63*abb65b4bSAndroid Build Coastguard Worker s2 = (s16 *)src2;
64*abb65b4bSAndroid Build Coastguard Worker
65*abb65b4bSAndroid Build Coastguard Worker s00a = _mm_setzero_si128();
66*abb65b4bSAndroid Build Coastguard Worker
67*abb65b4bSAndroid Build Coastguard Worker SSE_SSD_16B_8PEL(s1, s2, shift, s00, s01, s02, s00a);
68*abb65b4bSAndroid Build Coastguard Worker SSE_SSD_16B_8PEL(s1 + s_src1, s2 + s_src2, shift, s00, s01, s02, s00a);
69*abb65b4bSAndroid Build Coastguard Worker SSE_SSD_16B_8PEL(s1 + s_src1*2, s2 + s_src2*2, shift, s00, s01, s02, s00a);
70*abb65b4bSAndroid Build Coastguard Worker SSE_SSD_16B_8PEL(s1 + s_src1*3, s2 + s_src2*3, shift, s00, s01, s02, s00a);
71*abb65b4bSAndroid Build Coastguard Worker SSE_SSD_16B_8PEL(s1 + s_src1*4, s2 + s_src2*4, shift, s00, s01, s02, s00a);
72*abb65b4bSAndroid Build Coastguard Worker SSE_SSD_16B_8PEL(s1 + s_src1*5, s2 + s_src2*5, shift, s00, s01, s02, s00a);
73*abb65b4bSAndroid Build Coastguard Worker SSE_SSD_16B_8PEL(s1 + s_src1*6, s2 + s_src2*6, shift, s00, s01, s02, s00a);
74*abb65b4bSAndroid Build Coastguard Worker SSE_SSD_16B_8PEL(s1 + s_src1*7, s2 + s_src2*7, shift, s00, s01, s02, s00a);
75*abb65b4bSAndroid Build Coastguard Worker
76*abb65b4bSAndroid Build Coastguard Worker ssd = _mm_extract_epi32(s00a, 0);
77*abb65b4bSAndroid Build Coastguard Worker ssd += _mm_extract_epi32(s00a, 1);
78*abb65b4bSAndroid Build Coastguard Worker ssd += _mm_extract_epi32(s00a, 2);
79*abb65b4bSAndroid Build Coastguard Worker ssd += _mm_extract_epi32(s00a, 3);
80*abb65b4bSAndroid Build Coastguard Worker
81*abb65b4bSAndroid Build Coastguard Worker return ssd;
82*abb65b4bSAndroid Build Coastguard Worker }
83*abb65b4bSAndroid Build Coastguard Worker
84*abb65b4bSAndroid Build Coastguard Worker const oapv_fn_ssd_t oapv_tbl_fn_ssd_16b_sse[2] =
85*abb65b4bSAndroid Build Coastguard Worker {
86*abb65b4bSAndroid Build Coastguard Worker ssd_16b_sse_8x8,
87*abb65b4bSAndroid Build Coastguard Worker NULL
88*abb65b4bSAndroid Build Coastguard Worker };
89*abb65b4bSAndroid Build Coastguard Worker
oapv_dc_removed_had8x8_sse(pel * org,int s_org)90*abb65b4bSAndroid Build Coastguard Worker int oapv_dc_removed_had8x8_sse(pel* org, int s_org)
91*abb65b4bSAndroid Build Coastguard Worker {
92*abb65b4bSAndroid Build Coastguard Worker int sad = 0;
93*abb65b4bSAndroid Build Coastguard Worker /* all 128 bit registers are named with a suffix mxnb, where m is the */
94*abb65b4bSAndroid Build Coastguard Worker /* number of n bits packed in the register */
95*abb65b4bSAndroid Build Coastguard Worker __m128i src0_8x16b, src1_8x16b, src2_8x16b, src3_8x16b;
96*abb65b4bSAndroid Build Coastguard Worker __m128i src4_8x16b, src5_8x16b, src6_8x16b, src7_8x16b;
97*abb65b4bSAndroid Build Coastguard Worker __m128i pred0_8x16b, pred1_8x16b, pred2_8x16b, pred3_8x16b;
98*abb65b4bSAndroid Build Coastguard Worker __m128i pred4_8x16b, pred5_8x16b, pred6_8x16b, pred7_8x16b;
99*abb65b4bSAndroid Build Coastguard Worker __m128i out0_8x16b, out1_8x16b, out2_8x16b, out3_8x16b;
100*abb65b4bSAndroid Build Coastguard Worker __m128i out4_8x16b, out5_8x16b, out6_8x16b, out7_8x16b;
101*abb65b4bSAndroid Build Coastguard Worker
102*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = _mm_loadu_si128((__m128i*) org);
103*abb65b4bSAndroid Build Coastguard Worker org = org + s_org;
104*abb65b4bSAndroid Build Coastguard Worker src1_8x16b = _mm_loadu_si128((__m128i*) org);
105*abb65b4bSAndroid Build Coastguard Worker org = org + s_org;
106*abb65b4bSAndroid Build Coastguard Worker src2_8x16b = _mm_loadu_si128((__m128i*) org);
107*abb65b4bSAndroid Build Coastguard Worker org = org + s_org;
108*abb65b4bSAndroid Build Coastguard Worker src3_8x16b = _mm_loadu_si128((__m128i*) org);
109*abb65b4bSAndroid Build Coastguard Worker org = org + s_org;
110*abb65b4bSAndroid Build Coastguard Worker src4_8x16b = _mm_loadu_si128((__m128i*) org);
111*abb65b4bSAndroid Build Coastguard Worker org = org + s_org;
112*abb65b4bSAndroid Build Coastguard Worker src5_8x16b = _mm_loadu_si128((__m128i*) org);
113*abb65b4bSAndroid Build Coastguard Worker org = org + s_org;
114*abb65b4bSAndroid Build Coastguard Worker src6_8x16b = _mm_loadu_si128((__m128i*) org);
115*abb65b4bSAndroid Build Coastguard Worker org = org + s_org;
116*abb65b4bSAndroid Build Coastguard Worker src7_8x16b = _mm_loadu_si128((__m128i*) org);
117*abb65b4bSAndroid Build Coastguard Worker org = org + s_org;
118*abb65b4bSAndroid Build Coastguard Worker
119*abb65b4bSAndroid Build Coastguard Worker /**************** 8x8 horizontal transform *******************************/
120*abb65b4bSAndroid Build Coastguard Worker /*********************** 8x8 16 bit Transpose ************************/
121*abb65b4bSAndroid Build Coastguard Worker out3_8x16b = _mm_unpacklo_epi16(src0_8x16b, src1_8x16b);
122*abb65b4bSAndroid Build Coastguard Worker pred0_8x16b = _mm_unpacklo_epi16(src2_8x16b, src3_8x16b);
123*abb65b4bSAndroid Build Coastguard Worker out2_8x16b = _mm_unpacklo_epi16(src4_8x16b, src5_8x16b);
124*abb65b4bSAndroid Build Coastguard Worker pred3_8x16b = _mm_unpacklo_epi16(src6_8x16b, src7_8x16b);
125*abb65b4bSAndroid Build Coastguard Worker out7_8x16b = _mm_unpackhi_epi16(src0_8x16b, src1_8x16b);
126*abb65b4bSAndroid Build Coastguard Worker src2_8x16b = _mm_unpackhi_epi16(src2_8x16b, src3_8x16b);
127*abb65b4bSAndroid Build Coastguard Worker pred7_8x16b = _mm_unpackhi_epi16(src4_8x16b, src5_8x16b);
128*abb65b4bSAndroid Build Coastguard Worker src6_8x16b = _mm_unpackhi_epi16(src6_8x16b, src7_8x16b);
129*abb65b4bSAndroid Build Coastguard Worker
130*abb65b4bSAndroid Build Coastguard Worker out1_8x16b = _mm_unpacklo_epi32(out3_8x16b, pred0_8x16b);
131*abb65b4bSAndroid Build Coastguard Worker out3_8x16b = _mm_unpackhi_epi32(out3_8x16b, pred0_8x16b);
132*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = _mm_unpacklo_epi32(out2_8x16b, pred3_8x16b);
133*abb65b4bSAndroid Build Coastguard Worker pred3_8x16b = _mm_unpackhi_epi32(out2_8x16b, pred3_8x16b);
134*abb65b4bSAndroid Build Coastguard Worker out5_8x16b = _mm_unpacklo_epi32(out7_8x16b, src2_8x16b);
135*abb65b4bSAndroid Build Coastguard Worker out7_8x16b = _mm_unpackhi_epi32(out7_8x16b, src2_8x16b);
136*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = _mm_unpacklo_epi32(pred7_8x16b, src6_8x16b);
137*abb65b4bSAndroid Build Coastguard Worker pred7_8x16b = _mm_unpackhi_epi32(pred7_8x16b, src6_8x16b);
138*abb65b4bSAndroid Build Coastguard Worker
139*abb65b4bSAndroid Build Coastguard Worker out0_8x16b = _mm_unpacklo_epi64(out1_8x16b, pred1_8x16b);
140*abb65b4bSAndroid Build Coastguard Worker out1_8x16b = _mm_unpackhi_epi64(out1_8x16b, pred1_8x16b);
141*abb65b4bSAndroid Build Coastguard Worker out2_8x16b = _mm_unpacklo_epi64(out3_8x16b, pred3_8x16b);
142*abb65b4bSAndroid Build Coastguard Worker out3_8x16b = _mm_unpackhi_epi64(out3_8x16b, pred3_8x16b);
143*abb65b4bSAndroid Build Coastguard Worker out4_8x16b = _mm_unpacklo_epi64(out5_8x16b, pred5_8x16b);
144*abb65b4bSAndroid Build Coastguard Worker out5_8x16b = _mm_unpackhi_epi64(out5_8x16b, pred5_8x16b);
145*abb65b4bSAndroid Build Coastguard Worker out6_8x16b = _mm_unpacklo_epi64(out7_8x16b, pred7_8x16b);
146*abb65b4bSAndroid Build Coastguard Worker out7_8x16b = _mm_unpackhi_epi64(out7_8x16b, pred7_8x16b);
147*abb65b4bSAndroid Build Coastguard Worker /********************** 8x8 16 bit Transpose End *********************/
148*abb65b4bSAndroid Build Coastguard Worker
149*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 */
150*abb65b4bSAndroid Build Coastguard Worker pred0_8x16b = _mm_add_epi16(out0_8x16b, out1_8x16b);
151*abb65b4bSAndroid Build Coastguard Worker /* r2 + r3 */
152*abb65b4bSAndroid Build Coastguard Worker pred2_8x16b = _mm_add_epi16(out2_8x16b, out3_8x16b);
153*abb65b4bSAndroid Build Coastguard Worker /* r4 + r5 */
154*abb65b4bSAndroid Build Coastguard Worker pred4_8x16b = _mm_add_epi16(out4_8x16b, out5_8x16b);
155*abb65b4bSAndroid Build Coastguard Worker /* r6 + r7 */
156*abb65b4bSAndroid Build Coastguard Worker pred6_8x16b = _mm_add_epi16(out6_8x16b, out7_8x16b);
157*abb65b4bSAndroid Build Coastguard Worker
158*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 + r2 + r3 */
159*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = _mm_add_epi16(pred0_8x16b, pred2_8x16b);
160*abb65b4bSAndroid Build Coastguard Worker /* r4 + r5 + r6 + r7 */
161*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = _mm_add_epi16(pred4_8x16b, pred6_8x16b);
162*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
163*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = _mm_add_epi16(pred1_8x16b, pred5_8x16b);
164*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
165*abb65b4bSAndroid Build Coastguard Worker src4_8x16b = _mm_sub_epi16(pred1_8x16b, pred5_8x16b);
166*abb65b4bSAndroid Build Coastguard Worker
167*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 - r2 - r3 */
168*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = _mm_sub_epi16(pred0_8x16b, pred2_8x16b);
169*abb65b4bSAndroid Build Coastguard Worker /* r4 + r5 - r6 - r7 */
170*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = _mm_sub_epi16(pred4_8x16b, pred6_8x16b);
171*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
172*abb65b4bSAndroid Build Coastguard Worker src2_8x16b = _mm_add_epi16(pred1_8x16b, pred5_8x16b);
173*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
174*abb65b4bSAndroid Build Coastguard Worker src6_8x16b = _mm_sub_epi16(pred1_8x16b, pred5_8x16b);
175*abb65b4bSAndroid Build Coastguard Worker
176*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 */
177*abb65b4bSAndroid Build Coastguard Worker pred0_8x16b = _mm_sub_epi16(out0_8x16b, out1_8x16b);
178*abb65b4bSAndroid Build Coastguard Worker /* r2 - r3 */
179*abb65b4bSAndroid Build Coastguard Worker pred2_8x16b = _mm_sub_epi16(out2_8x16b, out3_8x16b);
180*abb65b4bSAndroid Build Coastguard Worker /* r4 - r5 */
181*abb65b4bSAndroid Build Coastguard Worker pred4_8x16b = _mm_sub_epi16(out4_8x16b, out5_8x16b);
182*abb65b4bSAndroid Build Coastguard Worker /* r6 - r7 */
183*abb65b4bSAndroid Build Coastguard Worker pred6_8x16b = _mm_sub_epi16(out6_8x16b, out7_8x16b);
184*abb65b4bSAndroid Build Coastguard Worker
185*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 + r2 - r3 */
186*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = _mm_add_epi16(pred0_8x16b, pred2_8x16b);
187*abb65b4bSAndroid Build Coastguard Worker /* r4 - r5 + r6 - r7 */
188*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = _mm_add_epi16(pred4_8x16b, pred6_8x16b);
189*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
190*abb65b4bSAndroid Build Coastguard Worker src1_8x16b = _mm_add_epi16(pred1_8x16b, pred5_8x16b);
191*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
192*abb65b4bSAndroid Build Coastguard Worker src5_8x16b = _mm_sub_epi16(pred1_8x16b, pred5_8x16b);
193*abb65b4bSAndroid Build Coastguard Worker
194*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 - r2 + r3 */
195*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = _mm_sub_epi16(pred0_8x16b, pred2_8x16b);
196*abb65b4bSAndroid Build Coastguard Worker /* r4 - r5 - r6 + r7 */
197*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = _mm_sub_epi16(pred4_8x16b, pred6_8x16b);
198*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
199*abb65b4bSAndroid Build Coastguard Worker src3_8x16b = _mm_add_epi16(pred1_8x16b, pred5_8x16b);
200*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
201*abb65b4bSAndroid Build Coastguard Worker src7_8x16b = _mm_sub_epi16(pred1_8x16b, pred5_8x16b);
202*abb65b4bSAndroid Build Coastguard Worker
203*abb65b4bSAndroid Build Coastguard Worker /*********************** 8x8 16 bit Transpose ************************/
204*abb65b4bSAndroid Build Coastguard Worker out3_8x16b = _mm_unpacklo_epi16(src0_8x16b, src1_8x16b);
205*abb65b4bSAndroid Build Coastguard Worker pred0_8x16b = _mm_unpacklo_epi16(src2_8x16b, src3_8x16b);
206*abb65b4bSAndroid Build Coastguard Worker out2_8x16b = _mm_unpacklo_epi16(src4_8x16b, src5_8x16b);
207*abb65b4bSAndroid Build Coastguard Worker pred3_8x16b = _mm_unpacklo_epi16(src6_8x16b, src7_8x16b);
208*abb65b4bSAndroid Build Coastguard Worker out7_8x16b = _mm_unpackhi_epi16(src0_8x16b, src1_8x16b);
209*abb65b4bSAndroid Build Coastguard Worker src2_8x16b = _mm_unpackhi_epi16(src2_8x16b, src3_8x16b);
210*abb65b4bSAndroid Build Coastguard Worker pred7_8x16b = _mm_unpackhi_epi16(src4_8x16b, src5_8x16b);
211*abb65b4bSAndroid Build Coastguard Worker src6_8x16b = _mm_unpackhi_epi16(src6_8x16b, src7_8x16b);
212*abb65b4bSAndroid Build Coastguard Worker
213*abb65b4bSAndroid Build Coastguard Worker out1_8x16b = _mm_unpacklo_epi32(out3_8x16b, pred0_8x16b);
214*abb65b4bSAndroid Build Coastguard Worker out3_8x16b = _mm_unpackhi_epi32(out3_8x16b, pred0_8x16b);
215*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = _mm_unpacklo_epi32(out2_8x16b, pred3_8x16b);
216*abb65b4bSAndroid Build Coastguard Worker pred3_8x16b = _mm_unpackhi_epi32(out2_8x16b, pred3_8x16b);
217*abb65b4bSAndroid Build Coastguard Worker out5_8x16b = _mm_unpacklo_epi32(out7_8x16b, src2_8x16b);
218*abb65b4bSAndroid Build Coastguard Worker out7_8x16b = _mm_unpackhi_epi32(out7_8x16b, src2_8x16b);
219*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = _mm_unpacklo_epi32(pred7_8x16b, src6_8x16b);
220*abb65b4bSAndroid Build Coastguard Worker pred7_8x16b = _mm_unpackhi_epi32(pred7_8x16b, src6_8x16b);
221*abb65b4bSAndroid Build Coastguard Worker
222*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = _mm_unpacklo_epi64(out1_8x16b, pred1_8x16b);
223*abb65b4bSAndroid Build Coastguard Worker src1_8x16b = _mm_unpackhi_epi64(out1_8x16b, pred1_8x16b);
224*abb65b4bSAndroid Build Coastguard Worker src2_8x16b = _mm_unpacklo_epi64(out3_8x16b, pred3_8x16b);
225*abb65b4bSAndroid Build Coastguard Worker src3_8x16b = _mm_unpackhi_epi64(out3_8x16b, pred3_8x16b);
226*abb65b4bSAndroid Build Coastguard Worker src4_8x16b = _mm_unpacklo_epi64(out5_8x16b, pred5_8x16b);
227*abb65b4bSAndroid Build Coastguard Worker src5_8x16b = _mm_unpackhi_epi64(out5_8x16b, pred5_8x16b);
228*abb65b4bSAndroid Build Coastguard Worker src6_8x16b = _mm_unpacklo_epi64(out7_8x16b, pred7_8x16b);
229*abb65b4bSAndroid Build Coastguard Worker src7_8x16b = _mm_unpackhi_epi64(out7_8x16b, pred7_8x16b);
230*abb65b4bSAndroid Build Coastguard Worker /********************** 8x8 16 bit Transpose End *********************/
231*abb65b4bSAndroid Build Coastguard Worker /**************** 8x8 horizontal transform *******************************/
232*abb65b4bSAndroid Build Coastguard Worker
233*abb65b4bSAndroid Build Coastguard Worker {
234*abb65b4bSAndroid Build Coastguard Worker __m128i out0a_8x16b, out1a_8x16b, out2a_8x16b, out3a_8x16b;
235*abb65b4bSAndroid Build Coastguard Worker __m128i out4a_8x16b, out5a_8x16b, out6a_8x16b, out7a_8x16b;
236*abb65b4bSAndroid Build Coastguard Worker __m128i tmp0_8x16b, tmp1_8x16b, tmp2_8x16b, tmp3_8x16b;
237*abb65b4bSAndroid Build Coastguard Worker __m128i tmp4_8x16b, tmp5_8x16b, tmp6_8x16b, tmp7_8x16b;
238*abb65b4bSAndroid Build Coastguard Worker
239*abb65b4bSAndroid Build Coastguard Worker /************************* 8x8 Vertical Transform*************************/
240*abb65b4bSAndroid Build Coastguard Worker tmp0_8x16b = _mm_srli_si128(src0_8x16b, 8);
241*abb65b4bSAndroid Build Coastguard Worker tmp1_8x16b = _mm_srli_si128(src1_8x16b, 8);
242*abb65b4bSAndroid Build Coastguard Worker tmp2_8x16b = _mm_srli_si128(src2_8x16b, 8);
243*abb65b4bSAndroid Build Coastguard Worker tmp3_8x16b = _mm_srli_si128(src3_8x16b, 8);
244*abb65b4bSAndroid Build Coastguard Worker tmp4_8x16b = _mm_srli_si128(src4_8x16b, 8);
245*abb65b4bSAndroid Build Coastguard Worker tmp5_8x16b = _mm_srli_si128(src5_8x16b, 8);
246*abb65b4bSAndroid Build Coastguard Worker tmp6_8x16b = _mm_srli_si128(src6_8x16b, 8);
247*abb65b4bSAndroid Build Coastguard Worker tmp7_8x16b = _mm_srli_si128(src7_8x16b, 8);
248*abb65b4bSAndroid Build Coastguard Worker
249*abb65b4bSAndroid Build Coastguard Worker /*************************First 4 pixels ********************************/
250*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = _mm_cvtepi16_epi32(src0_8x16b);
251*abb65b4bSAndroid Build Coastguard Worker src1_8x16b = _mm_cvtepi16_epi32(src1_8x16b);
252*abb65b4bSAndroid Build Coastguard Worker src2_8x16b = _mm_cvtepi16_epi32(src2_8x16b);
253*abb65b4bSAndroid Build Coastguard Worker src3_8x16b = _mm_cvtepi16_epi32(src3_8x16b);
254*abb65b4bSAndroid Build Coastguard Worker src4_8x16b = _mm_cvtepi16_epi32(src4_8x16b);
255*abb65b4bSAndroid Build Coastguard Worker src5_8x16b = _mm_cvtepi16_epi32(src5_8x16b);
256*abb65b4bSAndroid Build Coastguard Worker src6_8x16b = _mm_cvtepi16_epi32(src6_8x16b);
257*abb65b4bSAndroid Build Coastguard Worker src7_8x16b = _mm_cvtepi16_epi32(src7_8x16b);
258*abb65b4bSAndroid Build Coastguard Worker
259*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 */
260*abb65b4bSAndroid Build Coastguard Worker pred0_8x16b = _mm_add_epi32(src0_8x16b, src1_8x16b);
261*abb65b4bSAndroid Build Coastguard Worker /* r2 + r3 */
262*abb65b4bSAndroid Build Coastguard Worker pred2_8x16b = _mm_add_epi32(src2_8x16b, src3_8x16b);
263*abb65b4bSAndroid Build Coastguard Worker /* r4 + r5 */
264*abb65b4bSAndroid Build Coastguard Worker pred4_8x16b = _mm_add_epi32(src4_8x16b, src5_8x16b);
265*abb65b4bSAndroid Build Coastguard Worker /* r6 + r7 */
266*abb65b4bSAndroid Build Coastguard Worker pred6_8x16b = _mm_add_epi32(src6_8x16b, src7_8x16b);
267*abb65b4bSAndroid Build Coastguard Worker
268*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 + r2 + r3 */
269*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = _mm_add_epi32(pred0_8x16b, pred2_8x16b);
270*abb65b4bSAndroid Build Coastguard Worker /* r4 + r5 + r6 + r7 */
271*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = _mm_add_epi32(pred4_8x16b, pred6_8x16b);
272*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
273*abb65b4bSAndroid Build Coastguard Worker out0_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
274*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
275*abb65b4bSAndroid Build Coastguard Worker out4_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
276*abb65b4bSAndroid Build Coastguard Worker
277*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 - r2 - r3 */
278*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = _mm_sub_epi32(pred0_8x16b, pred2_8x16b);
279*abb65b4bSAndroid Build Coastguard Worker /* r4 + r5 - r6 - r7 */
280*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = _mm_sub_epi32(pred4_8x16b, pred6_8x16b);
281*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
282*abb65b4bSAndroid Build Coastguard Worker out2_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
283*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
284*abb65b4bSAndroid Build Coastguard Worker out6_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
285*abb65b4bSAndroid Build Coastguard Worker
286*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 */
287*abb65b4bSAndroid Build Coastguard Worker pred0_8x16b = _mm_sub_epi32(src0_8x16b, src1_8x16b);
288*abb65b4bSAndroid Build Coastguard Worker /* r2 - r3 */
289*abb65b4bSAndroid Build Coastguard Worker pred2_8x16b = _mm_sub_epi32(src2_8x16b, src3_8x16b);
290*abb65b4bSAndroid Build Coastguard Worker /* r4 - r5 */
291*abb65b4bSAndroid Build Coastguard Worker pred4_8x16b = _mm_sub_epi32(src4_8x16b, src5_8x16b);
292*abb65b4bSAndroid Build Coastguard Worker /* r6 - r7 */
293*abb65b4bSAndroid Build Coastguard Worker pred6_8x16b = _mm_sub_epi32(src6_8x16b, src7_8x16b);
294*abb65b4bSAndroid Build Coastguard Worker
295*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 + r2 - r3 */
296*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = _mm_add_epi32(pred0_8x16b, pred2_8x16b);
297*abb65b4bSAndroid Build Coastguard Worker /* r4 - r5 + r6 - r7 */
298*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = _mm_add_epi32(pred4_8x16b, pred6_8x16b);
299*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
300*abb65b4bSAndroid Build Coastguard Worker out1_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
301*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
302*abb65b4bSAndroid Build Coastguard Worker out5_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
303*abb65b4bSAndroid Build Coastguard Worker
304*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 - r2 + r3 */
305*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = _mm_sub_epi32(pred0_8x16b, pred2_8x16b);
306*abb65b4bSAndroid Build Coastguard Worker /* r4 - r5 - r6 + r7 */
307*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = _mm_sub_epi32(pred4_8x16b, pred6_8x16b);
308*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
309*abb65b4bSAndroid Build Coastguard Worker out3_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
310*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
311*abb65b4bSAndroid Build Coastguard Worker out7_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
312*abb65b4bSAndroid Build Coastguard Worker /*************************First 4 pixels ********************************/
313*abb65b4bSAndroid Build Coastguard Worker
314*abb65b4bSAndroid Build Coastguard Worker /**************************Next 4 pixels *******************************/
315*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = _mm_cvtepi16_epi32(tmp0_8x16b);
316*abb65b4bSAndroid Build Coastguard Worker src1_8x16b = _mm_cvtepi16_epi32(tmp1_8x16b);
317*abb65b4bSAndroid Build Coastguard Worker src2_8x16b = _mm_cvtepi16_epi32(tmp2_8x16b);
318*abb65b4bSAndroid Build Coastguard Worker src3_8x16b = _mm_cvtepi16_epi32(tmp3_8x16b);
319*abb65b4bSAndroid Build Coastguard Worker src4_8x16b = _mm_cvtepi16_epi32(tmp4_8x16b);
320*abb65b4bSAndroid Build Coastguard Worker src5_8x16b = _mm_cvtepi16_epi32(tmp5_8x16b);
321*abb65b4bSAndroid Build Coastguard Worker src6_8x16b = _mm_cvtepi16_epi32(tmp6_8x16b);
322*abb65b4bSAndroid Build Coastguard Worker src7_8x16b = _mm_cvtepi16_epi32(tmp7_8x16b);
323*abb65b4bSAndroid Build Coastguard Worker
324*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 */
325*abb65b4bSAndroid Build Coastguard Worker pred0_8x16b = _mm_add_epi32(src0_8x16b, src1_8x16b);
326*abb65b4bSAndroid Build Coastguard Worker /* r2 + r3 */
327*abb65b4bSAndroid Build Coastguard Worker pred2_8x16b = _mm_add_epi32(src2_8x16b, src3_8x16b);
328*abb65b4bSAndroid Build Coastguard Worker /* r4 + r5 */
329*abb65b4bSAndroid Build Coastguard Worker pred4_8x16b = _mm_add_epi32(src4_8x16b, src5_8x16b);
330*abb65b4bSAndroid Build Coastguard Worker /* r6 + r7 */
331*abb65b4bSAndroid Build Coastguard Worker pred6_8x16b = _mm_add_epi32(src6_8x16b, src7_8x16b);
332*abb65b4bSAndroid Build Coastguard Worker
333*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 + r2 + r3 */
334*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = _mm_add_epi32(pred0_8x16b, pred2_8x16b);
335*abb65b4bSAndroid Build Coastguard Worker /* r4 + r5 + r6 + r7 */
336*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = _mm_add_epi32(pred4_8x16b, pred6_8x16b);
337*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
338*abb65b4bSAndroid Build Coastguard Worker out0a_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
339*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
340*abb65b4bSAndroid Build Coastguard Worker out4a_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
341*abb65b4bSAndroid Build Coastguard Worker
342*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 - r2 - r3 */
343*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = _mm_sub_epi32(pred0_8x16b, pred2_8x16b);
344*abb65b4bSAndroid Build Coastguard Worker /* r4 + r5 - r6 - r7 */
345*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = _mm_sub_epi32(pred4_8x16b, pred6_8x16b);
346*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
347*abb65b4bSAndroid Build Coastguard Worker out2a_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
348*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
349*abb65b4bSAndroid Build Coastguard Worker out6a_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
350*abb65b4bSAndroid Build Coastguard Worker
351*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 */
352*abb65b4bSAndroid Build Coastguard Worker pred0_8x16b = _mm_sub_epi32(src0_8x16b, src1_8x16b);
353*abb65b4bSAndroid Build Coastguard Worker /* r2 - r3 */
354*abb65b4bSAndroid Build Coastguard Worker pred2_8x16b = _mm_sub_epi32(src2_8x16b, src3_8x16b);
355*abb65b4bSAndroid Build Coastguard Worker /* r4 - r5 */
356*abb65b4bSAndroid Build Coastguard Worker pred4_8x16b = _mm_sub_epi32(src4_8x16b, src5_8x16b);
357*abb65b4bSAndroid Build Coastguard Worker /* r6 - r7 */
358*abb65b4bSAndroid Build Coastguard Worker pred6_8x16b = _mm_sub_epi32(src6_8x16b, src7_8x16b);
359*abb65b4bSAndroid Build Coastguard Worker
360*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 + r2 - r3 */
361*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = _mm_add_epi32(pred0_8x16b, pred2_8x16b);
362*abb65b4bSAndroid Build Coastguard Worker /* r4 - r5 + r6 - r7 */
363*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = _mm_add_epi32(pred4_8x16b, pred6_8x16b);
364*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
365*abb65b4bSAndroid Build Coastguard Worker out1a_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
366*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
367*abb65b4bSAndroid Build Coastguard Worker out5a_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
368*abb65b4bSAndroid Build Coastguard Worker
369*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 - r2 + r3 */
370*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = _mm_sub_epi32(pred0_8x16b, pred2_8x16b);
371*abb65b4bSAndroid Build Coastguard Worker /* r4 - r5 - r6 + r7 */
372*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = _mm_sub_epi32(pred4_8x16b, pred6_8x16b);
373*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
374*abb65b4bSAndroid Build Coastguard Worker out3a_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
375*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
376*abb65b4bSAndroid Build Coastguard Worker out7a_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
377*abb65b4bSAndroid Build Coastguard Worker /**************************Next 4 pixels *******************************/
378*abb65b4bSAndroid Build Coastguard Worker /************************* 8x8 Vertical Transform*************************/
379*abb65b4bSAndroid Build Coastguard Worker
380*abb65b4bSAndroid Build Coastguard Worker /****************************SATD calculation ****************************/
381*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = _mm_abs_epi32(out0_8x16b);
382*abb65b4bSAndroid Build Coastguard Worker src1_8x16b = _mm_abs_epi32(out1_8x16b);
383*abb65b4bSAndroid Build Coastguard Worker src2_8x16b = _mm_abs_epi32(out2_8x16b);
384*abb65b4bSAndroid Build Coastguard Worker src3_8x16b = _mm_abs_epi32(out3_8x16b);
385*abb65b4bSAndroid Build Coastguard Worker src4_8x16b = _mm_abs_epi32(out4_8x16b);
386*abb65b4bSAndroid Build Coastguard Worker src5_8x16b = _mm_abs_epi32(out5_8x16b);
387*abb65b4bSAndroid Build Coastguard Worker src6_8x16b = _mm_abs_epi32(out6_8x16b);
388*abb65b4bSAndroid Build Coastguard Worker src7_8x16b = _mm_abs_epi32(out7_8x16b);
389*abb65b4bSAndroid Build Coastguard Worker
390*abb65b4bSAndroid Build Coastguard Worker s32* p = (s32*)&src0_8x16b;
391*abb65b4bSAndroid Build Coastguard Worker p[0] = 0;
392*abb65b4bSAndroid Build Coastguard Worker
393*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = _mm_add_epi32(src0_8x16b, src1_8x16b);
394*abb65b4bSAndroid Build Coastguard Worker src2_8x16b = _mm_add_epi32(src2_8x16b, src3_8x16b);
395*abb65b4bSAndroid Build Coastguard Worker src4_8x16b = _mm_add_epi32(src4_8x16b, src5_8x16b);
396*abb65b4bSAndroid Build Coastguard Worker src6_8x16b = _mm_add_epi32(src6_8x16b, src7_8x16b);
397*abb65b4bSAndroid Build Coastguard Worker
398*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = _mm_add_epi32(src0_8x16b, src2_8x16b);
399*abb65b4bSAndroid Build Coastguard Worker src4_8x16b = _mm_add_epi32(src4_8x16b, src6_8x16b);
400*abb65b4bSAndroid Build Coastguard Worker
401*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = _mm_add_epi32(src0_8x16b, src4_8x16b);
402*abb65b4bSAndroid Build Coastguard Worker
403*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = _mm_hadd_epi32(src0_8x16b, src0_8x16b);
404*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = _mm_hadd_epi32(src0_8x16b, src0_8x16b);
405*abb65b4bSAndroid Build Coastguard Worker
406*abb65b4bSAndroid Build Coastguard Worker sad += _mm_cvtsi128_si32(src0_8x16b);
407*abb65b4bSAndroid Build Coastguard Worker
408*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = _mm_abs_epi32(out0a_8x16b);
409*abb65b4bSAndroid Build Coastguard Worker src1_8x16b = _mm_abs_epi32(out1a_8x16b);
410*abb65b4bSAndroid Build Coastguard Worker src2_8x16b = _mm_abs_epi32(out2a_8x16b);
411*abb65b4bSAndroid Build Coastguard Worker src3_8x16b = _mm_abs_epi32(out3a_8x16b);
412*abb65b4bSAndroid Build Coastguard Worker src4_8x16b = _mm_abs_epi32(out4a_8x16b);
413*abb65b4bSAndroid Build Coastguard Worker src5_8x16b = _mm_abs_epi32(out5a_8x16b);
414*abb65b4bSAndroid Build Coastguard Worker src6_8x16b = _mm_abs_epi32(out6a_8x16b);
415*abb65b4bSAndroid Build Coastguard Worker src7_8x16b = _mm_abs_epi32(out7a_8x16b);
416*abb65b4bSAndroid Build Coastguard Worker
417*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = _mm_add_epi32(src0_8x16b, src1_8x16b);
418*abb65b4bSAndroid Build Coastguard Worker src2_8x16b = _mm_add_epi32(src2_8x16b, src3_8x16b);
419*abb65b4bSAndroid Build Coastguard Worker src4_8x16b = _mm_add_epi32(src4_8x16b, src5_8x16b);
420*abb65b4bSAndroid Build Coastguard Worker src6_8x16b = _mm_add_epi32(src6_8x16b, src7_8x16b);
421*abb65b4bSAndroid Build Coastguard Worker
422*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = _mm_add_epi32(src0_8x16b, src2_8x16b);
423*abb65b4bSAndroid Build Coastguard Worker src4_8x16b = _mm_add_epi32(src4_8x16b, src6_8x16b);
424*abb65b4bSAndroid Build Coastguard Worker
425*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = _mm_add_epi32(src0_8x16b, src4_8x16b);
426*abb65b4bSAndroid Build Coastguard Worker
427*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = _mm_hadd_epi32(src0_8x16b, src0_8x16b);
428*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = _mm_hadd_epi32(src0_8x16b, src0_8x16b);
429*abb65b4bSAndroid Build Coastguard Worker
430*abb65b4bSAndroid Build Coastguard Worker sad += _mm_cvtsi128_si32(src0_8x16b);
431*abb65b4bSAndroid Build Coastguard Worker
432*abb65b4bSAndroid Build Coastguard Worker sad = (sad + 2) >> 2;
433*abb65b4bSAndroid Build Coastguard Worker
434*abb65b4bSAndroid Build Coastguard Worker return sad;
435*abb65b4bSAndroid Build Coastguard Worker }
436*abb65b4bSAndroid Build Coastguard Worker }
437*abb65b4bSAndroid Build Coastguard Worker #endif /* X86_SSE */