xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/sad_sse2.asm (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1*fb1b10abSAndroid Build Coastguard Worker;
2*fb1b10abSAndroid Build Coastguard Worker;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker;
4*fb1b10abSAndroid Build Coastguard Worker;  Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker;  that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker;  tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker;  in the file PATENTS.  All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker;  be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker;
10*fb1b10abSAndroid Build Coastguard Worker
11*fb1b10abSAndroid Build Coastguard Worker%include "third_party/x86inc/x86inc.asm"
12*fb1b10abSAndroid Build Coastguard Worker
13*fb1b10abSAndroid Build Coastguard WorkerSECTION .text
14*fb1b10abSAndroid Build Coastguard Worker
15*fb1b10abSAndroid Build Coastguard Worker; Macro Arguments
16*fb1b10abSAndroid Build Coastguard Worker; Arg 1: Width
17*fb1b10abSAndroid Build Coastguard Worker; Arg 2: Height
18*fb1b10abSAndroid Build Coastguard Worker; Arg 3: Number of general purpose registers
19*fb1b10abSAndroid Build Coastguard Worker; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
20*fb1b10abSAndroid Build Coastguard Worker%macro SAD_FN 4
21*fb1b10abSAndroid Build Coastguard Worker%if %4 == 0 ; normal sad
22*fb1b10abSAndroid Build Coastguard Worker%if %3 == 5
23*fb1b10abSAndroid Build Coastguard Workercglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
24*fb1b10abSAndroid Build Coastguard Worker%else ; %3 == 7
25*fb1b10abSAndroid Build Coastguard Workercglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
26*fb1b10abSAndroid Build Coastguard Worker                            src_stride3, ref_stride3, n_rows
27*fb1b10abSAndroid Build Coastguard Worker%endif ; %3 == 5/7
28*fb1b10abSAndroid Build Coastguard Worker
29*fb1b10abSAndroid Build Coastguard Worker%elif %4 == 2 ; skip
30*fb1b10abSAndroid Build Coastguard Worker%if %3 == 5
31*fb1b10abSAndroid Build Coastguard Workercglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
32*fb1b10abSAndroid Build Coastguard Worker%else ; %3 == 7
33*fb1b10abSAndroid Build Coastguard Workercglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
34*fb1b10abSAndroid Build Coastguard Worker                            src_stride3, ref_stride3, n_rows
35*fb1b10abSAndroid Build Coastguard Worker%endif ; %3 == 5/7
36*fb1b10abSAndroid Build Coastguard Worker
37*fb1b10abSAndroid Build Coastguard Worker%else
38*fb1b10abSAndroid Build Coastguard Worker%if %3 == 5
39*fb1b10abSAndroid Build Coastguard Workercglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
40*fb1b10abSAndroid Build Coastguard Worker                                    second_pred, n_rows
41*fb1b10abSAndroid Build Coastguard Worker%else ; %3 == 7
42*fb1b10abSAndroid Build Coastguard Workercglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \
43*fb1b10abSAndroid Build Coastguard Worker                                              ref, ref_stride, \
44*fb1b10abSAndroid Build Coastguard Worker                                              second_pred, \
45*fb1b10abSAndroid Build Coastguard Worker                                              src_stride3, ref_stride3
46*fb1b10abSAndroid Build Coastguard Worker%if VPX_ARCH_X86_64
47*fb1b10abSAndroid Build Coastguard Worker%define n_rowsd r7d
48*fb1b10abSAndroid Build Coastguard Worker%else ; x86-32
49*fb1b10abSAndroid Build Coastguard Worker%define n_rowsd dword r0m
50*fb1b10abSAndroid Build Coastguard Worker%endif ; x86-32/64
51*fb1b10abSAndroid Build Coastguard Worker%endif ; %3 == 5/7
52*fb1b10abSAndroid Build Coastguard Worker%endif ; sad/avg/skip
53*fb1b10abSAndroid Build Coastguard Worker%if %4 == 2; skip rows so double the stride
54*fb1b10abSAndroid Build Coastguard Workerlea           src_strided, [src_strided*2]
55*fb1b10abSAndroid Build Coastguard Workerlea           ref_strided, [ref_strided*2]
56*fb1b10abSAndroid Build Coastguard Worker%endif ; %4 skip
57*fb1b10abSAndroid Build Coastguard Worker  movsxdifnidn src_strideq, src_strided
58*fb1b10abSAndroid Build Coastguard Worker  movsxdifnidn ref_strideq, ref_strided
59*fb1b10abSAndroid Build Coastguard Worker%if %3 == 7
60*fb1b10abSAndroid Build Coastguard Worker  lea         src_stride3q, [src_strideq*3]
61*fb1b10abSAndroid Build Coastguard Worker  lea         ref_stride3q, [ref_strideq*3]
62*fb1b10abSAndroid Build Coastguard Worker%endif ; %3 == 7
63*fb1b10abSAndroid Build Coastguard Worker%endmacro
64*fb1b10abSAndroid Build Coastguard Worker
65*fb1b10abSAndroid Build Coastguard Worker; unsigned int vpx_sad64x64_sse2(uint8_t *src, int src_stride,
66*fb1b10abSAndroid Build Coastguard Worker;                                uint8_t *ref, int ref_stride);
67*fb1b10abSAndroid Build Coastguard Worker%macro SAD64XN 1-2 0
68*fb1b10abSAndroid Build Coastguard Worker  SAD_FN 64, %1, 5, %2
69*fb1b10abSAndroid Build Coastguard Worker%if %2 == 2
70*fb1b10abSAndroid Build Coastguard Worker  mov              n_rowsd, %1/2
71*fb1b10abSAndroid Build Coastguard Worker%else
72*fb1b10abSAndroid Build Coastguard Worker  mov              n_rowsd, %1
73*fb1b10abSAndroid Build Coastguard Worker%endif
74*fb1b10abSAndroid Build Coastguard Worker  pxor                  m0, m0
75*fb1b10abSAndroid Build Coastguard Worker.loop:
76*fb1b10abSAndroid Build Coastguard Worker  movu                  m1, [refq]
77*fb1b10abSAndroid Build Coastguard Worker  movu                  m2, [refq+16]
78*fb1b10abSAndroid Build Coastguard Worker  movu                  m3, [refq+32]
79*fb1b10abSAndroid Build Coastguard Worker  movu                  m4, [refq+48]
80*fb1b10abSAndroid Build Coastguard Worker%if %2 == 1
81*fb1b10abSAndroid Build Coastguard Worker  pavgb                 m1, [second_predq+mmsize*0]
82*fb1b10abSAndroid Build Coastguard Worker  pavgb                 m2, [second_predq+mmsize*1]
83*fb1b10abSAndroid Build Coastguard Worker  pavgb                 m3, [second_predq+mmsize*2]
84*fb1b10abSAndroid Build Coastguard Worker  pavgb                 m4, [second_predq+mmsize*3]
85*fb1b10abSAndroid Build Coastguard Worker  lea         second_predq, [second_predq+mmsize*4]
86*fb1b10abSAndroid Build Coastguard Worker%endif
87*fb1b10abSAndroid Build Coastguard Worker  psadbw                m1, [srcq]
88*fb1b10abSAndroid Build Coastguard Worker  psadbw                m2, [srcq+16]
89*fb1b10abSAndroid Build Coastguard Worker  psadbw                m3, [srcq+32]
90*fb1b10abSAndroid Build Coastguard Worker  psadbw                m4, [srcq+48]
91*fb1b10abSAndroid Build Coastguard Worker  paddd                 m1, m2
92*fb1b10abSAndroid Build Coastguard Worker  paddd                 m3, m4
93*fb1b10abSAndroid Build Coastguard Worker  add                 refq, ref_strideq
94*fb1b10abSAndroid Build Coastguard Worker  paddd                 m0, m1
95*fb1b10abSAndroid Build Coastguard Worker  add                 srcq, src_strideq
96*fb1b10abSAndroid Build Coastguard Worker  paddd                 m0, m3
97*fb1b10abSAndroid Build Coastguard Worker  dec              n_rowsd
98*fb1b10abSAndroid Build Coastguard Worker  jg .loop
99*fb1b10abSAndroid Build Coastguard Worker
100*fb1b10abSAndroid Build Coastguard Worker  movhlps               m1, m0
101*fb1b10abSAndroid Build Coastguard Worker  paddd                 m0, m1
102*fb1b10abSAndroid Build Coastguard Worker%if %2 == 2 ; we skipped rows, so now we need to double the sad
103*fb1b10abSAndroid Build Coastguard Worker  pslld                 m0, 1
104*fb1b10abSAndroid Build Coastguard Worker%endif
105*fb1b10abSAndroid Build Coastguard Worker  movd                 eax, m0
106*fb1b10abSAndroid Build Coastguard Worker  RET
107*fb1b10abSAndroid Build Coastguard Worker%endmacro
108*fb1b10abSAndroid Build Coastguard Worker
109*fb1b10abSAndroid Build Coastguard WorkerINIT_XMM sse2
110*fb1b10abSAndroid Build Coastguard WorkerSAD64XN 64 ; sad64x64_sse2
111*fb1b10abSAndroid Build Coastguard WorkerSAD64XN 32 ; sad64x32_sse2
112*fb1b10abSAndroid Build Coastguard WorkerSAD64XN 64, 1 ; sad64x64_avg_sse2
113*fb1b10abSAndroid Build Coastguard WorkerSAD64XN 32, 1 ; sad64x32_avg_sse2
114*fb1b10abSAndroid Build Coastguard WorkerSAD64XN  64, 2  ; sad64x64_skip_sse2
115*fb1b10abSAndroid Build Coastguard WorkerSAD64XN  32, 2  ; sad64x32_skip_sse2
116*fb1b10abSAndroid Build Coastguard Worker
117*fb1b10abSAndroid Build Coastguard Worker; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride,
118*fb1b10abSAndroid Build Coastguard Worker;                                uint8_t *ref, int ref_stride);
119*fb1b10abSAndroid Build Coastguard Worker%macro SAD32XN 1-2 0
120*fb1b10abSAndroid Build Coastguard Worker  SAD_FN 32, %1, 5, %2
121*fb1b10abSAndroid Build Coastguard Worker%if %2 == 2
122*fb1b10abSAndroid Build Coastguard Worker  mov              n_rowsd, %1/4
123*fb1b10abSAndroid Build Coastguard Worker%else
124*fb1b10abSAndroid Build Coastguard Worker  mov              n_rowsd, %1/2
125*fb1b10abSAndroid Build Coastguard Worker%endif
126*fb1b10abSAndroid Build Coastguard Worker  pxor                  m0, m0
127*fb1b10abSAndroid Build Coastguard Worker.loop:
128*fb1b10abSAndroid Build Coastguard Worker  movu                  m1, [refq]
129*fb1b10abSAndroid Build Coastguard Worker  movu                  m2, [refq+16]
130*fb1b10abSAndroid Build Coastguard Worker  movu                  m3, [refq+ref_strideq]
131*fb1b10abSAndroid Build Coastguard Worker  movu                  m4, [refq+ref_strideq+16]
132*fb1b10abSAndroid Build Coastguard Worker%if %2 == 1
133*fb1b10abSAndroid Build Coastguard Worker  pavgb                 m1, [second_predq+mmsize*0]
134*fb1b10abSAndroid Build Coastguard Worker  pavgb                 m2, [second_predq+mmsize*1]
135*fb1b10abSAndroid Build Coastguard Worker  pavgb                 m3, [second_predq+mmsize*2]
136*fb1b10abSAndroid Build Coastguard Worker  pavgb                 m4, [second_predq+mmsize*3]
137*fb1b10abSAndroid Build Coastguard Worker  lea         second_predq, [second_predq+mmsize*4]
138*fb1b10abSAndroid Build Coastguard Worker%endif
139*fb1b10abSAndroid Build Coastguard Worker  psadbw                m1, [srcq]
140*fb1b10abSAndroid Build Coastguard Worker  psadbw                m2, [srcq+16]
141*fb1b10abSAndroid Build Coastguard Worker  psadbw                m3, [srcq+src_strideq]
142*fb1b10abSAndroid Build Coastguard Worker  psadbw                m4, [srcq+src_strideq+16]
143*fb1b10abSAndroid Build Coastguard Worker  paddd                 m1, m2
144*fb1b10abSAndroid Build Coastguard Worker  paddd                 m3, m4
145*fb1b10abSAndroid Build Coastguard Worker  lea                 refq, [refq+ref_strideq*2]
146*fb1b10abSAndroid Build Coastguard Worker  paddd                 m0, m1
147*fb1b10abSAndroid Build Coastguard Worker  lea                 srcq, [srcq+src_strideq*2]
148*fb1b10abSAndroid Build Coastguard Worker  paddd                 m0, m3
149*fb1b10abSAndroid Build Coastguard Worker  dec              n_rowsd
150*fb1b10abSAndroid Build Coastguard Worker  jg .loop
151*fb1b10abSAndroid Build Coastguard Worker
152*fb1b10abSAndroid Build Coastguard Worker  movhlps               m1, m0
153*fb1b10abSAndroid Build Coastguard Worker  paddd                 m0, m1
154*fb1b10abSAndroid Build Coastguard Worker%if %2 == 2 ; we skipped rows, so now we need to double the sad
155*fb1b10abSAndroid Build Coastguard Worker  pslld                 m0, 1
156*fb1b10abSAndroid Build Coastguard Worker%endif
157*fb1b10abSAndroid Build Coastguard Worker  movd                 eax, m0
158*fb1b10abSAndroid Build Coastguard Worker  RET
159*fb1b10abSAndroid Build Coastguard Worker%endmacro
160*fb1b10abSAndroid Build Coastguard Worker
161*fb1b10abSAndroid Build Coastguard WorkerINIT_XMM sse2
162*fb1b10abSAndroid Build Coastguard WorkerSAD32XN 64 ; sad32x64_sse2
163*fb1b10abSAndroid Build Coastguard WorkerSAD32XN 32 ; sad32x32_sse2
164*fb1b10abSAndroid Build Coastguard WorkerSAD32XN 16 ; sad32x16_sse2
165*fb1b10abSAndroid Build Coastguard WorkerSAD32XN 64, 1 ; sad32x64_avg_sse2
166*fb1b10abSAndroid Build Coastguard WorkerSAD32XN 32, 1 ; sad32x32_avg_sse2
167*fb1b10abSAndroid Build Coastguard WorkerSAD32XN 16, 1 ; sad32x16_avg_sse2
168*fb1b10abSAndroid Build Coastguard WorkerSAD32XN 64, 2 ; sad32x64_skip_sse2
169*fb1b10abSAndroid Build Coastguard WorkerSAD32XN 32, 2 ; sad32x32_skip_sse2
170*fb1b10abSAndroid Build Coastguard WorkerSAD32XN 16, 2 ; sad32x16_skip_sse2
171*fb1b10abSAndroid Build Coastguard Worker
172*fb1b10abSAndroid Build Coastguard Worker; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
173*fb1b10abSAndroid Build Coastguard Worker;                                    uint8_t *ref, int ref_stride);
174*fb1b10abSAndroid Build Coastguard Worker%macro SAD16XN 1-2 0
175*fb1b10abSAndroid Build Coastguard Worker  SAD_FN 16, %1, 7, %2
176*fb1b10abSAndroid Build Coastguard Worker%if %2 == 2
177*fb1b10abSAndroid Build Coastguard Worker  mov              n_rowsd, %1/8
178*fb1b10abSAndroid Build Coastguard Worker%else
179*fb1b10abSAndroid Build Coastguard Worker  mov              n_rowsd, %1/4
180*fb1b10abSAndroid Build Coastguard Worker%endif
181*fb1b10abSAndroid Build Coastguard Worker  pxor                  m0, m0
182*fb1b10abSAndroid Build Coastguard Worker
183*fb1b10abSAndroid Build Coastguard Worker.loop:
184*fb1b10abSAndroid Build Coastguard Worker  movu                  m1, [refq]
185*fb1b10abSAndroid Build Coastguard Worker  movu                  m2, [refq+ref_strideq]
186*fb1b10abSAndroid Build Coastguard Worker  movu                  m3, [refq+ref_strideq*2]
187*fb1b10abSAndroid Build Coastguard Worker  movu                  m4, [refq+ref_stride3q]
188*fb1b10abSAndroid Build Coastguard Worker%if %2 == 1
189*fb1b10abSAndroid Build Coastguard Worker  pavgb                 m1, [second_predq+mmsize*0]
190*fb1b10abSAndroid Build Coastguard Worker  pavgb                 m2, [second_predq+mmsize*1]
191*fb1b10abSAndroid Build Coastguard Worker  pavgb                 m3, [second_predq+mmsize*2]
192*fb1b10abSAndroid Build Coastguard Worker  pavgb                 m4, [second_predq+mmsize*3]
193*fb1b10abSAndroid Build Coastguard Worker  lea         second_predq, [second_predq+mmsize*4]
194*fb1b10abSAndroid Build Coastguard Worker%endif
195*fb1b10abSAndroid Build Coastguard Worker  psadbw                m1, [srcq]
196*fb1b10abSAndroid Build Coastguard Worker  psadbw                m2, [srcq+src_strideq]
197*fb1b10abSAndroid Build Coastguard Worker  psadbw                m3, [srcq+src_strideq*2]
198*fb1b10abSAndroid Build Coastguard Worker  psadbw                m4, [srcq+src_stride3q]
199*fb1b10abSAndroid Build Coastguard Worker  paddd                 m1, m2
200*fb1b10abSAndroid Build Coastguard Worker  paddd                 m3, m4
201*fb1b10abSAndroid Build Coastguard Worker  lea                 refq, [refq+ref_strideq*4]
202*fb1b10abSAndroid Build Coastguard Worker  paddd                 m0, m1
203*fb1b10abSAndroid Build Coastguard Worker  lea                 srcq, [srcq+src_strideq*4]
204*fb1b10abSAndroid Build Coastguard Worker  paddd                 m0, m3
205*fb1b10abSAndroid Build Coastguard Worker  dec              n_rowsd
206*fb1b10abSAndroid Build Coastguard Worker  jg .loop
207*fb1b10abSAndroid Build Coastguard Worker
208*fb1b10abSAndroid Build Coastguard Worker  movhlps               m1, m0
209*fb1b10abSAndroid Build Coastguard Worker  paddd                 m0, m1
210*fb1b10abSAndroid Build Coastguard Worker%if %2 == 2 ; we skipped rows, so now we need to double the sad
211*fb1b10abSAndroid Build Coastguard Worker  pslld                 m0, 1
212*fb1b10abSAndroid Build Coastguard Worker%endif
213*fb1b10abSAndroid Build Coastguard Worker  movd                 eax, m0
214*fb1b10abSAndroid Build Coastguard Worker  RET
215*fb1b10abSAndroid Build Coastguard Worker%endmacro
216*fb1b10abSAndroid Build Coastguard Worker
217*fb1b10abSAndroid Build Coastguard WorkerINIT_XMM sse2
218*fb1b10abSAndroid Build Coastguard WorkerSAD16XN 32 ; sad16x32_sse2
219*fb1b10abSAndroid Build Coastguard WorkerSAD16XN 16 ; sad16x16_sse2
220*fb1b10abSAndroid Build Coastguard WorkerSAD16XN  8 ; sad16x8_sse2
221*fb1b10abSAndroid Build Coastguard WorkerSAD16XN 32, 1 ; sad16x32_avg_sse2
222*fb1b10abSAndroid Build Coastguard WorkerSAD16XN 16, 1 ; sad16x16_avg_sse2
223*fb1b10abSAndroid Build Coastguard WorkerSAD16XN  8, 1 ; sad16x8_avg_sse2
224*fb1b10abSAndroid Build Coastguard WorkerSAD16XN 32, 2 ; sad16x32_skip_sse2
225*fb1b10abSAndroid Build Coastguard WorkerSAD16XN 16, 2 ; sad16x16_skip_sse2
226*fb1b10abSAndroid Build Coastguard WorkerSAD16XN  8, 2 ; sad16x8_skip_sse2
227*fb1b10abSAndroid Build Coastguard Worker
228*fb1b10abSAndroid Build Coastguard Worker; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
229*fb1b10abSAndroid Build Coastguard Worker;                                   uint8_t *ref, int ref_stride);
230*fb1b10abSAndroid Build Coastguard Worker%macro SAD8XN 1-2 0
231*fb1b10abSAndroid Build Coastguard Worker  SAD_FN 8, %1, 7, %2
232*fb1b10abSAndroid Build Coastguard Worker%if %2 == 2
233*fb1b10abSAndroid Build Coastguard Worker  mov              n_rowsd, %1/8
234*fb1b10abSAndroid Build Coastguard Worker%else
235*fb1b10abSAndroid Build Coastguard Worker  mov              n_rowsd, %1/4
236*fb1b10abSAndroid Build Coastguard Worker%endif
237*fb1b10abSAndroid Build Coastguard Worker  pxor                  m0, m0
238*fb1b10abSAndroid Build Coastguard Worker
239*fb1b10abSAndroid Build Coastguard Worker.loop:
240*fb1b10abSAndroid Build Coastguard Worker  movh                  m1, [refq]
241*fb1b10abSAndroid Build Coastguard Worker  movhps                m1, [refq+ref_strideq]
242*fb1b10abSAndroid Build Coastguard Worker  movh                  m2, [refq+ref_strideq*2]
243*fb1b10abSAndroid Build Coastguard Worker  movhps                m2, [refq+ref_stride3q]
244*fb1b10abSAndroid Build Coastguard Worker%if %2 == 1
245*fb1b10abSAndroid Build Coastguard Worker  pavgb                 m1, [second_predq+mmsize*0]
246*fb1b10abSAndroid Build Coastguard Worker  pavgb                 m2, [second_predq+mmsize*1]
247*fb1b10abSAndroid Build Coastguard Worker  lea         second_predq, [second_predq+mmsize*2]
248*fb1b10abSAndroid Build Coastguard Worker%endif
249*fb1b10abSAndroid Build Coastguard Worker  movh                  m3, [srcq]
250*fb1b10abSAndroid Build Coastguard Worker  movhps                m3, [srcq+src_strideq]
251*fb1b10abSAndroid Build Coastguard Worker  movh                  m4, [srcq+src_strideq*2]
252*fb1b10abSAndroid Build Coastguard Worker  movhps                m4, [srcq+src_stride3q]
253*fb1b10abSAndroid Build Coastguard Worker  psadbw                m1, m3
254*fb1b10abSAndroid Build Coastguard Worker  psadbw                m2, m4
255*fb1b10abSAndroid Build Coastguard Worker  lea                 refq, [refq+ref_strideq*4]
256*fb1b10abSAndroid Build Coastguard Worker  paddd                 m0, m1
257*fb1b10abSAndroid Build Coastguard Worker  lea                 srcq, [srcq+src_strideq*4]
258*fb1b10abSAndroid Build Coastguard Worker  paddd                 m0, m2
259*fb1b10abSAndroid Build Coastguard Worker  dec              n_rowsd
260*fb1b10abSAndroid Build Coastguard Worker  jg .loop
261*fb1b10abSAndroid Build Coastguard Worker
262*fb1b10abSAndroid Build Coastguard Worker  movhlps               m1, m0
263*fb1b10abSAndroid Build Coastguard Worker  paddd                 m0, m1
264*fb1b10abSAndroid Build Coastguard Worker%if %2 == 2 ; we skipped rows, so now we need to double the sad
265*fb1b10abSAndroid Build Coastguard Worker  pslld                 m0, 1
266*fb1b10abSAndroid Build Coastguard Worker%endif
267*fb1b10abSAndroid Build Coastguard Worker  movd                 eax, m0
268*fb1b10abSAndroid Build Coastguard Worker  RET
269*fb1b10abSAndroid Build Coastguard Worker%endmacro
270*fb1b10abSAndroid Build Coastguard Worker
271*fb1b10abSAndroid Build Coastguard WorkerINIT_XMM sse2
272*fb1b10abSAndroid Build Coastguard WorkerSAD8XN 16 ; sad8x16_sse2
273*fb1b10abSAndroid Build Coastguard WorkerSAD8XN  8 ; sad8x8_sse2
274*fb1b10abSAndroid Build Coastguard WorkerSAD8XN  4 ; sad8x4_sse2
275*fb1b10abSAndroid Build Coastguard WorkerSAD8XN 16, 1 ; sad8x16_avg_sse2
276*fb1b10abSAndroid Build Coastguard WorkerSAD8XN  8, 1 ; sad8x8_avg_sse2
277*fb1b10abSAndroid Build Coastguard WorkerSAD8XN  4, 1 ; sad8x4_avg_sse2
278*fb1b10abSAndroid Build Coastguard WorkerSAD8XN 16, 2 ; sad8x16_skip_sse2
279*fb1b10abSAndroid Build Coastguard WorkerSAD8XN  8, 2 ; sad8x8_skip_sse2
280*fb1b10abSAndroid Build Coastguard Worker
281*fb1b10abSAndroid Build Coastguard Worker; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
282*fb1b10abSAndroid Build Coastguard Worker;                                   uint8_t *ref, int ref_stride);
283*fb1b10abSAndroid Build Coastguard Worker%macro SAD4XN 1-2 0
284*fb1b10abSAndroid Build Coastguard Worker  SAD_FN 4, %1, 7, %2
285*fb1b10abSAndroid Build Coastguard Worker%if %2 == 2
286*fb1b10abSAndroid Build Coastguard Worker  mov              n_rowsd, %1/8
287*fb1b10abSAndroid Build Coastguard Worker%else
288*fb1b10abSAndroid Build Coastguard Worker  mov              n_rowsd, %1/4
289*fb1b10abSAndroid Build Coastguard Worker%endif
290*fb1b10abSAndroid Build Coastguard Worker  pxor                  m0, m0
291*fb1b10abSAndroid Build Coastguard Worker
292*fb1b10abSAndroid Build Coastguard Worker.loop:
293*fb1b10abSAndroid Build Coastguard Worker  movd                  m1, [refq]
294*fb1b10abSAndroid Build Coastguard Worker  movd                  m2, [refq+ref_strideq]
295*fb1b10abSAndroid Build Coastguard Worker  movd                  m3, [refq+ref_strideq*2]
296*fb1b10abSAndroid Build Coastguard Worker  movd                  m4, [refq+ref_stride3q]
297*fb1b10abSAndroid Build Coastguard Worker  punpckldq             m1, m2
298*fb1b10abSAndroid Build Coastguard Worker  punpckldq             m3, m4
299*fb1b10abSAndroid Build Coastguard Worker  movlhps               m1, m3
300*fb1b10abSAndroid Build Coastguard Worker%if %2 == 1
301*fb1b10abSAndroid Build Coastguard Worker  pavgb                 m1, [second_predq+mmsize*0]
302*fb1b10abSAndroid Build Coastguard Worker  lea         second_predq, [second_predq+mmsize*1]
303*fb1b10abSAndroid Build Coastguard Worker%endif
304*fb1b10abSAndroid Build Coastguard Worker  movd                  m2, [srcq]
305*fb1b10abSAndroid Build Coastguard Worker  movd                  m5, [srcq+src_strideq]
306*fb1b10abSAndroid Build Coastguard Worker  movd                  m4, [srcq+src_strideq*2]
307*fb1b10abSAndroid Build Coastguard Worker  movd                  m3, [srcq+src_stride3q]
308*fb1b10abSAndroid Build Coastguard Worker  punpckldq             m2, m5
309*fb1b10abSAndroid Build Coastguard Worker  punpckldq             m4, m3
310*fb1b10abSAndroid Build Coastguard Worker  movlhps               m2, m4
311*fb1b10abSAndroid Build Coastguard Worker  psadbw                m1, m2
312*fb1b10abSAndroid Build Coastguard Worker  lea                 refq, [refq+ref_strideq*4]
313*fb1b10abSAndroid Build Coastguard Worker  paddd                 m0, m1
314*fb1b10abSAndroid Build Coastguard Worker  lea                 srcq, [srcq+src_strideq*4]
315*fb1b10abSAndroid Build Coastguard Worker  dec              n_rowsd
316*fb1b10abSAndroid Build Coastguard Worker  jg .loop
317*fb1b10abSAndroid Build Coastguard Worker
318*fb1b10abSAndroid Build Coastguard Worker  movhlps               m1, m0
319*fb1b10abSAndroid Build Coastguard Worker  paddd                 m0, m1
320*fb1b10abSAndroid Build Coastguard Worker%if %2 == 2 ; we skipped rows, so now we need to double the sad
321*fb1b10abSAndroid Build Coastguard Worker  pslld                 m0, 1
322*fb1b10abSAndroid Build Coastguard Worker%endif
323*fb1b10abSAndroid Build Coastguard Worker  movd                 eax, m0
324*fb1b10abSAndroid Build Coastguard Worker  RET
325*fb1b10abSAndroid Build Coastguard Worker%endmacro
326*fb1b10abSAndroid Build Coastguard Worker
327*fb1b10abSAndroid Build Coastguard WorkerINIT_XMM sse2
328*fb1b10abSAndroid Build Coastguard WorkerSAD4XN  8 ; sad4x8_sse
329*fb1b10abSAndroid Build Coastguard WorkerSAD4XN  4 ; sad4x4_sse
330*fb1b10abSAndroid Build Coastguard WorkerSAD4XN  8, 1 ; sad4x8_avg_sse
331*fb1b10abSAndroid Build Coastguard WorkerSAD4XN  4, 1 ; sad4x4_avg_sse
332*fb1b10abSAndroid Build Coastguard WorkerSAD4XN  8, 2 ; sad4x8_skip_sse
333