xref: /aosp_15_r20/external/libaom/aom_dsp/x86/sad_sse2.asm (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker;
2*77c1e3ccSAndroid Build Coastguard Worker; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker;
4*77c1e3ccSAndroid Build Coastguard Worker; This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker; was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker; Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker;
11*77c1e3ccSAndroid Build Coastguard Worker
12*77c1e3ccSAndroid Build Coastguard Worker;
13*77c1e3ccSAndroid Build Coastguard Worker
14*77c1e3ccSAndroid Build Coastguard Worker%include "third_party/x86inc/x86inc.asm"
15*77c1e3ccSAndroid Build Coastguard Worker
16*77c1e3ccSAndroid Build Coastguard WorkerSECTION .text
17*77c1e3ccSAndroid Build Coastguard Worker
18*77c1e3ccSAndroid Build Coastguard Worker; Macro Arguments
19*77c1e3ccSAndroid Build Coastguard Worker; Arg 1: Width
20*77c1e3ccSAndroid Build Coastguard Worker; Arg 2: Height
21*77c1e3ccSAndroid Build Coastguard Worker; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit
22*77c1e3ccSAndroid Build Coastguard Worker; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
23*77c1e3ccSAndroid Build Coastguard Worker%macro SAD_FN 4
24*77c1e3ccSAndroid Build Coastguard Worker%if %4 == 0 ; normal sad
25*77c1e3ccSAndroid Build Coastguard Worker%if %3 == 5
26*77c1e3ccSAndroid Build Coastguard Workercglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
27*77c1e3ccSAndroid Build Coastguard Worker%else ; %3 == 7
28*77c1e3ccSAndroid Build Coastguard Workercglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
29*77c1e3ccSAndroid Build Coastguard Worker                            src_stride3, ref_stride3, n_rows
30*77c1e3ccSAndroid Build Coastguard Worker%endif ; %3 == 5/7
31*77c1e3ccSAndroid Build Coastguard Worker
32*77c1e3ccSAndroid Build Coastguard Worker%elif %4 == 2 ; skip
33*77c1e3ccSAndroid Build Coastguard Worker%if %3 == 5
34*77c1e3ccSAndroid Build Coastguard Workercglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
35*77c1e3ccSAndroid Build Coastguard Worker%else ; %3 == 7
36*77c1e3ccSAndroid Build Coastguard Workercglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
37*77c1e3ccSAndroid Build Coastguard Worker                            src_stride3, ref_stride3, n_rows
38*77c1e3ccSAndroid Build Coastguard Worker%endif ; %3 == 5/7
39*77c1e3ccSAndroid Build Coastguard Worker
40*77c1e3ccSAndroid Build Coastguard Worker%else
41*77c1e3ccSAndroid Build Coastguard Worker%if %3 == 5
42*77c1e3ccSAndroid Build Coastguard Workercglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
43*77c1e3ccSAndroid Build Coastguard Worker                                    second_pred, n_rows
44*77c1e3ccSAndroid Build Coastguard Worker%else ; %3 == 7
45*77c1e3ccSAndroid Build Coastguard Workercglobal sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, 6, src, src_stride, \
46*77c1e3ccSAndroid Build Coastguard Worker                                              ref, ref_stride, \
47*77c1e3ccSAndroid Build Coastguard Worker                                              second_pred, \
48*77c1e3ccSAndroid Build Coastguard Worker                                              src_stride3, ref_stride3
49*77c1e3ccSAndroid Build Coastguard Worker%if AOM_ARCH_X86_64
50*77c1e3ccSAndroid Build Coastguard Worker%define n_rowsd r7d
51*77c1e3ccSAndroid Build Coastguard Worker%else ; x86-32
52*77c1e3ccSAndroid Build Coastguard Worker%define n_rowsd dword r0m
53*77c1e3ccSAndroid Build Coastguard Worker%endif ; x86-32/64
54*77c1e3ccSAndroid Build Coastguard Worker%endif ; %3 == 5/7
55*77c1e3ccSAndroid Build Coastguard Worker%endif ; sad/avg/skip
56*77c1e3ccSAndroid Build Coastguard Worker%if %4 == 2; skip rows so double the stride
57*77c1e3ccSAndroid Build Coastguard Workerlea           src_strided, [src_strided*2]
58*77c1e3ccSAndroid Build Coastguard Workerlea           ref_strided, [ref_strided*2]
59*77c1e3ccSAndroid Build Coastguard Worker%endif ; %4 skip
60*77c1e3ccSAndroid Build Coastguard Worker  movsxdifnidn src_strideq, src_strided
61*77c1e3ccSAndroid Build Coastguard Worker  movsxdifnidn ref_strideq, ref_strided
62*77c1e3ccSAndroid Build Coastguard Worker%if %3 == 7
63*77c1e3ccSAndroid Build Coastguard Worker  lea         src_stride3q, [src_strideq*3]
64*77c1e3ccSAndroid Build Coastguard Worker  lea         ref_stride3q, [ref_strideq*3]
65*77c1e3ccSAndroid Build Coastguard Worker%endif ; %3 == 7
66*77c1e3ccSAndroid Build Coastguard Worker%endmacro
67*77c1e3ccSAndroid Build Coastguard Worker
68*77c1e3ccSAndroid Build Coastguard Worker; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride,
69*77c1e3ccSAndroid Build Coastguard Worker;                                  uint8_t *ref, int ref_stride);
70*77c1e3ccSAndroid Build Coastguard Worker%macro SAD128XN 1-2 0
71*77c1e3ccSAndroid Build Coastguard Worker  SAD_FN 128, %1, 5, %2
72*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2
73*77c1e3ccSAndroid Build Coastguard Worker  mov              n_rowsd, %1/2
74*77c1e3ccSAndroid Build Coastguard Worker%else
75*77c1e3ccSAndroid Build Coastguard Worker  mov              n_rowsd, %1
76*77c1e3ccSAndroid Build Coastguard Worker%endif
77*77c1e3ccSAndroid Build Coastguard Worker  pxor                  m0, m0
78*77c1e3ccSAndroid Build Coastguard Worker
79*77c1e3ccSAndroid Build Coastguard Worker.loop:
80*77c1e3ccSAndroid Build Coastguard Worker  movu                  m1, [refq]
81*77c1e3ccSAndroid Build Coastguard Worker  movu                  m2, [refq+16]
82*77c1e3ccSAndroid Build Coastguard Worker  movu                  m3, [refq+32]
83*77c1e3ccSAndroid Build Coastguard Worker  movu                  m4, [refq+48]
84*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 1
85*77c1e3ccSAndroid Build Coastguard Worker  pavgb                 m1, [second_predq+mmsize*0]
86*77c1e3ccSAndroid Build Coastguard Worker  pavgb                 m2, [second_predq+mmsize*1]
87*77c1e3ccSAndroid Build Coastguard Worker  pavgb                 m3, [second_predq+mmsize*2]
88*77c1e3ccSAndroid Build Coastguard Worker  pavgb                 m4, [second_predq+mmsize*3]
89*77c1e3ccSAndroid Build Coastguard Worker%endif
90*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m1, [srcq]
91*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m2, [srcq+16]
92*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m3, [srcq+32]
93*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m4, [srcq+48]
94*77c1e3ccSAndroid Build Coastguard Worker
95*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m1, m2
96*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m3, m4
97*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m0, m1
98*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m0, m3
99*77c1e3ccSAndroid Build Coastguard Worker
100*77c1e3ccSAndroid Build Coastguard Worker  movu                  m1, [refq+64]
101*77c1e3ccSAndroid Build Coastguard Worker  movu                  m2, [refq+80]
102*77c1e3ccSAndroid Build Coastguard Worker  movu                  m3, [refq+96]
103*77c1e3ccSAndroid Build Coastguard Worker  movu                  m4, [refq+112]
104*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 1
105*77c1e3ccSAndroid Build Coastguard Worker  pavgb                 m1, [second_predq+mmsize*4]
106*77c1e3ccSAndroid Build Coastguard Worker  pavgb                 m2, [second_predq+mmsize*5]
107*77c1e3ccSAndroid Build Coastguard Worker  pavgb                 m3, [second_predq+mmsize*6]
108*77c1e3ccSAndroid Build Coastguard Worker  pavgb                 m4, [second_predq+mmsize*7]
109*77c1e3ccSAndroid Build Coastguard Worker  lea         second_predq, [second_predq+mmsize*8]
110*77c1e3ccSAndroid Build Coastguard Worker%endif
111*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m1, [srcq+64]
112*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m2, [srcq+80]
113*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m3, [srcq+96]
114*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m4, [srcq+112]
115*77c1e3ccSAndroid Build Coastguard Worker
116*77c1e3ccSAndroid Build Coastguard Worker  add                 refq, ref_strideq
117*77c1e3ccSAndroid Build Coastguard Worker  add                 srcq, src_strideq
118*77c1e3ccSAndroid Build Coastguard Worker
119*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m1, m2
120*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m3, m4
121*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m0, m1
122*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m0, m3
123*77c1e3ccSAndroid Build Coastguard Worker
124*77c1e3ccSAndroid Build Coastguard Worker  sub              n_rowsd, 1
125*77c1e3ccSAndroid Build Coastguard Worker  jg .loop
126*77c1e3ccSAndroid Build Coastguard Worker
127*77c1e3ccSAndroid Build Coastguard Worker  movhlps               m1, m0
128*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m0, m1
129*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2 ; we skipped rows, so now we need to double the sad
130*77c1e3ccSAndroid Build Coastguard Worker  pslld                 m0, 1
131*77c1e3ccSAndroid Build Coastguard Worker%endif
132*77c1e3ccSAndroid Build Coastguard Worker  movd                 eax, m0
133*77c1e3ccSAndroid Build Coastguard Worker  RET
134*77c1e3ccSAndroid Build Coastguard Worker%endmacro
135*77c1e3ccSAndroid Build Coastguard Worker
136*77c1e3ccSAndroid Build Coastguard WorkerINIT_XMM sse2
137*77c1e3ccSAndroid Build Coastguard WorkerSAD128XN 128     ; sad128x128_sse2
138*77c1e3ccSAndroid Build Coastguard WorkerSAD128XN 128, 1  ; sad128x128_avg_sse2
139*77c1e3ccSAndroid Build Coastguard WorkerSAD128XN 128, 2  ; sad128x128_skip_sse2
140*77c1e3ccSAndroid Build Coastguard WorkerSAD128XN 64      ; sad128x64_sse2
141*77c1e3ccSAndroid Build Coastguard WorkerSAD128XN 64, 1   ; sad128x64_avg_sse2
142*77c1e3ccSAndroid Build Coastguard WorkerSAD128XN 64, 2   ; sad128x64_skip_sse2
143*77c1e3ccSAndroid Build Coastguard Worker
144*77c1e3ccSAndroid Build Coastguard Worker
145*77c1e3ccSAndroid Build Coastguard Worker; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride,
146*77c1e3ccSAndroid Build Coastguard Worker;                                uint8_t *ref, int ref_stride);
147*77c1e3ccSAndroid Build Coastguard Worker%macro SAD64XN 1-2 0
148*77c1e3ccSAndroid Build Coastguard Worker  SAD_FN 64, %1, 5, %2
149*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2
150*77c1e3ccSAndroid Build Coastguard Worker  mov              n_rowsd, %1/2
151*77c1e3ccSAndroid Build Coastguard Worker%else
152*77c1e3ccSAndroid Build Coastguard Worker  mov              n_rowsd, %1
153*77c1e3ccSAndroid Build Coastguard Worker%endif
154*77c1e3ccSAndroid Build Coastguard Worker  pxor                  m0, m0
155*77c1e3ccSAndroid Build Coastguard Worker.loop:
156*77c1e3ccSAndroid Build Coastguard Worker  movu                  m1, [refq]
157*77c1e3ccSAndroid Build Coastguard Worker  movu                  m2, [refq+16]
158*77c1e3ccSAndroid Build Coastguard Worker  movu                  m3, [refq+32]
159*77c1e3ccSAndroid Build Coastguard Worker  movu                  m4, [refq+48]
160*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 1
161*77c1e3ccSAndroid Build Coastguard Worker  pavgb                 m1, [second_predq+mmsize*0]
162*77c1e3ccSAndroid Build Coastguard Worker  pavgb                 m2, [second_predq+mmsize*1]
163*77c1e3ccSAndroid Build Coastguard Worker  pavgb                 m3, [second_predq+mmsize*2]
164*77c1e3ccSAndroid Build Coastguard Worker  pavgb                 m4, [second_predq+mmsize*3]
165*77c1e3ccSAndroid Build Coastguard Worker  lea         second_predq, [second_predq+mmsize*4]
166*77c1e3ccSAndroid Build Coastguard Worker%endif
167*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m1, [srcq]
168*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m2, [srcq+16]
169*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m3, [srcq+32]
170*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m4, [srcq+48]
171*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m1, m2
172*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m3, m4
173*77c1e3ccSAndroid Build Coastguard Worker  add                 refq, ref_strideq
174*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m0, m1
175*77c1e3ccSAndroid Build Coastguard Worker  add                 srcq, src_strideq
176*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m0, m3
177*77c1e3ccSAndroid Build Coastguard Worker  dec              n_rowsd
178*77c1e3ccSAndroid Build Coastguard Worker  jg .loop
179*77c1e3ccSAndroid Build Coastguard Worker
180*77c1e3ccSAndroid Build Coastguard Worker  movhlps               m1, m0
181*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m0, m1
182*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2 ; we skipped rows, so now we need to double the sad
183*77c1e3ccSAndroid Build Coastguard Worker  pslld                 m0, 1
184*77c1e3ccSAndroid Build Coastguard Worker%endif
185*77c1e3ccSAndroid Build Coastguard Worker  movd                 eax, m0
186*77c1e3ccSAndroid Build Coastguard Worker  RET
187*77c1e3ccSAndroid Build Coastguard Worker%endmacro
188*77c1e3ccSAndroid Build Coastguard Worker
189*77c1e3ccSAndroid Build Coastguard WorkerINIT_XMM sse2
190*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN 128     ; sad64x128_sse2
191*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN  64     ; sad64x64_sse2
192*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN  32     ; sad64x32_sse2
193*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN 128, 1  ; sad64x128_avg_sse2
194*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN  64, 1  ; sad64x64_avg_sse2
195*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN  32, 1  ; sad64x32_avg_sse2
196*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN 128, 2  ; sad64x128_skip_sse2
197*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN  64, 2  ; sad64x64_skip_sse2
198*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN  32, 2  ; sad64x32_skip_sse2
199*77c1e3ccSAndroid Build Coastguard Worker%if CONFIG_REALTIME_ONLY==0
200*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN  16     ; sad64x16_sse2
201*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN  16, 1  ; sad64x16_avg_sse2
202*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN  16, 2  ; sad64x16_skip_sse2
203*77c1e3ccSAndroid Build Coastguard Worker%endif
204*77c1e3ccSAndroid Build Coastguard Worker
205*77c1e3ccSAndroid Build Coastguard Worker; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride,
206*77c1e3ccSAndroid Build Coastguard Worker;                                uint8_t *ref, int ref_stride);
207*77c1e3ccSAndroid Build Coastguard Worker%macro SAD32XN 1-2 0
208*77c1e3ccSAndroid Build Coastguard Worker  SAD_FN 32, %1, 5, %2
209*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2
210*77c1e3ccSAndroid Build Coastguard Worker  mov              n_rowsd, %1/4
211*77c1e3ccSAndroid Build Coastguard Worker%else
212*77c1e3ccSAndroid Build Coastguard Worker  mov              n_rowsd, %1/2
213*77c1e3ccSAndroid Build Coastguard Worker%endif
214*77c1e3ccSAndroid Build Coastguard Worker  pxor                  m0, m0
215*77c1e3ccSAndroid Build Coastguard Worker.loop:
216*77c1e3ccSAndroid Build Coastguard Worker  movu                  m1, [refq]
217*77c1e3ccSAndroid Build Coastguard Worker  movu                  m2, [refq+16]
218*77c1e3ccSAndroid Build Coastguard Worker  movu                  m3, [refq+ref_strideq]
219*77c1e3ccSAndroid Build Coastguard Worker  movu                  m4, [refq+ref_strideq+16]
220*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 1
221*77c1e3ccSAndroid Build Coastguard Worker  pavgb                 m1, [second_predq+mmsize*0]
222*77c1e3ccSAndroid Build Coastguard Worker  pavgb                 m2, [second_predq+mmsize*1]
223*77c1e3ccSAndroid Build Coastguard Worker  pavgb                 m3, [second_predq+mmsize*2]
224*77c1e3ccSAndroid Build Coastguard Worker  pavgb                 m4, [second_predq+mmsize*3]
225*77c1e3ccSAndroid Build Coastguard Worker  lea         second_predq, [second_predq+mmsize*4]
226*77c1e3ccSAndroid Build Coastguard Worker%endif
227*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m1, [srcq]
228*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m2, [srcq+16]
229*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m3, [srcq+src_strideq]
230*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m4, [srcq+src_strideq+16]
231*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m1, m2
232*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m3, m4
233*77c1e3ccSAndroid Build Coastguard Worker  lea                 refq, [refq+ref_strideq*2]
234*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m0, m1
235*77c1e3ccSAndroid Build Coastguard Worker  lea                 srcq, [srcq+src_strideq*2]
236*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m0, m3
237*77c1e3ccSAndroid Build Coastguard Worker  dec              n_rowsd
238*77c1e3ccSAndroid Build Coastguard Worker  jg .loop
239*77c1e3ccSAndroid Build Coastguard Worker
240*77c1e3ccSAndroid Build Coastguard Worker  movhlps               m1, m0
241*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m0, m1
242*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2 ; we skipped rows, so now we need to double the sad
243*77c1e3ccSAndroid Build Coastguard Worker  pslld                 m0, 1
244*77c1e3ccSAndroid Build Coastguard Worker%endif
245*77c1e3ccSAndroid Build Coastguard Worker  movd                 eax, m0
246*77c1e3ccSAndroid Build Coastguard Worker  RET
247*77c1e3ccSAndroid Build Coastguard Worker%endmacro
248*77c1e3ccSAndroid Build Coastguard Worker
249*77c1e3ccSAndroid Build Coastguard WorkerINIT_XMM sse2
250*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN 64    ; sad32x64_sse2
251*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN 32    ; sad32x32_sse2
252*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN 16    ; sad32x16_sse2
253*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN 64, 1 ; sad32x64_avg_sse2
254*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN 32, 1 ; sad32x32_avg_sse2
255*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN 16, 1 ; sad32x16_avg_sse2
256*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN 64, 2 ; sad32x64_skip_sse2
257*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN 32, 2 ; sad32x32_skip_sse2
258*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN 16, 2 ; sad32x16_skip_sse2
259*77c1e3ccSAndroid Build Coastguard Worker%if CONFIG_REALTIME_ONLY==0
260*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN  8    ; sad_32x8_sse2
261*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN  8, 1 ; sad_32x8_avg_sse2
262*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN  8, 2 ; sad_32x8_skip_sse2
263*77c1e3ccSAndroid Build Coastguard Worker%endif
264*77c1e3ccSAndroid Build Coastguard Worker
265*77c1e3ccSAndroid Build Coastguard Worker; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
266*77c1e3ccSAndroid Build Coastguard Worker;                                    uint8_t *ref, int ref_stride);
267*77c1e3ccSAndroid Build Coastguard Worker%macro SAD16XN 1-2 0
268*77c1e3ccSAndroid Build Coastguard Worker  SAD_FN 16, %1, 7, %2
269*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2
270*77c1e3ccSAndroid Build Coastguard Worker  mov              n_rowsd, %1/8
271*77c1e3ccSAndroid Build Coastguard Worker%else
272*77c1e3ccSAndroid Build Coastguard Worker  mov              n_rowsd, %1/4
273*77c1e3ccSAndroid Build Coastguard Worker%endif
274*77c1e3ccSAndroid Build Coastguard Worker  pxor                  m0, m0
275*77c1e3ccSAndroid Build Coastguard Worker
276*77c1e3ccSAndroid Build Coastguard Worker.loop:
277*77c1e3ccSAndroid Build Coastguard Worker  movu                  m1, [refq]
278*77c1e3ccSAndroid Build Coastguard Worker  movu                  m2, [refq+ref_strideq]
279*77c1e3ccSAndroid Build Coastguard Worker  movu                  m3, [refq+ref_strideq*2]
280*77c1e3ccSAndroid Build Coastguard Worker  movu                  m4, [refq+ref_stride3q]
281*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 1
282*77c1e3ccSAndroid Build Coastguard Worker  pavgb                 m1, [second_predq+mmsize*0]
283*77c1e3ccSAndroid Build Coastguard Worker  pavgb                 m2, [second_predq+mmsize*1]
284*77c1e3ccSAndroid Build Coastguard Worker  pavgb                 m3, [second_predq+mmsize*2]
285*77c1e3ccSAndroid Build Coastguard Worker  pavgb                 m4, [second_predq+mmsize*3]
286*77c1e3ccSAndroid Build Coastguard Worker  lea         second_predq, [second_predq+mmsize*4]
287*77c1e3ccSAndroid Build Coastguard Worker%endif
288*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m1, [srcq]
289*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m2, [srcq+src_strideq]
290*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m3, [srcq+src_strideq*2]
291*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m4, [srcq+src_stride3q]
292*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m1, m2
293*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m3, m4
294*77c1e3ccSAndroid Build Coastguard Worker  lea                 refq, [refq+ref_strideq*4]
295*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m0, m1
296*77c1e3ccSAndroid Build Coastguard Worker  lea                 srcq, [srcq+src_strideq*4]
297*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m0, m3
298*77c1e3ccSAndroid Build Coastguard Worker  dec              n_rowsd
299*77c1e3ccSAndroid Build Coastguard Worker  jg .loop
300*77c1e3ccSAndroid Build Coastguard Worker
301*77c1e3ccSAndroid Build Coastguard Worker  movhlps               m1, m0
302*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m0, m1
303*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2 ; we skipped rows, so now we need to double the sad
304*77c1e3ccSAndroid Build Coastguard Worker  pslld                 m0, 1
305*77c1e3ccSAndroid Build Coastguard Worker%endif
306*77c1e3ccSAndroid Build Coastguard Worker  movd                 eax, m0
307*77c1e3ccSAndroid Build Coastguard Worker  RET
308*77c1e3ccSAndroid Build Coastguard Worker%endmacro
309*77c1e3ccSAndroid Build Coastguard Worker
310*77c1e3ccSAndroid Build Coastguard WorkerINIT_XMM sse2
311*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN 32    ; sad16x32_sse2
312*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN 16    ; sad16x16_sse2
313*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN  8    ; sad16x8_sse2
314*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN 32, 1 ; sad16x32_avg_sse2
315*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN 16, 1 ; sad16x16_avg_sse2
316*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN  8, 1 ; sad16x8_avg_sse2
317*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN 32, 2 ; sad16x32_skip_sse2
318*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN 16, 2 ; sad16x16_skip_sse2
319*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN  8, 2 ; sad16x8_skip_sse2
320*77c1e3ccSAndroid Build Coastguard Worker%if CONFIG_REALTIME_ONLY==0
321*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN 64    ; sad_16x64_sse2
322*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN  4    ; sad_16x4_sse2
323*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN 64, 1 ; sad_16x64_avg_sse2
324*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN  4, 1 ; sad_16x4_avg_sse2
325*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN 64, 2 ; sad_16x64_skip_sse2
326*77c1e3ccSAndroid Build Coastguard Worker%endif
327*77c1e3ccSAndroid Build Coastguard Worker
328*77c1e3ccSAndroid Build Coastguard Worker; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
329*77c1e3ccSAndroid Build Coastguard Worker;                                   uint8_t *ref, int ref_stride);
330*77c1e3ccSAndroid Build Coastguard Worker%macro SAD8XN 1-2 0
331*77c1e3ccSAndroid Build Coastguard Worker  SAD_FN 8, %1, 7, %2
332*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2
333*77c1e3ccSAndroid Build Coastguard Worker  mov              n_rowsd, %1/8
334*77c1e3ccSAndroid Build Coastguard Worker%else
335*77c1e3ccSAndroid Build Coastguard Worker  mov              n_rowsd, %1/4
336*77c1e3ccSAndroid Build Coastguard Worker%endif
337*77c1e3ccSAndroid Build Coastguard Worker  pxor                  m0, m0
338*77c1e3ccSAndroid Build Coastguard Worker
339*77c1e3ccSAndroid Build Coastguard Worker.loop:
340*77c1e3ccSAndroid Build Coastguard Worker  movh                  m1, [refq]
341*77c1e3ccSAndroid Build Coastguard Worker  movhps                m1, [refq+ref_strideq]
342*77c1e3ccSAndroid Build Coastguard Worker  movh                  m2, [refq+ref_strideq*2]
343*77c1e3ccSAndroid Build Coastguard Worker  movhps                m2, [refq+ref_stride3q]
344*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 1
345*77c1e3ccSAndroid Build Coastguard Worker  pavgb                 m1, [second_predq+mmsize*0]
346*77c1e3ccSAndroid Build Coastguard Worker  pavgb                 m2, [second_predq+mmsize*1]
347*77c1e3ccSAndroid Build Coastguard Worker  lea         second_predq, [second_predq+mmsize*2]
348*77c1e3ccSAndroid Build Coastguard Worker%endif
349*77c1e3ccSAndroid Build Coastguard Worker  movh                  m3, [srcq]
350*77c1e3ccSAndroid Build Coastguard Worker  movhps                m3, [srcq+src_strideq]
351*77c1e3ccSAndroid Build Coastguard Worker  movh                  m4, [srcq+src_strideq*2]
352*77c1e3ccSAndroid Build Coastguard Worker  movhps                m4, [srcq+src_stride3q]
353*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m1, m3
354*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m2, m4
355*77c1e3ccSAndroid Build Coastguard Worker  lea                 refq, [refq+ref_strideq*4]
356*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m0, m1
357*77c1e3ccSAndroid Build Coastguard Worker  lea                 srcq, [srcq+src_strideq*4]
358*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m0, m2
359*77c1e3ccSAndroid Build Coastguard Worker  dec              n_rowsd
360*77c1e3ccSAndroid Build Coastguard Worker  jg .loop
361*77c1e3ccSAndroid Build Coastguard Worker
362*77c1e3ccSAndroid Build Coastguard Worker  movhlps               m1, m0
363*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m0, m1
364*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2 ; we skipped rows, so now we need to double the sad
365*77c1e3ccSAndroid Build Coastguard Worker  pslld                 m0, 1
366*77c1e3ccSAndroid Build Coastguard Worker%endif
367*77c1e3ccSAndroid Build Coastguard Worker  movd                 eax, m0
368*77c1e3ccSAndroid Build Coastguard Worker  RET
369*77c1e3ccSAndroid Build Coastguard Worker%endmacro
370*77c1e3ccSAndroid Build Coastguard Worker
371*77c1e3ccSAndroid Build Coastguard WorkerINIT_XMM sse2
372*77c1e3ccSAndroid Build Coastguard WorkerSAD8XN 16    ; sad8x16_sse2
373*77c1e3ccSAndroid Build Coastguard WorkerSAD8XN  8    ; sad8x8_sse2
374*77c1e3ccSAndroid Build Coastguard WorkerSAD8XN  4    ; sad8x4_sse2
375*77c1e3ccSAndroid Build Coastguard WorkerSAD8XN 16, 1 ; sad8x16_avg_sse2
376*77c1e3ccSAndroid Build Coastguard WorkerSAD8XN  8, 1 ; sad8x8_avg_sse2
377*77c1e3ccSAndroid Build Coastguard WorkerSAD8XN  4, 1 ; sad8x4_avg_sse2
378*77c1e3ccSAndroid Build Coastguard WorkerSAD8XN 16, 2 ; sad8x16_skip_sse2
379*77c1e3ccSAndroid Build Coastguard WorkerSAD8XN  8, 2 ; sad8x8_skip_sse2
380*77c1e3ccSAndroid Build Coastguard Worker%if CONFIG_REALTIME_ONLY==0
381*77c1e3ccSAndroid Build Coastguard WorkerSAD8XN 32    ; sad_8x32_sse2
382*77c1e3ccSAndroid Build Coastguard WorkerSAD8XN 32, 1 ; sad_8x32_avg_sse2
383*77c1e3ccSAndroid Build Coastguard WorkerSAD8XN 32, 2 ; sad_8x32_skip_sse2
384*77c1e3ccSAndroid Build Coastguard Worker%endif
385*77c1e3ccSAndroid Build Coastguard Worker
386*77c1e3ccSAndroid Build Coastguard Worker; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
387*77c1e3ccSAndroid Build Coastguard Worker;                                   uint8_t *ref, int ref_stride);
388*77c1e3ccSAndroid Build Coastguard Worker%macro SAD4XN 1-2 0
389*77c1e3ccSAndroid Build Coastguard Worker  SAD_FN 4, %1, 7, %2
390*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2
391*77c1e3ccSAndroid Build Coastguard Worker  mov              n_rowsd, %1/8
392*77c1e3ccSAndroid Build Coastguard Worker%else
393*77c1e3ccSAndroid Build Coastguard Worker  mov              n_rowsd, %1/4
394*77c1e3ccSAndroid Build Coastguard Worker%endif
395*77c1e3ccSAndroid Build Coastguard Worker  pxor                  m0, m0
396*77c1e3ccSAndroid Build Coastguard Worker
397*77c1e3ccSAndroid Build Coastguard Worker.loop:
398*77c1e3ccSAndroid Build Coastguard Worker  movd                  m1, [refq]
399*77c1e3ccSAndroid Build Coastguard Worker  movd                  m2, [refq+ref_strideq]
400*77c1e3ccSAndroid Build Coastguard Worker  movd                  m3, [refq+ref_strideq*2]
401*77c1e3ccSAndroid Build Coastguard Worker  movd                  m4, [refq+ref_stride3q]
402*77c1e3ccSAndroid Build Coastguard Worker  punpckldq             m1, m2
403*77c1e3ccSAndroid Build Coastguard Worker  punpckldq             m3, m4
404*77c1e3ccSAndroid Build Coastguard Worker  movlhps               m1, m3
405*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 1
406*77c1e3ccSAndroid Build Coastguard Worker  pavgb                 m1, [second_predq+mmsize*0]
407*77c1e3ccSAndroid Build Coastguard Worker  lea         second_predq, [second_predq+mmsize*1]
408*77c1e3ccSAndroid Build Coastguard Worker%endif
409*77c1e3ccSAndroid Build Coastguard Worker  movd                  m2, [srcq]
410*77c1e3ccSAndroid Build Coastguard Worker  movd                  m5, [srcq+src_strideq]
411*77c1e3ccSAndroid Build Coastguard Worker  movd                  m4, [srcq+src_strideq*2]
412*77c1e3ccSAndroid Build Coastguard Worker  movd                  m3, [srcq+src_stride3q]
413*77c1e3ccSAndroid Build Coastguard Worker  punpckldq             m2, m5
414*77c1e3ccSAndroid Build Coastguard Worker  punpckldq             m4, m3
415*77c1e3ccSAndroid Build Coastguard Worker  movlhps               m2, m4
416*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m1, m2
417*77c1e3ccSAndroid Build Coastguard Worker  lea                 refq, [refq+ref_strideq*4]
418*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m0, m1
419*77c1e3ccSAndroid Build Coastguard Worker  lea                 srcq, [srcq+src_strideq*4]
420*77c1e3ccSAndroid Build Coastguard Worker  dec              n_rowsd
421*77c1e3ccSAndroid Build Coastguard Worker  jg .loop
422*77c1e3ccSAndroid Build Coastguard Worker
423*77c1e3ccSAndroid Build Coastguard Worker  movhlps               m1, m0
424*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m0, m1
425*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2 ; we skipped rows, so now we need to double the sad
426*77c1e3ccSAndroid Build Coastguard Worker  pslld                 m0, 1
427*77c1e3ccSAndroid Build Coastguard Worker%endif
428*77c1e3ccSAndroid Build Coastguard Worker  movd                 eax, m0
429*77c1e3ccSAndroid Build Coastguard Worker  RET
430*77c1e3ccSAndroid Build Coastguard Worker%endmacro
431*77c1e3ccSAndroid Build Coastguard Worker
432*77c1e3ccSAndroid Build Coastguard WorkerINIT_XMM sse2
433*77c1e3ccSAndroid Build Coastguard WorkerSAD4XN  8 ; sad4x8_sse2
434*77c1e3ccSAndroid Build Coastguard WorkerSAD4XN  4 ; sad4x4_sse2
435*77c1e3ccSAndroid Build Coastguard WorkerSAD4XN  8, 1 ; sad4x8_avg_sse2
436*77c1e3ccSAndroid Build Coastguard WorkerSAD4XN  4, 1 ; sad4x4_avg_sse2
437*77c1e3ccSAndroid Build Coastguard WorkerSAD4XN  8, 2 ; sad4x8_skip_sse2
438*77c1e3ccSAndroid Build Coastguard Worker%if CONFIG_REALTIME_ONLY==0
439*77c1e3ccSAndroid Build Coastguard WorkerSAD4XN 16 ; sad_4x16_sse2
440*77c1e3ccSAndroid Build Coastguard WorkerSAD4XN 16, 1 ; sad_4x16_avg_sse2
441*77c1e3ccSAndroid Build Coastguard WorkerSAD4XN 16, 2 ; sad_4x16_skip_sse2
442*77c1e3ccSAndroid Build Coastguard Worker%endif
443