xref: /aosp_15_r20/external/libaom/aom_dsp/x86/sad4d_sse2.asm (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker;
2*77c1e3ccSAndroid Build Coastguard Worker; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker;
4*77c1e3ccSAndroid Build Coastguard Worker; This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker; was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker; Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker;
11*77c1e3ccSAndroid Build Coastguard Worker
12*77c1e3ccSAndroid Build Coastguard Worker;
13*77c1e3ccSAndroid Build Coastguard Worker
14*77c1e3ccSAndroid Build Coastguard Worker%include "third_party/x86inc/x86inc.asm"
15*77c1e3ccSAndroid Build Coastguard Worker
16*77c1e3ccSAndroid Build Coastguard WorkerSECTION .text
17*77c1e3ccSAndroid Build Coastguard Worker
18*77c1e3ccSAndroid Build Coastguard Worker; 'spill_src_stride' affect a lot how the code works.
19*77c1e3ccSAndroid Build Coastguard Worker;
20*77c1e3ccSAndroid Build Coastguard Worker; When 'spill_src_stride' is false, the 'src_strideq' resides in
21*77c1e3ccSAndroid Build Coastguard Worker; register, [srcq + src_strideq + offset] is allowed, so we can simply
22*77c1e3ccSAndroid Build Coastguard Worker; use such form to access src memory and don't bother to update 'srcq'
23*77c1e3ccSAndroid Build Coastguard Worker; at each line. We only update 'srcq' each two-lines using a compact
24*77c1e3ccSAndroid Build Coastguard Worker; LEA instruction like [srcq+src_strideq*2].
25*77c1e3ccSAndroid Build Coastguard Worker;
26*77c1e3ccSAndroid Build Coastguard Worker; When 'spill_src_stride' is true, the 'src_strideq' resides in memory.
27*77c1e3ccSAndroid Build Coastguard Worker; we cannot use above form to access memory, we have to update
28*77c1e3ccSAndroid Build Coastguard Worker; 'srcq' at each line break. As we process two parts (first,second)
29*77c1e3ccSAndroid Build Coastguard Worker; together in each macro function, the second part may also sit
30*77c1e3ccSAndroid Build Coastguard Worker; in the next line, which means we also need to possibly add
31*77c1e3ccSAndroid Build Coastguard Worker; one 'src_strideq' to 'srcq' before processing second part.
32*77c1e3ccSAndroid Build Coastguard Worker
33*77c1e3ccSAndroid Build Coastguard Worker%macro HANDLE_SECOND_OFFSET 0
34*77c1e3ccSAndroid Build Coastguard Worker  %if spill_src_stride
35*77c1e3ccSAndroid Build Coastguard Worker    %define second_offset 0
36*77c1e3ccSAndroid Build Coastguard Worker    add srcq, src_strideq
37*77c1e3ccSAndroid Build Coastguard Worker  %else
38*77c1e3ccSAndroid Build Coastguard Worker    %define second_offset (src_strideq)
39*77c1e3ccSAndroid Build Coastguard Worker  %endif
40*77c1e3ccSAndroid Build Coastguard Worker%endmacro
41*77c1e3ccSAndroid Build Coastguard Worker
42*77c1e3ccSAndroid Build Coastguard Worker; This is specically designed to handle when src_strideq is a
43*77c1e3ccSAndroid Build Coastguard Worker; memory position, under such case, we can not accomplish
44*77c1e3ccSAndroid Build Coastguard Worker; complex address calculation using LEA, and fall back to
45*77c1e3ccSAndroid Build Coastguard Worker; using simple ADD instruction at each line ending.
46*77c1e3ccSAndroid Build Coastguard Worker%macro ADVANCE_END_OF_TWO_LINES 0
47*77c1e3ccSAndroid Build Coastguard Worker  %if spill_src_stride
48*77c1e3ccSAndroid Build Coastguard Worker    add srcq, src_strideq
49*77c1e3ccSAndroid Build Coastguard Worker  %else
50*77c1e3ccSAndroid Build Coastguard Worker    lea                 srcq, [srcq+src_strideq*2]
51*77c1e3ccSAndroid Build Coastguard Worker  %endif
52*77c1e3ccSAndroid Build Coastguard Worker
53*77c1e3ccSAndroid Build Coastguard Worker; note: ref_stride is never spilled when processing two lines
54*77c1e3ccSAndroid Build Coastguard Worker  lea                ref1q, [ref1q+ref_strideq*2]
55*77c1e3ccSAndroid Build Coastguard Worker  lea                ref2q, [ref2q+ref_strideq*2]
56*77c1e3ccSAndroid Build Coastguard Worker  lea                ref3q, [ref3q+ref_strideq*2]
57*77c1e3ccSAndroid Build Coastguard Worker  lea                ref4q, [ref4q+ref_strideq*2]
58*77c1e3ccSAndroid Build Coastguard Worker%endmacro
59*77c1e3ccSAndroid Build Coastguard Worker
60*77c1e3ccSAndroid Build Coastguard Worker; PROCESS_4x2x4 first
61*77c1e3ccSAndroid Build Coastguard Worker%macro PROCESS_4x2x4 1
62*77c1e3ccSAndroid Build Coastguard Worker  movd                  m0, [srcq]
63*77c1e3ccSAndroid Build Coastguard Worker  HANDLE_SECOND_OFFSET
64*77c1e3ccSAndroid Build Coastguard Worker%if %1 == 1
65*77c1e3ccSAndroid Build Coastguard Worker  movd                  m6, [ref1q]
66*77c1e3ccSAndroid Build Coastguard Worker  movd                  m4, [ref2q]
67*77c1e3ccSAndroid Build Coastguard Worker  movd                  m7, [ref3q]
68*77c1e3ccSAndroid Build Coastguard Worker  movd                  m5, [ref4q]
69*77c1e3ccSAndroid Build Coastguard Worker
70*77c1e3ccSAndroid Build Coastguard Worker  movd                  m1, [srcq + second_offset]
71*77c1e3ccSAndroid Build Coastguard Worker  movd                  m2, [ref1q+ref_strideq]
72*77c1e3ccSAndroid Build Coastguard Worker  punpckldq             m0, m1
73*77c1e3ccSAndroid Build Coastguard Worker  punpckldq             m6, m2
74*77c1e3ccSAndroid Build Coastguard Worker  movd                  m1, [ref2q+ref_strideq]
75*77c1e3ccSAndroid Build Coastguard Worker  movd                  m2, [ref3q+ref_strideq]
76*77c1e3ccSAndroid Build Coastguard Worker  movd                  m3, [ref4q+ref_strideq]
77*77c1e3ccSAndroid Build Coastguard Worker  punpckldq             m4, m1
78*77c1e3ccSAndroid Build Coastguard Worker  punpckldq             m7, m2
79*77c1e3ccSAndroid Build Coastguard Worker  punpckldq             m5, m3
80*77c1e3ccSAndroid Build Coastguard Worker  movlhps               m0, m0
81*77c1e3ccSAndroid Build Coastguard Worker  movlhps               m6, m4
82*77c1e3ccSAndroid Build Coastguard Worker  movlhps               m7, m5
83*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m6, m0
84*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m7, m0
85*77c1e3ccSAndroid Build Coastguard Worker%else
86*77c1e3ccSAndroid Build Coastguard Worker  movd                  m1, [ref1q]
87*77c1e3ccSAndroid Build Coastguard Worker  movd                  m5, [ref1q+ref_strideq]
88*77c1e3ccSAndroid Build Coastguard Worker  movd                  m2, [ref2q]
89*77c1e3ccSAndroid Build Coastguard Worker  movd                  m4, [ref2q+ref_strideq]
90*77c1e3ccSAndroid Build Coastguard Worker  punpckldq             m1, m5
91*77c1e3ccSAndroid Build Coastguard Worker  punpckldq             m2, m4
92*77c1e3ccSAndroid Build Coastguard Worker  movd                  m3, [ref3q]
93*77c1e3ccSAndroid Build Coastguard Worker  movd                  m5, [ref3q+ref_strideq]
94*77c1e3ccSAndroid Build Coastguard Worker  punpckldq             m3, m5
95*77c1e3ccSAndroid Build Coastguard Worker  movd                  m4, [ref4q]
96*77c1e3ccSAndroid Build Coastguard Worker  movd                  m5, [ref4q+ref_strideq]
97*77c1e3ccSAndroid Build Coastguard Worker  punpckldq             m4, m5
98*77c1e3ccSAndroid Build Coastguard Worker  movd                  m5, [srcq + second_offset]
99*77c1e3ccSAndroid Build Coastguard Worker  punpckldq             m0, m5
100*77c1e3ccSAndroid Build Coastguard Worker  movlhps               m0, m0
101*77c1e3ccSAndroid Build Coastguard Worker  movlhps               m1, m2
102*77c1e3ccSAndroid Build Coastguard Worker  movlhps               m3, m4
103*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m1, m0
104*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m3, m0
105*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m6, m1
106*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m7, m3
107*77c1e3ccSAndroid Build Coastguard Worker%endif
108*77c1e3ccSAndroid Build Coastguard Worker%endmacro
109*77c1e3ccSAndroid Build Coastguard Worker
110*77c1e3ccSAndroid Build Coastguard Worker; PROCESS_8x2x4 first
111*77c1e3ccSAndroid Build Coastguard Worker%macro PROCESS_8x2x4 1
112*77c1e3ccSAndroid Build Coastguard Worker  movh                  m0, [srcq]
113*77c1e3ccSAndroid Build Coastguard Worker  HANDLE_SECOND_OFFSET
114*77c1e3ccSAndroid Build Coastguard Worker%if %1 == 1
115*77c1e3ccSAndroid Build Coastguard Worker  movh                  m4, [ref1q]
116*77c1e3ccSAndroid Build Coastguard Worker  movh                  m5, [ref2q]
117*77c1e3ccSAndroid Build Coastguard Worker  movh                  m6, [ref3q]
118*77c1e3ccSAndroid Build Coastguard Worker  movh                  m7, [ref4q]
119*77c1e3ccSAndroid Build Coastguard Worker  movhps                m0, [srcq + second_offset]
120*77c1e3ccSAndroid Build Coastguard Worker  movhps                m4, [ref1q+ref_strideq]
121*77c1e3ccSAndroid Build Coastguard Worker  movhps                m5, [ref2q+ref_strideq]
122*77c1e3ccSAndroid Build Coastguard Worker  movhps                m6, [ref3q+ref_strideq]
123*77c1e3ccSAndroid Build Coastguard Worker  movhps                m7, [ref4q+ref_strideq]
124*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m4, m0
125*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m5, m0
126*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m6, m0
127*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m7, m0
128*77c1e3ccSAndroid Build Coastguard Worker%else
129*77c1e3ccSAndroid Build Coastguard Worker  movh                  m1, [ref1q]
130*77c1e3ccSAndroid Build Coastguard Worker  movh                  m2, [ref2q]
131*77c1e3ccSAndroid Build Coastguard Worker  movhps                m0, [srcq + second_offset]
132*77c1e3ccSAndroid Build Coastguard Worker  movhps                m1, [ref1q+ref_strideq]
133*77c1e3ccSAndroid Build Coastguard Worker  movhps                m2, [ref2q+ref_strideq]
134*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m1, m0
135*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m2, m0
136*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m4, m1
137*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m5, m2
138*77c1e3ccSAndroid Build Coastguard Worker
139*77c1e3ccSAndroid Build Coastguard Worker  movh                  m1, [ref3q]
140*77c1e3ccSAndroid Build Coastguard Worker  movhps                m1, [ref3q+ref_strideq]
141*77c1e3ccSAndroid Build Coastguard Worker  movh                  m2, [ref4q]
142*77c1e3ccSAndroid Build Coastguard Worker  movhps                m2, [ref4q+ref_strideq]
143*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m1, m0
144*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m2, m0
145*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m6, m1
146*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m7, m2
147*77c1e3ccSAndroid Build Coastguard Worker%endif
148*77c1e3ccSAndroid Build Coastguard Worker%endmacro
149*77c1e3ccSAndroid Build Coastguard Worker
150*77c1e3ccSAndroid Build Coastguard Worker; PROCESS_FIRST_MMSIZE
151*77c1e3ccSAndroid Build Coastguard Worker%macro PROCESS_FIRST_MMSIZE 0
152*77c1e3ccSAndroid Build Coastguard Worker  mova                  m0, [srcq]
153*77c1e3ccSAndroid Build Coastguard Worker  movu                  m4, [ref1q]
154*77c1e3ccSAndroid Build Coastguard Worker  movu                  m5, [ref2q]
155*77c1e3ccSAndroid Build Coastguard Worker  movu                  m6, [ref3q]
156*77c1e3ccSAndroid Build Coastguard Worker  movu                  m7, [ref4q]
157*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m4, m0
158*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m5, m0
159*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m6, m0
160*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m7, m0
161*77c1e3ccSAndroid Build Coastguard Worker%endmacro
162*77c1e3ccSAndroid Build Coastguard Worker
163*77c1e3ccSAndroid Build Coastguard Worker; PROCESS_16x1x4 offset
164*77c1e3ccSAndroid Build Coastguard Worker%macro PROCESS_16x1x4 1
165*77c1e3ccSAndroid Build Coastguard Worker  mova                  m0, [srcq + %1]
166*77c1e3ccSAndroid Build Coastguard Worker  movu                  m1, [ref1q + ref_offsetq + %1]
167*77c1e3ccSAndroid Build Coastguard Worker  movu                  m2, [ref2q + ref_offsetq + %1]
168*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m1, m0
169*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m2, m0
170*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m4, m1
171*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m5, m2
172*77c1e3ccSAndroid Build Coastguard Worker
173*77c1e3ccSAndroid Build Coastguard Worker  movu                  m1, [ref3q + ref_offsetq + %1]
174*77c1e3ccSAndroid Build Coastguard Worker  movu                  m2, [ref4q + ref_offsetq + %1]
175*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m1, m0
176*77c1e3ccSAndroid Build Coastguard Worker  psadbw                m2, m0
177*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m6, m1
178*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m7, m2
179*77c1e3ccSAndroid Build Coastguard Worker%endmacro
180*77c1e3ccSAndroid Build Coastguard Worker
181*77c1e3ccSAndroid Build Coastguard Worker; void aom_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
182*77c1e3ccSAndroid Build Coastguard Worker;                         uint8_t *ref[4], int ref_stride,
183*77c1e3ccSAndroid Build Coastguard Worker;                         uint32_t res[4]);
184*77c1e3ccSAndroid Build Coastguard Worker; Macro Arguments:
185*77c1e3ccSAndroid Build Coastguard Worker;   1: Width
186*77c1e3ccSAndroid Build Coastguard Worker;   2: Height
187*77c1e3ccSAndroid Build Coastguard Worker;   3: If 0, then normal sad, else skip rows
188*77c1e3ccSAndroid Build Coastguard Worker%macro SADNXN4D 2-3 0
189*77c1e3ccSAndroid Build Coastguard Worker
190*77c1e3ccSAndroid Build Coastguard Worker%define spill_src_stride 0
191*77c1e3ccSAndroid Build Coastguard Worker%define spill_ref_stride 0
192*77c1e3ccSAndroid Build Coastguard Worker%define spill_cnt 0
193*77c1e3ccSAndroid Build Coastguard Worker
194*77c1e3ccSAndroid Build Coastguard Worker; Whether a shared offset should be used instead of adding strides to
195*77c1e3ccSAndroid Build Coastguard Worker; each reference array. With this option, only one line will be processed
196*77c1e3ccSAndroid Build Coastguard Worker; per loop iteration.
197*77c1e3ccSAndroid Build Coastguard Worker%define use_ref_offset (%1 >= mmsize)
198*77c1e3ccSAndroid Build Coastguard Worker
199*77c1e3ccSAndroid Build Coastguard Worker; Remove loops in the 4x4 and 8x4 case
200*77c1e3ccSAndroid Build Coastguard Worker%define use_loop (use_ref_offset || %2 > 4)
201*77c1e3ccSAndroid Build Coastguard Worker
202*77c1e3ccSAndroid Build Coastguard Worker%if %3 == 1  ; skip rows
203*77c1e3ccSAndroid Build Coastguard Worker%if AOM_ARCH_X86_64
204*77c1e3ccSAndroid Build Coastguard Worker%if use_ref_offset
205*77c1e3ccSAndroid Build Coastguard Workercglobal sad_skip_%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, \
206*77c1e3ccSAndroid Build Coastguard Worker                                     ref2, ref3, ref4, cnt, ref_offset
207*77c1e3ccSAndroid Build Coastguard Worker%elif use_loop
208*77c1e3ccSAndroid Build Coastguard Workercglobal sad_skip_%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, res, \
209*77c1e3ccSAndroid Build Coastguard Worker                                    ref2, ref3, ref4, cnt
210*77c1e3ccSAndroid Build Coastguard Worker%else
211*77c1e3ccSAndroid Build Coastguard Workercglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, res, \
212*77c1e3ccSAndroid Build Coastguard Worker                                    ref2, ref3, ref4
213*77c1e3ccSAndroid Build Coastguard Worker%endif
214*77c1e3ccSAndroid Build Coastguard Worker%else
215*77c1e3ccSAndroid Build Coastguard Worker%if use_ref_offset
216*77c1e3ccSAndroid Build Coastguard Workercglobal sad_skip_%1x%2x4d, 4, 7, 8, src, ref_offset, ref1, cnt, ref2, ref3, \
217*77c1e3ccSAndroid Build Coastguard Worker                                    ref4
218*77c1e3ccSAndroid Build Coastguard Worker%define spill_src_stride 1
219*77c1e3ccSAndroid Build Coastguard Worker%define spill_ref_stride 1
220*77c1e3ccSAndroid Build Coastguard Worker%elif use_loop
221*77c1e3ccSAndroid Build Coastguard Workercglobal sad_skip_%1x%2x4d, 4, 7, 8, src, cnt, ref1, ref_stride, ref2, \
222*77c1e3ccSAndroid Build Coastguard Worker                                    ref3, ref4
223*77c1e3ccSAndroid Build Coastguard Worker%define spill_src_stride 1
224*77c1e3ccSAndroid Build Coastguard Worker%else
225*77c1e3ccSAndroid Build Coastguard Workercglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, ref2, \
226*77c1e3ccSAndroid Build Coastguard Worker                                    ref3, ref4
227*77c1e3ccSAndroid Build Coastguard Worker%endif
228*77c1e3ccSAndroid Build Coastguard Worker%endif
229*77c1e3ccSAndroid Build Coastguard Worker%else ; normal sad
230*77c1e3ccSAndroid Build Coastguard Worker%if AOM_ARCH_X86_64
231*77c1e3ccSAndroid Build Coastguard Worker%if use_ref_offset
232*77c1e3ccSAndroid Build Coastguard Workercglobal sad%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, ref2, \
233*77c1e3ccSAndroid Build Coastguard Worker                               ref3, ref4, cnt, ref_offset
234*77c1e3ccSAndroid Build Coastguard Worker%elif use_loop
235*77c1e3ccSAndroid Build Coastguard Workercglobal sad%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, res, ref2, \
236*77c1e3ccSAndroid Build Coastguard Worker                              ref3, ref4, cnt
237*77c1e3ccSAndroid Build Coastguard Worker%else
238*77c1e3ccSAndroid Build Coastguard Workercglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, res, ref2, \
239*77c1e3ccSAndroid Build Coastguard Worker                              ref3, ref4
240*77c1e3ccSAndroid Build Coastguard Worker%endif
241*77c1e3ccSAndroid Build Coastguard Worker%else
242*77c1e3ccSAndroid Build Coastguard Worker%if use_ref_offset
243*77c1e3ccSAndroid Build Coastguard Workercglobal sad%1x%2x4d, 4, 7, 8, src, ref_offset, ref1, cnt, ref2, ref3, ref4
244*77c1e3ccSAndroid Build Coastguard Worker  %define spill_src_stride 1
245*77c1e3ccSAndroid Build Coastguard Worker  %define spill_ref_stride 1
246*77c1e3ccSAndroid Build Coastguard Worker%elif use_loop
247*77c1e3ccSAndroid Build Coastguard Workercglobal sad%1x%2x4d, 4, 7, 8, src, cnt, ref1, ref_stride, ref2, ref3, ref4
248*77c1e3ccSAndroid Build Coastguard Worker  %define spill_src_stride 1
249*77c1e3ccSAndroid Build Coastguard Worker%else
250*77c1e3ccSAndroid Build Coastguard Workercglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, ref2, ref3, \
251*77c1e3ccSAndroid Build Coastguard Worker                              ref4
252*77c1e3ccSAndroid Build Coastguard Worker%endif
253*77c1e3ccSAndroid Build Coastguard Worker%endif
254*77c1e3ccSAndroid Build Coastguard Worker%endif
255*77c1e3ccSAndroid Build Coastguard Worker
256*77c1e3ccSAndroid Build Coastguard Worker%if spill_src_stride
257*77c1e3ccSAndroid Build Coastguard Worker  %define src_strideq r1mp
258*77c1e3ccSAndroid Build Coastguard Worker  %define src_strided r1mp
259*77c1e3ccSAndroid Build Coastguard Worker%endif
260*77c1e3ccSAndroid Build Coastguard Worker%if spill_ref_stride
261*77c1e3ccSAndroid Build Coastguard Worker  %define ref_strideq r3mp
262*77c1e3ccSAndroid Build Coastguard Worker  %define ref_strided r3mp
263*77c1e3ccSAndroid Build Coastguard Worker%endif
264*77c1e3ccSAndroid Build Coastguard Worker
265*77c1e3ccSAndroid Build Coastguard Worker%if spill_cnt
266*77c1e3ccSAndroid Build Coastguard Worker  SUB                  rsp, 4
267*77c1e3ccSAndroid Build Coastguard Worker  %define cntd word [rsp]
268*77c1e3ccSAndroid Build Coastguard Worker%endif
269*77c1e3ccSAndroid Build Coastguard Worker
270*77c1e3ccSAndroid Build Coastguard Worker%if %3 == 1
271*77c1e3ccSAndroid Build Coastguard Worker  sal          src_strided, 1
272*77c1e3ccSAndroid Build Coastguard Worker  sal          ref_strided, 1
273*77c1e3ccSAndroid Build Coastguard Worker%endif
274*77c1e3ccSAndroid Build Coastguard Worker  movsxdifnidn src_strideq, src_strided
275*77c1e3ccSAndroid Build Coastguard Worker  movsxdifnidn ref_strideq, ref_strided
276*77c1e3ccSAndroid Build Coastguard Worker
277*77c1e3ccSAndroid Build Coastguard Worker  mov                ref2q, [ref1q+gprsize*1]
278*77c1e3ccSAndroid Build Coastguard Worker  mov                ref3q, [ref1q+gprsize*2]
279*77c1e3ccSAndroid Build Coastguard Worker  mov                ref4q, [ref1q+gprsize*3]
280*77c1e3ccSAndroid Build Coastguard Worker  mov                ref1q, [ref1q+gprsize*0]
281*77c1e3ccSAndroid Build Coastguard Worker
282*77c1e3ccSAndroid Build Coastguard Worker; Is the loop for this wxh in another function?
283*77c1e3ccSAndroid Build Coastguard Worker; If so, we jump into that function for the loop and returning
284*77c1e3ccSAndroid Build Coastguard Worker%define external_loop (use_ref_offset && %1 > mmsize && %1 != %2)
285*77c1e3ccSAndroid Build Coastguard Worker
286*77c1e3ccSAndroid Build Coastguard Worker%if use_ref_offset
287*77c1e3ccSAndroid Build Coastguard Worker  PROCESS_FIRST_MMSIZE
288*77c1e3ccSAndroid Build Coastguard Worker%if %1 > mmsize
289*77c1e3ccSAndroid Build Coastguard Worker  mov          ref_offsetq, 0
290*77c1e3ccSAndroid Build Coastguard Worker  mov                 cntd, %2 >> %3
291*77c1e3ccSAndroid Build Coastguard Worker; Jump part way into the loop for the square version of this width
292*77c1e3ccSAndroid Build Coastguard Worker%if %3 == 1
293*77c1e3ccSAndroid Build Coastguard Worker  jmp mangle(private_prefix %+ _sad_skip_%1x%1x4d %+ SUFFIX).midloop
294*77c1e3ccSAndroid Build Coastguard Worker%else
295*77c1e3ccSAndroid Build Coastguard Worker  jmp mangle(private_prefix %+ _sad%1x%1x4d %+ SUFFIX).midloop
296*77c1e3ccSAndroid Build Coastguard Worker%endif
297*77c1e3ccSAndroid Build Coastguard Worker%else
298*77c1e3ccSAndroid Build Coastguard Worker  mov          ref_offsetq, ref_strideq
299*77c1e3ccSAndroid Build Coastguard Worker  add                 srcq, src_strideq
300*77c1e3ccSAndroid Build Coastguard Worker  mov                 cntd, (%2 >> %3) - 1
301*77c1e3ccSAndroid Build Coastguard Worker%endif
302*77c1e3ccSAndroid Build Coastguard Worker%if external_loop == 0
303*77c1e3ccSAndroid Build Coastguard Worker.loop:
304*77c1e3ccSAndroid Build Coastguard Worker; Unrolled horizontal loop
305*77c1e3ccSAndroid Build Coastguard Worker%assign h_offset 0
306*77c1e3ccSAndroid Build Coastguard Worker%rep %1/mmsize
307*77c1e3ccSAndroid Build Coastguard Worker  PROCESS_16x1x4 h_offset
308*77c1e3ccSAndroid Build Coastguard Worker%if h_offset == 0
309*77c1e3ccSAndroid Build Coastguard Worker; The first row of the first column is done outside the loop and jumps here
310*77c1e3ccSAndroid Build Coastguard Worker.midloop:
311*77c1e3ccSAndroid Build Coastguard Worker%endif
312*77c1e3ccSAndroid Build Coastguard Worker%assign h_offset h_offset+mmsize
313*77c1e3ccSAndroid Build Coastguard Worker%endrep
314*77c1e3ccSAndroid Build Coastguard Worker
315*77c1e3ccSAndroid Build Coastguard Worker  add                 srcq, src_strideq
316*77c1e3ccSAndroid Build Coastguard Worker  add          ref_offsetq, ref_strideq
317*77c1e3ccSAndroid Build Coastguard Worker  sub                 cntd, 1
318*77c1e3ccSAndroid Build Coastguard Worker  jnz .loop
319*77c1e3ccSAndroid Build Coastguard Worker%endif
320*77c1e3ccSAndroid Build Coastguard Worker%else
321*77c1e3ccSAndroid Build Coastguard Worker  PROCESS_%1x2x4 1
322*77c1e3ccSAndroid Build Coastguard Worker  ADVANCE_END_OF_TWO_LINES
323*77c1e3ccSAndroid Build Coastguard Worker%if use_loop
324*77c1e3ccSAndroid Build Coastguard Worker  mov                 cntd, (%2/2 >> %3) - 1
325*77c1e3ccSAndroid Build Coastguard Worker.loop:
326*77c1e3ccSAndroid Build Coastguard Worker%endif
327*77c1e3ccSAndroid Build Coastguard Worker  PROCESS_%1x2x4 0
328*77c1e3ccSAndroid Build Coastguard Worker%if use_loop
329*77c1e3ccSAndroid Build Coastguard Worker  ADVANCE_END_OF_TWO_LINES
330*77c1e3ccSAndroid Build Coastguard Worker  sub                 cntd, 1
331*77c1e3ccSAndroid Build Coastguard Worker  jnz .loop
332*77c1e3ccSAndroid Build Coastguard Worker%endif
333*77c1e3ccSAndroid Build Coastguard Worker%endif
334*77c1e3ccSAndroid Build Coastguard Worker
335*77c1e3ccSAndroid Build Coastguard Worker%if spill_cnt
336*77c1e3ccSAndroid Build Coastguard Worker; Undo stack allocation for cnt
337*77c1e3ccSAndroid Build Coastguard Worker  ADD                  rsp, 4
338*77c1e3ccSAndroid Build Coastguard Worker%endif
339*77c1e3ccSAndroid Build Coastguard Worker
340*77c1e3ccSAndroid Build Coastguard Worker%if external_loop == 0
341*77c1e3ccSAndroid Build Coastguard Worker%if %3 == 0
342*77c1e3ccSAndroid Build Coastguard Worker  %define resultq r4
343*77c1e3ccSAndroid Build Coastguard Worker  %define resultmp r4mp
344*77c1e3ccSAndroid Build Coastguard Worker%endif
345*77c1e3ccSAndroid Build Coastguard Worker
346*77c1e3ccSAndroid Build Coastguard Worker; Undo modifications on parameters on the stack
347*77c1e3ccSAndroid Build Coastguard Worker%if %3 == 1
348*77c1e3ccSAndroid Build Coastguard Worker%if spill_src_stride
349*77c1e3ccSAndroid Build Coastguard Worker  shr          src_strided, 1
350*77c1e3ccSAndroid Build Coastguard Worker%endif
351*77c1e3ccSAndroid Build Coastguard Worker%if spill_ref_stride
352*77c1e3ccSAndroid Build Coastguard Worker  shr          ref_strided, 1
353*77c1e3ccSAndroid Build Coastguard Worker%endif
354*77c1e3ccSAndroid Build Coastguard Worker%endif
355*77c1e3ccSAndroid Build Coastguard Worker
356*77c1e3ccSAndroid Build Coastguard Worker%if %1 > 4
357*77c1e3ccSAndroid Build Coastguard Worker  pslldq                m5, 4
358*77c1e3ccSAndroid Build Coastguard Worker  pslldq                m7, 4
359*77c1e3ccSAndroid Build Coastguard Worker  por                   m4, m5
360*77c1e3ccSAndroid Build Coastguard Worker  por                   m6, m7
361*77c1e3ccSAndroid Build Coastguard Worker  mova                  m5, m4
362*77c1e3ccSAndroid Build Coastguard Worker  mova                  m7, m6
363*77c1e3ccSAndroid Build Coastguard Worker  punpcklqdq            m4, m6
364*77c1e3ccSAndroid Build Coastguard Worker  punpckhqdq            m5, m7
365*77c1e3ccSAndroid Build Coastguard Worker  paddd                 m4, m5
366*77c1e3ccSAndroid Build Coastguard Worker%if %3 == 1
367*77c1e3ccSAndroid Build Coastguard Worker  pslld                 m4, 1
368*77c1e3ccSAndroid Build Coastguard Worker%endif
369*77c1e3ccSAndroid Build Coastguard Worker  movifnidn             resultq, resultmp
370*77c1e3ccSAndroid Build Coastguard Worker  movu                [resultq], m4
371*77c1e3ccSAndroid Build Coastguard Worker  RET
372*77c1e3ccSAndroid Build Coastguard Worker%else
373*77c1e3ccSAndroid Build Coastguard Worker  pshufd            m6, m6, 0x08
374*77c1e3ccSAndroid Build Coastguard Worker  pshufd            m7, m7, 0x08
375*77c1e3ccSAndroid Build Coastguard Worker%if %3 == 1
376*77c1e3ccSAndroid Build Coastguard Worker  pslld                 m6, 1
377*77c1e3ccSAndroid Build Coastguard Worker  pslld                 m7, 1
378*77c1e3ccSAndroid Build Coastguard Worker%endif
379*77c1e3ccSAndroid Build Coastguard Worker  movifnidn             resultq, resultmp
380*77c1e3ccSAndroid Build Coastguard Worker  movq              [resultq+0], m6
381*77c1e3ccSAndroid Build Coastguard Worker  movq              [resultq+8], m7
382*77c1e3ccSAndroid Build Coastguard Worker  RET
383*77c1e3ccSAndroid Build Coastguard Worker%endif
384*77c1e3ccSAndroid Build Coastguard Worker%endif ; external_loop == 0
385*77c1e3ccSAndroid Build Coastguard Worker%endmacro
386*77c1e3ccSAndroid Build Coastguard Worker
387*77c1e3ccSAndroid Build Coastguard WorkerINIT_XMM sse2
388*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D 128, 128
389*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D 128,  64
390*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  64, 128
391*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  64,  64
392*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  64,  32
393*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  32,  64
394*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  32,  32
395*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  32,  16
396*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  16,  32
397*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  16,  16
398*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  16,   8
399*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D   8,  16
400*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D   8,   8
401*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D   8,   4
402*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D   4,   8
403*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D   4,   4
404*77c1e3ccSAndroid Build Coastguard Worker%if CONFIG_REALTIME_ONLY==0
405*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D   4,  16
406*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  16,   4
407*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D   8,  32
408*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  32,   8
409*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  16,  64
410*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  64,  16
411*77c1e3ccSAndroid Build Coastguard Worker%endif
412*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D 128, 128, 1
413*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D 128,  64, 1
414*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  64, 128, 1
415*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  64,  64, 1
416*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  64,  32, 1
417*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  32,  64, 1
418*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  32,  32, 1
419*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  32,  16, 1
420*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  16,  32, 1
421*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  16,  16, 1
422*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  16,   8, 1
423*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D   8,  16, 1
424*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D   8,   8, 1
425*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D   4,   8, 1
426*77c1e3ccSAndroid Build Coastguard Worker%if CONFIG_REALTIME_ONLY==0
427*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D   4,  16, 1
428*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D   8,  32, 1
429*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  32,   8, 1
430*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  16,  64, 1
431*77c1e3ccSAndroid Build Coastguard WorkerSADNXN4D  64,  16, 1
432*77c1e3ccSAndroid Build Coastguard Worker%endif
433*77c1e3ccSAndroid Build Coastguard Worker
434*77c1e3ccSAndroid Build Coastguard Worker; Different assembly is needed when the height gets subsampled to 2
435*77c1e3ccSAndroid Build Coastguard Worker; SADNXN4D 16,  4, 1
436*77c1e3ccSAndroid Build Coastguard Worker; SADNXN4D  8,  4, 1
437*77c1e3ccSAndroid Build Coastguard Worker; SADNXN4D  4,  4, 1
438