xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/sad_sse2.asm (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION .text
14
15; Macro Arguments
16; Arg 1: Width
17; Arg 2: Height
18; Arg 3: Number of general purpose registers
19; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
20%macro SAD_FN 4
21%if %4 == 0 ; normal sad
22%if %3 == 5
23cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
24%else ; %3 == 7
25cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
26                            src_stride3, ref_stride3, n_rows
27%endif ; %3 == 5/7
28
29%elif %4 == 2 ; skip
30%if %3 == 5
31cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
32%else ; %3 == 7
33cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
34                            src_stride3, ref_stride3, n_rows
35%endif ; %3 == 5/7
36
37%else
38%if %3 == 5
39cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
40                                    second_pred, n_rows
41%else ; %3 == 7
42cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \
43                                              ref, ref_stride, \
44                                              second_pred, \
45                                              src_stride3, ref_stride3
46%if VPX_ARCH_X86_64
47%define n_rowsd r7d
48%else ; x86-32
49%define n_rowsd dword r0m
50%endif ; x86-32/64
51%endif ; %3 == 5/7
52%endif ; sad/avg/skip
53%if %4 == 2; skip rows so double the stride
54lea           src_strided, [src_strided*2]
55lea           ref_strided, [ref_strided*2]
56%endif ; %4 skip
57  movsxdifnidn src_strideq, src_strided
58  movsxdifnidn ref_strideq, ref_strided
59%if %3 == 7
60  lea         src_stride3q, [src_strideq*3]
61  lea         ref_stride3q, [ref_strideq*3]
62%endif ; %3 == 7
63%endmacro
64
65; unsigned int vpx_sad64x64_sse2(uint8_t *src, int src_stride,
66;                                uint8_t *ref, int ref_stride);
67%macro SAD64XN 1-2 0
68  SAD_FN 64, %1, 5, %2
69%if %2 == 2
70  mov              n_rowsd, %1/2
71%else
72  mov              n_rowsd, %1
73%endif
74  pxor                  m0, m0
75.loop:
76  movu                  m1, [refq]
77  movu                  m2, [refq+16]
78  movu                  m3, [refq+32]
79  movu                  m4, [refq+48]
80%if %2 == 1
81  pavgb                 m1, [second_predq+mmsize*0]
82  pavgb                 m2, [second_predq+mmsize*1]
83  pavgb                 m3, [second_predq+mmsize*2]
84  pavgb                 m4, [second_predq+mmsize*3]
85  lea         second_predq, [second_predq+mmsize*4]
86%endif
87  psadbw                m1, [srcq]
88  psadbw                m2, [srcq+16]
89  psadbw                m3, [srcq+32]
90  psadbw                m4, [srcq+48]
91  paddd                 m1, m2
92  paddd                 m3, m4
93  add                 refq, ref_strideq
94  paddd                 m0, m1
95  add                 srcq, src_strideq
96  paddd                 m0, m3
97  dec              n_rowsd
98  jg .loop
99
100  movhlps               m1, m0
101  paddd                 m0, m1
102%if %2 == 2 ; we skipped rows, so now we need to double the sad
103  pslld                 m0, 1
104%endif
105  movd                 eax, m0
106  RET
107%endmacro
108
109INIT_XMM sse2
110SAD64XN 64 ; sad64x64_sse2
111SAD64XN 32 ; sad64x32_sse2
112SAD64XN 64, 1 ; sad64x64_avg_sse2
113SAD64XN 32, 1 ; sad64x32_avg_sse2
114SAD64XN  64, 2  ; sad64x64_skip_sse2
115SAD64XN  32, 2  ; sad64x32_skip_sse2
116
117; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride,
118;                                uint8_t *ref, int ref_stride);
119%macro SAD32XN 1-2 0
120  SAD_FN 32, %1, 5, %2
121%if %2 == 2
122  mov              n_rowsd, %1/4
123%else
124  mov              n_rowsd, %1/2
125%endif
126  pxor                  m0, m0
127.loop:
128  movu                  m1, [refq]
129  movu                  m2, [refq+16]
130  movu                  m3, [refq+ref_strideq]
131  movu                  m4, [refq+ref_strideq+16]
132%if %2 == 1
133  pavgb                 m1, [second_predq+mmsize*0]
134  pavgb                 m2, [second_predq+mmsize*1]
135  pavgb                 m3, [second_predq+mmsize*2]
136  pavgb                 m4, [second_predq+mmsize*3]
137  lea         second_predq, [second_predq+mmsize*4]
138%endif
139  psadbw                m1, [srcq]
140  psadbw                m2, [srcq+16]
141  psadbw                m3, [srcq+src_strideq]
142  psadbw                m4, [srcq+src_strideq+16]
143  paddd                 m1, m2
144  paddd                 m3, m4
145  lea                 refq, [refq+ref_strideq*2]
146  paddd                 m0, m1
147  lea                 srcq, [srcq+src_strideq*2]
148  paddd                 m0, m3
149  dec              n_rowsd
150  jg .loop
151
152  movhlps               m1, m0
153  paddd                 m0, m1
154%if %2 == 2 ; we skipped rows, so now we need to double the sad
155  pslld                 m0, 1
156%endif
157  movd                 eax, m0
158  RET
159%endmacro
160
161INIT_XMM sse2
162SAD32XN 64 ; sad32x64_sse2
163SAD32XN 32 ; sad32x32_sse2
164SAD32XN 16 ; sad32x16_sse2
165SAD32XN 64, 1 ; sad32x64_avg_sse2
166SAD32XN 32, 1 ; sad32x32_avg_sse2
167SAD32XN 16, 1 ; sad32x16_avg_sse2
168SAD32XN 64, 2 ; sad32x64_skip_sse2
169SAD32XN 32, 2 ; sad32x32_skip_sse2
170SAD32XN 16, 2 ; sad32x16_skip_sse2
171
172; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
173;                                    uint8_t *ref, int ref_stride);
174%macro SAD16XN 1-2 0
175  SAD_FN 16, %1, 7, %2
176%if %2 == 2
177  mov              n_rowsd, %1/8
178%else
179  mov              n_rowsd, %1/4
180%endif
181  pxor                  m0, m0
182
183.loop:
184  movu                  m1, [refq]
185  movu                  m2, [refq+ref_strideq]
186  movu                  m3, [refq+ref_strideq*2]
187  movu                  m4, [refq+ref_stride3q]
188%if %2 == 1
189  pavgb                 m1, [second_predq+mmsize*0]
190  pavgb                 m2, [second_predq+mmsize*1]
191  pavgb                 m3, [second_predq+mmsize*2]
192  pavgb                 m4, [second_predq+mmsize*3]
193  lea         second_predq, [second_predq+mmsize*4]
194%endif
195  psadbw                m1, [srcq]
196  psadbw                m2, [srcq+src_strideq]
197  psadbw                m3, [srcq+src_strideq*2]
198  psadbw                m4, [srcq+src_stride3q]
199  paddd                 m1, m2
200  paddd                 m3, m4
201  lea                 refq, [refq+ref_strideq*4]
202  paddd                 m0, m1
203  lea                 srcq, [srcq+src_strideq*4]
204  paddd                 m0, m3
205  dec              n_rowsd
206  jg .loop
207
208  movhlps               m1, m0
209  paddd                 m0, m1
210%if %2 == 2 ; we skipped rows, so now we need to double the sad
211  pslld                 m0, 1
212%endif
213  movd                 eax, m0
214  RET
215%endmacro
216
217INIT_XMM sse2
218SAD16XN 32 ; sad16x32_sse2
219SAD16XN 16 ; sad16x16_sse2
220SAD16XN  8 ; sad16x8_sse2
221SAD16XN 32, 1 ; sad16x32_avg_sse2
222SAD16XN 16, 1 ; sad16x16_avg_sse2
223SAD16XN  8, 1 ; sad16x8_avg_sse2
224SAD16XN 32, 2 ; sad16x32_skip_sse2
225SAD16XN 16, 2 ; sad16x16_skip_sse2
226SAD16XN  8, 2 ; sad16x8_skip_sse2
227
228; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
229;                                   uint8_t *ref, int ref_stride);
230%macro SAD8XN 1-2 0
231  SAD_FN 8, %1, 7, %2
232%if %2 == 2
233  mov              n_rowsd, %1/8
234%else
235  mov              n_rowsd, %1/4
236%endif
237  pxor                  m0, m0
238
239.loop:
240  movh                  m1, [refq]
241  movhps                m1, [refq+ref_strideq]
242  movh                  m2, [refq+ref_strideq*2]
243  movhps                m2, [refq+ref_stride3q]
244%if %2 == 1
245  pavgb                 m1, [second_predq+mmsize*0]
246  pavgb                 m2, [second_predq+mmsize*1]
247  lea         second_predq, [second_predq+mmsize*2]
248%endif
249  movh                  m3, [srcq]
250  movhps                m3, [srcq+src_strideq]
251  movh                  m4, [srcq+src_strideq*2]
252  movhps                m4, [srcq+src_stride3q]
253  psadbw                m1, m3
254  psadbw                m2, m4
255  lea                 refq, [refq+ref_strideq*4]
256  paddd                 m0, m1
257  lea                 srcq, [srcq+src_strideq*4]
258  paddd                 m0, m2
259  dec              n_rowsd
260  jg .loop
261
262  movhlps               m1, m0
263  paddd                 m0, m1
264%if %2 == 2 ; we skipped rows, so now we need to double the sad
265  pslld                 m0, 1
266%endif
267  movd                 eax, m0
268  RET
269%endmacro
270
271INIT_XMM sse2
272SAD8XN 16 ; sad8x16_sse2
273SAD8XN  8 ; sad8x8_sse2
274SAD8XN  4 ; sad8x4_sse2
275SAD8XN 16, 1 ; sad8x16_avg_sse2
276SAD8XN  8, 1 ; sad8x8_avg_sse2
277SAD8XN  4, 1 ; sad8x4_avg_sse2
278SAD8XN 16, 2 ; sad8x16_skip_sse2
279SAD8XN  8, 2 ; sad8x8_skip_sse2
280
281; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
282;                                   uint8_t *ref, int ref_stride);
283%macro SAD4XN 1-2 0
284  SAD_FN 4, %1, 7, %2
285%if %2 == 2
286  mov              n_rowsd, %1/8
287%else
288  mov              n_rowsd, %1/4
289%endif
290  pxor                  m0, m0
291
292.loop:
293  movd                  m1, [refq]
294  movd                  m2, [refq+ref_strideq]
295  movd                  m3, [refq+ref_strideq*2]
296  movd                  m4, [refq+ref_stride3q]
297  punpckldq             m1, m2
298  punpckldq             m3, m4
299  movlhps               m1, m3
300%if %2 == 1
301  pavgb                 m1, [second_predq+mmsize*0]
302  lea         second_predq, [second_predq+mmsize*1]
303%endif
304  movd                  m2, [srcq]
305  movd                  m5, [srcq+src_strideq]
306  movd                  m4, [srcq+src_strideq*2]
307  movd                  m3, [srcq+src_stride3q]
308  punpckldq             m2, m5
309  punpckldq             m4, m3
310  movlhps               m2, m4
311  psadbw                m1, m2
312  lea                 refq, [refq+ref_strideq*4]
313  paddd                 m0, m1
314  lea                 srcq, [srcq+src_strideq*4]
315  dec              n_rowsd
316  jg .loop
317
318  movhlps               m1, m0
319  paddd                 m0, m1
320%if %2 == 2 ; we skipped rows, so now we need to double the sad
321  pslld                 m0, 1
322%endif
323  movd                 eax, m0
324  RET
325%endmacro
326
327INIT_XMM sse2
328SAD4XN  8 ; sad4x8_sse
329SAD4XN  4 ; sad4x4_sse
330SAD4XN  8, 1 ; sad4x8_avg_sse
331SAD4XN  4, 1 ; sad4x4_avg_sse
332SAD4XN  8, 2 ; sad4x8_skip_sse
333