xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1;
2;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION .text
14
15; Macro Arguments
16; Arg 1: Width
17; Arg 2: Height
18; Arg 3: Number of general purpose registers
19; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
20%macro HIGH_SAD_FN 4
21%if %4 == 0
22%if %3 == 5
23cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
24%else ; %3 == 7
25cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
26                            src_stride3, ref_stride3, n_rows
27%endif ; %3 == 5/7
28%elif %4 == 1 ; avg
29%if %3 == 5
30cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
31                                    second_pred, n_rows
32%else ; %3 == 7
33cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \
34                                              ref, ref_stride, \
35                                              second_pred, \
36                                              src_stride3, ref_stride3
37%if VPX_ARCH_X86_64
38%define n_rowsd r7d
39%else ; x86-32
40%define n_rowsd dword r0m
41%endif ; x86-32/64
42%endif ; %3 == 5/7
43%else  ; %4 == 2, skip rows
44%if %3 == 5
45cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
46%else ; %3 == 7
47cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
48                            src_stride3, ref_stride3, n_rows
49%endif ; %3 == 5/7
50%endif ; sad/avg/skip
51%if %4 == 2  ; double the stride if we are skipping rows
52  lea          src_strided, [src_strided*2]
53  lea          ref_strided, [ref_strided*2]
54%endif
55  movsxdifnidn src_strideq, src_strided
56  movsxdifnidn ref_strideq, ref_strided
57%if %3 == 7
58  lea         src_stride3q, [src_strideq*3]
59  lea         ref_stride3q, [ref_strideq*3]
60%endif ; %3 == 7
61; convert src, ref & second_pred to short ptrs (from byte ptrs)
62  shl                 srcq, 1
63  shl                 refq, 1
64%if %4 == 1
65  shl         second_predq, 1
66%endif
67%endmacro
68
69; unsigned int vpx_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
70;                                    uint8_t *ref, int ref_stride);
71%macro HIGH_SAD64XN 1-2 0
72  HIGH_SAD_FN 64, %1, 5, %2
73%if %2 == 2  ; skip rows, so divide number of rows by 2
74  mov              n_rowsd, %1/2
75%else
76  mov              n_rowsd, %1
77%endif
78  pxor                  m0, m0
79  pxor                  m6, m6
80
81.loop:
82  ; first half of each row
83  movu                  m1, [refq]
84  movu                  m2, [refq+16]
85  movu                  m3, [refq+32]
86  movu                  m4, [refq+48]
87%if %2 == 1
88  pavgw                 m1, [second_predq+mmsize*0]
89  pavgw                 m2, [second_predq+mmsize*1]
90  pavgw                 m3, [second_predq+mmsize*2]
91  pavgw                 m4, [second_predq+mmsize*3]
92  lea         second_predq, [second_predq+mmsize*4]
93%endif
94  mova                  m5, [srcq]
95  psubusw               m5, m1
96  psubusw               m1, [srcq]
97  por                   m1, m5
98  mova                  m5, [srcq+16]
99  psubusw               m5, m2
100  psubusw               m2, [srcq+16]
101  por                   m2, m5
102  mova                  m5, [srcq+32]
103  psubusw               m5, m3
104  psubusw               m3, [srcq+32]
105  por                   m3, m5
106  mova                  m5, [srcq+48]
107  psubusw               m5, m4
108  psubusw               m4, [srcq+48]
109  por                   m4, m5
110  paddw                 m1, m2
111  paddw                 m3, m4
112  movhlps               m2, m1
113  movhlps               m4, m3
114  paddw                 m1, m2
115  paddw                 m3, m4
116  punpcklwd             m1, m6
117  punpcklwd             m3, m6
118  paddd                 m0, m1
119  paddd                 m0, m3
120  ; second half of each row
121  movu                  m1, [refq+64]
122  movu                  m2, [refq+80]
123  movu                  m3, [refq+96]
124  movu                  m4, [refq+112]
125%if %2 == 1
126  pavgw                 m1, [second_predq+mmsize*0]
127  pavgw                 m2, [second_predq+mmsize*1]
128  pavgw                 m3, [second_predq+mmsize*2]
129  pavgw                 m4, [second_predq+mmsize*3]
130  lea         second_predq, [second_predq+mmsize*4]
131%endif
132  mova                  m5, [srcq+64]
133  psubusw               m5, m1
134  psubusw               m1, [srcq+64]
135  por                   m1, m5
136  mova                  m5, [srcq+80]
137  psubusw               m5, m2
138  psubusw               m2, [srcq+80]
139  por                   m2, m5
140  mova                  m5, [srcq+96]
141  psubusw               m5, m3
142  psubusw               m3, [srcq+96]
143  por                   m3, m5
144  mova                  m5, [srcq+112]
145  psubusw               m5, m4
146  psubusw               m4, [srcq+112]
147  por                   m4, m5
148  paddw                 m1, m2
149  paddw                 m3, m4
150  movhlps               m2, m1
151  movhlps               m4, m3
152  paddw                 m1, m2
153  paddw                 m3, m4
154  punpcklwd             m1, m6
155  punpcklwd             m3, m6
156  lea                 refq, [refq+ref_strideq*2]
157  paddd                 m0, m1
158  lea                 srcq, [srcq+src_strideq*2]
159  paddd                 m0, m3
160
161  dec              n_rowsd
162  jg .loop
163
164  movhlps               m1, m0
165  paddd                 m0, m1
166  punpckldq             m0, m6
167  movhlps               m1, m0
168  paddd                 m0, m1
169%if %2 == 2  ; we skipped rows, so we need to double the sad
170  pslld                 m0, 1
171%endif
172  movd                 eax, m0
173  RET
174%endmacro
175
176INIT_XMM sse2
177HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
178HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
179HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
180HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
181HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2
182HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2
183
184
185; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
186;                                    uint8_t *ref, int ref_stride);
187%macro HIGH_SAD32XN 1-2 0
188  HIGH_SAD_FN 32, %1, 5, %2
189%if %2 == 2  ; skip rows, so divide number of rows by 2
190  mov              n_rowsd, %1/2
191%else
192  mov              n_rowsd, %1
193%endif
194  pxor                  m0, m0
195  pxor                  m6, m6
196
197.loop:
198  movu                  m1, [refq]
199  movu                  m2, [refq+16]
200  movu                  m3, [refq+32]
201  movu                  m4, [refq+48]
202%if %2 == 1
203  pavgw                 m1, [second_predq+mmsize*0]
204  pavgw                 m2, [second_predq+mmsize*1]
205  pavgw                 m3, [second_predq+mmsize*2]
206  pavgw                 m4, [second_predq+mmsize*3]
207  lea         second_predq, [second_predq+mmsize*4]
208%endif
209  mova                  m5, [srcq]
210  psubusw               m5, m1
211  psubusw               m1, [srcq]
212  por                   m1, m5
213  mova                  m5, [srcq+16]
214  psubusw               m5, m2
215  psubusw               m2, [srcq+16]
216  por                   m2, m5
217  mova                  m5, [srcq+32]
218  psubusw               m5, m3
219  psubusw               m3, [srcq+32]
220  por                   m3, m5
221  mova                  m5, [srcq+48]
222  psubusw               m5, m4
223  psubusw               m4, [srcq+48]
224  por                   m4, m5
225  paddw                 m1, m2
226  paddw                 m3, m4
227  movhlps               m2, m1
228  movhlps               m4, m3
229  paddw                 m1, m2
230  paddw                 m3, m4
231  punpcklwd             m1, m6
232  punpcklwd             m3, m6
233  lea                 refq, [refq+ref_strideq*2]
234  paddd                 m0, m1
235  lea                 srcq, [srcq+src_strideq*2]
236  paddd                 m0, m3
237  dec              n_rowsd
238  jg .loop
239
240  movhlps               m1, m0
241  paddd                 m0, m1
242  punpckldq             m0, m6
243  movhlps               m1, m0
244  paddd                 m0, m1
245%if %2 == 2  ; we skipped rows, so we need to double the sad
246  pslld                 m0, 1
247%endif
248  movd                 eax, m0
249  RET
250%endmacro
251
252INIT_XMM sse2
253HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
254HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
255HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
256HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
257HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
258HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
259HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2
260HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2
261HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2
262
263; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
264;                                    uint8_t *ref, int ref_stride);
265%macro HIGH_SAD16XN 1-2 0
266  HIGH_SAD_FN 16, %1, 5, %2
267%if %2 == 2  ; skip rows, so divide number of rows by 2
268  mov              n_rowsd, %1/4
269%else
270  mov              n_rowsd, %1/2
271%endif
272  pxor                  m0, m0
273  pxor                  m6, m6
274
275.loop:
276  movu                  m1, [refq]
277  movu                  m2, [refq+16]
278  movu                  m3, [refq+ref_strideq*2]
279  movu                  m4, [refq+ref_strideq*2+16]
280%if %2 == 1
281  pavgw                 m1, [second_predq+mmsize*0]
282  pavgw                 m2, [second_predq+16]
283  pavgw                 m3, [second_predq+mmsize*2]
284  pavgw                 m4, [second_predq+mmsize*2+16]
285  lea         second_predq, [second_predq+mmsize*4]
286%endif
287  mova                  m5, [srcq]
288  psubusw               m5, m1
289  psubusw               m1, [srcq]
290  por                   m1, m5
291  mova                  m5, [srcq+16]
292  psubusw               m5, m2
293  psubusw               m2, [srcq+16]
294  por                   m2, m5
295  mova                  m5, [srcq+src_strideq*2]
296  psubusw               m5, m3
297  psubusw               m3, [srcq+src_strideq*2]
298  por                   m3, m5
299  mova                  m5, [srcq+src_strideq*2+16]
300  psubusw               m5, m4
301  psubusw               m4, [srcq+src_strideq*2+16]
302  por                   m4, m5
303  paddw                 m1, m2
304  paddw                 m3, m4
305  movhlps               m2, m1
306  movhlps               m4, m3
307  paddw                 m1, m2
308  paddw                 m3, m4
309  punpcklwd             m1, m6
310  punpcklwd             m3, m6
311  lea                 refq, [refq+ref_strideq*4]
312  paddd                 m0, m1
313  lea                 srcq, [srcq+src_strideq*4]
314  paddd                 m0, m3
315  dec              n_rowsd
316  jg .loop
317
318  movhlps               m1, m0
319  paddd                 m0, m1
320  punpckldq             m0, m6
321  movhlps               m1, m0
322  paddd                 m0, m1
323%if %2 == 2  ; we skipped rows, so we need to double the sad
324  pslld                 m0, 1
325%endif
326  movd                 eax, m0
327  RET
328%endmacro
329
330INIT_XMM sse2
331HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
332HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
333HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
334HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
335HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
336HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
337HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2
338HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2
339HIGH_SAD16XN  8, 2 ; highbd_sad_skip_16x8_sse2
340
341; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
342;                                    uint8_t *ref, int ref_stride);
343%macro HIGH_SAD8XN 1-2 0
344  HIGH_SAD_FN 8, %1, 7, %2
345%if %2 == 2  ; skip rows, so divide number of rows by 2
346  mov              n_rowsd, %1/8
347%else
348  mov              n_rowsd, %1/4
349%endif
350  pxor                  m0, m0
351  pxor                  m6, m6
352
353.loop:
354  movu                  m1, [refq]
355  movu                  m2, [refq+ref_strideq*2]
356  movu                  m3, [refq+ref_strideq*4]
357  movu                  m4, [refq+ref_stride3q*2]
358%if %2 == 1
359  pavgw                 m1, [second_predq+mmsize*0]
360  pavgw                 m2, [second_predq+mmsize*1]
361  pavgw                 m3, [second_predq+mmsize*2]
362  pavgw                 m4, [second_predq+mmsize*3]
363  lea         second_predq, [second_predq+mmsize*4]
364%endif
365  mova                  m5, [srcq]
366  psubusw               m5, m1
367  psubusw               m1, [srcq]
368  por                   m1, m5
369  mova                  m5, [srcq+src_strideq*2]
370  psubusw               m5, m2
371  psubusw               m2, [srcq+src_strideq*2]
372  por                   m2, m5
373  mova                  m5, [srcq+src_strideq*4]
374  psubusw               m5, m3
375  psubusw               m3, [srcq+src_strideq*4]
376  por                   m3, m5
377  mova                  m5, [srcq+src_stride3q*2]
378  psubusw               m5, m4
379  psubusw               m4, [srcq+src_stride3q*2]
380  por                   m4, m5
381  paddw                 m1, m2
382  paddw                 m3, m4
383  movhlps               m2, m1
384  movhlps               m4, m3
385  paddw                 m1, m2
386  paddw                 m3, m4
387  punpcklwd             m1, m6
388  punpcklwd             m3, m6
389  lea                 refq, [refq+ref_strideq*8]
390  paddd                 m0, m1
391  lea                 srcq, [srcq+src_strideq*8]
392  paddd                 m0, m3
393  dec              n_rowsd
394  jg .loop
395
396  movhlps               m1, m0
397  paddd                 m0, m1
398  punpckldq             m0, m6
399  movhlps               m1, m0
400  paddd                 m0, m1
401%if %2 == 2  ; we skipped rows, so we need to double the sad
402  pslld                 m0, 1
403%endif
404  movd                 eax, m0
405  RET
406%endmacro
407
408INIT_XMM sse2
409HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
410HIGH_SAD8XN  8 ; highbd_sad8x8_sse2
411HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
412HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
413HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
414HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
415HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2
416HIGH_SAD8XN  8, 2 ; highbd_sad_skip_8x8_sse2
417