xref: /aosp_15_r20/external/libaom/aom_dsp/x86/highbd_sad_sse2.asm (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14%include "third_party/x86inc/x86inc.asm"
15
16SECTION .text
17
18; Macro Arguments
19; Arg 1: Width
20; Arg 2: Height
21; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit
22; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
23; Arg 5: Number of xmm registers. 8xh needs 8, others only need 7
24%macro HIGH_SAD_FN 4-5 7
25%if %4 == 0
26%if %3 == 5
27cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows
28%else ; %3 == 7
29cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \
30                            src_stride3, ref_stride3, n_rows
31%endif ; %3 == 5/7
32%elif %4 == 1 ; avg
33%if %3 == 5
34cglobal highbd_sad%1x%2_avg, 5, 1 + %3, %5, src, src_stride, ref, ref_stride, \
35                                    second_pred, n_rows
36%else ; %3 == 7
37cglobal highbd_sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, %5, src, src_stride, \
38                                              ref, ref_stride, \
39                                              second_pred, \
40                                              src_stride3, ref_stride3
41%if AOM_ARCH_X86_64
42%define n_rowsd r7d
43%else ; x86-32
44%define n_rowsd dword r0m
45%endif ; x86-32/64
46%endif ; %3 == 5/7
47%else  ; %4 == 2, skip rows
48%if %3 == 5
49cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows
50%else ; %3 == 7
51cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \
52                            src_stride3, ref_stride3, n_rows
53%endif ; %3 == 5/7
54%endif ; sad/avg/skip
55%if %4 == 2  ; double the stride if we are skipping rows
56  lea          src_strided, [src_strided*2]
57  lea          ref_strided, [ref_strided*2]
58%endif
59  movsxdifnidn src_strideq, src_strided
60  movsxdifnidn ref_strideq, ref_strided
61%if %3 == 7
62  lea         src_stride3q, [src_strideq*3]
63  lea         ref_stride3q, [ref_strideq*3]
64%endif ; %3 == 7
65; convert src, ref & second_pred to short ptrs (from byte ptrs)
66  shl                 srcq, 1
67  shl                 refq, 1
68%if %4 == 1
69  shl         second_predq, 1
70%endif
71%endmacro
72
73; unsigned int aom_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
74;                                    uint8_t *ref, int ref_stride);
75%macro HIGH_SAD64XN 1-2 0
76  HIGH_SAD_FN 64, %1, 5, %2
77%if %2 == 2  ; skip rows, so divide number of rows by 2
78  mov              n_rowsd, %1/2
79%else
80  mov              n_rowsd, %1
81%endif
82  pxor                  m0, m0
83  pxor                  m6, m6
84
85.loop:
86  ; first half of each row
87  movu                  m1, [refq]
88  movu                  m2, [refq+16]
89  movu                  m3, [refq+32]
90  movu                  m4, [refq+48]
91%if %2 == 1
92  pavgw                 m1, [second_predq+mmsize*0]
93  pavgw                 m2, [second_predq+mmsize*1]
94  pavgw                 m3, [second_predq+mmsize*2]
95  pavgw                 m4, [second_predq+mmsize*3]
96  lea         second_predq, [second_predq+mmsize*4]
97%endif
98  mova                  m5, [srcq]
99  psubusw               m5, m1
100  psubusw               m1, [srcq]
101  por                   m1, m5
102  mova                  m5, [srcq+16]
103  psubusw               m5, m2
104  psubusw               m2, [srcq+16]
105  por                   m2, m5
106  mova                  m5, [srcq+32]
107  psubusw               m5, m3
108  psubusw               m3, [srcq+32]
109  por                   m3, m5
110  mova                  m5, [srcq+48]
111  psubusw               m5, m4
112  psubusw               m4, [srcq+48]
113  por                   m4, m5
114  paddw                 m1, m2
115  paddw                 m3, m4
116  movhlps               m2, m1
117  movhlps               m4, m3
118  paddw                 m1, m2
119  paddw                 m3, m4
120  punpcklwd             m1, m6
121  punpcklwd             m3, m6
122  paddd                 m0, m1
123  paddd                 m0, m3
124  ; second half of each row
125  movu                  m1, [refq+64]
126  movu                  m2, [refq+80]
127  movu                  m3, [refq+96]
128  movu                  m4, [refq+112]
129%if %2 == 1
130  pavgw                 m1, [second_predq+mmsize*0]
131  pavgw                 m2, [second_predq+mmsize*1]
132  pavgw                 m3, [second_predq+mmsize*2]
133  pavgw                 m4, [second_predq+mmsize*3]
134  lea         second_predq, [second_predq+mmsize*4]
135%endif
136  mova                  m5, [srcq+64]
137  psubusw               m5, m1
138  psubusw               m1, [srcq+64]
139  por                   m1, m5
140  mova                  m5, [srcq+80]
141  psubusw               m5, m2
142  psubusw               m2, [srcq+80]
143  por                   m2, m5
144  mova                  m5, [srcq+96]
145  psubusw               m5, m3
146  psubusw               m3, [srcq+96]
147  por                   m3, m5
148  mova                  m5, [srcq+112]
149  psubusw               m5, m4
150  psubusw               m4, [srcq+112]
151  por                   m4, m5
152  paddw                 m1, m2
153  paddw                 m3, m4
154  movhlps               m2, m1
155  movhlps               m4, m3
156  paddw                 m1, m2
157  paddw                 m3, m4
158  punpcklwd             m1, m6
159  punpcklwd             m3, m6
160  lea                 refq, [refq+ref_strideq*2]
161  paddd                 m0, m1
162  lea                 srcq, [srcq+src_strideq*2]
163  paddd                 m0, m3
164
165  dec              n_rowsd
166  jg .loop
167
168  movhlps               m1, m0
169  paddd                 m0, m1
170  punpckldq             m0, m6
171  movhlps               m1, m0
172  paddd                 m0, m1
173%if %2 == 2  ; we skipped rows, so we need to double the sad
174  pslld                 m0, 1
175%endif
176  movd                 eax, m0
177  RET
178%endmacro
179
180INIT_XMM sse2
181HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
182HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
183HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
184HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
185HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2
186HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2
187%if CONFIG_REALTIME_ONLY==0
188HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2
189HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2
190HIGH_SAD64XN 16, 2 ; highbd_sad_skip_64x16_sse2
191%endif
192
193; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
194;                                    uint8_t *ref, int ref_stride);
195%macro HIGH_SAD32XN 1-2 0
196  HIGH_SAD_FN 32, %1, 5, %2
197%if %2 == 2  ; skip rows, so divide number of rows by 2
198  mov              n_rowsd, %1/2
199%else
200  mov              n_rowsd, %1
201%endif
202  pxor                  m0, m0
203  pxor                  m6, m6
204
205.loop:
206  movu                  m1, [refq]
207  movu                  m2, [refq+16]
208  movu                  m3, [refq+32]
209  movu                  m4, [refq+48]
210%if %2 == 1
211  pavgw                 m1, [second_predq+mmsize*0]
212  pavgw                 m2, [second_predq+mmsize*1]
213  pavgw                 m3, [second_predq+mmsize*2]
214  pavgw                 m4, [second_predq+mmsize*3]
215  lea         second_predq, [second_predq+mmsize*4]
216%endif
217  mova                  m5, [srcq]
218  psubusw               m5, m1
219  psubusw               m1, [srcq]
220  por                   m1, m5
221  mova                  m5, [srcq+16]
222  psubusw               m5, m2
223  psubusw               m2, [srcq+16]
224  por                   m2, m5
225  mova                  m5, [srcq+32]
226  psubusw               m5, m3
227  psubusw               m3, [srcq+32]
228  por                   m3, m5
229  mova                  m5, [srcq+48]
230  psubusw               m5, m4
231  psubusw               m4, [srcq+48]
232  por                   m4, m5
233  paddw                 m1, m2
234  paddw                 m3, m4
235  movhlps               m2, m1
236  movhlps               m4, m3
237  paddw                 m1, m2
238  paddw                 m3, m4
239  punpcklwd             m1, m6
240  punpcklwd             m3, m6
241  lea                 refq, [refq+ref_strideq*2]
242  paddd                 m0, m1
243  lea                 srcq, [srcq+src_strideq*2]
244  paddd                 m0, m3
245  dec              n_rowsd
246  jg .loop
247
248  movhlps               m1, m0
249  paddd                 m0, m1
250  punpckldq             m0, m6
251  movhlps               m1, m0
252  paddd                 m0, m1
253%if %2 == 2  ; we skipped rows, so we need to double the sad
254  pslld                 m0, 1
255%endif
256  movd                 eax, m0
257  RET
258%endmacro
259
260INIT_XMM sse2
261HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
262HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
263HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
264HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
265HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
266HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
267HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2
268HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2
269HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2
270%if CONFIG_REALTIME_ONLY==0
271HIGH_SAD32XN  8 ; highbd_sad_32x8_sse2
272HIGH_SAD32XN  8, 1 ; highbd_sad_32x8_avg_sse2
273HIGH_SAD32XN  8, 2 ; highbd_sad_skip_32x8_sse2
274%endif
275
276; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
277;                                    uint8_t *ref, int ref_stride);
278%macro HIGH_SAD16XN 1-2 0
279  HIGH_SAD_FN 16, %1, 5, %2
280%if %2 == 2  ; skip rows, so divide number of rows by 2
281  mov              n_rowsd, %1/4
282%else
283  mov              n_rowsd, %1/2
284%endif
285  pxor                  m0, m0
286  pxor                  m6, m6
287
288.loop:
289  movu                  m1, [refq]
290  movu                  m2, [refq+16]
291  movu                  m3, [refq+ref_strideq*2]
292  movu                  m4, [refq+ref_strideq*2+16]
293%if %2 == 1
294  pavgw                 m1, [second_predq+mmsize*0]
295  pavgw                 m2, [second_predq+16]
296  pavgw                 m3, [second_predq+mmsize*2]
297  pavgw                 m4, [second_predq+mmsize*2+16]
298  lea         second_predq, [second_predq+mmsize*4]
299%endif
300  mova                  m5, [srcq]
301  psubusw               m5, m1
302  psubusw               m1, [srcq]
303  por                   m1, m5
304  mova                  m5, [srcq+16]
305  psubusw               m5, m2
306  psubusw               m2, [srcq+16]
307  por                   m2, m5
308  mova                  m5, [srcq+src_strideq*2]
309  psubusw               m5, m3
310  psubusw               m3, [srcq+src_strideq*2]
311  por                   m3, m5
312  mova                  m5, [srcq+src_strideq*2+16]
313  psubusw               m5, m4
314  psubusw               m4, [srcq+src_strideq*2+16]
315  por                   m4, m5
316  paddw                 m1, m2
317  paddw                 m3, m4
318  movhlps               m2, m1
319  movhlps               m4, m3
320  paddw                 m1, m2
321  paddw                 m3, m4
322  punpcklwd             m1, m6
323  punpcklwd             m3, m6
324  lea                 refq, [refq+ref_strideq*4]
325  paddd                 m0, m1
326  lea                 srcq, [srcq+src_strideq*4]
327  paddd                 m0, m3
328  dec              n_rowsd
329  jg .loop
330
331  movhlps               m1, m0
332  paddd                 m0, m1
333  punpckldq             m0, m6
334  movhlps               m1, m0
335  paddd                 m0, m1
336%if %2 == 2  ; we skipped rows, so we need to double the sad
337  pslld                 m0, 1
338%endif
339  movd                 eax, m0
340  RET
341%endmacro
342
343INIT_XMM sse2
344HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
345HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
346HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
347HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
348HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
349HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
350HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2
351HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2
352HIGH_SAD16XN  8, 2 ; highbd_sad_skip_16x8_sse2
353%if CONFIG_REALTIME_ONLY==0
354HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2
355HIGH_SAD16XN  4 ; highbd_sad_16x4_sse2
356HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2
357HIGH_SAD16XN  4, 1 ; highbd_sad_16x4_avg_sse2
358HIGH_SAD16XN 64, 2 ; highbd_sad_skip_16x64_sse2
359; Current code fails there are only 2 rows
360; HIGH_SAD16XN  4, 2 ; highbd_sad_skip_16x4_sse2
361%endif
362
363; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
364;                                    uint8_t *ref, int ref_stride);
365%macro HIGH_SAD8XN 1-2 0
366  HIGH_SAD_FN 8, %1, 7, %2, 8
367%if %2 == 2  ; skip rows, so divide number of rows by 2
368  mov              n_rowsd, %1/8
369%else
370  mov              n_rowsd, %1/4
371%endif
372  pxor                  m0, m0
373  pxor                  m6, m6
374
375.loop:
376  movu                  m1, [refq]
377  movu                  m2, [refq+ref_strideq*2]
378  movu                  m3, [refq+ref_strideq*4]
379  movu                  m4, [refq+ref_stride3q*2]
380%if %2 == 1
381  pavgw                 m1, [second_predq+mmsize*0]
382  pavgw                 m2, [second_predq+mmsize*1]
383  pavgw                 m3, [second_predq+mmsize*2]
384  pavgw                 m4, [second_predq+mmsize*3]
385  lea         second_predq, [second_predq+mmsize*4]
386%endif
387  mova                  m7, m1
388  movu                  m5, [srcq]
389  psubusw               m1, m5
390  psubusw               m5, m7
391  por                   m1, m5
392
393  mova                  m7, m2
394  movu                  m5, [srcq+src_strideq*2]
395  psubusw               m2, m5
396  psubusw               m5, m7
397  por                   m2, m5
398
399  mova                  m7, m3
400  movu                  m5, [srcq+src_strideq*4]
401  psubusw               m3, m5
402  psubusw               m5, m7
403  por                   m3, m5
404
405  mova                  m7, m4
406  movu                  m5, [srcq+src_stride3q*2]
407  psubusw               m4, m5
408  psubusw               m5, m7
409  por                   m4, m5
410
411  paddw                 m1, m2
412  paddw                 m3, m4
413  movhlps               m2, m1
414  movhlps               m4, m3
415  paddw                 m1, m2
416  paddw                 m3, m4
417  punpcklwd             m1, m6
418  punpcklwd             m3, m6
419  lea                 refq, [refq+ref_strideq*8]
420  paddd                 m0, m1
421  lea                 srcq, [srcq+src_strideq*8]
422  paddd                 m0, m3
423  dec              n_rowsd
424  jg .loop
425
426  movhlps               m1, m0
427  paddd                 m0, m1
428  punpckldq             m0, m6
429  movhlps               m1, m0
430  paddd                 m0, m1
431%if %2 == 2  ; we skipped rows, so we need to double the sad
432  pslld                 m0, 1
433%endif
434  movd                 eax, m0
435  RET
436%endmacro
437
438INIT_XMM sse2
439HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
440HIGH_SAD8XN  8 ; highbd_sad8x8_sse2
441HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
442HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
443HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
444HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
445HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2
446HIGH_SAD8XN  8, 2 ; highbd_sad_skip_8x8_sse2
447; Current code fails there are only 2 rows
448; HIGH_SAD8XN  4, 2 ; highbd_sad8x4_avg_sse2
449%if CONFIG_REALTIME_ONLY==0
450HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2
451HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2
452HIGH_SAD8XN 32, 2 ; highbd_sad_skip_8x32_sse2
453%endif
454
455; unsigned int aom_highbd_sad4x{4,8,16}_sse2(uint8_t *src, int src_stride,
456;                                    uint8_t *ref, int ref_stride);
457%macro HIGH_SAD4XN 1-2 0
458  HIGH_SAD_FN 4, %1, 7, %2
459%if %2 == 2  ; skip rows, so divide number of rows by 2
460  mov              n_rowsd, %1/8
461%else
462  mov              n_rowsd, %1/4
463%endif
464  pxor                  m0, m0
465  pxor                  m6, m6
466
467.loop:
468  movq                  m1, [refq]
469  movq                  m2, [refq+ref_strideq*2]
470  movq                  m3, [refq+ref_strideq*4]
471  movq                  m4, [refq+ref_stride3q*2]
472  punpcklwd             m1, m3
473  punpcklwd             m2, m4
474%if %2 == 1
475  movq                  m3, [second_predq+8*0]
476  movq                  m5, [second_predq+8*2]
477  punpcklwd             m3, m5
478  movq                  m4, [second_predq+8*1]
479  movq                  m5, [second_predq+8*3]
480  punpcklwd             m4, m5
481  lea         second_predq, [second_predq+8*4]
482  pavgw                 m1, m3
483  pavgw                 m2, m4
484%endif
485  movq                  m5, [srcq]
486  movq                  m3, [srcq+src_strideq*4]
487  punpcklwd             m5, m3
488  movdqa                m3, m1
489  psubusw               m1, m5
490  psubusw               m5, m3
491  por                   m1, m5
492  movq                  m5, [srcq+src_strideq*2]
493  movq                  m4, [srcq+src_stride3q*2]
494  punpcklwd             m5, m4
495  movdqa                m4, m2
496  psubusw               m2, m5
497  psubusw               m5, m4
498  por                   m2, m5
499  paddw                 m1, m2
500  movdqa                m2, m1
501  punpcklwd             m1, m6
502  punpckhwd             m2, m6
503  lea                 refq, [refq+ref_strideq*8]
504  paddd                 m0, m1
505  lea                 srcq, [srcq+src_strideq*8]
506  paddd                 m0, m2
507  dec              n_rowsd
508  jg .loop
509
510  movhlps               m1, m0
511  paddd                 m0, m1
512  punpckldq             m0, m6
513  movhlps               m1, m0
514  paddd                 m0, m1
515%if %2 == 2  ; we skipped rows, so we need to double the sad
516  pslld                 m0, 1
517%endif
518  movd                 eax, m0
519  RET
520%endmacro
521
522INIT_XMM sse2
523HIGH_SAD4XN  8 ; highbd_sad4x8_sse2
524HIGH_SAD4XN  4 ; highbd_sad4x4_sse2
525HIGH_SAD4XN  8, 1 ; highbd_sad4x8_avg_sse2
526HIGH_SAD4XN  4, 1 ; highbd_sad4x4_avg_sse2
527HIGH_SAD4XN  8, 2 ; highbd_sad_skip_4x8_sse2
528; Current code fails there are only 2 rows
529; HIGH_SAD4XN  4, 2 ; highbd_sad_skip_4x4_sse2
530%if CONFIG_REALTIME_ONLY==0
531HIGH_SAD4XN 16 ; highbd_sad4x16_sse2
532HIGH_SAD4XN 16, 1 ; highbd_sad4x16_avg_sse2
533HIGH_SAD4XN 16, 2 ; highbd_sad_skip_4x16_sse2
534%endif
535