xref: /aosp_15_r20/external/libaom/aom_dsp/x86/sad4d_sse2.asm (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14%include "third_party/x86inc/x86inc.asm"
15
16SECTION .text
17
18; 'spill_src_stride' affect a lot how the code works.
19;
20; When 'spill_src_stride' is false, the 'src_strideq' resides in
21; register, [srcq + src_strideq + offset] is allowed, so we can simply
22; use such form to access src memory and don't bother to update 'srcq'
23; at each line. We only update 'srcq' each two-lines using a compact
24; LEA instruction like [srcq+src_strideq*2].
25;
26; When 'spill_src_stride' is true, the 'src_strideq' resides in memory.
27; we cannot use above form to access memory, we have to update
28; 'srcq' at each line break. As we process two parts (first,second)
29; together in each macro function, the second part may also sit
30; in the next line, which means we also need to possibly add
31; one 'src_strideq' to 'srcq' before processing second part.
32
33%macro HANDLE_SECOND_OFFSET 0
34  %if spill_src_stride
35    %define second_offset 0
36    add srcq, src_strideq
37  %else
38    %define second_offset (src_strideq)
39  %endif
40%endmacro
41
42; This is specically designed to handle when src_strideq is a
43; memory position, under such case, we can not accomplish
44; complex address calculation using LEA, and fall back to
45; using simple ADD instruction at each line ending.
46%macro ADVANCE_END_OF_TWO_LINES 0
47  %if spill_src_stride
48    add srcq, src_strideq
49  %else
50    lea                 srcq, [srcq+src_strideq*2]
51  %endif
52
53; note: ref_stride is never spilled when processing two lines
54  lea                ref1q, [ref1q+ref_strideq*2]
55  lea                ref2q, [ref2q+ref_strideq*2]
56  lea                ref3q, [ref3q+ref_strideq*2]
57  lea                ref4q, [ref4q+ref_strideq*2]
58%endmacro
59
60; PROCESS_4x2x4 first
61%macro PROCESS_4x2x4 1
62  movd                  m0, [srcq]
63  HANDLE_SECOND_OFFSET
64%if %1 == 1
65  movd                  m6, [ref1q]
66  movd                  m4, [ref2q]
67  movd                  m7, [ref3q]
68  movd                  m5, [ref4q]
69
70  movd                  m1, [srcq + second_offset]
71  movd                  m2, [ref1q+ref_strideq]
72  punpckldq             m0, m1
73  punpckldq             m6, m2
74  movd                  m1, [ref2q+ref_strideq]
75  movd                  m2, [ref3q+ref_strideq]
76  movd                  m3, [ref4q+ref_strideq]
77  punpckldq             m4, m1
78  punpckldq             m7, m2
79  punpckldq             m5, m3
80  movlhps               m0, m0
81  movlhps               m6, m4
82  movlhps               m7, m5
83  psadbw                m6, m0
84  psadbw                m7, m0
85%else
86  movd                  m1, [ref1q]
87  movd                  m5, [ref1q+ref_strideq]
88  movd                  m2, [ref2q]
89  movd                  m4, [ref2q+ref_strideq]
90  punpckldq             m1, m5
91  punpckldq             m2, m4
92  movd                  m3, [ref3q]
93  movd                  m5, [ref3q+ref_strideq]
94  punpckldq             m3, m5
95  movd                  m4, [ref4q]
96  movd                  m5, [ref4q+ref_strideq]
97  punpckldq             m4, m5
98  movd                  m5, [srcq + second_offset]
99  punpckldq             m0, m5
100  movlhps               m0, m0
101  movlhps               m1, m2
102  movlhps               m3, m4
103  psadbw                m1, m0
104  psadbw                m3, m0
105  paddd                 m6, m1
106  paddd                 m7, m3
107%endif
108%endmacro
109
110; PROCESS_8x2x4 first
111%macro PROCESS_8x2x4 1
112  movh                  m0, [srcq]
113  HANDLE_SECOND_OFFSET
114%if %1 == 1
115  movh                  m4, [ref1q]
116  movh                  m5, [ref2q]
117  movh                  m6, [ref3q]
118  movh                  m7, [ref4q]
119  movhps                m0, [srcq + second_offset]
120  movhps                m4, [ref1q+ref_strideq]
121  movhps                m5, [ref2q+ref_strideq]
122  movhps                m6, [ref3q+ref_strideq]
123  movhps                m7, [ref4q+ref_strideq]
124  psadbw                m4, m0
125  psadbw                m5, m0
126  psadbw                m6, m0
127  psadbw                m7, m0
128%else
129  movh                  m1, [ref1q]
130  movh                  m2, [ref2q]
131  movhps                m0, [srcq + second_offset]
132  movhps                m1, [ref1q+ref_strideq]
133  movhps                m2, [ref2q+ref_strideq]
134  psadbw                m1, m0
135  psadbw                m2, m0
136  paddd                 m4, m1
137  paddd                 m5, m2
138
139  movh                  m1, [ref3q]
140  movhps                m1, [ref3q+ref_strideq]
141  movh                  m2, [ref4q]
142  movhps                m2, [ref4q+ref_strideq]
143  psadbw                m1, m0
144  psadbw                m2, m0
145  paddd                 m6, m1
146  paddd                 m7, m2
147%endif
148%endmacro
149
150; PROCESS_FIRST_MMSIZE
151%macro PROCESS_FIRST_MMSIZE 0
152  mova                  m0, [srcq]
153  movu                  m4, [ref1q]
154  movu                  m5, [ref2q]
155  movu                  m6, [ref3q]
156  movu                  m7, [ref4q]
157  psadbw                m4, m0
158  psadbw                m5, m0
159  psadbw                m6, m0
160  psadbw                m7, m0
161%endmacro
162
163; PROCESS_16x1x4 offset
164%macro PROCESS_16x1x4 1
165  mova                  m0, [srcq + %1]
166  movu                  m1, [ref1q + ref_offsetq + %1]
167  movu                  m2, [ref2q + ref_offsetq + %1]
168  psadbw                m1, m0
169  psadbw                m2, m0
170  paddd                 m4, m1
171  paddd                 m5, m2
172
173  movu                  m1, [ref3q + ref_offsetq + %1]
174  movu                  m2, [ref4q + ref_offsetq + %1]
175  psadbw                m1, m0
176  psadbw                m2, m0
177  paddd                 m6, m1
178  paddd                 m7, m2
179%endmacro
180
181; void aom_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
182;                         uint8_t *ref[4], int ref_stride,
183;                         uint32_t res[4]);
184; Macro Arguments:
185;   1: Width
186;   2: Height
187;   3: If 0, then normal sad, else skip rows
188%macro SADNXN4D 2-3 0
189
190%define spill_src_stride 0
191%define spill_ref_stride 0
192%define spill_cnt 0
193
194; Whether a shared offset should be used instead of adding strides to
195; each reference array. With this option, only one line will be processed
196; per loop iteration.
197%define use_ref_offset (%1 >= mmsize)
198
199; Remove loops in the 4x4 and 8x4 case
200%define use_loop (use_ref_offset || %2 > 4)
201
202%if %3 == 1  ; skip rows
203%if AOM_ARCH_X86_64
204%if use_ref_offset
205cglobal sad_skip_%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, \
206                                     ref2, ref3, ref4, cnt, ref_offset
207%elif use_loop
208cglobal sad_skip_%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, res, \
209                                    ref2, ref3, ref4, cnt
210%else
211cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, res, \
212                                    ref2, ref3, ref4
213%endif
214%else
215%if use_ref_offset
216cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, ref_offset, ref1, cnt, ref2, ref3, \
217                                    ref4
218%define spill_src_stride 1
219%define spill_ref_stride 1
220%elif use_loop
221cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, cnt, ref1, ref_stride, ref2, \
222                                    ref3, ref4
223%define spill_src_stride 1
224%else
225cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, ref2, \
226                                    ref3, ref4
227%endif
228%endif
229%else ; normal sad
230%if AOM_ARCH_X86_64
231%if use_ref_offset
232cglobal sad%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, ref2, \
233                               ref3, ref4, cnt, ref_offset
234%elif use_loop
235cglobal sad%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, res, ref2, \
236                              ref3, ref4, cnt
237%else
238cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, res, ref2, \
239                              ref3, ref4
240%endif
241%else
242%if use_ref_offset
243cglobal sad%1x%2x4d, 4, 7, 8, src, ref_offset, ref1, cnt, ref2, ref3, ref4
244  %define spill_src_stride 1
245  %define spill_ref_stride 1
246%elif use_loop
247cglobal sad%1x%2x4d, 4, 7, 8, src, cnt, ref1, ref_stride, ref2, ref3, ref4
248  %define spill_src_stride 1
249%else
250cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, ref2, ref3, \
251                              ref4
252%endif
253%endif
254%endif
255
256%if spill_src_stride
257  %define src_strideq r1mp
258  %define src_strided r1mp
259%endif
260%if spill_ref_stride
261  %define ref_strideq r3mp
262  %define ref_strided r3mp
263%endif
264
265%if spill_cnt
266  SUB                  rsp, 4
267  %define cntd word [rsp]
268%endif
269
270%if %3 == 1
271  sal          src_strided, 1
272  sal          ref_strided, 1
273%endif
274  movsxdifnidn src_strideq, src_strided
275  movsxdifnidn ref_strideq, ref_strided
276
277  mov                ref2q, [ref1q+gprsize*1]
278  mov                ref3q, [ref1q+gprsize*2]
279  mov                ref4q, [ref1q+gprsize*3]
280  mov                ref1q, [ref1q+gprsize*0]
281
282; Is the loop for this wxh in another function?
283; If so, we jump into that function for the loop and returning
284%define external_loop (use_ref_offset && %1 > mmsize && %1 != %2)
285
286%if use_ref_offset
287  PROCESS_FIRST_MMSIZE
288%if %1 > mmsize
289  mov          ref_offsetq, 0
290  mov                 cntd, %2 >> %3
291; Jump part way into the loop for the square version of this width
292%if %3 == 1
293  jmp mangle(private_prefix %+ _sad_skip_%1x%1x4d %+ SUFFIX).midloop
294%else
295  jmp mangle(private_prefix %+ _sad%1x%1x4d %+ SUFFIX).midloop
296%endif
297%else
298  mov          ref_offsetq, ref_strideq
299  add                 srcq, src_strideq
300  mov                 cntd, (%2 >> %3) - 1
301%endif
302%if external_loop == 0
303.loop:
304; Unrolled horizontal loop
305%assign h_offset 0
306%rep %1/mmsize
307  PROCESS_16x1x4 h_offset
308%if h_offset == 0
309; The first row of the first column is done outside the loop and jumps here
310.midloop:
311%endif
312%assign h_offset h_offset+mmsize
313%endrep
314
315  add                 srcq, src_strideq
316  add          ref_offsetq, ref_strideq
317  sub                 cntd, 1
318  jnz .loop
319%endif
320%else
321  PROCESS_%1x2x4 1
322  ADVANCE_END_OF_TWO_LINES
323%if use_loop
324  mov                 cntd, (%2/2 >> %3) - 1
325.loop:
326%endif
327  PROCESS_%1x2x4 0
328%if use_loop
329  ADVANCE_END_OF_TWO_LINES
330  sub                 cntd, 1
331  jnz .loop
332%endif
333%endif
334
335%if spill_cnt
336; Undo stack allocation for cnt
337  ADD                  rsp, 4
338%endif
339
340%if external_loop == 0
341%if %3 == 0
342  %define resultq r4
343  %define resultmp r4mp
344%endif
345
346; Undo modifications on parameters on the stack
347%if %3 == 1
348%if spill_src_stride
349  shr          src_strided, 1
350%endif
351%if spill_ref_stride
352  shr          ref_strided, 1
353%endif
354%endif
355
356%if %1 > 4
357  pslldq                m5, 4
358  pslldq                m7, 4
359  por                   m4, m5
360  por                   m6, m7
361  mova                  m5, m4
362  mova                  m7, m6
363  punpcklqdq            m4, m6
364  punpckhqdq            m5, m7
365  paddd                 m4, m5
366%if %3 == 1
367  pslld                 m4, 1
368%endif
369  movifnidn             resultq, resultmp
370  movu                [resultq], m4
371  RET
372%else
373  pshufd            m6, m6, 0x08
374  pshufd            m7, m7, 0x08
375%if %3 == 1
376  pslld                 m6, 1
377  pslld                 m7, 1
378%endif
379  movifnidn             resultq, resultmp
380  movq              [resultq+0], m6
381  movq              [resultq+8], m7
382  RET
383%endif
384%endif ; external_loop == 0
385%endmacro
386
387INIT_XMM sse2
388SADNXN4D 128, 128
389SADNXN4D 128,  64
390SADNXN4D  64, 128
391SADNXN4D  64,  64
392SADNXN4D  64,  32
393SADNXN4D  32,  64
394SADNXN4D  32,  32
395SADNXN4D  32,  16
396SADNXN4D  16,  32
397SADNXN4D  16,  16
398SADNXN4D  16,   8
399SADNXN4D   8,  16
400SADNXN4D   8,   8
401SADNXN4D   8,   4
402SADNXN4D   4,   8
403SADNXN4D   4,   4
404%if CONFIG_REALTIME_ONLY==0
405SADNXN4D   4,  16
406SADNXN4D  16,   4
407SADNXN4D   8,  32
408SADNXN4D  32,   8
409SADNXN4D  16,  64
410SADNXN4D  64,  16
411%endif
412SADNXN4D 128, 128, 1
413SADNXN4D 128,  64, 1
414SADNXN4D  64, 128, 1
415SADNXN4D  64,  64, 1
416SADNXN4D  64,  32, 1
417SADNXN4D  32,  64, 1
418SADNXN4D  32,  32, 1
419SADNXN4D  32,  16, 1
420SADNXN4D  16,  32, 1
421SADNXN4D  16,  16, 1
422SADNXN4D  16,   8, 1
423SADNXN4D   8,  16, 1
424SADNXN4D   8,   8, 1
425SADNXN4D   4,   8, 1
426%if CONFIG_REALTIME_ONLY==0
427SADNXN4D   4,  16, 1
428SADNXN4D   8,  32, 1
429SADNXN4D  32,   8, 1
430SADNXN4D  16,  64, 1
431SADNXN4D  64,  16, 1
432%endif
433
434; Different assembly is needed when the height gets subsampled to 2
435; SADNXN4D 16,  4, 1
436; SADNXN4D  8,  4, 1
437; SADNXN4D  4,  4, 1
438