xref: /aosp_15_r20/external/libaom/aom_dsp/x86/highbd_sad4d_sse2.asm (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14%include "third_party/x86inc/x86inc.asm"
15
16SECTION .text
17
18; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
19%macro HIGH_PROCESS_4x2x4 5-6 0
20  movh                  m0, [srcq +%2*2]
21%if %1 == 1
22  movu                  m4, [ref1q+%3*2]
23  movu                  m5, [ref2q+%3*2]
24  movu                  m6, [ref3q+%3*2]
25  movu                  m7, [ref4q+%3*2]
26  movhps                m0, [srcq +%4*2]
27  movhps                m4, [ref1q+%5*2]
28  movhps                m5, [ref2q+%5*2]
29  movhps                m6, [ref3q+%5*2]
30  movhps                m7, [ref4q+%5*2]
31  mova                  m3, m0
32  mova                  m2, m0
33  psubusw               m3, m4
34  psubusw               m2, m5
35  psubusw               m4, m0
36  psubusw               m5, m0
37  por                   m4, m3
38  por                   m5, m2
39  pmaddwd               m4, m1
40  pmaddwd               m5, m1
41  mova                  m3, m0
42  mova                  m2, m0
43  psubusw               m3, m6
44  psubusw               m2, m7
45  psubusw               m6, m0
46  psubusw               m7, m0
47  por                   m6, m3
48  por                   m7, m2
49  pmaddwd               m6, m1
50  pmaddwd               m7, m1
51%else
52  movu                  m2, [ref1q+%3*2]
53  movhps                m0, [srcq +%4*2]
54  movhps                m2, [ref1q+%5*2]
55  mova                  m3, m0
56  psubusw               m3, m2
57  psubusw               m2, m0
58  por                   m2, m3
59  pmaddwd               m2, m1
60  paddd                 m4, m2
61
62  movu                  m2, [ref2q+%3*2]
63  mova                  m3, m0
64  movhps                m2, [ref2q+%5*2]
65  psubusw               m3, m2
66  psubusw               m2, m0
67  por                   m2, m3
68  pmaddwd               m2, m1
69  paddd                 m5, m2
70
71  movu                  m2, [ref3q+%3*2]
72  mova                  m3, m0
73  movhps                m2, [ref3q+%5*2]
74  psubusw               m3, m2
75  psubusw               m2, m0
76  por                   m2, m3
77  pmaddwd               m2, m1
78  paddd                 m6, m2
79
80  movu                  m2, [ref4q+%3*2]
81  mova                  m3, m0
82  movhps                m2, [ref4q+%5*2]
83  psubusw               m3, m2
84  psubusw               m2, m0
85  por                   m2, m3
86  pmaddwd               m2, m1
87  paddd                 m7, m2
88%endif
89%if %6 == 1
90  lea                 srcq, [srcq +src_strideq*4]
91  lea                ref1q, [ref1q+ref_strideq*4]
92  lea                ref2q, [ref2q+ref_strideq*4]
93  lea                ref3q, [ref3q+ref_strideq*4]
94  lea                ref4q, [ref4q+ref_strideq*4]
95%endif
96%endmacro
97
98; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
99%macro HIGH_PROCESS_8x2x4 5-6 0
100  ; 1st 8 px
101  mova                  m0, [srcq +%2*2]
102%if %1 == 1
103  movu                  m4, [ref1q+%3*2]
104  movu                  m5, [ref2q+%3*2]
105  movu                  m6, [ref3q+%3*2]
106  movu                  m7, [ref4q+%3*2]
107  mova                  m3, m0
108  mova                  m2, m0
109  psubusw               m3, m4
110  psubusw               m2, m5
111  psubusw               m4, m0
112  psubusw               m5, m0
113  por                   m4, m3
114  por                   m5, m2
115  pmaddwd               m4, m1
116  pmaddwd               m5, m1
117  mova                  m3, m0
118  mova                  m2, m0
119  psubusw               m3, m6
120  psubusw               m2, m7
121  psubusw               m6, m0
122  psubusw               m7, m0
123  por                   m6, m3
124  por                   m7, m2
125  pmaddwd               m6, m1
126  pmaddwd               m7, m1
127%else
128  mova                  m3, m0
129  movu                  m2, [ref1q+%3*2]
130  psubusw               m3, m2
131  psubusw               m2, m0
132  por                   m2, m3
133  mova                  m3, m0
134  pmaddwd               m2, m1
135  paddd                 m4, m2
136  movu                  m2, [ref2q+%3*2]
137  psubusw               m3, m2
138  psubusw               m2, m0
139  por                   m2, m3
140  mova                  m3, m0
141  pmaddwd               m2, m1
142  paddd                 m5, m2
143  movu                  m2, [ref3q+%3*2]
144  psubusw               m3, m2
145  psubusw               m2, m0
146  por                   m2, m3
147  mova                  m3, m0
148  pmaddwd               m2, m1
149  paddd                 m6, m2
150  movu                  m2, [ref4q+%3*2]
151  psubusw               m3, m2
152  psubusw               m2, m0
153  por                   m2, m3
154  pmaddwd               m2, m1
155  paddd                 m7, m2
156%endif
157
158  ; 2nd 8 px
159  mova                  m0, [srcq +(%4)*2]
160  mova                  m3, m0
161  movu                  m2, [ref1q+(%5)*2]
162  psubusw               m3, m2
163  psubusw               m2, m0
164  por                   m2, m3
165  mova                  m3, m0
166  pmaddwd               m2, m1
167  paddd                 m4, m2
168  movu                  m2, [ref2q+(%5)*2]
169  psubusw               m3, m2
170  psubusw               m2, m0
171  por                   m2, m3
172  mova                  m3, m0
173  pmaddwd               m2, m1
174  paddd                 m5, m2
175  movu                  m2, [ref3q+(%5)*2]
176  psubusw               m3, m2
177  psubusw               m2, m0
178  por                   m2, m3
179  mova                  m3, m0
180  pmaddwd               m2, m1
181  paddd                 m6, m2
182  movu                  m2, [ref4q+(%5)*2]
183  psubusw               m3, m2
184  psubusw               m2, m0
185%if %6 == 1
186  lea                 srcq, [srcq +src_strideq*4]
187  lea                ref1q, [ref1q+ref_strideq*4]
188  lea                ref2q, [ref2q+ref_strideq*4]
189  lea                ref3q, [ref3q+ref_strideq*4]
190  lea                ref4q, [ref4q+ref_strideq*4]
191%endif
192  por                   m2, m3
193  pmaddwd               m2, m1
194  paddd                 m7, m2
195%endmacro
196
197; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
198%macro HIGH_PROCESS_16x2x4 5-6 0
199  HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
200  HIGH_PROCESS_8x2x4  0, %4, %5, (%4 + 8), (%5 + 8), %6
201%endmacro
202
203; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
204%macro HIGH_PROCESS_32x2x4 5-6 0
205  HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
206  HIGH_PROCESS_16x2x4  0, %4, %5, (%4 + 16), (%5 + 16), %6
207%endmacro
208
209; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
210%macro HIGH_PROCESS_64x2x4 5-6 0
211  HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
212  HIGH_PROCESS_32x2x4  0, %4, %5, (%4 + 32), (%5 + 32), %6
213%endmacro
214
215; void aom_highbd_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
216;                         uint8_t *ref[4], int ref_stride,
217;                         uint32_t res[4]);
218; Macro Arguments:
219;   1: Width
220;   2: Height
221;   3: If 0, then normal sad, if 2, then skip every other row
222%macro HIGH_SADNXN4D 2-3 0
223%if %3 == 0  ; normal sad
224%if AOM_ARCH_X86_64
225cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
226                              res, ref2, ref3, ref4
227%else
228cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
229                              ref2, ref3, ref4
230%endif  ; AOM_ARCH_X86_64
231%else  ; %3 == 2, downsample
232%if AOM_ARCH_X86_64
233cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
234                              res, ref2, ref3, ref4
235%else
236cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
237                              ref2, ref3, ref4
238%endif  ; AOM_ARCH_X86_64
239%endif  ; sad/avg/skip
240
241; set m1
242  push                srcq
243  mov                 srcd, 0x00010001
244  movd                  m1, srcd
245  pshufd                m1, m1, 0x0
246  pop                 srcq
247
248%if %3 == 2  ; skip rows
249  lea          src_strided, [2*src_strided]
250  lea          ref_strided, [2*ref_strided]
251%endif  ; skip rows
252  movsxdifnidn src_strideq, src_strided
253  movsxdifnidn ref_strideq, ref_strided
254  mov                ref2q, [ref1q+gprsize*1]
255  mov                ref3q, [ref1q+gprsize*2]
256  mov                ref4q, [ref1q+gprsize*3]
257  mov                ref1q, [ref1q+gprsize*0]
258
259; convert byte pointers to short pointers
260  shl                 srcq, 1
261  shl                ref2q, 1
262  shl                ref3q, 1
263  shl                ref4q, 1
264  shl                ref1q, 1
265
266  HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
267%if %3 == 2  ;  Downsampling by two
268%define num_rep (%2-8)/4
269%else
270%define num_rep (%2-4)/2
271%endif
272%rep num_rep
273  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
274%endrep
275%undef rep
276  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
277  ; N.B. HIGH_PROCESS outputs dwords (32 bits)
278  ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
279  movhlps               m0, m4
280  movhlps               m1, m5
281  movhlps               m2, m6
282  movhlps               m3, m7
283  paddd                 m4, m0
284  paddd                 m5, m1
285  paddd                 m6, m2
286  paddd                 m7, m3
287  punpckldq             m4, m5
288  punpckldq             m6, m7
289  movhlps               m0, m4
290  movhlps               m1, m6
291  paddd                 m4, m0
292  paddd                 m6, m1
293  punpcklqdq            m4, m6
294%if %3 == 2  ; skip rows
295  pslld                 m4, 1
296%endif
297  movifnidn             r4, r4mp
298  movu                [r4], m4
299  RET
300%endmacro
301
302
303INIT_XMM sse2
304HIGH_SADNXN4D 64, 64
305HIGH_SADNXN4D 64, 32
306HIGH_SADNXN4D 32, 64
307HIGH_SADNXN4D 32, 32
308HIGH_SADNXN4D 32, 16
309HIGH_SADNXN4D 16, 32
310HIGH_SADNXN4D 16, 16
311HIGH_SADNXN4D 16,  8
312HIGH_SADNXN4D  8, 16
313HIGH_SADNXN4D  8,  8
314HIGH_SADNXN4D  8,  4
315HIGH_SADNXN4D  4,  8
316HIGH_SADNXN4D  4,  4
317HIGH_SADNXN4D  4, 16
318HIGH_SADNXN4D 16,  4
319HIGH_SADNXN4D  8, 32
320HIGH_SADNXN4D 32,  8
321HIGH_SADNXN4D 16, 64
322HIGH_SADNXN4D 64, 16
323
324HIGH_SADNXN4D 64, 64, 2
325HIGH_SADNXN4D 64, 32, 2
326HIGH_SADNXN4D 32, 64, 2
327HIGH_SADNXN4D 32, 32, 2
328HIGH_SADNXN4D 32, 16, 2
329HIGH_SADNXN4D 16, 32, 2
330HIGH_SADNXN4D 16, 16, 2
331HIGH_SADNXN4D 16,  8, 2
332HIGH_SADNXN4D  8, 16, 2
333HIGH_SADNXN4D  8,  8, 2
334HIGH_SADNXN4D  4,  8, 2
335HIGH_SADNXN4D  4, 16, 2
336HIGH_SADNXN4D  8, 32, 2
337HIGH_SADNXN4D 32,  8, 2
338HIGH_SADNXN4D 16, 64, 2
339HIGH_SADNXN4D 64, 16, 2
340
341; Current code cannot handle the case when the height is downsampled to 2
342; HIGH_SADNXN4D 16,  4, 2
343; HIGH_SADNXN4D  8,  4, 2
344; HIGH_SADNXN4D  4,  4, 2
345