xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1;
2;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION .text
14
15; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
16%macro HIGH_PROCESS_4x2x4 5-6 0
17  movh                  m0, [srcq +%2*2]
18%if %1 == 1
19  movu                  m4, [ref1q+%3*2]
20  movu                  m5, [ref2q+%3*2]
21  movu                  m6, [ref3q+%3*2]
22  movu                  m7, [ref4q+%3*2]
23  movhps                m0, [srcq +%4*2]
24  movhps                m4, [ref1q+%5*2]
25  movhps                m5, [ref2q+%5*2]
26  movhps                m6, [ref3q+%5*2]
27  movhps                m7, [ref4q+%5*2]
28  mova                  m3, m0
29  mova                  m2, m0
30  psubusw               m3, m4
31  psubusw               m2, m5
32  psubusw               m4, m0
33  psubusw               m5, m0
34  por                   m4, m3
35  por                   m5, m2
36  pmaddwd               m4, m1
37  pmaddwd               m5, m1
38  mova                  m3, m0
39  mova                  m2, m0
40  psubusw               m3, m6
41  psubusw               m2, m7
42  psubusw               m6, m0
43  psubusw               m7, m0
44  por                   m6, m3
45  por                   m7, m2
46  pmaddwd               m6, m1
47  pmaddwd               m7, m1
48%else
49  movu                  m2, [ref1q+%3*2]
50  movhps                m0, [srcq +%4*2]
51  movhps                m2, [ref1q+%5*2]
52  mova                  m3, m0
53  psubusw               m3, m2
54  psubusw               m2, m0
55  por                   m2, m3
56  pmaddwd               m2, m1
57  paddd                 m4, m2
58
59  movu                  m2, [ref2q+%3*2]
60  mova                  m3, m0
61  movhps                m2, [ref2q+%5*2]
62  psubusw               m3, m2
63  psubusw               m2, m0
64  por                   m2, m3
65  pmaddwd               m2, m1
66  paddd                 m5, m2
67
68  movu                  m2, [ref3q+%3*2]
69  mova                  m3, m0
70  movhps                m2, [ref3q+%5*2]
71  psubusw               m3, m2
72  psubusw               m2, m0
73  por                   m2, m3
74  pmaddwd               m2, m1
75  paddd                 m6, m2
76
77  movu                  m2, [ref4q+%3*2]
78  mova                  m3, m0
79  movhps                m2, [ref4q+%5*2]
80  psubusw               m3, m2
81  psubusw               m2, m0
82  por                   m2, m3
83  pmaddwd               m2, m1
84  paddd                 m7, m2
85%endif
86%if %6 == 1
87  lea                 srcq, [srcq +src_strideq*4]
88  lea                ref1q, [ref1q+ref_strideq*4]
89  lea                ref2q, [ref2q+ref_strideq*4]
90  lea                ref3q, [ref3q+ref_strideq*4]
91  lea                ref4q, [ref4q+ref_strideq*4]
92%endif
93%endmacro
94
95; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
96%macro HIGH_PROCESS_8x2x4 5-6 0
97  ; 1st 8 px
98  mova                  m0, [srcq +%2*2]
99%if %1 == 1
100  movu                  m4, [ref1q+%3*2]
101  movu                  m5, [ref2q+%3*2]
102  movu                  m6, [ref3q+%3*2]
103  movu                  m7, [ref4q+%3*2]
104  mova                  m3, m0
105  mova                  m2, m0
106  psubusw               m3, m4
107  psubusw               m2, m5
108  psubusw               m4, m0
109  psubusw               m5, m0
110  por                   m4, m3
111  por                   m5, m2
112  pmaddwd               m4, m1
113  pmaddwd               m5, m1
114  mova                  m3, m0
115  mova                  m2, m0
116  psubusw               m3, m6
117  psubusw               m2, m7
118  psubusw               m6, m0
119  psubusw               m7, m0
120  por                   m6, m3
121  por                   m7, m2
122  pmaddwd               m6, m1
123  pmaddwd               m7, m1
124%else
125  mova                  m3, m0
126  movu                  m2, [ref1q+%3*2]
127  psubusw               m3, m2
128  psubusw               m2, m0
129  por                   m2, m3
130  mova                  m3, m0
131  pmaddwd               m2, m1
132  paddd                 m4, m2
133  movu                  m2, [ref2q+%3*2]
134  psubusw               m3, m2
135  psubusw               m2, m0
136  por                   m2, m3
137  mova                  m3, m0
138  pmaddwd               m2, m1
139  paddd                 m5, m2
140  movu                  m2, [ref3q+%3*2]
141  psubusw               m3, m2
142  psubusw               m2, m0
143  por                   m2, m3
144  mova                  m3, m0
145  pmaddwd               m2, m1
146  paddd                 m6, m2
147  movu                  m2, [ref4q+%3*2]
148  psubusw               m3, m2
149  psubusw               m2, m0
150  por                   m2, m3
151  pmaddwd               m2, m1
152  paddd                 m7, m2
153%endif
154
155  ; 2nd 8 px
156  mova                  m0, [srcq +(%4)*2]
157  mova                  m3, m0
158  movu                  m2, [ref1q+(%5)*2]
159  psubusw               m3, m2
160  psubusw               m2, m0
161  por                   m2, m3
162  mova                  m3, m0
163  pmaddwd               m2, m1
164  paddd                 m4, m2
165  movu                  m2, [ref2q+(%5)*2]
166  psubusw               m3, m2
167  psubusw               m2, m0
168  por                   m2, m3
169  mova                  m3, m0
170  pmaddwd               m2, m1
171  paddd                 m5, m2
172  movu                  m2, [ref3q+(%5)*2]
173  psubusw               m3, m2
174  psubusw               m2, m0
175  por                   m2, m3
176  mova                  m3, m0
177  pmaddwd               m2, m1
178  paddd                 m6, m2
179  movu                  m2, [ref4q+(%5)*2]
180  psubusw               m3, m2
181  psubusw               m2, m0
182%if %6 == 1
183  lea                 srcq, [srcq +src_strideq*4]
184  lea                ref1q, [ref1q+ref_strideq*4]
185  lea                ref2q, [ref2q+ref_strideq*4]
186  lea                ref3q, [ref3q+ref_strideq*4]
187  lea                ref4q, [ref4q+ref_strideq*4]
188%endif
189  por                   m2, m3
190  pmaddwd               m2, m1
191  paddd                 m7, m2
192%endmacro
193
194; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
195%macro HIGH_PROCESS_16x2x4 5-6 0
196  HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
197  HIGH_PROCESS_8x2x4  0, %4, %5, (%4 + 8), (%5 + 8), %6
198%endmacro
199
200; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
201%macro HIGH_PROCESS_32x2x4 5-6 0
202  HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
203  HIGH_PROCESS_16x2x4  0, %4, %5, (%4 + 16), (%5 + 16), %6
204%endmacro
205
206; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
207%macro HIGH_PROCESS_64x2x4 5-6 0
208  HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
209  HIGH_PROCESS_32x2x4  0, %4, %5, (%4 + 32), (%5 + 32), %6
210%endmacro
211
212; void vpx_highbd_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
213;                         uint8_t *ref[4], int ref_stride,
214;                         uint32_t res[4]);
215; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
216; Macro Arguments:
217;   1: Width
218;   2: Height
219;   3: If 0, then normal sad, if 2, then skip every other row
220%macro HIGH_SADNXN4D 2-3 0
221%if %3 == 0  ; normal sad
222%if UNIX64
223cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
224                              res, ref2, ref3, ref4
225%else
226cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
227                              ref2, ref3, ref4
228%endif
229%else  ; %3 == 2, downsample
230%if UNIX64
231cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
232                              res, ref2, ref3, ref4
233%else
234cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
235                              ref2, ref3, ref4
236%endif  ;
237%endif  ; sad/avg/skip
238
239; set m1
240  push                srcq
241  mov                 srcd, 0x00010001
242  movd                  m1, srcd
243  pshufd                m1, m1, 0x0
244  pop                 srcq
245
246%if %3 == 2  ; skip rows
247  lea          src_strided, [2*src_strided]
248  lea          ref_strided, [2*ref_strided]
249%endif  ; skip rows
250  movsxdifnidn src_strideq, src_strided
251  movsxdifnidn ref_strideq, ref_strided
252  mov                ref2q, [ref1q+gprsize*1]
253  mov                ref3q, [ref1q+gprsize*2]
254  mov                ref4q, [ref1q+gprsize*3]
255  mov                ref1q, [ref1q+gprsize*0]
256
257; convert byte pointers to short pointers
258  shl                 srcq, 1
259  shl                ref2q, 1
260  shl                ref3q, 1
261  shl                ref4q, 1
262  shl                ref1q, 1
263
264  HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
265%if %3 == 2  ;  Downsampling by two
266%define num_rep (%2-8)/4
267%else
268%define num_rep (%2-4)/2
269%endif
270%rep num_rep
271  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
272%endrep
273%undef rep
274  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
275  ; N.B. HIGH_PROCESS outputs dwords (32 bits)
276  ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
277  movhlps               m0, m4
278  movhlps               m1, m5
279  movhlps               m2, m6
280  movhlps               m3, m7
281  paddd                 m4, m0
282  paddd                 m5, m1
283  paddd                 m6, m2
284  paddd                 m7, m3
285  punpckldq             m4, m5
286  punpckldq             m6, m7
287  movhlps               m0, m4
288  movhlps               m1, m6
289  paddd                 m4, m0
290  paddd                 m6, m1
291  punpcklqdq            m4, m6
292%if %3 == 2  ; skip rows
293  pslld                 m4, 1
294%endif
295  movifnidn             r4, r4mp
296  movu                [r4], m4
297  RET
298%endmacro
299
300
301INIT_XMM sse2
302HIGH_SADNXN4D 64, 64
303HIGH_SADNXN4D 64, 32
304HIGH_SADNXN4D 32, 64
305HIGH_SADNXN4D 32, 32
306HIGH_SADNXN4D 32, 16
307HIGH_SADNXN4D 16, 32
308HIGH_SADNXN4D 16, 16
309HIGH_SADNXN4D 16,  8
310HIGH_SADNXN4D  8, 16
311HIGH_SADNXN4D  8,  8
312HIGH_SADNXN4D  8,  4
313HIGH_SADNXN4D  4,  8
314HIGH_SADNXN4D  4,  4
315
316HIGH_SADNXN4D 64, 64, 2
317HIGH_SADNXN4D 64, 32, 2
318HIGH_SADNXN4D 32, 64, 2
319HIGH_SADNXN4D 32, 32, 2
320HIGH_SADNXN4D 32, 16, 2
321HIGH_SADNXN4D 16, 32, 2
322HIGH_SADNXN4D 16, 16, 2
323HIGH_SADNXN4D 16,  8, 2
324HIGH_SADNXN4D  8, 16, 2
325HIGH_SADNXN4D  8,  8, 2
326HIGH_SADNXN4D  4,  8, 2
327