xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/sad4d_sse2.asm (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION .text
14
15; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
16%macro PROCESS_4x2x4 5-6 0
17  movd                  m0, [srcq +%2]
18%if %1 == 1
19  movd                  m6, [ref1q+%3]
20  movd                  m4, [ref2q+%3]
21  movd                  m7, [ref3q+%3]
22  movd                  m5, [ref4q+%3]
23  movd                  m1, [srcq +%4]
24  movd                  m2, [ref1q+%5]
25  punpckldq             m0, m1
26  punpckldq             m6, m2
27  movd                  m1, [ref2q+%5]
28  movd                  m2, [ref3q+%5]
29  movd                  m3, [ref4q+%5]
30  punpckldq             m4, m1
31  punpckldq             m7, m2
32  punpckldq             m5, m3
33  movlhps               m0, m0
34  movlhps               m6, m4
35  movlhps               m7, m5
36  psadbw                m6, m0
37  psadbw                m7, m0
38%else
39  movd                  m1, [ref1q+%3]
40  movd                  m5, [ref1q+%5]
41  movd                  m2, [ref2q+%3]
42  movd                  m4, [ref2q+%5]
43  punpckldq             m1, m5
44  punpckldq             m2, m4
45  movd                  m3, [ref3q+%3]
46  movd                  m5, [ref3q+%5]
47  punpckldq             m3, m5
48  movd                  m4, [ref4q+%3]
49  movd                  m5, [ref4q+%5]
50  punpckldq             m4, m5
51  movd                  m5, [srcq +%4]
52  punpckldq             m0, m5
53  movlhps               m0, m0
54  movlhps               m1, m2
55  movlhps               m3, m4
56  psadbw                m1, m0
57  psadbw                m3, m0
58  paddd                 m6, m1
59  paddd                 m7, m3
60%endif
61%if %6 == 1
62  lea                 srcq, [srcq +src_strideq*2]
63  lea                ref1q, [ref1q+ref_strideq*2]
64  lea                ref2q, [ref2q+ref_strideq*2]
65  lea                ref3q, [ref3q+ref_strideq*2]
66  lea                ref4q, [ref4q+ref_strideq*2]
67%endif
68%endmacro
69
70; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
71%macro PROCESS_8x2x4 5-6 0
72  movh                  m0, [srcq +%2]
73%if %1 == 1
74  movh                  m4, [ref1q+%3]
75  movh                  m5, [ref2q+%3]
76  movh                  m6, [ref3q+%3]
77  movh                  m7, [ref4q+%3]
78  movhps                m0, [srcq +%4]
79  movhps                m4, [ref1q+%5]
80  movhps                m5, [ref2q+%5]
81  movhps                m6, [ref3q+%5]
82  movhps                m7, [ref4q+%5]
83  psadbw                m4, m0
84  psadbw                m5, m0
85  psadbw                m6, m0
86  psadbw                m7, m0
87%else
88  movh                  m1, [ref1q+%3]
89  movh                  m2, [ref2q+%3]
90  movh                  m3, [ref3q+%3]
91  movhps                m0, [srcq +%4]
92  movhps                m1, [ref1q+%5]
93  movhps                m2, [ref2q+%5]
94  movhps                m3, [ref3q+%5]
95  psadbw                m1, m0
96  psadbw                m2, m0
97  psadbw                m3, m0
98  paddd                 m4, m1
99  movh                  m1, [ref4q+%3]
100  movhps                m1, [ref4q+%5]
101  paddd                 m5, m2
102  paddd                 m6, m3
103  psadbw                m1, m0
104  paddd                 m7, m1
105%endif
106%if %6 == 1
107  lea                 srcq, [srcq +src_strideq*2]
108  lea                ref1q, [ref1q+ref_strideq*2]
109  lea                ref2q, [ref2q+ref_strideq*2]
110  lea                ref3q, [ref3q+ref_strideq*2]
111  lea                ref4q, [ref4q+ref_strideq*2]
112%endif
113%endmacro
114
115; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
116%macro PROCESS_16x2x4 5-6 0
117  ; 1st 16 px
118  mova                  m0, [srcq +%2]
119%if %1 == 1
120  movu                  m4, [ref1q+%3]
121  movu                  m5, [ref2q+%3]
122  movu                  m6, [ref3q+%3]
123  movu                  m7, [ref4q+%3]
124  psadbw                m4, m0
125  psadbw                m5, m0
126  psadbw                m6, m0
127  psadbw                m7, m0
128%else
129  movu                  m1, [ref1q+%3]
130  movu                  m2, [ref2q+%3]
131  movu                  m3, [ref3q+%3]
132  psadbw                m1, m0
133  psadbw                m2, m0
134  psadbw                m3, m0
135  paddd                 m4, m1
136  movu                  m1, [ref4q+%3]
137  paddd                 m5, m2
138  paddd                 m6, m3
139  psadbw                m1, m0
140  paddd                 m7, m1
141%endif
142
143  ; 2nd 16 px
144  mova                  m0, [srcq +%4]
145  movu                  m1, [ref1q+%5]
146  movu                  m2, [ref2q+%5]
147  movu                  m3, [ref3q+%5]
148  psadbw                m1, m0
149  psadbw                m2, m0
150  psadbw                m3, m0
151  paddd                 m4, m1
152  movu                  m1, [ref4q+%5]
153  paddd                 m5, m2
154  paddd                 m6, m3
155%if %6 == 1
156  lea                 srcq, [srcq +src_strideq*2]
157  lea                ref1q, [ref1q+ref_strideq*2]
158  lea                ref2q, [ref2q+ref_strideq*2]
159  lea                ref3q, [ref3q+ref_strideq*2]
160  lea                ref4q, [ref4q+ref_strideq*2]
161%endif
162  psadbw                m1, m0
163  paddd                 m7, m1
164%endmacro
165
166; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
167%macro PROCESS_32x2x4 5-6 0
168  PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16
169  PROCESS_16x2x4  0, %4, %5, %4 + 16, %5 + 16, %6
170%endmacro
171
172; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
173%macro PROCESS_64x2x4 5-6 0
174  PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32
175  PROCESS_32x2x4  0, %4, %5, %4 + 32, %5 + 32, %6
176%endmacro
177
178; void vpx_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
179;                         uint8_t *ref[4], int ref_stride,
180;                         uint32_t res[4]);
181; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
182%macro SADNXN4D 2-3 0
183%if %3 == 1  ; skip rows
184%if UNIX64
185cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
186                              res, ref2, ref3, ref4
187%else
188cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
189                              ref2, ref3, ref4
190%endif
191%else  ; normal sad
192%if UNIX64
193cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
194                              res, ref2, ref3, ref4
195%else
196cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
197                              ref2, ref3, ref4
198%endif
199%endif
200%if %3 == 1
201  lea          src_strided, [2*src_strided]
202  lea          ref_strided, [2*ref_strided]
203%endif
204  movsxdifnidn src_strideq, src_strided
205  movsxdifnidn ref_strideq, ref_strided
206  mov                ref2q, [ref1q+gprsize*1]
207  mov                ref3q, [ref1q+gprsize*2]
208  mov                ref4q, [ref1q+gprsize*3]
209  mov                ref1q, [ref1q+gprsize*0]
210
211  PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
212%if %3 == 1  ; downsample number of rows by 2
213%define num_rep (%2-8)/4
214%else
215%define num_rep (%2-4)/2
216%endif
217%rep num_rep
218  PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
219%endrep
220%undef num_rep
221  PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
222
223%if %1 > 4
224  pslldq                m5, 4
225  pslldq                m7, 4
226  por                   m4, m5
227  por                   m6, m7
228  mova                  m5, m4
229  mova                  m7, m6
230  punpcklqdq            m4, m6
231  punpckhqdq            m5, m7
232  movifnidn             r4, r4mp
233  paddd                 m4, m5
234%if %3 == 1
235  pslld                 m4, 1
236%endif
237  movu                [r4], m4
238  RET
239%else
240  movifnidn             r4, r4mp
241  pshufd            m6, m6, 0x08
242  pshufd            m7, m7, 0x08
243%if %3 == 1
244  pslld                 m6, 1
245  pslld                 m7, 1
246%endif
247  movq              [r4+0], m6
248  movq              [r4+8], m7
249  RET
250%endif
251%endmacro
252
253INIT_XMM sse2
254SADNXN4D 64, 64
255SADNXN4D 64, 32
256SADNXN4D 32, 64
257SADNXN4D 32, 32
258SADNXN4D 32, 16
259SADNXN4D 16, 32
260SADNXN4D 16, 16
261SADNXN4D 16,  8
262SADNXN4D  8, 16
263SADNXN4D  8,  8
264SADNXN4D  8,  4
265SADNXN4D  4,  8
266SADNXN4D  4,  4
267
268SADNXN4D 64, 64, 1
269SADNXN4D 64, 32, 1
270SADNXN4D 32, 64, 1
271SADNXN4D 32, 32, 1
272SADNXN4D 32, 16, 1
273SADNXN4D 16, 32, 1
274SADNXN4D 16, 16, 1
275SADNXN4D 16,  8, 1
276SADNXN4D  8, 16, 1
277SADNXN4D  8,  8, 1
278SADNXN4D  4,  8, 1
279