xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/deblock_sse2.asm (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;macro in deblock functions
15%macro FIRST_2_ROWS 0
16        movdqa      xmm4,       xmm0
17        movdqa      xmm6,       xmm0
18        movdqa      xmm5,       xmm1
19        pavgb       xmm5,       xmm3
20
21        ;calculate absolute value
22        psubusb     xmm4,       xmm1
23        psubusb     xmm1,       xmm0
24        psubusb     xmm6,       xmm3
25        psubusb     xmm3,       xmm0
26        paddusb     xmm4,       xmm1
27        paddusb     xmm6,       xmm3
28
29        ;get threshold
30        movdqa      xmm2,       flimit
31        pxor        xmm1,       xmm1
32        movdqa      xmm7,       xmm2
33
34        ;get mask
35        psubusb     xmm2,       xmm4
36        psubusb     xmm7,       xmm6
37        pcmpeqb     xmm2,       xmm1
38        pcmpeqb     xmm7,       xmm1
39        por         xmm7,       xmm2
40%endmacro
41
42%macro SECOND_2_ROWS 0
43        movdqa      xmm6,       xmm0
44        movdqa      xmm4,       xmm0
45        movdqa      xmm2,       xmm1
46        pavgb       xmm1,       xmm3
47
48        ;calculate absolute value
49        psubusb     xmm6,       xmm2
50        psubusb     xmm2,       xmm0
51        psubusb     xmm4,       xmm3
52        psubusb     xmm3,       xmm0
53        paddusb     xmm6,       xmm2
54        paddusb     xmm4,       xmm3
55
56        pavgb       xmm5,       xmm1
57
58        ;get threshold
59        movdqa      xmm2,       flimit
60        pxor        xmm1,       xmm1
61        movdqa      xmm3,       xmm2
62
63        ;get mask
64        psubusb     xmm2,       xmm6
65        psubusb     xmm3,       xmm4
66        pcmpeqb     xmm2,       xmm1
67        pcmpeqb     xmm3,       xmm1
68
69        por         xmm7,       xmm2
70        por         xmm7,       xmm3
71
72        pavgb       xmm5,       xmm0
73
74        ;decide if or not to use filtered value
75        pand        xmm0,       xmm7
76        pandn       xmm7,       xmm5
77        paddusb     xmm0,       xmm7
78%endmacro
79
80%macro UPDATE_FLIMIT 0
81        movdqu      xmm2,       XMMWORD PTR [rbx]
82        movdqu      [rsp],      xmm2
83        add         rbx,        16
84%endmacro
85
86SECTION .text
87
88;void vpx_post_proc_down_and_across_mb_row_sse2
89;(
90;    unsigned char *src_ptr,
91;    unsigned char *dst_ptr,
92;    int src_pixels_per_line,
93;    int dst_pixels_per_line,
94;    int cols,
95;    int *flimits,
96;    int size
97;)
98globalsym(vpx_post_proc_down_and_across_mb_row_sse2)
99sym(vpx_post_proc_down_and_across_mb_row_sse2):
100    push        rbp
101    mov         rbp, rsp
102    SHADOW_ARGS_TO_STACK 7
103    SAVE_XMM 7
104    push        rbx
105    push        rsi
106    push        rdi
107    ; end prolog
108    ALIGN_STACK 16, rax
109    sub         rsp, 16
110
111        ; put flimit on stack
112        mov         rbx,        arg(5)           ;flimits ptr
113        UPDATE_FLIMIT
114
115%define flimit [rsp]
116
117        mov         rsi,        arg(0)           ;src_ptr
118        mov         rdi,        arg(1)           ;dst_ptr
119
120        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line
121        movsxd      rcx,        DWORD PTR arg(6) ;rows in a macroblock
122.nextrow:
123        xor         rdx,        rdx              ;col
124.nextcol:
125        ;load current and next 2 rows
126        movdqu      xmm0,       XMMWORD PTR [rsi]
127        movdqu      xmm1,       XMMWORD PTR [rsi + rax]
128        movdqu      xmm3,       XMMWORD PTR [rsi + 2*rax]
129
130        FIRST_2_ROWS
131
132        ;load above 2 rows
133        neg         rax
134        movdqu      xmm1,       XMMWORD PTR [rsi + 2*rax]
135        movdqu      xmm3,       XMMWORD PTR [rsi + rax]
136
137        SECOND_2_ROWS
138
139        movdqu      XMMWORD PTR [rdi], xmm0
140
141        neg         rax                          ; positive stride
142        add         rsi,        16
143        add         rdi,        16
144
145        add         rdx,        16
146        cmp         edx,        dword arg(4)     ;cols
147        jge         .downdone
148        UPDATE_FLIMIT
149        jmp         .nextcol
150
151.downdone:
152        ; done with the all cols, start the across filtering in place
153        sub         rsi,        rdx
154        sub         rdi,        rdx
155
156        mov         rbx,        arg(5) ; flimits
157        UPDATE_FLIMIT
158
159        ; dup the first byte into the left border 8 times
160        movq        mm1,   [rdi]
161        punpcklbw   mm1,   mm1
162        punpcklwd   mm1,   mm1
163        punpckldq   mm1,   mm1
164        mov         rdx,    -8
165        movq        [rdi+rdx], mm1
166
167        ; dup the last byte into the right border
168        movsxd      rdx,    dword arg(4)
169        movq        mm1,   [rdi + rdx + -1]
170        punpcklbw   mm1,   mm1
171        punpcklwd   mm1,   mm1
172        punpckldq   mm1,   mm1
173        movq        [rdi+rdx], mm1
174
175        xor         rdx,        rdx
176        movq        mm0,        QWORD PTR [rdi-16];
177        movq        mm1,        QWORD PTR [rdi-8];
178
179.acrossnextcol:
180        movdqu      xmm0,       XMMWORD PTR [rdi + rdx]
181        movdqu      xmm1,       XMMWORD PTR [rdi + rdx -2]
182        movdqu      xmm3,       XMMWORD PTR [rdi + rdx -1]
183
184        FIRST_2_ROWS
185
186        movdqu      xmm1,       XMMWORD PTR [rdi + rdx +1]
187        movdqu      xmm3,       XMMWORD PTR [rdi + rdx +2]
188
189        SECOND_2_ROWS
190
191        movq        QWORD PTR [rdi+rdx-16], mm0  ; store previous 8 bytes
192        movq        QWORD PTR [rdi+rdx-8], mm1   ; store previous 8 bytes
193        movdq2q     mm0,        xmm0
194        psrldq      xmm0,       8
195        movdq2q     mm1,        xmm0
196
197        add         rdx,        16
198        cmp         edx,        dword arg(4)     ;cols
199        jge         .acrossdone
200        UPDATE_FLIMIT
201        jmp         .acrossnextcol
202
203.acrossdone:
204        ; last 16 pixels
205        movq        QWORD PTR [rdi+rdx-16], mm0
206
207        cmp         edx,        dword arg(4)
208        jne         .throw_last_8
209        movq        QWORD PTR [rdi+rdx-8], mm1
210.throw_last_8:
211        ; done with this rwo
212        add         rsi,rax                      ;next src line
213        mov         eax, dword arg(3)            ;dst_pixels_per_line
214        add         rdi,rax                      ;next destination
215        mov         eax, dword arg(2)            ;src_pixels_per_line
216
217        mov         rbx,        arg(5)           ;flimits
218        UPDATE_FLIMIT
219
220        dec         rcx                          ;decrement count
221        jnz         .nextrow                     ;next row
222
223    add rsp, 16
224    pop rsp
225    ; begin epilog
226    pop rdi
227    pop rsi
228    pop rbx
229    RESTORE_XMM
230    UNSHADOW_ARGS
231    pop         rbp
232    ret
233%undef flimit
234
235
236;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src,
237;                                    int pitch, int rows, int cols,int flimit)
238globalsym(vpx_mbpost_proc_across_ip_sse2)
239sym(vpx_mbpost_proc_across_ip_sse2):
240    push        rbp
241    mov         rbp, rsp
242    SHADOW_ARGS_TO_STACK 5
243    SAVE_XMM 7
244    GET_GOT     rbx
245    push        rsi
246    push        rdi
247    ; end prolog
248
249    ALIGN_STACK 16, rax
250    sub         rsp, 16
251
252    ; create flimit4 at [rsp]
253    mov         eax, dword ptr arg(4) ;flimit
254    mov         [rsp], eax
255    mov         [rsp+4], eax
256    mov         [rsp+8], eax
257    mov         [rsp+12], eax
258%define flimit4 [rsp]
259
260
261    ;for(r=0;r<rows;r++)
262.ip_row_loop:
263
264        xor         rdx,    rdx ;sumsq=0;
265        xor         rcx,    rcx ;sum=0;
266        mov         rsi,    arg(0); s
267
268
269        ; dup the first byte into the left border 8 times
270        movq        mm1,   [rsi]
271        punpcklbw   mm1,   mm1
272        punpcklwd   mm1,   mm1
273        punpckldq   mm1,   mm1
274
275        mov         rdi,    -8
276        movq        [rsi+rdi], mm1
277
278        ; dup the last byte into the right border
279        movsxd      rdx,    dword arg(3)
280        movq        mm1,   [rsi + rdx + -1]
281        punpcklbw   mm1,   mm1
282        punpcklwd   mm1,   mm1
283        punpckldq   mm1,   mm1
284        movq        [rsi+rdx], mm1
285
286.ip_var_loop:
287        ;for(i=-8;i<=6;i++)
288        ;{
289        ;    sumsq += s[i]*s[i];
290        ;    sum   += s[i];
291        ;}
292        movzx       eax, byte [rsi+rdi]
293        add         ecx, eax
294        mul         al
295        add         edx, eax
296        add         rdi, 1
297        cmp         rdi, 6
298        jle         .ip_var_loop
299
300
301            ;mov         rax,    sumsq
302            ;movd        xmm7,   rax
303            movd        xmm7,   edx
304
305            ;mov         rax,    sum
306            ;movd        xmm6,   rax
307            movd        xmm6,   ecx
308
309            mov         rsi,    arg(0) ;s
310            xor         rcx,    rcx
311
312            movsxd      rdx,    dword arg(3) ;cols
313            add         rdx,    8
314            pxor        mm0,    mm0
315            pxor        mm1,    mm1
316
317            pxor        xmm0,   xmm0
318.nextcol4:
319
320            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
321            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
322
323            punpcklbw   xmm1,   xmm0                    ; expanding
324            punpcklbw   xmm2,   xmm0                    ; expanding
325
326            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
327            punpcklwd   xmm2,   xmm0                    ; expanding to dwords
328
329            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
330            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
331
332            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
333            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
334
335            paddd       xmm6,   xmm2
336            paddd       xmm7,   xmm1
337
338            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
339            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
340
341            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
342            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
343
344            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
345            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
346
347            paddd       xmm6,   xmm4
348            paddd       xmm7,   xmm3
349
350            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
351            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
352
353            paddd       xmm7,   xmm3
354            paddd       xmm6,   xmm4
355
356            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
357            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
358
359            paddd       xmm7,   xmm3
360            paddd       xmm6,   xmm4
361
362            movdqa      xmm3,   xmm6
363            pmaddwd     xmm3,   xmm3
364
365            movdqa      xmm5,   xmm7
366            pslld       xmm5,   4
367
368            psubd       xmm5,   xmm7
369            psubd       xmm5,   xmm3
370
371            psubd       xmm5,   flimit4
372            psrad       xmm5,   31
373
374            packssdw    xmm5,   xmm0
375            packsswb    xmm5,   xmm0
376
377            movd        xmm1,   DWORD PTR [rsi+rcx]
378            movq        xmm2,   xmm1
379
380            punpcklbw   xmm1,   xmm0
381            punpcklwd   xmm1,   xmm0
382
383            paddd       xmm1,   xmm6
384            paddd       xmm1,   [GLOBAL(four8s)]
385
386            psrad       xmm1,   4
387            packssdw    xmm1,   xmm0
388
389            packuswb    xmm1,   xmm0
390            pand        xmm1,   xmm5
391
392            pandn       xmm5,   xmm2
393            por         xmm5,   xmm1
394
395            movd        [rsi+rcx-8],  mm0
396            movq        mm0,    mm1
397
398            movdq2q     mm1,    xmm5
399            psrldq      xmm7,   12
400
401            psrldq      xmm6,   12
402            add         rcx,    4
403
404            cmp         rcx,    rdx
405            jl          .nextcol4
406
407        ;s+=pitch;
408        movsxd rax, dword arg(1)
409        add    arg(0), rax
410
411        sub dword arg(2), 1 ;rows-=1
412        cmp dword arg(2), 0
413        jg .ip_row_loop
414
415    add         rsp, 16
416    pop         rsp
417
418    ; begin epilog
419    pop rdi
420    pop rsi
421    RESTORE_GOT
422    RESTORE_XMM
423    UNSHADOW_ARGS
424    pop         rbp
425    ret
426%undef flimit4
427
428
429SECTION_RODATA
430align 16
431four8s:
432    times 4 dd 8
433