xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/deblock_sse2.asm (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1*fb1b10abSAndroid Build Coastguard Worker;
2*fb1b10abSAndroid Build Coastguard Worker;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker;
4*fb1b10abSAndroid Build Coastguard Worker;  Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker;  that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker;  tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker;  in the file PATENTS.  All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker;  be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker;
10*fb1b10abSAndroid Build Coastguard Worker
11*fb1b10abSAndroid Build Coastguard Worker
12*fb1b10abSAndroid Build Coastguard Worker%include "vpx_ports/x86_abi_support.asm"
13*fb1b10abSAndroid Build Coastguard Worker
14*fb1b10abSAndroid Build Coastguard Worker;macro in deblock functions
15*fb1b10abSAndroid Build Coastguard Worker%macro FIRST_2_ROWS 0
16*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,       xmm0
17*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm6,       xmm0
18*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,       xmm1
19*fb1b10abSAndroid Build Coastguard Worker        pavgb       xmm5,       xmm3
20*fb1b10abSAndroid Build Coastguard Worker
21*fb1b10abSAndroid Build Coastguard Worker        ;calculate absolute value
22*fb1b10abSAndroid Build Coastguard Worker        psubusb     xmm4,       xmm1
23*fb1b10abSAndroid Build Coastguard Worker        psubusb     xmm1,       xmm0
24*fb1b10abSAndroid Build Coastguard Worker        psubusb     xmm6,       xmm3
25*fb1b10abSAndroid Build Coastguard Worker        psubusb     xmm3,       xmm0
26*fb1b10abSAndroid Build Coastguard Worker        paddusb     xmm4,       xmm1
27*fb1b10abSAndroid Build Coastguard Worker        paddusb     xmm6,       xmm3
28*fb1b10abSAndroid Build Coastguard Worker
29*fb1b10abSAndroid Build Coastguard Worker        ;get threshold
30*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm2,       flimit
31*fb1b10abSAndroid Build Coastguard Worker        pxor        xmm1,       xmm1
32*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,       xmm2
33*fb1b10abSAndroid Build Coastguard Worker
34*fb1b10abSAndroid Build Coastguard Worker        ;get mask
35*fb1b10abSAndroid Build Coastguard Worker        psubusb     xmm2,       xmm4
36*fb1b10abSAndroid Build Coastguard Worker        psubusb     xmm7,       xmm6
37*fb1b10abSAndroid Build Coastguard Worker        pcmpeqb     xmm2,       xmm1
38*fb1b10abSAndroid Build Coastguard Worker        pcmpeqb     xmm7,       xmm1
39*fb1b10abSAndroid Build Coastguard Worker        por         xmm7,       xmm2
40*fb1b10abSAndroid Build Coastguard Worker%endmacro
41*fb1b10abSAndroid Build Coastguard Worker
42*fb1b10abSAndroid Build Coastguard Worker%macro SECOND_2_ROWS 0
43*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm6,       xmm0
44*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,       xmm0
45*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm2,       xmm1
46*fb1b10abSAndroid Build Coastguard Worker        pavgb       xmm1,       xmm3
47*fb1b10abSAndroid Build Coastguard Worker
48*fb1b10abSAndroid Build Coastguard Worker        ;calculate absolute value
49*fb1b10abSAndroid Build Coastguard Worker        psubusb     xmm6,       xmm2
50*fb1b10abSAndroid Build Coastguard Worker        psubusb     xmm2,       xmm0
51*fb1b10abSAndroid Build Coastguard Worker        psubusb     xmm4,       xmm3
52*fb1b10abSAndroid Build Coastguard Worker        psubusb     xmm3,       xmm0
53*fb1b10abSAndroid Build Coastguard Worker        paddusb     xmm6,       xmm2
54*fb1b10abSAndroid Build Coastguard Worker        paddusb     xmm4,       xmm3
55*fb1b10abSAndroid Build Coastguard Worker
56*fb1b10abSAndroid Build Coastguard Worker        pavgb       xmm5,       xmm1
57*fb1b10abSAndroid Build Coastguard Worker
58*fb1b10abSAndroid Build Coastguard Worker        ;get threshold
59*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm2,       flimit
60*fb1b10abSAndroid Build Coastguard Worker        pxor        xmm1,       xmm1
61*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm3,       xmm2
62*fb1b10abSAndroid Build Coastguard Worker
63*fb1b10abSAndroid Build Coastguard Worker        ;get mask
64*fb1b10abSAndroid Build Coastguard Worker        psubusb     xmm2,       xmm6
65*fb1b10abSAndroid Build Coastguard Worker        psubusb     xmm3,       xmm4
66*fb1b10abSAndroid Build Coastguard Worker        pcmpeqb     xmm2,       xmm1
67*fb1b10abSAndroid Build Coastguard Worker        pcmpeqb     xmm3,       xmm1
68*fb1b10abSAndroid Build Coastguard Worker
69*fb1b10abSAndroid Build Coastguard Worker        por         xmm7,       xmm2
70*fb1b10abSAndroid Build Coastguard Worker        por         xmm7,       xmm3
71*fb1b10abSAndroid Build Coastguard Worker
72*fb1b10abSAndroid Build Coastguard Worker        pavgb       xmm5,       xmm0
73*fb1b10abSAndroid Build Coastguard Worker
74*fb1b10abSAndroid Build Coastguard Worker        ;decide if or not to use filtered value
75*fb1b10abSAndroid Build Coastguard Worker        pand        xmm0,       xmm7
76*fb1b10abSAndroid Build Coastguard Worker        pandn       xmm7,       xmm5
77*fb1b10abSAndroid Build Coastguard Worker        paddusb     xmm0,       xmm7
78*fb1b10abSAndroid Build Coastguard Worker%endmacro
79*fb1b10abSAndroid Build Coastguard Worker
80*fb1b10abSAndroid Build Coastguard Worker%macro UPDATE_FLIMIT 0
81*fb1b10abSAndroid Build Coastguard Worker        movdqu      xmm2,       XMMWORD PTR [rbx]
82*fb1b10abSAndroid Build Coastguard Worker        movdqu      [rsp],      xmm2
83*fb1b10abSAndroid Build Coastguard Worker        add         rbx,        16
84*fb1b10abSAndroid Build Coastguard Worker%endmacro
85*fb1b10abSAndroid Build Coastguard Worker
86*fb1b10abSAndroid Build Coastguard WorkerSECTION .text
87*fb1b10abSAndroid Build Coastguard Worker
88*fb1b10abSAndroid Build Coastguard Worker;void vpx_post_proc_down_and_across_mb_row_sse2
89*fb1b10abSAndroid Build Coastguard Worker;(
90*fb1b10abSAndroid Build Coastguard Worker;    unsigned char *src_ptr,
91*fb1b10abSAndroid Build Coastguard Worker;    unsigned char *dst_ptr,
92*fb1b10abSAndroid Build Coastguard Worker;    int src_pixels_per_line,
93*fb1b10abSAndroid Build Coastguard Worker;    int dst_pixels_per_line,
94*fb1b10abSAndroid Build Coastguard Worker;    int cols,
95*fb1b10abSAndroid Build Coastguard Worker;    int *flimits,
96*fb1b10abSAndroid Build Coastguard Worker;    int size
97*fb1b10abSAndroid Build Coastguard Worker;)
98*fb1b10abSAndroid Build Coastguard Workerglobalsym(vpx_post_proc_down_and_across_mb_row_sse2)
99*fb1b10abSAndroid Build Coastguard Workersym(vpx_post_proc_down_and_across_mb_row_sse2):
100*fb1b10abSAndroid Build Coastguard Worker    push        rbp
101*fb1b10abSAndroid Build Coastguard Worker    mov         rbp, rsp
102*fb1b10abSAndroid Build Coastguard Worker    SHADOW_ARGS_TO_STACK 7
103*fb1b10abSAndroid Build Coastguard Worker    SAVE_XMM 7
104*fb1b10abSAndroid Build Coastguard Worker    push        rbx
105*fb1b10abSAndroid Build Coastguard Worker    push        rsi
106*fb1b10abSAndroid Build Coastguard Worker    push        rdi
107*fb1b10abSAndroid Build Coastguard Worker    ; end prolog
108*fb1b10abSAndroid Build Coastguard Worker    ALIGN_STACK 16, rax
109*fb1b10abSAndroid Build Coastguard Worker    sub         rsp, 16
110*fb1b10abSAndroid Build Coastguard Worker
111*fb1b10abSAndroid Build Coastguard Worker        ; put flimit on stack
112*fb1b10abSAndroid Build Coastguard Worker        mov         rbx,        arg(5)           ;flimits ptr
113*fb1b10abSAndroid Build Coastguard Worker        UPDATE_FLIMIT
114*fb1b10abSAndroid Build Coastguard Worker
115*fb1b10abSAndroid Build Coastguard Worker%define flimit [rsp]
116*fb1b10abSAndroid Build Coastguard Worker
117*fb1b10abSAndroid Build Coastguard Worker        mov         rsi,        arg(0)           ;src_ptr
118*fb1b10abSAndroid Build Coastguard Worker        mov         rdi,        arg(1)           ;dst_ptr
119*fb1b10abSAndroid Build Coastguard Worker
120*fb1b10abSAndroid Build Coastguard Worker        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line
121*fb1b10abSAndroid Build Coastguard Worker        movsxd      rcx,        DWORD PTR arg(6) ;rows in a macroblock
122*fb1b10abSAndroid Build Coastguard Worker.nextrow:
123*fb1b10abSAndroid Build Coastguard Worker        xor         rdx,        rdx              ;col
124*fb1b10abSAndroid Build Coastguard Worker.nextcol:
125*fb1b10abSAndroid Build Coastguard Worker        ;load current and next 2 rows
126*fb1b10abSAndroid Build Coastguard Worker        movdqu      xmm0,       XMMWORD PTR [rsi]
127*fb1b10abSAndroid Build Coastguard Worker        movdqu      xmm1,       XMMWORD PTR [rsi + rax]
128*fb1b10abSAndroid Build Coastguard Worker        movdqu      xmm3,       XMMWORD PTR [rsi + 2*rax]
129*fb1b10abSAndroid Build Coastguard Worker
130*fb1b10abSAndroid Build Coastguard Worker        FIRST_2_ROWS
131*fb1b10abSAndroid Build Coastguard Worker
132*fb1b10abSAndroid Build Coastguard Worker        ;load above 2 rows
133*fb1b10abSAndroid Build Coastguard Worker        neg         rax
134*fb1b10abSAndroid Build Coastguard Worker        movdqu      xmm1,       XMMWORD PTR [rsi + 2*rax]
135*fb1b10abSAndroid Build Coastguard Worker        movdqu      xmm3,       XMMWORD PTR [rsi + rax]
136*fb1b10abSAndroid Build Coastguard Worker
137*fb1b10abSAndroid Build Coastguard Worker        SECOND_2_ROWS
138*fb1b10abSAndroid Build Coastguard Worker
139*fb1b10abSAndroid Build Coastguard Worker        movdqu      XMMWORD PTR [rdi], xmm0
140*fb1b10abSAndroid Build Coastguard Worker
141*fb1b10abSAndroid Build Coastguard Worker        neg         rax                          ; positive stride
142*fb1b10abSAndroid Build Coastguard Worker        add         rsi,        16
143*fb1b10abSAndroid Build Coastguard Worker        add         rdi,        16
144*fb1b10abSAndroid Build Coastguard Worker
145*fb1b10abSAndroid Build Coastguard Worker        add         rdx,        16
146*fb1b10abSAndroid Build Coastguard Worker        cmp         edx,        dword arg(4)     ;cols
147*fb1b10abSAndroid Build Coastguard Worker        jge         .downdone
148*fb1b10abSAndroid Build Coastguard Worker        UPDATE_FLIMIT
149*fb1b10abSAndroid Build Coastguard Worker        jmp         .nextcol
150*fb1b10abSAndroid Build Coastguard Worker
151*fb1b10abSAndroid Build Coastguard Worker.downdone:
152*fb1b10abSAndroid Build Coastguard Worker        ; done with the all cols, start the across filtering in place
153*fb1b10abSAndroid Build Coastguard Worker        sub         rsi,        rdx
154*fb1b10abSAndroid Build Coastguard Worker        sub         rdi,        rdx
155*fb1b10abSAndroid Build Coastguard Worker
156*fb1b10abSAndroid Build Coastguard Worker        mov         rbx,        arg(5) ; flimits
157*fb1b10abSAndroid Build Coastguard Worker        UPDATE_FLIMIT
158*fb1b10abSAndroid Build Coastguard Worker
159*fb1b10abSAndroid Build Coastguard Worker        ; dup the first byte into the left border 8 times
160*fb1b10abSAndroid Build Coastguard Worker        movq        mm1,   [rdi]
161*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   mm1,   mm1
162*fb1b10abSAndroid Build Coastguard Worker        punpcklwd   mm1,   mm1
163*fb1b10abSAndroid Build Coastguard Worker        punpckldq   mm1,   mm1
164*fb1b10abSAndroid Build Coastguard Worker        mov         rdx,    -8
165*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi+rdx], mm1
166*fb1b10abSAndroid Build Coastguard Worker
167*fb1b10abSAndroid Build Coastguard Worker        ; dup the last byte into the right border
168*fb1b10abSAndroid Build Coastguard Worker        movsxd      rdx,    dword arg(4)
169*fb1b10abSAndroid Build Coastguard Worker        movq        mm1,   [rdi + rdx + -1]
170*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   mm1,   mm1
171*fb1b10abSAndroid Build Coastguard Worker        punpcklwd   mm1,   mm1
172*fb1b10abSAndroid Build Coastguard Worker        punpckldq   mm1,   mm1
173*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi+rdx], mm1
174*fb1b10abSAndroid Build Coastguard Worker
175*fb1b10abSAndroid Build Coastguard Worker        xor         rdx,        rdx
176*fb1b10abSAndroid Build Coastguard Worker        movq        mm0,        QWORD PTR [rdi-16];
177*fb1b10abSAndroid Build Coastguard Worker        movq        mm1,        QWORD PTR [rdi-8];
178*fb1b10abSAndroid Build Coastguard Worker
179*fb1b10abSAndroid Build Coastguard Worker.acrossnextcol:
180*fb1b10abSAndroid Build Coastguard Worker        movdqu      xmm0,       XMMWORD PTR [rdi + rdx]
181*fb1b10abSAndroid Build Coastguard Worker        movdqu      xmm1,       XMMWORD PTR [rdi + rdx -2]
182*fb1b10abSAndroid Build Coastguard Worker        movdqu      xmm3,       XMMWORD PTR [rdi + rdx -1]
183*fb1b10abSAndroid Build Coastguard Worker
184*fb1b10abSAndroid Build Coastguard Worker        FIRST_2_ROWS
185*fb1b10abSAndroid Build Coastguard Worker
186*fb1b10abSAndroid Build Coastguard Worker        movdqu      xmm1,       XMMWORD PTR [rdi + rdx +1]
187*fb1b10abSAndroid Build Coastguard Worker        movdqu      xmm3,       XMMWORD PTR [rdi + rdx +2]
188*fb1b10abSAndroid Build Coastguard Worker
189*fb1b10abSAndroid Build Coastguard Worker        SECOND_2_ROWS
190*fb1b10abSAndroid Build Coastguard Worker
191*fb1b10abSAndroid Build Coastguard Worker        movq        QWORD PTR [rdi+rdx-16], mm0  ; store previous 8 bytes
192*fb1b10abSAndroid Build Coastguard Worker        movq        QWORD PTR [rdi+rdx-8], mm1   ; store previous 8 bytes
193*fb1b10abSAndroid Build Coastguard Worker        movdq2q     mm0,        xmm0
194*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm0,       8
195*fb1b10abSAndroid Build Coastguard Worker        movdq2q     mm1,        xmm0
196*fb1b10abSAndroid Build Coastguard Worker
197*fb1b10abSAndroid Build Coastguard Worker        add         rdx,        16
198*fb1b10abSAndroid Build Coastguard Worker        cmp         edx,        dword arg(4)     ;cols
199*fb1b10abSAndroid Build Coastguard Worker        jge         .acrossdone
200*fb1b10abSAndroid Build Coastguard Worker        UPDATE_FLIMIT
201*fb1b10abSAndroid Build Coastguard Worker        jmp         .acrossnextcol
202*fb1b10abSAndroid Build Coastguard Worker
203*fb1b10abSAndroid Build Coastguard Worker.acrossdone:
204*fb1b10abSAndroid Build Coastguard Worker        ; last 16 pixels
205*fb1b10abSAndroid Build Coastguard Worker        movq        QWORD PTR [rdi+rdx-16], mm0
206*fb1b10abSAndroid Build Coastguard Worker
207*fb1b10abSAndroid Build Coastguard Worker        cmp         edx,        dword arg(4)
208*fb1b10abSAndroid Build Coastguard Worker        jne         .throw_last_8
209*fb1b10abSAndroid Build Coastguard Worker        movq        QWORD PTR [rdi+rdx-8], mm1
210*fb1b10abSAndroid Build Coastguard Worker.throw_last_8:
211*fb1b10abSAndroid Build Coastguard Worker        ; done with this rwo
212*fb1b10abSAndroid Build Coastguard Worker        add         rsi,rax                      ;next src line
213*fb1b10abSAndroid Build Coastguard Worker        mov         eax, dword arg(3)            ;dst_pixels_per_line
214*fb1b10abSAndroid Build Coastguard Worker        add         rdi,rax                      ;next destination
215*fb1b10abSAndroid Build Coastguard Worker        mov         eax, dword arg(2)            ;src_pixels_per_line
216*fb1b10abSAndroid Build Coastguard Worker
217*fb1b10abSAndroid Build Coastguard Worker        mov         rbx,        arg(5)           ;flimits
218*fb1b10abSAndroid Build Coastguard Worker        UPDATE_FLIMIT
219*fb1b10abSAndroid Build Coastguard Worker
220*fb1b10abSAndroid Build Coastguard Worker        dec         rcx                          ;decrement count
221*fb1b10abSAndroid Build Coastguard Worker        jnz         .nextrow                     ;next row
222*fb1b10abSAndroid Build Coastguard Worker
223*fb1b10abSAndroid Build Coastguard Worker    add rsp, 16
224*fb1b10abSAndroid Build Coastguard Worker    pop rsp
225*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
226*fb1b10abSAndroid Build Coastguard Worker    pop rdi
227*fb1b10abSAndroid Build Coastguard Worker    pop rsi
228*fb1b10abSAndroid Build Coastguard Worker    pop rbx
229*fb1b10abSAndroid Build Coastguard Worker    RESTORE_XMM
230*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
231*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
232*fb1b10abSAndroid Build Coastguard Worker    ret
233*fb1b10abSAndroid Build Coastguard Worker%undef flimit
234*fb1b10abSAndroid Build Coastguard Worker
235*fb1b10abSAndroid Build Coastguard Worker
236*fb1b10abSAndroid Build Coastguard Worker;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src,
237*fb1b10abSAndroid Build Coastguard Worker;                                    int pitch, int rows, int cols,int flimit)
238*fb1b10abSAndroid Build Coastguard Workerglobalsym(vpx_mbpost_proc_across_ip_sse2)
239*fb1b10abSAndroid Build Coastguard Workersym(vpx_mbpost_proc_across_ip_sse2):
240*fb1b10abSAndroid Build Coastguard Worker    push        rbp
241*fb1b10abSAndroid Build Coastguard Worker    mov         rbp, rsp
242*fb1b10abSAndroid Build Coastguard Worker    SHADOW_ARGS_TO_STACK 5
243*fb1b10abSAndroid Build Coastguard Worker    SAVE_XMM 7
244*fb1b10abSAndroid Build Coastguard Worker    GET_GOT     rbx
245*fb1b10abSAndroid Build Coastguard Worker    push        rsi
246*fb1b10abSAndroid Build Coastguard Worker    push        rdi
247*fb1b10abSAndroid Build Coastguard Worker    ; end prolog
248*fb1b10abSAndroid Build Coastguard Worker
249*fb1b10abSAndroid Build Coastguard Worker    ALIGN_STACK 16, rax
250*fb1b10abSAndroid Build Coastguard Worker    sub         rsp, 16
251*fb1b10abSAndroid Build Coastguard Worker
252*fb1b10abSAndroid Build Coastguard Worker    ; create flimit4 at [rsp]
253*fb1b10abSAndroid Build Coastguard Worker    mov         eax, dword ptr arg(4) ;flimit
254*fb1b10abSAndroid Build Coastguard Worker    mov         [rsp], eax
255*fb1b10abSAndroid Build Coastguard Worker    mov         [rsp+4], eax
256*fb1b10abSAndroid Build Coastguard Worker    mov         [rsp+8], eax
257*fb1b10abSAndroid Build Coastguard Worker    mov         [rsp+12], eax
258*fb1b10abSAndroid Build Coastguard Worker%define flimit4 [rsp]
259*fb1b10abSAndroid Build Coastguard Worker
260*fb1b10abSAndroid Build Coastguard Worker
261*fb1b10abSAndroid Build Coastguard Worker    ;for(r=0;r<rows;r++)
262*fb1b10abSAndroid Build Coastguard Worker.ip_row_loop:
263*fb1b10abSAndroid Build Coastguard Worker
264*fb1b10abSAndroid Build Coastguard Worker        xor         rdx,    rdx ;sumsq=0;
265*fb1b10abSAndroid Build Coastguard Worker        xor         rcx,    rcx ;sum=0;
266*fb1b10abSAndroid Build Coastguard Worker        mov         rsi,    arg(0); s
267*fb1b10abSAndroid Build Coastguard Worker
268*fb1b10abSAndroid Build Coastguard Worker
269*fb1b10abSAndroid Build Coastguard Worker        ; dup the first byte into the left border 8 times
270*fb1b10abSAndroid Build Coastguard Worker        movq        mm1,   [rsi]
271*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   mm1,   mm1
272*fb1b10abSAndroid Build Coastguard Worker        punpcklwd   mm1,   mm1
273*fb1b10abSAndroid Build Coastguard Worker        punpckldq   mm1,   mm1
274*fb1b10abSAndroid Build Coastguard Worker
275*fb1b10abSAndroid Build Coastguard Worker        mov         rdi,    -8
276*fb1b10abSAndroid Build Coastguard Worker        movq        [rsi+rdi], mm1
277*fb1b10abSAndroid Build Coastguard Worker
278*fb1b10abSAndroid Build Coastguard Worker        ; dup the last byte into the right border
279*fb1b10abSAndroid Build Coastguard Worker        movsxd      rdx,    dword arg(3)
280*fb1b10abSAndroid Build Coastguard Worker        movq        mm1,   [rsi + rdx + -1]
281*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   mm1,   mm1
282*fb1b10abSAndroid Build Coastguard Worker        punpcklwd   mm1,   mm1
283*fb1b10abSAndroid Build Coastguard Worker        punpckldq   mm1,   mm1
284*fb1b10abSAndroid Build Coastguard Worker        movq        [rsi+rdx], mm1
285*fb1b10abSAndroid Build Coastguard Worker
286*fb1b10abSAndroid Build Coastguard Worker.ip_var_loop:
287*fb1b10abSAndroid Build Coastguard Worker        ;for(i=-8;i<=6;i++)
288*fb1b10abSAndroid Build Coastguard Worker        ;{
289*fb1b10abSAndroid Build Coastguard Worker        ;    sumsq += s[i]*s[i];
290*fb1b10abSAndroid Build Coastguard Worker        ;    sum   += s[i];
291*fb1b10abSAndroid Build Coastguard Worker        ;}
292*fb1b10abSAndroid Build Coastguard Worker        movzx       eax, byte [rsi+rdi]
293*fb1b10abSAndroid Build Coastguard Worker        add         ecx, eax
294*fb1b10abSAndroid Build Coastguard Worker        mul         al
295*fb1b10abSAndroid Build Coastguard Worker        add         edx, eax
296*fb1b10abSAndroid Build Coastguard Worker        add         rdi, 1
297*fb1b10abSAndroid Build Coastguard Worker        cmp         rdi, 6
298*fb1b10abSAndroid Build Coastguard Worker        jle         .ip_var_loop
299*fb1b10abSAndroid Build Coastguard Worker
300*fb1b10abSAndroid Build Coastguard Worker
301*fb1b10abSAndroid Build Coastguard Worker            ;mov         rax,    sumsq
302*fb1b10abSAndroid Build Coastguard Worker            ;movd        xmm7,   rax
303*fb1b10abSAndroid Build Coastguard Worker            movd        xmm7,   edx
304*fb1b10abSAndroid Build Coastguard Worker
305*fb1b10abSAndroid Build Coastguard Worker            ;mov         rax,    sum
306*fb1b10abSAndroid Build Coastguard Worker            ;movd        xmm6,   rax
307*fb1b10abSAndroid Build Coastguard Worker            movd        xmm6,   ecx
308*fb1b10abSAndroid Build Coastguard Worker
309*fb1b10abSAndroid Build Coastguard Worker            mov         rsi,    arg(0) ;s
310*fb1b10abSAndroid Build Coastguard Worker            xor         rcx,    rcx
311*fb1b10abSAndroid Build Coastguard Worker
312*fb1b10abSAndroid Build Coastguard Worker            movsxd      rdx,    dword arg(3) ;cols
313*fb1b10abSAndroid Build Coastguard Worker            add         rdx,    8
314*fb1b10abSAndroid Build Coastguard Worker            pxor        mm0,    mm0
315*fb1b10abSAndroid Build Coastguard Worker            pxor        mm1,    mm1
316*fb1b10abSAndroid Build Coastguard Worker
317*fb1b10abSAndroid Build Coastguard Worker            pxor        xmm0,   xmm0
318*fb1b10abSAndroid Build Coastguard Worker.nextcol4:
319*fb1b10abSAndroid Build Coastguard Worker
320*fb1b10abSAndroid Build Coastguard Worker            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
321*fb1b10abSAndroid Build Coastguard Worker            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
322*fb1b10abSAndroid Build Coastguard Worker
323*fb1b10abSAndroid Build Coastguard Worker            punpcklbw   xmm1,   xmm0                    ; expanding
324*fb1b10abSAndroid Build Coastguard Worker            punpcklbw   xmm2,   xmm0                    ; expanding
325*fb1b10abSAndroid Build Coastguard Worker
326*fb1b10abSAndroid Build Coastguard Worker            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
327*fb1b10abSAndroid Build Coastguard Worker            punpcklwd   xmm2,   xmm0                    ; expanding to dwords
328*fb1b10abSAndroid Build Coastguard Worker
329*fb1b10abSAndroid Build Coastguard Worker            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
330*fb1b10abSAndroid Build Coastguard Worker            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
331*fb1b10abSAndroid Build Coastguard Worker
332*fb1b10abSAndroid Build Coastguard Worker            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
333*fb1b10abSAndroid Build Coastguard Worker            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
334*fb1b10abSAndroid Build Coastguard Worker
335*fb1b10abSAndroid Build Coastguard Worker            paddd       xmm6,   xmm2
336*fb1b10abSAndroid Build Coastguard Worker            paddd       xmm7,   xmm1
337*fb1b10abSAndroid Build Coastguard Worker
338*fb1b10abSAndroid Build Coastguard Worker            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
339*fb1b10abSAndroid Build Coastguard Worker            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
340*fb1b10abSAndroid Build Coastguard Worker
341*fb1b10abSAndroid Build Coastguard Worker            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
342*fb1b10abSAndroid Build Coastguard Worker            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
343*fb1b10abSAndroid Build Coastguard Worker
344*fb1b10abSAndroid Build Coastguard Worker            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
345*fb1b10abSAndroid Build Coastguard Worker            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
346*fb1b10abSAndroid Build Coastguard Worker
347*fb1b10abSAndroid Build Coastguard Worker            paddd       xmm6,   xmm4
348*fb1b10abSAndroid Build Coastguard Worker            paddd       xmm7,   xmm3
349*fb1b10abSAndroid Build Coastguard Worker
350*fb1b10abSAndroid Build Coastguard Worker            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
351*fb1b10abSAndroid Build Coastguard Worker            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
352*fb1b10abSAndroid Build Coastguard Worker
353*fb1b10abSAndroid Build Coastguard Worker            paddd       xmm7,   xmm3
354*fb1b10abSAndroid Build Coastguard Worker            paddd       xmm6,   xmm4
355*fb1b10abSAndroid Build Coastguard Worker
356*fb1b10abSAndroid Build Coastguard Worker            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
357*fb1b10abSAndroid Build Coastguard Worker            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
358*fb1b10abSAndroid Build Coastguard Worker
359*fb1b10abSAndroid Build Coastguard Worker            paddd       xmm7,   xmm3
360*fb1b10abSAndroid Build Coastguard Worker            paddd       xmm6,   xmm4
361*fb1b10abSAndroid Build Coastguard Worker
362*fb1b10abSAndroid Build Coastguard Worker            movdqa      xmm3,   xmm6
363*fb1b10abSAndroid Build Coastguard Worker            pmaddwd     xmm3,   xmm3
364*fb1b10abSAndroid Build Coastguard Worker
365*fb1b10abSAndroid Build Coastguard Worker            movdqa      xmm5,   xmm7
366*fb1b10abSAndroid Build Coastguard Worker            pslld       xmm5,   4
367*fb1b10abSAndroid Build Coastguard Worker
368*fb1b10abSAndroid Build Coastguard Worker            psubd       xmm5,   xmm7
369*fb1b10abSAndroid Build Coastguard Worker            psubd       xmm5,   xmm3
370*fb1b10abSAndroid Build Coastguard Worker
371*fb1b10abSAndroid Build Coastguard Worker            psubd       xmm5,   flimit4
372*fb1b10abSAndroid Build Coastguard Worker            psrad       xmm5,   31
373*fb1b10abSAndroid Build Coastguard Worker
374*fb1b10abSAndroid Build Coastguard Worker            packssdw    xmm5,   xmm0
375*fb1b10abSAndroid Build Coastguard Worker            packsswb    xmm5,   xmm0
376*fb1b10abSAndroid Build Coastguard Worker
377*fb1b10abSAndroid Build Coastguard Worker            movd        xmm1,   DWORD PTR [rsi+rcx]
378*fb1b10abSAndroid Build Coastguard Worker            movq        xmm2,   xmm1
379*fb1b10abSAndroid Build Coastguard Worker
380*fb1b10abSAndroid Build Coastguard Worker            punpcklbw   xmm1,   xmm0
381*fb1b10abSAndroid Build Coastguard Worker            punpcklwd   xmm1,   xmm0
382*fb1b10abSAndroid Build Coastguard Worker
383*fb1b10abSAndroid Build Coastguard Worker            paddd       xmm1,   xmm6
384*fb1b10abSAndroid Build Coastguard Worker            paddd       xmm1,   [GLOBAL(four8s)]
385*fb1b10abSAndroid Build Coastguard Worker
386*fb1b10abSAndroid Build Coastguard Worker            psrad       xmm1,   4
387*fb1b10abSAndroid Build Coastguard Worker            packssdw    xmm1,   xmm0
388*fb1b10abSAndroid Build Coastguard Worker
389*fb1b10abSAndroid Build Coastguard Worker            packuswb    xmm1,   xmm0
390*fb1b10abSAndroid Build Coastguard Worker            pand        xmm1,   xmm5
391*fb1b10abSAndroid Build Coastguard Worker
392*fb1b10abSAndroid Build Coastguard Worker            pandn       xmm5,   xmm2
393*fb1b10abSAndroid Build Coastguard Worker            por         xmm5,   xmm1
394*fb1b10abSAndroid Build Coastguard Worker
395*fb1b10abSAndroid Build Coastguard Worker            movd        [rsi+rcx-8],  mm0
396*fb1b10abSAndroid Build Coastguard Worker            movq        mm0,    mm1
397*fb1b10abSAndroid Build Coastguard Worker
398*fb1b10abSAndroid Build Coastguard Worker            movdq2q     mm1,    xmm5
399*fb1b10abSAndroid Build Coastguard Worker            psrldq      xmm7,   12
400*fb1b10abSAndroid Build Coastguard Worker
401*fb1b10abSAndroid Build Coastguard Worker            psrldq      xmm6,   12
402*fb1b10abSAndroid Build Coastguard Worker            add         rcx,    4
403*fb1b10abSAndroid Build Coastguard Worker
404*fb1b10abSAndroid Build Coastguard Worker            cmp         rcx,    rdx
405*fb1b10abSAndroid Build Coastguard Worker            jl          .nextcol4
406*fb1b10abSAndroid Build Coastguard Worker
407*fb1b10abSAndroid Build Coastguard Worker        ;s+=pitch;
408*fb1b10abSAndroid Build Coastguard Worker        movsxd rax, dword arg(1)
409*fb1b10abSAndroid Build Coastguard Worker        add    arg(0), rax
410*fb1b10abSAndroid Build Coastguard Worker
411*fb1b10abSAndroid Build Coastguard Worker        sub dword arg(2), 1 ;rows-=1
412*fb1b10abSAndroid Build Coastguard Worker        cmp dword arg(2), 0
413*fb1b10abSAndroid Build Coastguard Worker        jg .ip_row_loop
414*fb1b10abSAndroid Build Coastguard Worker
415*fb1b10abSAndroid Build Coastguard Worker    add         rsp, 16
416*fb1b10abSAndroid Build Coastguard Worker    pop         rsp
417*fb1b10abSAndroid Build Coastguard Worker
418*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
419*fb1b10abSAndroid Build Coastguard Worker    pop rdi
420*fb1b10abSAndroid Build Coastguard Worker    pop rsi
421*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
422*fb1b10abSAndroid Build Coastguard Worker    RESTORE_XMM
423*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
424*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
425*fb1b10abSAndroid Build Coastguard Worker    ret
426*fb1b10abSAndroid Build Coastguard Worker%undef flimit4
427*fb1b10abSAndroid Build Coastguard Worker
428*fb1b10abSAndroid Build Coastguard Worker
429*fb1b10abSAndroid Build Coastguard WorkerSECTION_RODATA
430*fb1b10abSAndroid Build Coastguard Workeralign 16
431*fb1b10abSAndroid Build Coastguard Workerfour8s:
432*fb1b10abSAndroid Build Coastguard Worker    times 4 dd 8
433