xref: /aosp_15_r20/external/libvpx/vp8/common/x86/subpixel_sse2.asm (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1*fb1b10abSAndroid Build Coastguard Worker;
2*fb1b10abSAndroid Build Coastguard Worker;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker;
4*fb1b10abSAndroid Build Coastguard Worker;  Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker;  that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker;  tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker;  in the file PATENTS.  All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker;  be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker;
10*fb1b10abSAndroid Build Coastguard Worker
11*fb1b10abSAndroid Build Coastguard Worker
12*fb1b10abSAndroid Build Coastguard Worker%include "vpx_ports/x86_abi_support.asm"
13*fb1b10abSAndroid Build Coastguard Worker
14*fb1b10abSAndroid Build Coastguard Worker%define BLOCK_HEIGHT_WIDTH 4
15*fb1b10abSAndroid Build Coastguard Worker%define VP8_FILTER_WEIGHT 128
16*fb1b10abSAndroid Build Coastguard Worker%define VP8_FILTER_SHIFT  7
17*fb1b10abSAndroid Build Coastguard Worker
18*fb1b10abSAndroid Build Coastguard WorkerSECTION .text
19*fb1b10abSAndroid Build Coastguard Worker
20*fb1b10abSAndroid Build Coastguard Worker;/************************************************************************************
21*fb1b10abSAndroid Build Coastguard Worker; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
22*fb1b10abSAndroid Build Coastguard Worker; input pixel array has output_height rows. This routine assumes that output_height is an
23*fb1b10abSAndroid Build Coastguard Worker; even number. This function handles 8 pixels in horizontal direction, calculating ONE
24*fb1b10abSAndroid Build Coastguard Worker; rows each iteration to take advantage of the 128 bits operations.
25*fb1b10abSAndroid Build Coastguard Worker;*************************************************************************************/
26*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_block1d8_h6_sse2
27*fb1b10abSAndroid Build Coastguard Worker;(
28*fb1b10abSAndroid Build Coastguard Worker;    unsigned char  *src_ptr,
29*fb1b10abSAndroid Build Coastguard Worker;    unsigned short *output_ptr,
30*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    src_pixels_per_line,
31*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    pixel_step,
32*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    output_height,
33*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    output_width,
34*fb1b10abSAndroid Build Coastguard Worker;    short           *vp8_filter
35*fb1b10abSAndroid Build Coastguard Worker;)
36*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_block1d8_h6_sse2)
37*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_block1d8_h6_sse2):
38*fb1b10abSAndroid Build Coastguard Worker    push        rbp
39*fb1b10abSAndroid Build Coastguard Worker    mov         rbp, rsp
40*fb1b10abSAndroid Build Coastguard Worker    SHADOW_ARGS_TO_STACK 7
41*fb1b10abSAndroid Build Coastguard Worker    SAVE_XMM 7
42*fb1b10abSAndroid Build Coastguard Worker    GET_GOT     rbx
43*fb1b10abSAndroid Build Coastguard Worker    push        rsi
44*fb1b10abSAndroid Build Coastguard Worker    push        rdi
45*fb1b10abSAndroid Build Coastguard Worker    ; end prolog
46*fb1b10abSAndroid Build Coastguard Worker
47*fb1b10abSAndroid Build Coastguard Worker        mov         rdx,        arg(6) ;vp8_filter
48*fb1b10abSAndroid Build Coastguard Worker        mov         rsi,        arg(0) ;src_ptr
49*fb1b10abSAndroid Build Coastguard Worker
50*fb1b10abSAndroid Build Coastguard Worker        mov         rdi,        arg(1) ;output_ptr
51*fb1b10abSAndroid Build Coastguard Worker
52*fb1b10abSAndroid Build Coastguard Worker        movsxd      rcx,        dword ptr arg(4) ;output_height
53*fb1b10abSAndroid Build Coastguard Worker        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
54*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT=0
55*fb1b10abSAndroid Build Coastguard Worker        movsxd      r8,         dword ptr arg(5) ;output_width
56*fb1b10abSAndroid Build Coastguard Worker%endif
57*fb1b10abSAndroid Build Coastguard Worker        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
58*fb1b10abSAndroid Build Coastguard Worker
59*fb1b10abSAndroid Build Coastguard Worker.filter_block1d8_h6_rowloop:
60*fb1b10abSAndroid Build Coastguard Worker        movq        xmm3,       MMWORD PTR [rsi - 2]
61*fb1b10abSAndroid Build Coastguard Worker        movq        xmm1,       MMWORD PTR [rsi + 6]
62*fb1b10abSAndroid Build Coastguard Worker
63*fb1b10abSAndroid Build Coastguard Worker        prefetcht2  [rsi+rax-2]
64*fb1b10abSAndroid Build Coastguard Worker
65*fb1b10abSAndroid Build Coastguard Worker        pslldq      xmm1,       8
66*fb1b10abSAndroid Build Coastguard Worker        por         xmm1,       xmm3
67*fb1b10abSAndroid Build Coastguard Worker
68*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,       xmm1
69*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,       xmm1
70*fb1b10abSAndroid Build Coastguard Worker
71*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm6,       xmm1
72*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,       xmm1
73*fb1b10abSAndroid Build Coastguard Worker
74*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
75*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
76*fb1b10abSAndroid Build Coastguard Worker
77*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
78*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
79*fb1b10abSAndroid Build Coastguard Worker
80*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
81*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
82*fb1b10abSAndroid Build Coastguard Worker
83*fb1b10abSAndroid Build Coastguard Worker
84*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
85*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
86*fb1b10abSAndroid Build Coastguard Worker
87*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
88*fb1b10abSAndroid Build Coastguard Worker
89*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
90*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
91*fb1b10abSAndroid Build Coastguard Worker
92*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
93*fb1b10abSAndroid Build Coastguard Worker
94*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
95*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
96*fb1b10abSAndroid Build Coastguard Worker
97*fb1b10abSAndroid Build Coastguard Worker
98*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
99*fb1b10abSAndroid Build Coastguard Worker
100*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
101*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
102*fb1b10abSAndroid Build Coastguard Worker
103*fb1b10abSAndroid Build Coastguard Worker
104*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm7
105*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm5
106*fb1b10abSAndroid Build Coastguard Worker
107*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm3
108*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm6
109*fb1b10abSAndroid Build Coastguard Worker
110*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm1
111*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       [GLOBAL(rd)]
112*fb1b10abSAndroid Build Coastguard Worker
113*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm4,       7
114*fb1b10abSAndroid Build Coastguard Worker
115*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm4,       xmm0
116*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm4,       xmm0
117*fb1b10abSAndroid Build Coastguard Worker
118*fb1b10abSAndroid Build Coastguard Worker        movdqa      XMMWORD Ptr [rdi],         xmm4
119*fb1b10abSAndroid Build Coastguard Worker        lea         rsi,        [rsi + rax]
120*fb1b10abSAndroid Build Coastguard Worker
121*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT
122*fb1b10abSAndroid Build Coastguard Worker        add         rdi,        DWORD Ptr arg(5) ;[output_width]
123*fb1b10abSAndroid Build Coastguard Worker%else
124*fb1b10abSAndroid Build Coastguard Worker        add         rdi,        r8
125*fb1b10abSAndroid Build Coastguard Worker%endif
126*fb1b10abSAndroid Build Coastguard Worker        dec         rcx
127*fb1b10abSAndroid Build Coastguard Worker
128*fb1b10abSAndroid Build Coastguard Worker        jnz         .filter_block1d8_h6_rowloop                ; next row
129*fb1b10abSAndroid Build Coastguard Worker
130*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
131*fb1b10abSAndroid Build Coastguard Worker    pop rdi
132*fb1b10abSAndroid Build Coastguard Worker    pop rsi
133*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
134*fb1b10abSAndroid Build Coastguard Worker    RESTORE_XMM
135*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
136*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
137*fb1b10abSAndroid Build Coastguard Worker    ret
138*fb1b10abSAndroid Build Coastguard Worker
139*fb1b10abSAndroid Build Coastguard Worker
140*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_block1d16_h6_sse2
141*fb1b10abSAndroid Build Coastguard Worker;(
142*fb1b10abSAndroid Build Coastguard Worker;    unsigned char  *src_ptr,
143*fb1b10abSAndroid Build Coastguard Worker;    unsigned short *output_ptr,
144*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    src_pixels_per_line,
145*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    pixel_step,
146*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    output_height,
147*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    output_width,
148*fb1b10abSAndroid Build Coastguard Worker;    short           *vp8_filter
149*fb1b10abSAndroid Build Coastguard Worker;)
150*fb1b10abSAndroid Build Coastguard Worker;/************************************************************************************
151*fb1b10abSAndroid Build Coastguard Worker; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
152*fb1b10abSAndroid Build Coastguard Worker; input pixel array has output_height rows. This routine assumes that output_height is an
153*fb1b10abSAndroid Build Coastguard Worker; even number. This function handles 8 pixels in horizontal direction, calculating ONE
154*fb1b10abSAndroid Build Coastguard Worker; rows each iteration to take advantage of the 128 bits operations.
155*fb1b10abSAndroid Build Coastguard Worker;*************************************************************************************/
156*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_block1d16_h6_sse2)
157*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_block1d16_h6_sse2):
158*fb1b10abSAndroid Build Coastguard Worker    push        rbp
159*fb1b10abSAndroid Build Coastguard Worker    mov         rbp, rsp
160*fb1b10abSAndroid Build Coastguard Worker    SHADOW_ARGS_TO_STACK 7
161*fb1b10abSAndroid Build Coastguard Worker    SAVE_XMM 7
162*fb1b10abSAndroid Build Coastguard Worker    GET_GOT     rbx
163*fb1b10abSAndroid Build Coastguard Worker    push        rsi
164*fb1b10abSAndroid Build Coastguard Worker    push        rdi
165*fb1b10abSAndroid Build Coastguard Worker    ; end prolog
166*fb1b10abSAndroid Build Coastguard Worker
167*fb1b10abSAndroid Build Coastguard Worker        mov         rdx,        arg(6) ;vp8_filter
168*fb1b10abSAndroid Build Coastguard Worker        mov         rsi,        arg(0) ;src_ptr
169*fb1b10abSAndroid Build Coastguard Worker
170*fb1b10abSAndroid Build Coastguard Worker        mov         rdi,        arg(1) ;output_ptr
171*fb1b10abSAndroid Build Coastguard Worker
172*fb1b10abSAndroid Build Coastguard Worker        movsxd      rcx,        dword ptr arg(4) ;output_height
173*fb1b10abSAndroid Build Coastguard Worker        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
174*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT=0
175*fb1b10abSAndroid Build Coastguard Worker        movsxd      r8,         dword ptr arg(5) ;output_width
176*fb1b10abSAndroid Build Coastguard Worker%endif
177*fb1b10abSAndroid Build Coastguard Worker
178*fb1b10abSAndroid Build Coastguard Worker        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
179*fb1b10abSAndroid Build Coastguard Worker
180*fb1b10abSAndroid Build Coastguard Worker.filter_block1d16_h6_sse2_rowloop:
181*fb1b10abSAndroid Build Coastguard Worker        movq        xmm3,       MMWORD PTR [rsi - 2]
182*fb1b10abSAndroid Build Coastguard Worker        movq        xmm1,       MMWORD PTR [rsi + 6]
183*fb1b10abSAndroid Build Coastguard Worker
184*fb1b10abSAndroid Build Coastguard Worker        ; Load from 11 to avoid reading out of bounds.
185*fb1b10abSAndroid Build Coastguard Worker        movq        xmm2,       MMWORD PTR [rsi +11]
186*fb1b10abSAndroid Build Coastguard Worker        ; The lower bits are not cleared before 'or'ing with xmm1,
187*fb1b10abSAndroid Build Coastguard Worker        ; but that is OK because the values in the overlapping positions
188*fb1b10abSAndroid Build Coastguard Worker        ; are already equal to the ones in xmm1.
189*fb1b10abSAndroid Build Coastguard Worker        pslldq      xmm2,       5
190*fb1b10abSAndroid Build Coastguard Worker
191*fb1b10abSAndroid Build Coastguard Worker        por         xmm2,       xmm1
192*fb1b10abSAndroid Build Coastguard Worker        prefetcht2  [rsi+rax-2]
193*fb1b10abSAndroid Build Coastguard Worker
194*fb1b10abSAndroid Build Coastguard Worker        pslldq      xmm1,       8
195*fb1b10abSAndroid Build Coastguard Worker        por         xmm1,       xmm3
196*fb1b10abSAndroid Build Coastguard Worker
197*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,       xmm1
198*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,       xmm1
199*fb1b10abSAndroid Build Coastguard Worker
200*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm6,       xmm1
201*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,       xmm1
202*fb1b10abSAndroid Build Coastguard Worker
203*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
204*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
205*fb1b10abSAndroid Build Coastguard Worker
206*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
207*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
208*fb1b10abSAndroid Build Coastguard Worker
209*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
210*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
211*fb1b10abSAndroid Build Coastguard Worker
212*fb1b10abSAndroid Build Coastguard Worker
213*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
214*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
215*fb1b10abSAndroid Build Coastguard Worker
216*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
217*fb1b10abSAndroid Build Coastguard Worker
218*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
219*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
220*fb1b10abSAndroid Build Coastguard Worker
221*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
222*fb1b10abSAndroid Build Coastguard Worker
223*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
224*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
225*fb1b10abSAndroid Build Coastguard Worker
226*fb1b10abSAndroid Build Coastguard Worker
227*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
228*fb1b10abSAndroid Build Coastguard Worker
229*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
230*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
231*fb1b10abSAndroid Build Coastguard Worker
232*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm7
233*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm5
234*fb1b10abSAndroid Build Coastguard Worker
235*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm3
236*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm6
237*fb1b10abSAndroid Build Coastguard Worker
238*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm1
239*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       [GLOBAL(rd)]
240*fb1b10abSAndroid Build Coastguard Worker
241*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm4,       7
242*fb1b10abSAndroid Build Coastguard Worker
243*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm4,       xmm0
244*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm4,       xmm0
245*fb1b10abSAndroid Build Coastguard Worker
246*fb1b10abSAndroid Build Coastguard Worker        movdqa      XMMWORD Ptr [rdi],         xmm4
247*fb1b10abSAndroid Build Coastguard Worker
248*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm3,       xmm2
249*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,       xmm2
250*fb1b10abSAndroid Build Coastguard Worker
251*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,       xmm2
252*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm6,       xmm2
253*fb1b10abSAndroid Build Coastguard Worker
254*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,       xmm2
255*fb1b10abSAndroid Build Coastguard Worker
256*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
257*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
258*fb1b10abSAndroid Build Coastguard Worker
259*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
260*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
261*fb1b10abSAndroid Build Coastguard Worker
262*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
263*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
264*fb1b10abSAndroid Build Coastguard Worker
265*fb1b10abSAndroid Build Coastguard Worker
266*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
267*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
268*fb1b10abSAndroid Build Coastguard Worker
269*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
270*fb1b10abSAndroid Build Coastguard Worker
271*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
272*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
273*fb1b10abSAndroid Build Coastguard Worker
274*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
275*fb1b10abSAndroid Build Coastguard Worker
276*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
277*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
278*fb1b10abSAndroid Build Coastguard Worker
279*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
280*fb1b10abSAndroid Build Coastguard Worker
281*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
282*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
283*fb1b10abSAndroid Build Coastguard Worker
284*fb1b10abSAndroid Build Coastguard Worker
285*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm7
286*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm5
287*fb1b10abSAndroid Build Coastguard Worker
288*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm3
289*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm6
290*fb1b10abSAndroid Build Coastguard Worker
291*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm2
292*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       [GLOBAL(rd)]
293*fb1b10abSAndroid Build Coastguard Worker
294*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm4,       7
295*fb1b10abSAndroid Build Coastguard Worker
296*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm4,       xmm0
297*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm4,       xmm0
298*fb1b10abSAndroid Build Coastguard Worker
299*fb1b10abSAndroid Build Coastguard Worker        movdqa      XMMWORD Ptr [rdi+16],      xmm4
300*fb1b10abSAndroid Build Coastguard Worker
301*fb1b10abSAndroid Build Coastguard Worker        lea         rsi,        [rsi + rax]
302*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT
303*fb1b10abSAndroid Build Coastguard Worker        add         rdi,        DWORD Ptr arg(5) ;[output_width]
304*fb1b10abSAndroid Build Coastguard Worker%else
305*fb1b10abSAndroid Build Coastguard Worker        add         rdi,        r8
306*fb1b10abSAndroid Build Coastguard Worker%endif
307*fb1b10abSAndroid Build Coastguard Worker
308*fb1b10abSAndroid Build Coastguard Worker        dec         rcx
309*fb1b10abSAndroid Build Coastguard Worker        jnz         .filter_block1d16_h6_sse2_rowloop                ; next row
310*fb1b10abSAndroid Build Coastguard Worker
311*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
312*fb1b10abSAndroid Build Coastguard Worker    pop rdi
313*fb1b10abSAndroid Build Coastguard Worker    pop rsi
314*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
315*fb1b10abSAndroid Build Coastguard Worker    RESTORE_XMM
316*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
317*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
318*fb1b10abSAndroid Build Coastguard Worker    ret
319*fb1b10abSAndroid Build Coastguard Worker
320*fb1b10abSAndroid Build Coastguard Worker
321*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_block1d8_v6_sse2
322*fb1b10abSAndroid Build Coastguard Worker;(
323*fb1b10abSAndroid Build Coastguard Worker;    short *src_ptr,
324*fb1b10abSAndroid Build Coastguard Worker;    unsigned char *output_ptr,
325*fb1b10abSAndroid Build Coastguard Worker;    int dst_ptich,
326*fb1b10abSAndroid Build Coastguard Worker;    unsigned int pixels_per_line,
327*fb1b10abSAndroid Build Coastguard Worker;    unsigned int pixel_step,
328*fb1b10abSAndroid Build Coastguard Worker;    unsigned int output_height,
329*fb1b10abSAndroid Build Coastguard Worker;    unsigned int output_width,
330*fb1b10abSAndroid Build Coastguard Worker;    short * vp8_filter
331*fb1b10abSAndroid Build Coastguard Worker;)
332*fb1b10abSAndroid Build Coastguard Worker;/************************************************************************************
333*fb1b10abSAndroid Build Coastguard Worker; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
334*fb1b10abSAndroid Build Coastguard Worker; input pixel array has output_height rows.
335*fb1b10abSAndroid Build Coastguard Worker;*************************************************************************************/
336*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_block1d8_v6_sse2)
337*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_block1d8_v6_sse2):
338*fb1b10abSAndroid Build Coastguard Worker    push        rbp
339*fb1b10abSAndroid Build Coastguard Worker    mov         rbp, rsp
340*fb1b10abSAndroid Build Coastguard Worker    SHADOW_ARGS_TO_STACK 8
341*fb1b10abSAndroid Build Coastguard Worker    SAVE_XMM 7
342*fb1b10abSAndroid Build Coastguard Worker    GET_GOT     rbx
343*fb1b10abSAndroid Build Coastguard Worker    push        rsi
344*fb1b10abSAndroid Build Coastguard Worker    push        rdi
345*fb1b10abSAndroid Build Coastguard Worker    ; end prolog
346*fb1b10abSAndroid Build Coastguard Worker
347*fb1b10abSAndroid Build Coastguard Worker        mov         rax,        arg(7) ;vp8_filter
348*fb1b10abSAndroid Build Coastguard Worker        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
349*fb1b10abSAndroid Build Coastguard Worker
350*fb1b10abSAndroid Build Coastguard Worker        mov         rdi,        arg(1) ;output_ptr
351*fb1b10abSAndroid Build Coastguard Worker        mov         rsi,        arg(0) ;src_ptr
352*fb1b10abSAndroid Build Coastguard Worker
353*fb1b10abSAndroid Build Coastguard Worker        sub         rsi,        rdx
354*fb1b10abSAndroid Build Coastguard Worker        sub         rsi,        rdx
355*fb1b10abSAndroid Build Coastguard Worker
356*fb1b10abSAndroid Build Coastguard Worker        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
357*fb1b10abSAndroid Build Coastguard Worker        pxor        xmm0,       xmm0                        ; clear xmm0
358*fb1b10abSAndroid Build Coastguard Worker
359*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
360*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT=0
361*fb1b10abSAndroid Build Coastguard Worker        movsxd      r8,         dword ptr arg(2) ; dst_ptich
362*fb1b10abSAndroid Build Coastguard Worker%endif
363*fb1b10abSAndroid Build Coastguard Worker
364*fb1b10abSAndroid Build Coastguard Worker.vp8_filter_block1d8_v6_sse2_loop:
365*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm1,       XMMWORD PTR [rsi]
366*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm1,       [rax]
367*fb1b10abSAndroid Build Coastguard Worker
368*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm2,       XMMWORD PTR [rsi + rdx]
369*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm2,       [rax + 16]
370*fb1b10abSAndroid Build Coastguard Worker
371*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]
372*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm3,       [rax + 32]
373*fb1b10abSAndroid Build Coastguard Worker
374*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]
375*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm5,       [rax + 64]
376*fb1b10abSAndroid Build Coastguard Worker
377*fb1b10abSAndroid Build Coastguard Worker        add         rsi,        rdx
378*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]
379*fb1b10abSAndroid Build Coastguard Worker
380*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm4,       [rax + 48]
381*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]
382*fb1b10abSAndroid Build Coastguard Worker
383*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm6,       [rax + 80]
384*fb1b10abSAndroid Build Coastguard Worker
385*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm2,       xmm5
386*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm2,       xmm3
387*fb1b10abSAndroid Build Coastguard Worker
388*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm2,       xmm1
389*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm2,       xmm4
390*fb1b10abSAndroid Build Coastguard Worker
391*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm2,       xmm6
392*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm2,       xmm7
393*fb1b10abSAndroid Build Coastguard Worker
394*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm2,       7
395*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm2,       xmm0              ; pack and saturate
396*fb1b10abSAndroid Build Coastguard Worker
397*fb1b10abSAndroid Build Coastguard Worker        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
398*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT
399*fb1b10abSAndroid Build Coastguard Worker        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
400*fb1b10abSAndroid Build Coastguard Worker%else
401*fb1b10abSAndroid Build Coastguard Worker        add         rdi,        r8
402*fb1b10abSAndroid Build Coastguard Worker%endif
403*fb1b10abSAndroid Build Coastguard Worker        dec         rcx         ; decrement count
404*fb1b10abSAndroid Build Coastguard Worker        jnz         .vp8_filter_block1d8_v6_sse2_loop               ; next row
405*fb1b10abSAndroid Build Coastguard Worker
406*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
407*fb1b10abSAndroid Build Coastguard Worker    pop rdi
408*fb1b10abSAndroid Build Coastguard Worker    pop rsi
409*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
410*fb1b10abSAndroid Build Coastguard Worker    RESTORE_XMM
411*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
412*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
413*fb1b10abSAndroid Build Coastguard Worker    ret
414*fb1b10abSAndroid Build Coastguard Worker
415*fb1b10abSAndroid Build Coastguard Worker
416*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_block1d16_v6_sse2
417*fb1b10abSAndroid Build Coastguard Worker;(
418*fb1b10abSAndroid Build Coastguard Worker;    unsigned short *src_ptr,
419*fb1b10abSAndroid Build Coastguard Worker;    unsigned char *output_ptr,
420*fb1b10abSAndroid Build Coastguard Worker;    int dst_ptich,
421*fb1b10abSAndroid Build Coastguard Worker;    unsigned int pixels_per_line,
422*fb1b10abSAndroid Build Coastguard Worker;    unsigned int pixel_step,
423*fb1b10abSAndroid Build Coastguard Worker;    unsigned int output_height,
424*fb1b10abSAndroid Build Coastguard Worker;    unsigned int output_width,
425*fb1b10abSAndroid Build Coastguard Worker;    const short    *vp8_filter
426*fb1b10abSAndroid Build Coastguard Worker;)
427*fb1b10abSAndroid Build Coastguard Worker;/************************************************************************************
428*fb1b10abSAndroid Build Coastguard Worker; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
429*fb1b10abSAndroid Build Coastguard Worker; input pixel array has output_height rows.
430*fb1b10abSAndroid Build Coastguard Worker;*************************************************************************************/
431*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_block1d16_v6_sse2)
432*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_block1d16_v6_sse2):
433*fb1b10abSAndroid Build Coastguard Worker    push        rbp
434*fb1b10abSAndroid Build Coastguard Worker    mov         rbp, rsp
435*fb1b10abSAndroid Build Coastguard Worker    SHADOW_ARGS_TO_STACK 8
436*fb1b10abSAndroid Build Coastguard Worker    SAVE_XMM 7
437*fb1b10abSAndroid Build Coastguard Worker    GET_GOT     rbx
438*fb1b10abSAndroid Build Coastguard Worker    push        rsi
439*fb1b10abSAndroid Build Coastguard Worker    push        rdi
440*fb1b10abSAndroid Build Coastguard Worker    ; end prolog
441*fb1b10abSAndroid Build Coastguard Worker
442*fb1b10abSAndroid Build Coastguard Worker        mov         rax,        arg(7) ;vp8_filter
443*fb1b10abSAndroid Build Coastguard Worker        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
444*fb1b10abSAndroid Build Coastguard Worker
445*fb1b10abSAndroid Build Coastguard Worker        mov         rdi,        arg(1) ;output_ptr
446*fb1b10abSAndroid Build Coastguard Worker        mov         rsi,        arg(0) ;src_ptr
447*fb1b10abSAndroid Build Coastguard Worker
448*fb1b10abSAndroid Build Coastguard Worker        sub         rsi,        rdx
449*fb1b10abSAndroid Build Coastguard Worker        sub         rsi,        rdx
450*fb1b10abSAndroid Build Coastguard Worker
451*fb1b10abSAndroid Build Coastguard Worker        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
452*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT=0
453*fb1b10abSAndroid Build Coastguard Worker        movsxd      r8,         dword ptr arg(2) ; dst_ptich
454*fb1b10abSAndroid Build Coastguard Worker%endif
455*fb1b10abSAndroid Build Coastguard Worker
456*fb1b10abSAndroid Build Coastguard Worker.vp8_filter_block1d16_v6_sse2_loop:
457*fb1b10abSAndroid Build Coastguard Worker; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
458*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2
459*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]
460*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm1,       [rax + 16]
461*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm2,       [rax + 16]
462*fb1b10abSAndroid Build Coastguard Worker
463*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 4]       ; line 5
464*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 4 + 16]
465*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm3,       [rax + 64]
466*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm4,       [rax + 64]
467*fb1b10abSAndroid Build Coastguard Worker
468*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 2]       ; line 3
469*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 2 + 16]
470*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm5,       [rax + 32]
471*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm6,       [rax + 32]
472*fb1b10abSAndroid Build Coastguard Worker
473*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,       XMMWORD PTR [rsi]       ; line 1
474*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm0,       XMMWORD PTR [rsi + 16]
475*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm7,       [rax]
476*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm0,       [rax]
477*fb1b10abSAndroid Build Coastguard Worker
478*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm1,       xmm3
479*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm2,       xmm4
480*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm1,       xmm5
481*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm2,       xmm6
482*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm1,       xmm7
483*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm2,       xmm0
484*fb1b10abSAndroid Build Coastguard Worker
485*fb1b10abSAndroid Build Coastguard Worker        add         rsi,        rdx
486*fb1b10abSAndroid Build Coastguard Worker
487*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]       ; line 4
488*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2 + 16]
489*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm3,       [rax + 48]
490*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm4,       [rax + 48]
491*fb1b10abSAndroid Build Coastguard Worker
492*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]       ; line 6
493*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4 + 16]
494*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm5,       [rax + 80]
495*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm6,       [rax + 80]
496*fb1b10abSAndroid Build Coastguard Worker
497*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
498*fb1b10abSAndroid Build Coastguard Worker        pxor        xmm0,       xmm0                        ; clear xmm0
499*fb1b10abSAndroid Build Coastguard Worker
500*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm1,       xmm3
501*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm2,       xmm4
502*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm1,       xmm5
503*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm2,       xmm6
504*fb1b10abSAndroid Build Coastguard Worker
505*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm1,       xmm7
506*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm2,       xmm7
507*fb1b10abSAndroid Build Coastguard Worker
508*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm1,       7
509*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm2,       7
510*fb1b10abSAndroid Build Coastguard Worker
511*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm1,       xmm2              ; pack and saturate
512*fb1b10abSAndroid Build Coastguard Worker        movdqa      XMMWORD PTR [rdi], xmm1       ; store the results in the destination
513*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT
514*fb1b10abSAndroid Build Coastguard Worker        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
515*fb1b10abSAndroid Build Coastguard Worker%else
516*fb1b10abSAndroid Build Coastguard Worker        add         rdi,        r8
517*fb1b10abSAndroid Build Coastguard Worker%endif
518*fb1b10abSAndroid Build Coastguard Worker        dec         rcx         ; decrement count
519*fb1b10abSAndroid Build Coastguard Worker        jnz         .vp8_filter_block1d16_v6_sse2_loop              ; next row
520*fb1b10abSAndroid Build Coastguard Worker
521*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
522*fb1b10abSAndroid Build Coastguard Worker    pop rdi
523*fb1b10abSAndroid Build Coastguard Worker    pop rsi
524*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
525*fb1b10abSAndroid Build Coastguard Worker    RESTORE_XMM
526*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
527*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
528*fb1b10abSAndroid Build Coastguard Worker    ret
529*fb1b10abSAndroid Build Coastguard Worker
530*fb1b10abSAndroid Build Coastguard Worker
531*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_block1d8_h6_only_sse2
532*fb1b10abSAndroid Build Coastguard Worker;(
533*fb1b10abSAndroid Build Coastguard Worker;    unsigned char  *src_ptr,
534*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    src_pixels_per_line,
535*fb1b10abSAndroid Build Coastguard Worker;    unsigned char  *output_ptr,
536*fb1b10abSAndroid Build Coastguard Worker;    int dst_ptich,
537*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    output_height,
538*fb1b10abSAndroid Build Coastguard Worker;    const short    *vp8_filter
539*fb1b10abSAndroid Build Coastguard Worker;)
540*fb1b10abSAndroid Build Coastguard Worker; First-pass filter only when yoffset==0
541*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_block1d8_h6_only_sse2)
542*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_block1d8_h6_only_sse2):
543*fb1b10abSAndroid Build Coastguard Worker    push        rbp
544*fb1b10abSAndroid Build Coastguard Worker    mov         rbp, rsp
545*fb1b10abSAndroid Build Coastguard Worker    SHADOW_ARGS_TO_STACK 6
546*fb1b10abSAndroid Build Coastguard Worker    SAVE_XMM 7
547*fb1b10abSAndroid Build Coastguard Worker    GET_GOT     rbx
548*fb1b10abSAndroid Build Coastguard Worker    push        rsi
549*fb1b10abSAndroid Build Coastguard Worker    push        rdi
550*fb1b10abSAndroid Build Coastguard Worker    ; end prolog
551*fb1b10abSAndroid Build Coastguard Worker
552*fb1b10abSAndroid Build Coastguard Worker        mov         rdx,        arg(5) ;vp8_filter
553*fb1b10abSAndroid Build Coastguard Worker        mov         rsi,        arg(0) ;src_ptr
554*fb1b10abSAndroid Build Coastguard Worker
555*fb1b10abSAndroid Build Coastguard Worker        mov         rdi,        arg(2) ;output_ptr
556*fb1b10abSAndroid Build Coastguard Worker
557*fb1b10abSAndroid Build Coastguard Worker        movsxd      rcx,        dword ptr arg(4) ;output_height
558*fb1b10abSAndroid Build Coastguard Worker        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
559*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT=0
560*fb1b10abSAndroid Build Coastguard Worker        movsxd      r8,         dword ptr arg(3) ;dst_ptich
561*fb1b10abSAndroid Build Coastguard Worker%endif
562*fb1b10abSAndroid Build Coastguard Worker        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
563*fb1b10abSAndroid Build Coastguard Worker
564*fb1b10abSAndroid Build Coastguard Worker.filter_block1d8_h6_only_rowloop:
565*fb1b10abSAndroid Build Coastguard Worker        movq        xmm3,       MMWORD PTR [rsi - 2]
566*fb1b10abSAndroid Build Coastguard Worker        movq        xmm1,       MMWORD PTR [rsi + 6]
567*fb1b10abSAndroid Build Coastguard Worker
568*fb1b10abSAndroid Build Coastguard Worker        prefetcht2  [rsi+rax-2]
569*fb1b10abSAndroid Build Coastguard Worker
570*fb1b10abSAndroid Build Coastguard Worker        pslldq      xmm1,       8
571*fb1b10abSAndroid Build Coastguard Worker        por         xmm1,       xmm3
572*fb1b10abSAndroid Build Coastguard Worker
573*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,       xmm1
574*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,       xmm1
575*fb1b10abSAndroid Build Coastguard Worker
576*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm6,       xmm1
577*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,       xmm1
578*fb1b10abSAndroid Build Coastguard Worker
579*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
580*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
581*fb1b10abSAndroid Build Coastguard Worker
582*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
583*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
584*fb1b10abSAndroid Build Coastguard Worker
585*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
586*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
587*fb1b10abSAndroid Build Coastguard Worker
588*fb1b10abSAndroid Build Coastguard Worker
589*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
590*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
591*fb1b10abSAndroid Build Coastguard Worker
592*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
593*fb1b10abSAndroid Build Coastguard Worker
594*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
595*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
596*fb1b10abSAndroid Build Coastguard Worker
597*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
598*fb1b10abSAndroid Build Coastguard Worker
599*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
600*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
601*fb1b10abSAndroid Build Coastguard Worker
602*fb1b10abSAndroid Build Coastguard Worker
603*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
604*fb1b10abSAndroid Build Coastguard Worker
605*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
606*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
607*fb1b10abSAndroid Build Coastguard Worker
608*fb1b10abSAndroid Build Coastguard Worker
609*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm7
610*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm5
611*fb1b10abSAndroid Build Coastguard Worker
612*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm3
613*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm6
614*fb1b10abSAndroid Build Coastguard Worker
615*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm1
616*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       [GLOBAL(rd)]
617*fb1b10abSAndroid Build Coastguard Worker
618*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm4,       7
619*fb1b10abSAndroid Build Coastguard Worker
620*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm4,       xmm0
621*fb1b10abSAndroid Build Coastguard Worker
622*fb1b10abSAndroid Build Coastguard Worker        movq        QWORD PTR [rdi],   xmm4       ; store the results in the destination
623*fb1b10abSAndroid Build Coastguard Worker        lea         rsi,        [rsi + rax]
624*fb1b10abSAndroid Build Coastguard Worker
625*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT
626*fb1b10abSAndroid Build Coastguard Worker        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
627*fb1b10abSAndroid Build Coastguard Worker%else
628*fb1b10abSAndroid Build Coastguard Worker        add         rdi,        r8
629*fb1b10abSAndroid Build Coastguard Worker%endif
630*fb1b10abSAndroid Build Coastguard Worker        dec         rcx
631*fb1b10abSAndroid Build Coastguard Worker
632*fb1b10abSAndroid Build Coastguard Worker        jnz         .filter_block1d8_h6_only_rowloop               ; next row
633*fb1b10abSAndroid Build Coastguard Worker
634*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
635*fb1b10abSAndroid Build Coastguard Worker    pop rdi
636*fb1b10abSAndroid Build Coastguard Worker    pop rsi
637*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
638*fb1b10abSAndroid Build Coastguard Worker    RESTORE_XMM
639*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
640*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
641*fb1b10abSAndroid Build Coastguard Worker    ret
642*fb1b10abSAndroid Build Coastguard Worker
643*fb1b10abSAndroid Build Coastguard Worker
644*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_block1d16_h6_only_sse2
645*fb1b10abSAndroid Build Coastguard Worker;(
646*fb1b10abSAndroid Build Coastguard Worker;    unsigned char  *src_ptr,
647*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    src_pixels_per_line,
648*fb1b10abSAndroid Build Coastguard Worker;    unsigned char  *output_ptr,
649*fb1b10abSAndroid Build Coastguard Worker;    int dst_ptich,
650*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    output_height,
651*fb1b10abSAndroid Build Coastguard Worker;    const short    *vp8_filter
652*fb1b10abSAndroid Build Coastguard Worker;)
653*fb1b10abSAndroid Build Coastguard Worker; First-pass filter only when yoffset==0
654*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_block1d16_h6_only_sse2)
655*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_block1d16_h6_only_sse2):
656*fb1b10abSAndroid Build Coastguard Worker    push        rbp
657*fb1b10abSAndroid Build Coastguard Worker    mov         rbp, rsp
658*fb1b10abSAndroid Build Coastguard Worker    SHADOW_ARGS_TO_STACK 6
659*fb1b10abSAndroid Build Coastguard Worker    SAVE_XMM 7
660*fb1b10abSAndroid Build Coastguard Worker    GET_GOT     rbx
661*fb1b10abSAndroid Build Coastguard Worker    push        rsi
662*fb1b10abSAndroid Build Coastguard Worker    push        rdi
663*fb1b10abSAndroid Build Coastguard Worker    ; end prolog
664*fb1b10abSAndroid Build Coastguard Worker
665*fb1b10abSAndroid Build Coastguard Worker        mov         rdx,        arg(5) ;vp8_filter
666*fb1b10abSAndroid Build Coastguard Worker        mov         rsi,        arg(0) ;src_ptr
667*fb1b10abSAndroid Build Coastguard Worker
668*fb1b10abSAndroid Build Coastguard Worker        mov         rdi,        arg(2) ;output_ptr
669*fb1b10abSAndroid Build Coastguard Worker
670*fb1b10abSAndroid Build Coastguard Worker        movsxd      rcx,        dword ptr arg(4) ;output_height
671*fb1b10abSAndroid Build Coastguard Worker        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
672*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT=0
673*fb1b10abSAndroid Build Coastguard Worker        movsxd      r8,         dword ptr arg(3) ;dst_ptich
674*fb1b10abSAndroid Build Coastguard Worker%endif
675*fb1b10abSAndroid Build Coastguard Worker
676*fb1b10abSAndroid Build Coastguard Worker        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
677*fb1b10abSAndroid Build Coastguard Worker
678*fb1b10abSAndroid Build Coastguard Worker.filter_block1d16_h6_only_sse2_rowloop:
679*fb1b10abSAndroid Build Coastguard Worker        movq        xmm3,       MMWORD PTR [rsi - 2]
680*fb1b10abSAndroid Build Coastguard Worker        movq        xmm1,       MMWORD PTR [rsi + 6]
681*fb1b10abSAndroid Build Coastguard Worker
682*fb1b10abSAndroid Build Coastguard Worker        movq        xmm2,       MMWORD PTR [rsi +14]
683*fb1b10abSAndroid Build Coastguard Worker        pslldq      xmm2,       8
684*fb1b10abSAndroid Build Coastguard Worker
685*fb1b10abSAndroid Build Coastguard Worker        por         xmm2,       xmm1
686*fb1b10abSAndroid Build Coastguard Worker        prefetcht2  [rsi+rax-2]
687*fb1b10abSAndroid Build Coastguard Worker
688*fb1b10abSAndroid Build Coastguard Worker        pslldq      xmm1,       8
689*fb1b10abSAndroid Build Coastguard Worker        por         xmm1,       xmm3
690*fb1b10abSAndroid Build Coastguard Worker
691*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,       xmm1
692*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,       xmm1
693*fb1b10abSAndroid Build Coastguard Worker
694*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm6,       xmm1
695*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,       xmm1
696*fb1b10abSAndroid Build Coastguard Worker
697*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
698*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
699*fb1b10abSAndroid Build Coastguard Worker
700*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
701*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
702*fb1b10abSAndroid Build Coastguard Worker
703*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
704*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
705*fb1b10abSAndroid Build Coastguard Worker
706*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
707*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
708*fb1b10abSAndroid Build Coastguard Worker
709*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
710*fb1b10abSAndroid Build Coastguard Worker
711*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
712*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
713*fb1b10abSAndroid Build Coastguard Worker
714*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
715*fb1b10abSAndroid Build Coastguard Worker
716*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
717*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
718*fb1b10abSAndroid Build Coastguard Worker
719*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
720*fb1b10abSAndroid Build Coastguard Worker
721*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
722*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
723*fb1b10abSAndroid Build Coastguard Worker
724*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm7
725*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm5
726*fb1b10abSAndroid Build Coastguard Worker
727*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm3
728*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm6
729*fb1b10abSAndroid Build Coastguard Worker
730*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm1
731*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       [GLOBAL(rd)]
732*fb1b10abSAndroid Build Coastguard Worker
733*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm4,       7
734*fb1b10abSAndroid Build Coastguard Worker
735*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm4,       xmm0                        ; lower 8 bytes
736*fb1b10abSAndroid Build Coastguard Worker
737*fb1b10abSAndroid Build Coastguard Worker        movq        QWORD Ptr [rdi],         xmm4           ; store the results in the destination
738*fb1b10abSAndroid Build Coastguard Worker
739*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm3,       xmm2
740*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,       xmm2
741*fb1b10abSAndroid Build Coastguard Worker
742*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,       xmm2
743*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm6,       xmm2
744*fb1b10abSAndroid Build Coastguard Worker
745*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,       xmm2
746*fb1b10abSAndroid Build Coastguard Worker
747*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
748*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
749*fb1b10abSAndroid Build Coastguard Worker
750*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
751*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
752*fb1b10abSAndroid Build Coastguard Worker
753*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
754*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
755*fb1b10abSAndroid Build Coastguard Worker
756*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
757*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
758*fb1b10abSAndroid Build Coastguard Worker
759*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
760*fb1b10abSAndroid Build Coastguard Worker
761*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
762*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
763*fb1b10abSAndroid Build Coastguard Worker
764*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
765*fb1b10abSAndroid Build Coastguard Worker
766*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
767*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
768*fb1b10abSAndroid Build Coastguard Worker
769*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
770*fb1b10abSAndroid Build Coastguard Worker
771*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
772*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
773*fb1b10abSAndroid Build Coastguard Worker
774*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm7
775*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm5
776*fb1b10abSAndroid Build Coastguard Worker
777*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm3
778*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm6
779*fb1b10abSAndroid Build Coastguard Worker
780*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       xmm2
781*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm4,       [GLOBAL(rd)]
782*fb1b10abSAndroid Build Coastguard Worker
783*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm4,       7
784*fb1b10abSAndroid Build Coastguard Worker
785*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm4,       xmm0                        ; higher 8 bytes
786*fb1b10abSAndroid Build Coastguard Worker
787*fb1b10abSAndroid Build Coastguard Worker        movq        QWORD Ptr [rdi+8],      xmm4            ; store the results in the destination
788*fb1b10abSAndroid Build Coastguard Worker
789*fb1b10abSAndroid Build Coastguard Worker        lea         rsi,        [rsi + rax]
790*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT
791*fb1b10abSAndroid Build Coastguard Worker        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
792*fb1b10abSAndroid Build Coastguard Worker%else
793*fb1b10abSAndroid Build Coastguard Worker        add         rdi,        r8
794*fb1b10abSAndroid Build Coastguard Worker%endif
795*fb1b10abSAndroid Build Coastguard Worker
796*fb1b10abSAndroid Build Coastguard Worker        dec         rcx
797*fb1b10abSAndroid Build Coastguard Worker        jnz         .filter_block1d16_h6_only_sse2_rowloop               ; next row
798*fb1b10abSAndroid Build Coastguard Worker
799*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
800*fb1b10abSAndroid Build Coastguard Worker    pop rdi
801*fb1b10abSAndroid Build Coastguard Worker    pop rsi
802*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
803*fb1b10abSAndroid Build Coastguard Worker    RESTORE_XMM
804*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
805*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
806*fb1b10abSAndroid Build Coastguard Worker    ret
807*fb1b10abSAndroid Build Coastguard Worker
808*fb1b10abSAndroid Build Coastguard Worker
809*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_block1d8_v6_only_sse2
810*fb1b10abSAndroid Build Coastguard Worker;(
811*fb1b10abSAndroid Build Coastguard Worker;    unsigned char *src_ptr,
812*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    src_pixels_per_line,
813*fb1b10abSAndroid Build Coastguard Worker;    unsigned char *output_ptr,
814*fb1b10abSAndroid Build Coastguard Worker;    int dst_ptich,
815*fb1b10abSAndroid Build Coastguard Worker;    unsigned int output_height,
816*fb1b10abSAndroid Build Coastguard Worker;    const short    *vp8_filter
817*fb1b10abSAndroid Build Coastguard Worker;)
818*fb1b10abSAndroid Build Coastguard Worker; Second-pass filter only when xoffset==0
819*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_block1d8_v6_only_sse2)
820*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_block1d8_v6_only_sse2):
821*fb1b10abSAndroid Build Coastguard Worker    push        rbp
822*fb1b10abSAndroid Build Coastguard Worker    mov         rbp, rsp
823*fb1b10abSAndroid Build Coastguard Worker    SHADOW_ARGS_TO_STACK 6
824*fb1b10abSAndroid Build Coastguard Worker    SAVE_XMM 7
825*fb1b10abSAndroid Build Coastguard Worker    GET_GOT     rbx
826*fb1b10abSAndroid Build Coastguard Worker    push        rsi
827*fb1b10abSAndroid Build Coastguard Worker    push        rdi
828*fb1b10abSAndroid Build Coastguard Worker    ; end prolog
829*fb1b10abSAndroid Build Coastguard Worker
830*fb1b10abSAndroid Build Coastguard Worker        mov         rsi,        arg(0) ;src_ptr
831*fb1b10abSAndroid Build Coastguard Worker        mov         rdi,        arg(2) ;output_ptr
832*fb1b10abSAndroid Build Coastguard Worker
833*fb1b10abSAndroid Build Coastguard Worker        movsxd      rcx,        dword ptr arg(4) ;output_height
834*fb1b10abSAndroid Build Coastguard Worker        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
835*fb1b10abSAndroid Build Coastguard Worker
836*fb1b10abSAndroid Build Coastguard Worker        mov         rax,        arg(5) ;vp8_filter
837*fb1b10abSAndroid Build Coastguard Worker
838*fb1b10abSAndroid Build Coastguard Worker        pxor        xmm0,       xmm0                        ; clear xmm0
839*fb1b10abSAndroid Build Coastguard Worker
840*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
841*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT=0
842*fb1b10abSAndroid Build Coastguard Worker        movsxd      r8,         dword ptr arg(3) ; dst_ptich
843*fb1b10abSAndroid Build Coastguard Worker%endif
844*fb1b10abSAndroid Build Coastguard Worker
845*fb1b10abSAndroid Build Coastguard Worker.vp8_filter_block1d8_v6_only_sse2_loop:
846*fb1b10abSAndroid Build Coastguard Worker        movq        xmm1,       MMWORD PTR [rsi]
847*fb1b10abSAndroid Build Coastguard Worker        movq        xmm2,       MMWORD PTR [rsi + rdx]
848*fb1b10abSAndroid Build Coastguard Worker        movq        xmm3,       MMWORD PTR [rsi + rdx * 2]
849*fb1b10abSAndroid Build Coastguard Worker        movq        xmm5,       MMWORD PTR [rsi + rdx * 4]
850*fb1b10abSAndroid Build Coastguard Worker        add         rsi,        rdx
851*fb1b10abSAndroid Build Coastguard Worker        movq        xmm4,       MMWORD PTR [rsi + rdx * 2]
852*fb1b10abSAndroid Build Coastguard Worker        movq        xmm6,       MMWORD PTR [rsi + rdx * 4]
853*fb1b10abSAndroid Build Coastguard Worker
854*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm1,       xmm0
855*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm1,       [rax]
856*fb1b10abSAndroid Build Coastguard Worker
857*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm2,       xmm0
858*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm2,       [rax + 16]
859*fb1b10abSAndroid Build Coastguard Worker
860*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm3,       xmm0
861*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm3,       [rax + 32]
862*fb1b10abSAndroid Build Coastguard Worker
863*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm5,       xmm0
864*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm5,       [rax + 64]
865*fb1b10abSAndroid Build Coastguard Worker
866*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm4,       xmm0
867*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm4,       [rax + 48]
868*fb1b10abSAndroid Build Coastguard Worker
869*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm6,       xmm0
870*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm6,       [rax + 80]
871*fb1b10abSAndroid Build Coastguard Worker
872*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm2,       xmm5
873*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm2,       xmm3
874*fb1b10abSAndroid Build Coastguard Worker
875*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm2,       xmm1
876*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm2,       xmm4
877*fb1b10abSAndroid Build Coastguard Worker
878*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm2,       xmm6
879*fb1b10abSAndroid Build Coastguard Worker        paddsw      xmm2,       xmm7
880*fb1b10abSAndroid Build Coastguard Worker
881*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm2,       7
882*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm2,       xmm0              ; pack and saturate
883*fb1b10abSAndroid Build Coastguard Worker
884*fb1b10abSAndroid Build Coastguard Worker        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
885*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT
886*fb1b10abSAndroid Build Coastguard Worker        add         rdi,        DWORD PTR arg(3) ;[dst_ptich]
887*fb1b10abSAndroid Build Coastguard Worker%else
888*fb1b10abSAndroid Build Coastguard Worker        add         rdi,        r8
889*fb1b10abSAndroid Build Coastguard Worker%endif
890*fb1b10abSAndroid Build Coastguard Worker        dec         rcx         ; decrement count
891*fb1b10abSAndroid Build Coastguard Worker        jnz         .vp8_filter_block1d8_v6_only_sse2_loop              ; next row
892*fb1b10abSAndroid Build Coastguard Worker
893*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
894*fb1b10abSAndroid Build Coastguard Worker    pop rdi
895*fb1b10abSAndroid Build Coastguard Worker    pop rsi
896*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
897*fb1b10abSAndroid Build Coastguard Worker    RESTORE_XMM
898*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
899*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
900*fb1b10abSAndroid Build Coastguard Worker    ret
901*fb1b10abSAndroid Build Coastguard Worker
902*fb1b10abSAndroid Build Coastguard Worker
903*fb1b10abSAndroid Build Coastguard Worker;void vp8_unpack_block1d16_h6_sse2
904*fb1b10abSAndroid Build Coastguard Worker;(
905*fb1b10abSAndroid Build Coastguard Worker;    unsigned char  *src_ptr,
906*fb1b10abSAndroid Build Coastguard Worker;    unsigned short *output_ptr,
907*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    src_pixels_per_line,
908*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    output_height,
909*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    output_width
910*fb1b10abSAndroid Build Coastguard Worker;)
911*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_unpack_block1d16_h6_sse2)
912*fb1b10abSAndroid Build Coastguard Workersym(vp8_unpack_block1d16_h6_sse2):
913*fb1b10abSAndroid Build Coastguard Worker    push        rbp
914*fb1b10abSAndroid Build Coastguard Worker    mov         rbp, rsp
915*fb1b10abSAndroid Build Coastguard Worker    SHADOW_ARGS_TO_STACK 5
916*fb1b10abSAndroid Build Coastguard Worker    GET_GOT     rbx
917*fb1b10abSAndroid Build Coastguard Worker    push        rsi
918*fb1b10abSAndroid Build Coastguard Worker    push        rdi
919*fb1b10abSAndroid Build Coastguard Worker    ; end prolog
920*fb1b10abSAndroid Build Coastguard Worker
921*fb1b10abSAndroid Build Coastguard Worker        mov         rsi,        arg(0) ;src_ptr
922*fb1b10abSAndroid Build Coastguard Worker        mov         rdi,        arg(1) ;output_ptr
923*fb1b10abSAndroid Build Coastguard Worker
924*fb1b10abSAndroid Build Coastguard Worker        movsxd      rcx,        dword ptr arg(3) ;output_height
925*fb1b10abSAndroid Build Coastguard Worker        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
926*fb1b10abSAndroid Build Coastguard Worker
927*fb1b10abSAndroid Build Coastguard Worker        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
928*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT=0
929*fb1b10abSAndroid Build Coastguard Worker        movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source
930*fb1b10abSAndroid Build Coastguard Worker%endif
931*fb1b10abSAndroid Build Coastguard Worker
932*fb1b10abSAndroid Build Coastguard Worker.unpack_block1d16_h6_sse2_rowloop:
933*fb1b10abSAndroid Build Coastguard Worker        movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
934*fb1b10abSAndroid Build Coastguard Worker        movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1
935*fb1b10abSAndroid Build Coastguard Worker
936*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
937*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm1,       xmm0
938*fb1b10abSAndroid Build Coastguard Worker
939*fb1b10abSAndroid Build Coastguard Worker        movdqa      XMMWORD Ptr [rdi],         xmm1
940*fb1b10abSAndroid Build Coastguard Worker        movdqa      XMMWORD Ptr [rdi + 16],    xmm3
941*fb1b10abSAndroid Build Coastguard Worker
942*fb1b10abSAndroid Build Coastguard Worker        lea         rsi,        [rsi + rax]
943*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT
944*fb1b10abSAndroid Build Coastguard Worker        add         rdi,        DWORD Ptr arg(4) ;[output_width]
945*fb1b10abSAndroid Build Coastguard Worker%else
946*fb1b10abSAndroid Build Coastguard Worker        add         rdi,        r8
947*fb1b10abSAndroid Build Coastguard Worker%endif
948*fb1b10abSAndroid Build Coastguard Worker        dec         rcx
949*fb1b10abSAndroid Build Coastguard Worker        jnz         .unpack_block1d16_h6_sse2_rowloop               ; next row
950*fb1b10abSAndroid Build Coastguard Worker
951*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
952*fb1b10abSAndroid Build Coastguard Worker    pop rdi
953*fb1b10abSAndroid Build Coastguard Worker    pop rsi
954*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
955*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
956*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
957*fb1b10abSAndroid Build Coastguard Worker    ret
958*fb1b10abSAndroid Build Coastguard Worker
959*fb1b10abSAndroid Build Coastguard Worker
960*fb1b10abSAndroid Build Coastguard WorkerSECTION_RODATA
961*fb1b10abSAndroid Build Coastguard Workeralign 16
962*fb1b10abSAndroid Build Coastguard Workerrd:
963*fb1b10abSAndroid Build Coastguard Worker    times 8 dw 0x40
964