xref: /aosp_15_r20/external/libvpx/vp8/common/x86/subpixel_ssse3.asm (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1*fb1b10abSAndroid Build Coastguard Worker;
2*fb1b10abSAndroid Build Coastguard Worker;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker;
4*fb1b10abSAndroid Build Coastguard Worker;  Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker;  that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker;  tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker;  in the file PATENTS.  All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker;  be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker;
10*fb1b10abSAndroid Build Coastguard Worker
11*fb1b10abSAndroid Build Coastguard Worker
12*fb1b10abSAndroid Build Coastguard Worker%include "vpx_ports/x86_abi_support.asm"
13*fb1b10abSAndroid Build Coastguard Worker
14*fb1b10abSAndroid Build Coastguard Worker%define BLOCK_HEIGHT_WIDTH 4
15*fb1b10abSAndroid Build Coastguard Worker%define VP8_FILTER_WEIGHT 128
16*fb1b10abSAndroid Build Coastguard Worker%define VP8_FILTER_SHIFT  7
17*fb1b10abSAndroid Build Coastguard Worker
18*fb1b10abSAndroid Build Coastguard WorkerSECTION .text
19*fb1b10abSAndroid Build Coastguard Worker
20*fb1b10abSAndroid Build Coastguard Worker;/************************************************************************************
21*fb1b10abSAndroid Build Coastguard Worker; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
22*fb1b10abSAndroid Build Coastguard Worker; input pixel array has output_height rows. This routine assumes that output_height is an
23*fb1b10abSAndroid Build Coastguard Worker; even number. This function handles 8 pixels in horizontal direction, calculating ONE
24*fb1b10abSAndroid Build Coastguard Worker; rows each iteration to take advantage of the 128 bits operations.
25*fb1b10abSAndroid Build Coastguard Worker;
26*fb1b10abSAndroid Build Coastguard Worker; This is an implementation of some of the SSE optimizations first seen in ffvp8
27*fb1b10abSAndroid Build Coastguard Worker;
28*fb1b10abSAndroid Build Coastguard Worker;*************************************************************************************/
29*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_block1d8_h6_ssse3
30*fb1b10abSAndroid Build Coastguard Worker;(
31*fb1b10abSAndroid Build Coastguard Worker;    unsigned char  *src_ptr,
32*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    src_pixels_per_line,
33*fb1b10abSAndroid Build Coastguard Worker;    unsigned char *output_ptr,
34*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    output_pitch,
35*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    output_height,
36*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    vp8_filter_index
37*fb1b10abSAndroid Build Coastguard Worker;)
38*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_block1d8_h6_ssse3)
39*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_block1d8_h6_ssse3):
40*fb1b10abSAndroid Build Coastguard Worker    push        rbp
41*fb1b10abSAndroid Build Coastguard Worker    mov         rbp, rsp
42*fb1b10abSAndroid Build Coastguard Worker    SHADOW_ARGS_TO_STACK 6
43*fb1b10abSAndroid Build Coastguard Worker    SAVE_XMM 7
44*fb1b10abSAndroid Build Coastguard Worker    GET_GOT     rbx
45*fb1b10abSAndroid Build Coastguard Worker    push        rsi
46*fb1b10abSAndroid Build Coastguard Worker    push        rdi
47*fb1b10abSAndroid Build Coastguard Worker    ; end prolog
48*fb1b10abSAndroid Build Coastguard Worker
49*fb1b10abSAndroid Build Coastguard Worker    movsxd      rdx, DWORD PTR arg(5)   ;table index
50*fb1b10abSAndroid Build Coastguard Worker    xor         rsi, rsi
51*fb1b10abSAndroid Build Coastguard Worker    shl         rdx, 4
52*fb1b10abSAndroid Build Coastguard Worker
53*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm7, [GLOBAL(rd)]
54*fb1b10abSAndroid Build Coastguard Worker
55*fb1b10abSAndroid Build Coastguard Worker    lea         rax, [GLOBAL(k0_k5)]
56*fb1b10abSAndroid Build Coastguard Worker    add         rax, rdx
57*fb1b10abSAndroid Build Coastguard Worker    mov         rdi, arg(2)             ;output_ptr
58*fb1b10abSAndroid Build Coastguard Worker
59*fb1b10abSAndroid Build Coastguard Worker    cmp         esi, DWORD PTR [rax]
60*fb1b10abSAndroid Build Coastguard Worker    je          vp8_filter_block1d8_h4_ssse3
61*fb1b10abSAndroid Build Coastguard Worker
62*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
63*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
64*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
65*fb1b10abSAndroid Build Coastguard Worker
66*fb1b10abSAndroid Build Coastguard Worker    mov         rsi, arg(0)             ;src_ptr
67*fb1b10abSAndroid Build Coastguard Worker    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
68*fb1b10abSAndroid Build Coastguard Worker    movsxd      rcx, dword ptr arg(4)   ;output_height
69*fb1b10abSAndroid Build Coastguard Worker
70*fb1b10abSAndroid Build Coastguard Worker    movsxd      rdx, dword ptr arg(3)   ;output_pitch
71*fb1b10abSAndroid Build Coastguard Worker
72*fb1b10abSAndroid Build Coastguard Worker    sub         rdi, rdx
73*fb1b10abSAndroid Build Coastguard Worker;xmm3 free
74*fb1b10abSAndroid Build Coastguard Worker.filter_block1d8_h6_rowloop_ssse3:
75*fb1b10abSAndroid Build Coastguard Worker    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
76*fb1b10abSAndroid Build Coastguard Worker
77*fb1b10abSAndroid Build Coastguard Worker    movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
78*fb1b10abSAndroid Build Coastguard Worker
79*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
80*fb1b10abSAndroid Build Coastguard Worker
81*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm1,   xmm0
82*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm0,   xmm4
83*fb1b10abSAndroid Build Coastguard Worker
84*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm2,   xmm1
85*fb1b10abSAndroid Build Coastguard Worker    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
86*fb1b10abSAndroid Build Coastguard Worker
87*fb1b10abSAndroid Build Coastguard Worker    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
88*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm1,   xmm5
89*fb1b10abSAndroid Build Coastguard Worker
90*fb1b10abSAndroid Build Coastguard Worker    lea         rdi,    [rdi + rdx]
91*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm2,   xmm6
92*fb1b10abSAndroid Build Coastguard Worker
93*fb1b10abSAndroid Build Coastguard Worker    lea         rsi,    [rsi + rax]
94*fb1b10abSAndroid Build Coastguard Worker    dec         rcx
95*fb1b10abSAndroid Build Coastguard Worker
96*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm0,   xmm1
97*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm2,   xmm7
98*fb1b10abSAndroid Build Coastguard Worker
99*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm0,   xmm2
100*fb1b10abSAndroid Build Coastguard Worker
101*fb1b10abSAndroid Build Coastguard Worker    psraw       xmm0,   7
102*fb1b10abSAndroid Build Coastguard Worker
103*fb1b10abSAndroid Build Coastguard Worker    packuswb    xmm0,   xmm0
104*fb1b10abSAndroid Build Coastguard Worker
105*fb1b10abSAndroid Build Coastguard Worker    movq        MMWORD Ptr [rdi], xmm0
106*fb1b10abSAndroid Build Coastguard Worker    jnz         .filter_block1d8_h6_rowloop_ssse3
107*fb1b10abSAndroid Build Coastguard Worker
108*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
109*fb1b10abSAndroid Build Coastguard Worker    pop rdi
110*fb1b10abSAndroid Build Coastguard Worker    pop rsi
111*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
112*fb1b10abSAndroid Build Coastguard Worker    RESTORE_XMM
113*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
114*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
115*fb1b10abSAndroid Build Coastguard Worker    ret
116*fb1b10abSAndroid Build Coastguard Worker
117*fb1b10abSAndroid Build Coastguard Workervp8_filter_block1d8_h4_ssse3:
118*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
119*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
120*fb1b10abSAndroid Build Coastguard Worker
121*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
122*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
123*fb1b10abSAndroid Build Coastguard Worker
124*fb1b10abSAndroid Build Coastguard Worker    mov         rsi, arg(0)             ;src_ptr
125*fb1b10abSAndroid Build Coastguard Worker
126*fb1b10abSAndroid Build Coastguard Worker    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
127*fb1b10abSAndroid Build Coastguard Worker    movsxd      rcx, dword ptr arg(4)   ;output_height
128*fb1b10abSAndroid Build Coastguard Worker
129*fb1b10abSAndroid Build Coastguard Worker    movsxd      rdx, dword ptr arg(3)   ;output_pitch
130*fb1b10abSAndroid Build Coastguard Worker
131*fb1b10abSAndroid Build Coastguard Worker    sub         rdi, rdx
132*fb1b10abSAndroid Build Coastguard Worker
133*fb1b10abSAndroid Build Coastguard Worker.filter_block1d8_h4_rowloop_ssse3:
134*fb1b10abSAndroid Build Coastguard Worker    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
135*fb1b10abSAndroid Build Coastguard Worker
136*fb1b10abSAndroid Build Coastguard Worker    movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
137*fb1b10abSAndroid Build Coastguard Worker
138*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
139*fb1b10abSAndroid Build Coastguard Worker
140*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm2,   xmm0
141*fb1b10abSAndroid Build Coastguard Worker    pshufb      xmm0,   xmm3
142*fb1b10abSAndroid Build Coastguard Worker
143*fb1b10abSAndroid Build Coastguard Worker    pshufb      xmm2,   xmm4
144*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm0,   xmm5
145*fb1b10abSAndroid Build Coastguard Worker
146*fb1b10abSAndroid Build Coastguard Worker    lea         rdi,    [rdi + rdx]
147*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm2,   xmm6
148*fb1b10abSAndroid Build Coastguard Worker
149*fb1b10abSAndroid Build Coastguard Worker    lea         rsi,    [rsi + rax]
150*fb1b10abSAndroid Build Coastguard Worker    dec         rcx
151*fb1b10abSAndroid Build Coastguard Worker
152*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm0,   xmm7
153*fb1b10abSAndroid Build Coastguard Worker
154*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm0,   xmm2
155*fb1b10abSAndroid Build Coastguard Worker
156*fb1b10abSAndroid Build Coastguard Worker    psraw       xmm0,   7
157*fb1b10abSAndroid Build Coastguard Worker
158*fb1b10abSAndroid Build Coastguard Worker    packuswb    xmm0,   xmm0
159*fb1b10abSAndroid Build Coastguard Worker
160*fb1b10abSAndroid Build Coastguard Worker    movq        MMWORD Ptr [rdi], xmm0
161*fb1b10abSAndroid Build Coastguard Worker
162*fb1b10abSAndroid Build Coastguard Worker    jnz         .filter_block1d8_h4_rowloop_ssse3
163*fb1b10abSAndroid Build Coastguard Worker
164*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
165*fb1b10abSAndroid Build Coastguard Worker    pop rdi
166*fb1b10abSAndroid Build Coastguard Worker    pop rsi
167*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
168*fb1b10abSAndroid Build Coastguard Worker    RESTORE_XMM
169*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
170*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
171*fb1b10abSAndroid Build Coastguard Worker    ret
172*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_block1d16_h6_ssse3
173*fb1b10abSAndroid Build Coastguard Worker;(
174*fb1b10abSAndroid Build Coastguard Worker;    unsigned char  *src_ptr,
175*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    src_pixels_per_line,
176*fb1b10abSAndroid Build Coastguard Worker;    unsigned char  *output_ptr,
177*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    output_pitch,
178*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    output_height,
179*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    vp8_filter_index
180*fb1b10abSAndroid Build Coastguard Worker;)
181*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_block1d16_h6_ssse3)
182*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_block1d16_h6_ssse3):
183*fb1b10abSAndroid Build Coastguard Worker    push        rbp
184*fb1b10abSAndroid Build Coastguard Worker    mov         rbp, rsp
185*fb1b10abSAndroid Build Coastguard Worker    SHADOW_ARGS_TO_STACK 6
186*fb1b10abSAndroid Build Coastguard Worker    SAVE_XMM 7
187*fb1b10abSAndroid Build Coastguard Worker    GET_GOT     rbx
188*fb1b10abSAndroid Build Coastguard Worker    push        rsi
189*fb1b10abSAndroid Build Coastguard Worker    push        rdi
190*fb1b10abSAndroid Build Coastguard Worker    ; end prolog
191*fb1b10abSAndroid Build Coastguard Worker
192*fb1b10abSAndroid Build Coastguard Worker    movsxd      rdx, DWORD PTR arg(5)           ;table index
193*fb1b10abSAndroid Build Coastguard Worker    xor         rsi, rsi
194*fb1b10abSAndroid Build Coastguard Worker    shl         rdx, 4      ;
195*fb1b10abSAndroid Build Coastguard Worker
196*fb1b10abSAndroid Build Coastguard Worker    lea         rax, [GLOBAL(k0_k5)]
197*fb1b10abSAndroid Build Coastguard Worker    add         rax, rdx
198*fb1b10abSAndroid Build Coastguard Worker
199*fb1b10abSAndroid Build Coastguard Worker    mov         rdi, arg(2)                     ;output_ptr
200*fb1b10abSAndroid Build Coastguard Worker
201*fb1b10abSAndroid Build Coastguard Worker    mov         rsi, arg(0)                     ;src_ptr
202*fb1b10abSAndroid Build Coastguard Worker
203*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
204*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
205*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
206*fb1b10abSAndroid Build Coastguard Worker
207*fb1b10abSAndroid Build Coastguard Worker    movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line
208*fb1b10abSAndroid Build Coastguard Worker    movsxd      rcx, dword ptr arg(4)           ;output_height
209*fb1b10abSAndroid Build Coastguard Worker    movsxd      rdx, dword ptr arg(3)           ;output_pitch
210*fb1b10abSAndroid Build Coastguard Worker
211*fb1b10abSAndroid Build Coastguard Worker.filter_block1d16_h6_rowloop_ssse3:
212*fb1b10abSAndroid Build Coastguard Worker    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
213*fb1b10abSAndroid Build Coastguard Worker
214*fb1b10abSAndroid Build Coastguard Worker    movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
215*fb1b10abSAndroid Build Coastguard Worker
216*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
217*fb1b10abSAndroid Build Coastguard Worker
218*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm1,   xmm0
219*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm0,   xmm4
220*fb1b10abSAndroid Build Coastguard Worker
221*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm2,   xmm1
222*fb1b10abSAndroid Build Coastguard Worker    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
223*fb1b10abSAndroid Build Coastguard Worker
224*fb1b10abSAndroid Build Coastguard Worker    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
225*fb1b10abSAndroid Build Coastguard Worker    movq        xmm3,   MMWORD PTR [rsi +  6]
226*fb1b10abSAndroid Build Coastguard Worker
227*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm1,   xmm5
228*fb1b10abSAndroid Build Coastguard Worker    movq        xmm7,   MMWORD PTR [rsi + 11]
229*fb1b10abSAndroid Build Coastguard Worker
230*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm2,   xmm6
231*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   xmm3,   xmm7
232*fb1b10abSAndroid Build Coastguard Worker
233*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm0,   xmm1
234*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm1,   xmm3
235*fb1b10abSAndroid Build Coastguard Worker
236*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm3,   xmm4
237*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm0,   xmm2
238*fb1b10abSAndroid Build Coastguard Worker
239*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm2,   xmm1
240*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm0,   [GLOBAL(rd)]
241*fb1b10abSAndroid Build Coastguard Worker
242*fb1b10abSAndroid Build Coastguard Worker    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
243*fb1b10abSAndroid Build Coastguard Worker    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
244*fb1b10abSAndroid Build Coastguard Worker
245*fb1b10abSAndroid Build Coastguard Worker    psraw       xmm0,   7
246*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm1,   xmm5
247*fb1b10abSAndroid Build Coastguard Worker
248*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm2,   xmm6
249*fb1b10abSAndroid Build Coastguard Worker    packuswb    xmm0,   xmm0
250*fb1b10abSAndroid Build Coastguard Worker
251*fb1b10abSAndroid Build Coastguard Worker    lea         rsi,    [rsi + rax]
252*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm3,   xmm1
253*fb1b10abSAndroid Build Coastguard Worker
254*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm3,   xmm2
255*fb1b10abSAndroid Build Coastguard Worker
256*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm3,   [GLOBAL(rd)]
257*fb1b10abSAndroid Build Coastguard Worker
258*fb1b10abSAndroid Build Coastguard Worker    psraw       xmm3,   7
259*fb1b10abSAndroid Build Coastguard Worker
260*fb1b10abSAndroid Build Coastguard Worker    packuswb    xmm3,   xmm3
261*fb1b10abSAndroid Build Coastguard Worker
262*fb1b10abSAndroid Build Coastguard Worker    punpcklqdq  xmm0,   xmm3
263*fb1b10abSAndroid Build Coastguard Worker
264*fb1b10abSAndroid Build Coastguard Worker    movdqa      XMMWORD Ptr [rdi], xmm0
265*fb1b10abSAndroid Build Coastguard Worker
266*fb1b10abSAndroid Build Coastguard Worker    lea         rdi,    [rdi + rdx]
267*fb1b10abSAndroid Build Coastguard Worker    dec         rcx
268*fb1b10abSAndroid Build Coastguard Worker    jnz         .filter_block1d16_h6_rowloop_ssse3
269*fb1b10abSAndroid Build Coastguard Worker
270*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
271*fb1b10abSAndroid Build Coastguard Worker    pop rdi
272*fb1b10abSAndroid Build Coastguard Worker    pop rsi
273*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
274*fb1b10abSAndroid Build Coastguard Worker    RESTORE_XMM
275*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
276*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
277*fb1b10abSAndroid Build Coastguard Worker    ret
278*fb1b10abSAndroid Build Coastguard Worker
279*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_block1d4_h6_ssse3
280*fb1b10abSAndroid Build Coastguard Worker;(
281*fb1b10abSAndroid Build Coastguard Worker;    unsigned char  *src_ptr,
282*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    src_pixels_per_line,
283*fb1b10abSAndroid Build Coastguard Worker;    unsigned char  *output_ptr,
284*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    output_pitch,
285*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    output_height,
286*fb1b10abSAndroid Build Coastguard Worker;    unsigned int    vp8_filter_index
287*fb1b10abSAndroid Build Coastguard Worker;)
288*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_block1d4_h6_ssse3)
289*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_block1d4_h6_ssse3):
290*fb1b10abSAndroid Build Coastguard Worker    push        rbp
291*fb1b10abSAndroid Build Coastguard Worker    mov         rbp, rsp
292*fb1b10abSAndroid Build Coastguard Worker    SHADOW_ARGS_TO_STACK 6
293*fb1b10abSAndroid Build Coastguard Worker    SAVE_XMM 7
294*fb1b10abSAndroid Build Coastguard Worker    GET_GOT     rbx
295*fb1b10abSAndroid Build Coastguard Worker    push        rsi
296*fb1b10abSAndroid Build Coastguard Worker    push        rdi
297*fb1b10abSAndroid Build Coastguard Worker    ; end prolog
298*fb1b10abSAndroid Build Coastguard Worker
299*fb1b10abSAndroid Build Coastguard Worker    movsxd      rdx, DWORD PTR arg(5)   ;table index
300*fb1b10abSAndroid Build Coastguard Worker    xor         rsi, rsi
301*fb1b10abSAndroid Build Coastguard Worker    shl         rdx, 4      ;
302*fb1b10abSAndroid Build Coastguard Worker
303*fb1b10abSAndroid Build Coastguard Worker    lea         rax, [GLOBAL(k0_k5)]
304*fb1b10abSAndroid Build Coastguard Worker    add         rax, rdx
305*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm7, [GLOBAL(rd)]
306*fb1b10abSAndroid Build Coastguard Worker
307*fb1b10abSAndroid Build Coastguard Worker    cmp         esi, DWORD PTR [rax]
308*fb1b10abSAndroid Build Coastguard Worker    je          .vp8_filter_block1d4_h4_ssse3
309*fb1b10abSAndroid Build Coastguard Worker
310*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
311*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
312*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
313*fb1b10abSAndroid Build Coastguard Worker
314*fb1b10abSAndroid Build Coastguard Worker    mov         rsi, arg(0)             ;src_ptr
315*fb1b10abSAndroid Build Coastguard Worker    mov         rdi, arg(2)             ;output_ptr
316*fb1b10abSAndroid Build Coastguard Worker    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
317*fb1b10abSAndroid Build Coastguard Worker    movsxd      rcx, dword ptr arg(4)   ;output_height
318*fb1b10abSAndroid Build Coastguard Worker
319*fb1b10abSAndroid Build Coastguard Worker    movsxd      rdx, dword ptr arg(3)   ;output_pitch
320*fb1b10abSAndroid Build Coastguard Worker
321*fb1b10abSAndroid Build Coastguard Worker;xmm3 free
322*fb1b10abSAndroid Build Coastguard Worker.filter_block1d4_h6_rowloop_ssse3:
323*fb1b10abSAndroid Build Coastguard Worker    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
324*fb1b10abSAndroid Build Coastguard Worker
325*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm1, xmm0
326*fb1b10abSAndroid Build Coastguard Worker    pshufb      xmm0, [GLOBAL(shuf1b)]
327*fb1b10abSAndroid Build Coastguard Worker
328*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm2, xmm1
329*fb1b10abSAndroid Build Coastguard Worker    pshufb      xmm1, [GLOBAL(shuf2b)]
330*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm0, xmm4
331*fb1b10abSAndroid Build Coastguard Worker    pshufb      xmm2, [GLOBAL(shuf3b)]
332*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm1, xmm5
333*fb1b10abSAndroid Build Coastguard Worker
334*fb1b10abSAndroid Build Coastguard Worker;--
335*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm2, xmm6
336*fb1b10abSAndroid Build Coastguard Worker
337*fb1b10abSAndroid Build Coastguard Worker    lea         rsi,    [rsi + rax]
338*fb1b10abSAndroid Build Coastguard Worker;--
339*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm0, xmm1
340*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm0, xmm7
341*fb1b10abSAndroid Build Coastguard Worker    pxor        xmm1, xmm1
342*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm0, xmm2
343*fb1b10abSAndroid Build Coastguard Worker    psraw       xmm0, 7
344*fb1b10abSAndroid Build Coastguard Worker    packuswb    xmm0, xmm0
345*fb1b10abSAndroid Build Coastguard Worker
346*fb1b10abSAndroid Build Coastguard Worker    movd        DWORD PTR [rdi], xmm0
347*fb1b10abSAndroid Build Coastguard Worker
348*fb1b10abSAndroid Build Coastguard Worker    add         rdi, rdx
349*fb1b10abSAndroid Build Coastguard Worker    dec         rcx
350*fb1b10abSAndroid Build Coastguard Worker    jnz         .filter_block1d4_h6_rowloop_ssse3
351*fb1b10abSAndroid Build Coastguard Worker
352*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
353*fb1b10abSAndroid Build Coastguard Worker    pop rdi
354*fb1b10abSAndroid Build Coastguard Worker    pop rsi
355*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
356*fb1b10abSAndroid Build Coastguard Worker    RESTORE_XMM
357*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
358*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
359*fb1b10abSAndroid Build Coastguard Worker    ret
360*fb1b10abSAndroid Build Coastguard Worker
361*fb1b10abSAndroid Build Coastguard Worker.vp8_filter_block1d4_h4_ssse3:
362*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
363*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
364*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
365*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
366*fb1b10abSAndroid Build Coastguard Worker
367*fb1b10abSAndroid Build Coastguard Worker    mov         rsi, arg(0)             ;src_ptr
368*fb1b10abSAndroid Build Coastguard Worker    mov         rdi, arg(2)             ;output_ptr
369*fb1b10abSAndroid Build Coastguard Worker    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
370*fb1b10abSAndroid Build Coastguard Worker    movsxd      rcx, dword ptr arg(4)   ;output_height
371*fb1b10abSAndroid Build Coastguard Worker
372*fb1b10abSAndroid Build Coastguard Worker    movsxd      rdx, dword ptr arg(3)   ;output_pitch
373*fb1b10abSAndroid Build Coastguard Worker
374*fb1b10abSAndroid Build Coastguard Worker.filter_block1d4_h4_rowloop_ssse3:
375*fb1b10abSAndroid Build Coastguard Worker    movdqu      xmm1,   XMMWORD PTR [rsi - 2]
376*fb1b10abSAndroid Build Coastguard Worker
377*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm2, xmm1
378*fb1b10abSAndroid Build Coastguard Worker    pshufb      xmm1, xmm0 ;;[GLOBAL(shuf2b)]
379*fb1b10abSAndroid Build Coastguard Worker    pshufb      xmm2, xmm3 ;;[GLOBAL(shuf3b)]
380*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm1, xmm5
381*fb1b10abSAndroid Build Coastguard Worker
382*fb1b10abSAndroid Build Coastguard Worker;--
383*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm2, xmm6
384*fb1b10abSAndroid Build Coastguard Worker
385*fb1b10abSAndroid Build Coastguard Worker    lea         rsi,    [rsi + rax]
386*fb1b10abSAndroid Build Coastguard Worker;--
387*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm1, xmm7
388*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm1, xmm2
389*fb1b10abSAndroid Build Coastguard Worker    psraw       xmm1, 7
390*fb1b10abSAndroid Build Coastguard Worker    packuswb    xmm1, xmm1
391*fb1b10abSAndroid Build Coastguard Worker
392*fb1b10abSAndroid Build Coastguard Worker    movd        DWORD PTR [rdi], xmm1
393*fb1b10abSAndroid Build Coastguard Worker
394*fb1b10abSAndroid Build Coastguard Worker    add         rdi, rdx
395*fb1b10abSAndroid Build Coastguard Worker    dec         rcx
396*fb1b10abSAndroid Build Coastguard Worker    jnz         .filter_block1d4_h4_rowloop_ssse3
397*fb1b10abSAndroid Build Coastguard Worker
398*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
399*fb1b10abSAndroid Build Coastguard Worker    pop rdi
400*fb1b10abSAndroid Build Coastguard Worker    pop rsi
401*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
402*fb1b10abSAndroid Build Coastguard Worker    RESTORE_XMM
403*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
404*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
405*fb1b10abSAndroid Build Coastguard Worker    ret
406*fb1b10abSAndroid Build Coastguard Worker
407*fb1b10abSAndroid Build Coastguard Worker
408*fb1b10abSAndroid Build Coastguard Worker
409*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_block1d16_v6_ssse3
410*fb1b10abSAndroid Build Coastguard Worker;(
411*fb1b10abSAndroid Build Coastguard Worker;    unsigned char *src_ptr,
412*fb1b10abSAndroid Build Coastguard Worker;    unsigned int   src_pitch,
413*fb1b10abSAndroid Build Coastguard Worker;    unsigned char *output_ptr,
414*fb1b10abSAndroid Build Coastguard Worker;    unsigned int   out_pitch,
415*fb1b10abSAndroid Build Coastguard Worker;    unsigned int   output_height,
416*fb1b10abSAndroid Build Coastguard Worker;    unsigned int   vp8_filter_index
417*fb1b10abSAndroid Build Coastguard Worker;)
418*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_block1d16_v6_ssse3)
419*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_block1d16_v6_ssse3):
420*fb1b10abSAndroid Build Coastguard Worker    push        rbp
421*fb1b10abSAndroid Build Coastguard Worker    mov         rbp, rsp
422*fb1b10abSAndroid Build Coastguard Worker    SHADOW_ARGS_TO_STACK 6
423*fb1b10abSAndroid Build Coastguard Worker    SAVE_XMM 7
424*fb1b10abSAndroid Build Coastguard Worker    GET_GOT     rbx
425*fb1b10abSAndroid Build Coastguard Worker    push        rsi
426*fb1b10abSAndroid Build Coastguard Worker    push        rdi
427*fb1b10abSAndroid Build Coastguard Worker    ; end prolog
428*fb1b10abSAndroid Build Coastguard Worker
429*fb1b10abSAndroid Build Coastguard Worker    movsxd      rdx, DWORD PTR arg(5)   ;table index
430*fb1b10abSAndroid Build Coastguard Worker    xor         rsi, rsi
431*fb1b10abSAndroid Build Coastguard Worker    shl         rdx, 4      ;
432*fb1b10abSAndroid Build Coastguard Worker
433*fb1b10abSAndroid Build Coastguard Worker    lea         rax, [GLOBAL(k0_k5)]
434*fb1b10abSAndroid Build Coastguard Worker    add         rax, rdx
435*fb1b10abSAndroid Build Coastguard Worker
436*fb1b10abSAndroid Build Coastguard Worker    cmp         esi, DWORD PTR [rax]
437*fb1b10abSAndroid Build Coastguard Worker    je          .vp8_filter_block1d16_v4_ssse3
438*fb1b10abSAndroid Build Coastguard Worker
439*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
440*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
441*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
442*fb1b10abSAndroid Build Coastguard Worker
443*fb1b10abSAndroid Build Coastguard Worker    mov         rsi, arg(0)             ;src_ptr
444*fb1b10abSAndroid Build Coastguard Worker    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
445*fb1b10abSAndroid Build Coastguard Worker    mov         rdi, arg(2)             ;output_ptr
446*fb1b10abSAndroid Build Coastguard Worker
447*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT=0
448*fb1b10abSAndroid Build Coastguard Worker    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
449*fb1b10abSAndroid Build Coastguard Worker%endif
450*fb1b10abSAndroid Build Coastguard Worker    mov         rax, rsi
451*fb1b10abSAndroid Build Coastguard Worker    movsxd      rcx, DWORD PTR arg(4)   ;output_height
452*fb1b10abSAndroid Build Coastguard Worker    add         rax, rdx
453*fb1b10abSAndroid Build Coastguard Worker
454*fb1b10abSAndroid Build Coastguard Worker
455*fb1b10abSAndroid Build Coastguard Worker.vp8_filter_block1d16_v6_ssse3_loop:
456*fb1b10abSAndroid Build Coastguard Worker    movq        xmm1, MMWORD PTR [rsi]                  ;A
457*fb1b10abSAndroid Build Coastguard Worker    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
458*fb1b10abSAndroid Build Coastguard Worker    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
459*fb1b10abSAndroid Build Coastguard Worker    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
460*fb1b10abSAndroid Build Coastguard Worker    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
461*fb1b10abSAndroid Build Coastguard Worker
462*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   xmm2, xmm4                  ;B D
463*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   xmm3, xmm0                  ;C E
464*fb1b10abSAndroid Build Coastguard Worker
465*fb1b10abSAndroid Build Coastguard Worker    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
466*fb1b10abSAndroid Build Coastguard Worker
467*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm3, xmm6
468*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   xmm1, xmm0                  ;A F
469*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm2, xmm7
470*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm1, xmm5
471*fb1b10abSAndroid Build Coastguard Worker
472*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm2, xmm3
473*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm2, xmm1
474*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm2, [GLOBAL(rd)]
475*fb1b10abSAndroid Build Coastguard Worker    psraw       xmm2, 7
476*fb1b10abSAndroid Build Coastguard Worker    packuswb    xmm2, xmm2
477*fb1b10abSAndroid Build Coastguard Worker
478*fb1b10abSAndroid Build Coastguard Worker    movq        MMWORD PTR [rdi], xmm2          ;store the results
479*fb1b10abSAndroid Build Coastguard Worker
480*fb1b10abSAndroid Build Coastguard Worker    movq        xmm1, MMWORD PTR [rsi + 8]                  ;A
481*fb1b10abSAndroid Build Coastguard Worker    movq        xmm2, MMWORD PTR [rsi + rdx + 8]            ;B
482*fb1b10abSAndroid Build Coastguard Worker    movq        xmm3, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
483*fb1b10abSAndroid Build Coastguard Worker    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
484*fb1b10abSAndroid Build Coastguard Worker    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
485*fb1b10abSAndroid Build Coastguard Worker
486*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   xmm2, xmm4                  ;B D
487*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   xmm3, xmm0                  ;C E
488*fb1b10abSAndroid Build Coastguard Worker
489*fb1b10abSAndroid Build Coastguard Worker    movq        xmm0, MMWORD PTR [rax + rdx * 4 + 8]        ;F
490*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm3, xmm6
491*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   xmm1, xmm0                  ;A F
492*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm2, xmm7
493*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm1, xmm5
494*fb1b10abSAndroid Build Coastguard Worker
495*fb1b10abSAndroid Build Coastguard Worker    add         rsi,  rdx
496*fb1b10abSAndroid Build Coastguard Worker    add         rax,  rdx
497*fb1b10abSAndroid Build Coastguard Worker;--
498*fb1b10abSAndroid Build Coastguard Worker;--
499*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm2, xmm3
500*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm2, xmm1
501*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm2, [GLOBAL(rd)]
502*fb1b10abSAndroid Build Coastguard Worker    psraw       xmm2, 7
503*fb1b10abSAndroid Build Coastguard Worker    packuswb    xmm2, xmm2
504*fb1b10abSAndroid Build Coastguard Worker
505*fb1b10abSAndroid Build Coastguard Worker    movq        MMWORD PTR [rdi+8], xmm2
506*fb1b10abSAndroid Build Coastguard Worker
507*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT
508*fb1b10abSAndroid Build Coastguard Worker    add         rdi,        DWORD PTR arg(3) ;out_pitch
509*fb1b10abSAndroid Build Coastguard Worker%else
510*fb1b10abSAndroid Build Coastguard Worker    add         rdi,        r8
511*fb1b10abSAndroid Build Coastguard Worker%endif
512*fb1b10abSAndroid Build Coastguard Worker    dec         rcx
513*fb1b10abSAndroid Build Coastguard Worker    jnz         .vp8_filter_block1d16_v6_ssse3_loop
514*fb1b10abSAndroid Build Coastguard Worker
515*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
516*fb1b10abSAndroid Build Coastguard Worker    pop rdi
517*fb1b10abSAndroid Build Coastguard Worker    pop rsi
518*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
519*fb1b10abSAndroid Build Coastguard Worker    RESTORE_XMM
520*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
521*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
522*fb1b10abSAndroid Build Coastguard Worker    ret
523*fb1b10abSAndroid Build Coastguard Worker
524*fb1b10abSAndroid Build Coastguard Worker.vp8_filter_block1d16_v4_ssse3:
525*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
526*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
527*fb1b10abSAndroid Build Coastguard Worker
528*fb1b10abSAndroid Build Coastguard Worker    mov         rsi, arg(0)             ;src_ptr
529*fb1b10abSAndroid Build Coastguard Worker    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
530*fb1b10abSAndroid Build Coastguard Worker    mov         rdi, arg(2)             ;output_ptr
531*fb1b10abSAndroid Build Coastguard Worker
532*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT=0
533*fb1b10abSAndroid Build Coastguard Worker    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
534*fb1b10abSAndroid Build Coastguard Worker%endif
535*fb1b10abSAndroid Build Coastguard Worker    mov         rax, rsi
536*fb1b10abSAndroid Build Coastguard Worker    movsxd      rcx, DWORD PTR arg(4)   ;output_height
537*fb1b10abSAndroid Build Coastguard Worker    add         rax, rdx
538*fb1b10abSAndroid Build Coastguard Worker
539*fb1b10abSAndroid Build Coastguard Worker.vp8_filter_block1d16_v4_ssse3_loop:
540*fb1b10abSAndroid Build Coastguard Worker    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
541*fb1b10abSAndroid Build Coastguard Worker    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
542*fb1b10abSAndroid Build Coastguard Worker    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
543*fb1b10abSAndroid Build Coastguard Worker    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
544*fb1b10abSAndroid Build Coastguard Worker
545*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   xmm2, xmm4                  ;B D
546*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   xmm3, xmm0                  ;C E
547*fb1b10abSAndroid Build Coastguard Worker
548*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm3, xmm6
549*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm2, xmm7
550*fb1b10abSAndroid Build Coastguard Worker    movq        xmm5, MMWORD PTR [rsi + rdx + 8]            ;B
551*fb1b10abSAndroid Build Coastguard Worker    movq        xmm1, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
552*fb1b10abSAndroid Build Coastguard Worker    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
553*fb1b10abSAndroid Build Coastguard Worker    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
554*fb1b10abSAndroid Build Coastguard Worker
555*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm2, [GLOBAL(rd)]
556*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm2, xmm3
557*fb1b10abSAndroid Build Coastguard Worker    psraw       xmm2, 7
558*fb1b10abSAndroid Build Coastguard Worker    packuswb    xmm2, xmm2
559*fb1b10abSAndroid Build Coastguard Worker
560*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   xmm5, xmm4                  ;B D
561*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   xmm1, xmm0                  ;C E
562*fb1b10abSAndroid Build Coastguard Worker
563*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm1, xmm6
564*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm5, xmm7
565*fb1b10abSAndroid Build Coastguard Worker
566*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm4, [GLOBAL(rd)]
567*fb1b10abSAndroid Build Coastguard Worker    add         rsi,  rdx
568*fb1b10abSAndroid Build Coastguard Worker    add         rax,  rdx
569*fb1b10abSAndroid Build Coastguard Worker;--
570*fb1b10abSAndroid Build Coastguard Worker;--
571*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm5, xmm1
572*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm5, xmm4
573*fb1b10abSAndroid Build Coastguard Worker    psraw       xmm5, 7
574*fb1b10abSAndroid Build Coastguard Worker    packuswb    xmm5, xmm5
575*fb1b10abSAndroid Build Coastguard Worker
576*fb1b10abSAndroid Build Coastguard Worker    punpcklqdq  xmm2, xmm5
577*fb1b10abSAndroid Build Coastguard Worker
578*fb1b10abSAndroid Build Coastguard Worker    movdqa       XMMWORD PTR [rdi], xmm2
579*fb1b10abSAndroid Build Coastguard Worker
580*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT
581*fb1b10abSAndroid Build Coastguard Worker    add         rdi,        DWORD PTR arg(3) ;out_pitch
582*fb1b10abSAndroid Build Coastguard Worker%else
583*fb1b10abSAndroid Build Coastguard Worker    add         rdi,        r8
584*fb1b10abSAndroid Build Coastguard Worker%endif
585*fb1b10abSAndroid Build Coastguard Worker    dec         rcx
586*fb1b10abSAndroid Build Coastguard Worker    jnz         .vp8_filter_block1d16_v4_ssse3_loop
587*fb1b10abSAndroid Build Coastguard Worker
588*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
589*fb1b10abSAndroid Build Coastguard Worker    pop rdi
590*fb1b10abSAndroid Build Coastguard Worker    pop rsi
591*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
592*fb1b10abSAndroid Build Coastguard Worker    RESTORE_XMM
593*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
594*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
595*fb1b10abSAndroid Build Coastguard Worker    ret
596*fb1b10abSAndroid Build Coastguard Worker
597*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_block1d8_v6_ssse3
598*fb1b10abSAndroid Build Coastguard Worker;(
599*fb1b10abSAndroid Build Coastguard Worker;    unsigned char *src_ptr,
600*fb1b10abSAndroid Build Coastguard Worker;    unsigned int   src_pitch,
601*fb1b10abSAndroid Build Coastguard Worker;    unsigned char *output_ptr,
602*fb1b10abSAndroid Build Coastguard Worker;    unsigned int   out_pitch,
603*fb1b10abSAndroid Build Coastguard Worker;    unsigned int   output_height,
604*fb1b10abSAndroid Build Coastguard Worker;    unsigned int   vp8_filter_index
605*fb1b10abSAndroid Build Coastguard Worker;)
606*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_block1d8_v6_ssse3)
607*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_block1d8_v6_ssse3):
608*fb1b10abSAndroid Build Coastguard Worker    push        rbp
609*fb1b10abSAndroid Build Coastguard Worker    mov         rbp, rsp
610*fb1b10abSAndroid Build Coastguard Worker    SHADOW_ARGS_TO_STACK 6
611*fb1b10abSAndroid Build Coastguard Worker    SAVE_XMM 7
612*fb1b10abSAndroid Build Coastguard Worker    GET_GOT     rbx
613*fb1b10abSAndroid Build Coastguard Worker    push        rsi
614*fb1b10abSAndroid Build Coastguard Worker    push        rdi
615*fb1b10abSAndroid Build Coastguard Worker    ; end prolog
616*fb1b10abSAndroid Build Coastguard Worker
617*fb1b10abSAndroid Build Coastguard Worker    movsxd      rdx, DWORD PTR arg(5)   ;table index
618*fb1b10abSAndroid Build Coastguard Worker    xor         rsi, rsi
619*fb1b10abSAndroid Build Coastguard Worker    shl         rdx, 4      ;
620*fb1b10abSAndroid Build Coastguard Worker
621*fb1b10abSAndroid Build Coastguard Worker    lea         rax, [GLOBAL(k0_k5)]
622*fb1b10abSAndroid Build Coastguard Worker    add         rax, rdx
623*fb1b10abSAndroid Build Coastguard Worker
624*fb1b10abSAndroid Build Coastguard Worker    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
625*fb1b10abSAndroid Build Coastguard Worker    mov         rdi, arg(2)             ;output_ptr
626*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT=0
627*fb1b10abSAndroid Build Coastguard Worker    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
628*fb1b10abSAndroid Build Coastguard Worker%endif
629*fb1b10abSAndroid Build Coastguard Worker    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
630*fb1b10abSAndroid Build Coastguard Worker
631*fb1b10abSAndroid Build Coastguard Worker    cmp         esi, DWORD PTR [rax]
632*fb1b10abSAndroid Build Coastguard Worker    je          .vp8_filter_block1d8_v4_ssse3
633*fb1b10abSAndroid Build Coastguard Worker
634*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
635*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
636*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
637*fb1b10abSAndroid Build Coastguard Worker
638*fb1b10abSAndroid Build Coastguard Worker    mov         rsi, arg(0)             ;src_ptr
639*fb1b10abSAndroid Build Coastguard Worker
640*fb1b10abSAndroid Build Coastguard Worker    mov         rax, rsi
641*fb1b10abSAndroid Build Coastguard Worker    add         rax, rdx
642*fb1b10abSAndroid Build Coastguard Worker
643*fb1b10abSAndroid Build Coastguard Worker.vp8_filter_block1d8_v6_ssse3_loop:
644*fb1b10abSAndroid Build Coastguard Worker    movq        xmm1, MMWORD PTR [rsi]                  ;A
645*fb1b10abSAndroid Build Coastguard Worker    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
646*fb1b10abSAndroid Build Coastguard Worker    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
647*fb1b10abSAndroid Build Coastguard Worker    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
648*fb1b10abSAndroid Build Coastguard Worker    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
649*fb1b10abSAndroid Build Coastguard Worker
650*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   xmm2, xmm4                  ;B D
651*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   xmm3, xmm0                  ;C E
652*fb1b10abSAndroid Build Coastguard Worker
653*fb1b10abSAndroid Build Coastguard Worker    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
654*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm4, [GLOBAL(rd)]
655*fb1b10abSAndroid Build Coastguard Worker
656*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm3, xmm6
657*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   xmm1, xmm0                  ;A F
658*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm2, xmm7
659*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm1, xmm5
660*fb1b10abSAndroid Build Coastguard Worker    add         rsi,  rdx
661*fb1b10abSAndroid Build Coastguard Worker    add         rax,  rdx
662*fb1b10abSAndroid Build Coastguard Worker;--
663*fb1b10abSAndroid Build Coastguard Worker;--
664*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm2, xmm3
665*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm2, xmm1
666*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm2, xmm4
667*fb1b10abSAndroid Build Coastguard Worker    psraw       xmm2, 7
668*fb1b10abSAndroid Build Coastguard Worker    packuswb    xmm2, xmm2
669*fb1b10abSAndroid Build Coastguard Worker
670*fb1b10abSAndroid Build Coastguard Worker    movq        MMWORD PTR [rdi], xmm2
671*fb1b10abSAndroid Build Coastguard Worker
672*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT
673*fb1b10abSAndroid Build Coastguard Worker    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
674*fb1b10abSAndroid Build Coastguard Worker%else
675*fb1b10abSAndroid Build Coastguard Worker    add         rdi,        r8
676*fb1b10abSAndroid Build Coastguard Worker%endif
677*fb1b10abSAndroid Build Coastguard Worker    dec         rcx
678*fb1b10abSAndroid Build Coastguard Worker    jnz         .vp8_filter_block1d8_v6_ssse3_loop
679*fb1b10abSAndroid Build Coastguard Worker
680*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
681*fb1b10abSAndroid Build Coastguard Worker    pop rdi
682*fb1b10abSAndroid Build Coastguard Worker    pop rsi
683*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
684*fb1b10abSAndroid Build Coastguard Worker    RESTORE_XMM
685*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
686*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
687*fb1b10abSAndroid Build Coastguard Worker    ret
688*fb1b10abSAndroid Build Coastguard Worker
689*fb1b10abSAndroid Build Coastguard Worker.vp8_filter_block1d8_v4_ssse3:
690*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
691*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
692*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm5, [GLOBAL(rd)]
693*fb1b10abSAndroid Build Coastguard Worker
694*fb1b10abSAndroid Build Coastguard Worker    mov         rsi, arg(0)             ;src_ptr
695*fb1b10abSAndroid Build Coastguard Worker
696*fb1b10abSAndroid Build Coastguard Worker    mov         rax, rsi
697*fb1b10abSAndroid Build Coastguard Worker    add         rax, rdx
698*fb1b10abSAndroid Build Coastguard Worker
699*fb1b10abSAndroid Build Coastguard Worker.vp8_filter_block1d8_v4_ssse3_loop:
700*fb1b10abSAndroid Build Coastguard Worker    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
701*fb1b10abSAndroid Build Coastguard Worker    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
702*fb1b10abSAndroid Build Coastguard Worker    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
703*fb1b10abSAndroid Build Coastguard Worker    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
704*fb1b10abSAndroid Build Coastguard Worker
705*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   xmm2, xmm4                  ;B D
706*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   xmm3, xmm0                  ;C E
707*fb1b10abSAndroid Build Coastguard Worker
708*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm3, xmm6
709*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   xmm2, xmm7
710*fb1b10abSAndroid Build Coastguard Worker    add         rsi,  rdx
711*fb1b10abSAndroid Build Coastguard Worker    add         rax,  rdx
712*fb1b10abSAndroid Build Coastguard Worker;--
713*fb1b10abSAndroid Build Coastguard Worker;--
714*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm2, xmm3
715*fb1b10abSAndroid Build Coastguard Worker    paddsw      xmm2, xmm5
716*fb1b10abSAndroid Build Coastguard Worker    psraw       xmm2, 7
717*fb1b10abSAndroid Build Coastguard Worker    packuswb    xmm2, xmm2
718*fb1b10abSAndroid Build Coastguard Worker
719*fb1b10abSAndroid Build Coastguard Worker    movq        MMWORD PTR [rdi], xmm2
720*fb1b10abSAndroid Build Coastguard Worker
721*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT
722*fb1b10abSAndroid Build Coastguard Worker    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
723*fb1b10abSAndroid Build Coastguard Worker%else
724*fb1b10abSAndroid Build Coastguard Worker    add         rdi,        r8
725*fb1b10abSAndroid Build Coastguard Worker%endif
726*fb1b10abSAndroid Build Coastguard Worker    dec         rcx
727*fb1b10abSAndroid Build Coastguard Worker    jnz         .vp8_filter_block1d8_v4_ssse3_loop
728*fb1b10abSAndroid Build Coastguard Worker
729*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
730*fb1b10abSAndroid Build Coastguard Worker    pop rdi
731*fb1b10abSAndroid Build Coastguard Worker    pop rsi
732*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
733*fb1b10abSAndroid Build Coastguard Worker    RESTORE_XMM
734*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
735*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
736*fb1b10abSAndroid Build Coastguard Worker    ret
737*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_block1d4_v6_ssse3
738*fb1b10abSAndroid Build Coastguard Worker;(
739*fb1b10abSAndroid Build Coastguard Worker;    unsigned char *src_ptr,
740*fb1b10abSAndroid Build Coastguard Worker;    unsigned int   src_pitch,
741*fb1b10abSAndroid Build Coastguard Worker;    unsigned char *output_ptr,
742*fb1b10abSAndroid Build Coastguard Worker;    unsigned int   out_pitch,
743*fb1b10abSAndroid Build Coastguard Worker;    unsigned int   output_height,
744*fb1b10abSAndroid Build Coastguard Worker;    unsigned int   vp8_filter_index
745*fb1b10abSAndroid Build Coastguard Worker;)
746*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_block1d4_v6_ssse3)
747*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_block1d4_v6_ssse3):
748*fb1b10abSAndroid Build Coastguard Worker    push        rbp
749*fb1b10abSAndroid Build Coastguard Worker    mov         rbp, rsp
750*fb1b10abSAndroid Build Coastguard Worker    SHADOW_ARGS_TO_STACK 6
751*fb1b10abSAndroid Build Coastguard Worker    GET_GOT     rbx
752*fb1b10abSAndroid Build Coastguard Worker    push        rsi
753*fb1b10abSAndroid Build Coastguard Worker    push        rdi
754*fb1b10abSAndroid Build Coastguard Worker    ; end prolog
755*fb1b10abSAndroid Build Coastguard Worker
756*fb1b10abSAndroid Build Coastguard Worker    movsxd      rdx, DWORD PTR arg(5)   ;table index
757*fb1b10abSAndroid Build Coastguard Worker    xor         rsi, rsi
758*fb1b10abSAndroid Build Coastguard Worker    shl         rdx, 4      ;
759*fb1b10abSAndroid Build Coastguard Worker
760*fb1b10abSAndroid Build Coastguard Worker    lea         rax, [GLOBAL(k0_k5)]
761*fb1b10abSAndroid Build Coastguard Worker    add         rax, rdx
762*fb1b10abSAndroid Build Coastguard Worker
763*fb1b10abSAndroid Build Coastguard Worker    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
764*fb1b10abSAndroid Build Coastguard Worker    mov         rdi, arg(2)             ;output_ptr
765*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT=0
766*fb1b10abSAndroid Build Coastguard Worker    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
767*fb1b10abSAndroid Build Coastguard Worker%endif
768*fb1b10abSAndroid Build Coastguard Worker    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
769*fb1b10abSAndroid Build Coastguard Worker
770*fb1b10abSAndroid Build Coastguard Worker    cmp         esi, DWORD PTR [rax]
771*fb1b10abSAndroid Build Coastguard Worker    je          .vp8_filter_block1d4_v4_ssse3
772*fb1b10abSAndroid Build Coastguard Worker
773*fb1b10abSAndroid Build Coastguard Worker    movq        mm5, MMWORD PTR [rax]         ;k0_k5
774*fb1b10abSAndroid Build Coastguard Worker    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
775*fb1b10abSAndroid Build Coastguard Worker    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
776*fb1b10abSAndroid Build Coastguard Worker
777*fb1b10abSAndroid Build Coastguard Worker    mov         rsi, arg(0)             ;src_ptr
778*fb1b10abSAndroid Build Coastguard Worker
779*fb1b10abSAndroid Build Coastguard Worker    mov         rax, rsi
780*fb1b10abSAndroid Build Coastguard Worker    add         rax, rdx
781*fb1b10abSAndroid Build Coastguard Worker
782*fb1b10abSAndroid Build Coastguard Worker.vp8_filter_block1d4_v6_ssse3_loop:
783*fb1b10abSAndroid Build Coastguard Worker    movd        mm1, DWORD PTR [rsi]                  ;A
784*fb1b10abSAndroid Build Coastguard Worker    movd        mm2, DWORD PTR [rsi + rdx]            ;B
785*fb1b10abSAndroid Build Coastguard Worker    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
786*fb1b10abSAndroid Build Coastguard Worker    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
787*fb1b10abSAndroid Build Coastguard Worker    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
788*fb1b10abSAndroid Build Coastguard Worker
789*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   mm2, mm4                  ;B D
790*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   mm3, mm0                  ;C E
791*fb1b10abSAndroid Build Coastguard Worker
792*fb1b10abSAndroid Build Coastguard Worker    movd        mm0, DWORD PTR [rax + rdx * 4]        ;F
793*fb1b10abSAndroid Build Coastguard Worker
794*fb1b10abSAndroid Build Coastguard Worker    movq        mm4, [GLOBAL(rd)]
795*fb1b10abSAndroid Build Coastguard Worker
796*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   mm3, mm6
797*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   mm1, mm0                  ;A F
798*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   mm2, mm7
799*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   mm1, mm5
800*fb1b10abSAndroid Build Coastguard Worker    add         rsi,  rdx
801*fb1b10abSAndroid Build Coastguard Worker    add         rax,  rdx
802*fb1b10abSAndroid Build Coastguard Worker;--
803*fb1b10abSAndroid Build Coastguard Worker;--
804*fb1b10abSAndroid Build Coastguard Worker    paddsw      mm2, mm3
805*fb1b10abSAndroid Build Coastguard Worker    paddsw      mm2, mm1
806*fb1b10abSAndroid Build Coastguard Worker    paddsw      mm2, mm4
807*fb1b10abSAndroid Build Coastguard Worker    psraw       mm2, 7
808*fb1b10abSAndroid Build Coastguard Worker    packuswb    mm2, mm2
809*fb1b10abSAndroid Build Coastguard Worker
810*fb1b10abSAndroid Build Coastguard Worker    movd        DWORD PTR [rdi], mm2
811*fb1b10abSAndroid Build Coastguard Worker
812*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT
813*fb1b10abSAndroid Build Coastguard Worker    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
814*fb1b10abSAndroid Build Coastguard Worker%else
815*fb1b10abSAndroid Build Coastguard Worker    add         rdi,        r8
816*fb1b10abSAndroid Build Coastguard Worker%endif
817*fb1b10abSAndroid Build Coastguard Worker    dec         rcx
818*fb1b10abSAndroid Build Coastguard Worker    jnz         .vp8_filter_block1d4_v6_ssse3_loop
819*fb1b10abSAndroid Build Coastguard Worker
820*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
821*fb1b10abSAndroid Build Coastguard Worker    pop rdi
822*fb1b10abSAndroid Build Coastguard Worker    pop rsi
823*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
824*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
825*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
826*fb1b10abSAndroid Build Coastguard Worker    ret
827*fb1b10abSAndroid Build Coastguard Worker
828*fb1b10abSAndroid Build Coastguard Worker.vp8_filter_block1d4_v4_ssse3:
829*fb1b10abSAndroid Build Coastguard Worker    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
830*fb1b10abSAndroid Build Coastguard Worker    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
831*fb1b10abSAndroid Build Coastguard Worker    movq        mm5, MMWORD PTR [GLOBAL(rd)]
832*fb1b10abSAndroid Build Coastguard Worker
833*fb1b10abSAndroid Build Coastguard Worker    mov         rsi, arg(0)             ;src_ptr
834*fb1b10abSAndroid Build Coastguard Worker
835*fb1b10abSAndroid Build Coastguard Worker    mov         rax, rsi
836*fb1b10abSAndroid Build Coastguard Worker    add         rax, rdx
837*fb1b10abSAndroid Build Coastguard Worker
838*fb1b10abSAndroid Build Coastguard Worker.vp8_filter_block1d4_v4_ssse3_loop:
839*fb1b10abSAndroid Build Coastguard Worker    movd        mm2, DWORD PTR [rsi + rdx]            ;B
840*fb1b10abSAndroid Build Coastguard Worker    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
841*fb1b10abSAndroid Build Coastguard Worker    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
842*fb1b10abSAndroid Build Coastguard Worker    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
843*fb1b10abSAndroid Build Coastguard Worker
844*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   mm2, mm4                  ;B D
845*fb1b10abSAndroid Build Coastguard Worker    punpcklbw   mm3, mm0                  ;C E
846*fb1b10abSAndroid Build Coastguard Worker
847*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   mm3, mm6
848*fb1b10abSAndroid Build Coastguard Worker    pmaddubsw   mm2, mm7
849*fb1b10abSAndroid Build Coastguard Worker    add         rsi,  rdx
850*fb1b10abSAndroid Build Coastguard Worker    add         rax,  rdx
851*fb1b10abSAndroid Build Coastguard Worker;--
852*fb1b10abSAndroid Build Coastguard Worker;--
853*fb1b10abSAndroid Build Coastguard Worker    paddsw      mm2, mm3
854*fb1b10abSAndroid Build Coastguard Worker    paddsw      mm2, mm5
855*fb1b10abSAndroid Build Coastguard Worker    psraw       mm2, 7
856*fb1b10abSAndroid Build Coastguard Worker    packuswb    mm2, mm2
857*fb1b10abSAndroid Build Coastguard Worker
858*fb1b10abSAndroid Build Coastguard Worker    movd        DWORD PTR [rdi], mm2
859*fb1b10abSAndroid Build Coastguard Worker
860*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT
861*fb1b10abSAndroid Build Coastguard Worker    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
862*fb1b10abSAndroid Build Coastguard Worker%else
863*fb1b10abSAndroid Build Coastguard Worker    add         rdi,        r8
864*fb1b10abSAndroid Build Coastguard Worker%endif
865*fb1b10abSAndroid Build Coastguard Worker    dec         rcx
866*fb1b10abSAndroid Build Coastguard Worker    jnz         .vp8_filter_block1d4_v4_ssse3_loop
867*fb1b10abSAndroid Build Coastguard Worker
868*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
869*fb1b10abSAndroid Build Coastguard Worker    pop rdi
870*fb1b10abSAndroid Build Coastguard Worker    pop rsi
871*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
872*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
873*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
874*fb1b10abSAndroid Build Coastguard Worker    ret
875*fb1b10abSAndroid Build Coastguard Worker
876*fb1b10abSAndroid Build Coastguard Worker;void vp8_bilinear_predict16x16_ssse3
877*fb1b10abSAndroid Build Coastguard Worker;(
878*fb1b10abSAndroid Build Coastguard Worker;    unsigned char  *src_ptr,
879*fb1b10abSAndroid Build Coastguard Worker;    int   src_pixels_per_line,
880*fb1b10abSAndroid Build Coastguard Worker;    int  xoffset,
881*fb1b10abSAndroid Build Coastguard Worker;    int  yoffset,
882*fb1b10abSAndroid Build Coastguard Worker;    unsigned char *dst_ptr,
883*fb1b10abSAndroid Build Coastguard Worker;    int dst_pitch
884*fb1b10abSAndroid Build Coastguard Worker;)
885*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_bilinear_predict16x16_ssse3)
886*fb1b10abSAndroid Build Coastguard Workersym(vp8_bilinear_predict16x16_ssse3):
887*fb1b10abSAndroid Build Coastguard Worker    push        rbp
888*fb1b10abSAndroid Build Coastguard Worker    mov         rbp, rsp
889*fb1b10abSAndroid Build Coastguard Worker    SHADOW_ARGS_TO_STACK 6
890*fb1b10abSAndroid Build Coastguard Worker    SAVE_XMM 7
891*fb1b10abSAndroid Build Coastguard Worker    GET_GOT     rbx
892*fb1b10abSAndroid Build Coastguard Worker    push        rsi
893*fb1b10abSAndroid Build Coastguard Worker    push        rdi
894*fb1b10abSAndroid Build Coastguard Worker    ; end prolog
895*fb1b10abSAndroid Build Coastguard Worker
896*fb1b10abSAndroid Build Coastguard Worker        lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
897*fb1b10abSAndroid Build Coastguard Worker        movsxd      rax,        dword ptr arg(2)    ; xoffset
898*fb1b10abSAndroid Build Coastguard Worker
899*fb1b10abSAndroid Build Coastguard Worker        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
900*fb1b10abSAndroid Build Coastguard Worker        je          .b16x16_sp_only
901*fb1b10abSAndroid Build Coastguard Worker
902*fb1b10abSAndroid Build Coastguard Worker        shl         rax,        4
903*fb1b10abSAndroid Build Coastguard Worker        lea         rax,        [rax + rcx]         ; HFilter
904*fb1b10abSAndroid Build Coastguard Worker
905*fb1b10abSAndroid Build Coastguard Worker        mov         rdi,        arg(4)              ; dst_ptr
906*fb1b10abSAndroid Build Coastguard Worker        mov         rsi,        arg(0)              ; src_ptr
907*fb1b10abSAndroid Build Coastguard Worker        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
908*fb1b10abSAndroid Build Coastguard Worker
909*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm1,       [rax]
910*fb1b10abSAndroid Build Coastguard Worker
911*fb1b10abSAndroid Build Coastguard Worker        movsxd      rax,        dword ptr arg(3)    ; yoffset
912*fb1b10abSAndroid Build Coastguard Worker
913*fb1b10abSAndroid Build Coastguard Worker        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
914*fb1b10abSAndroid Build Coastguard Worker        je          .b16x16_fp_only
915*fb1b10abSAndroid Build Coastguard Worker
916*fb1b10abSAndroid Build Coastguard Worker        shl         rax,        4
917*fb1b10abSAndroid Build Coastguard Worker        lea         rax,        [rax + rcx]         ; VFilter
918*fb1b10abSAndroid Build Coastguard Worker
919*fb1b10abSAndroid Build Coastguard Worker        lea         rcx,        [rdi+rdx*8]
920*fb1b10abSAndroid Build Coastguard Worker        lea         rcx,        [rcx+rdx*8]
921*fb1b10abSAndroid Build Coastguard Worker        movsxd      rdx,        dword ptr arg(1)    ; src_pixels_per_line
922*fb1b10abSAndroid Build Coastguard Worker
923*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm2,       [rax]
924*fb1b10abSAndroid Build Coastguard Worker
925*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT=0
926*fb1b10abSAndroid Build Coastguard Worker        movsxd      r8,         dword ptr arg(5)    ; dst_pitch
927*fb1b10abSAndroid Build Coastguard Worker%endif
928*fb1b10abSAndroid Build Coastguard Worker        movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07
929*fb1b10abSAndroid Build Coastguard Worker        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
930*fb1b10abSAndroid Build Coastguard Worker
931*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
932*fb1b10abSAndroid Build Coastguard Worker        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
933*fb1b10abSAndroid Build Coastguard Worker
934*fb1b10abSAndroid Build Coastguard Worker        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
935*fb1b10abSAndroid Build Coastguard Worker
936*fb1b10abSAndroid Build Coastguard Worker        lea         rsi,        [rsi + rdx]         ; next line
937*fb1b10abSAndroid Build Coastguard Worker
938*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14
939*fb1b10abSAndroid Build Coastguard Worker
940*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
941*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15
942*fb1b10abSAndroid Build Coastguard Worker
943*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
944*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
945*fb1b10abSAndroid Build Coastguard Worker
946*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
947*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
948*fb1b10abSAndroid Build Coastguard Worker
949*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,       xmm3
950*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
951*fb1b10abSAndroid Build Coastguard Worker
952*fb1b10abSAndroid Build Coastguard Worker.next_row:
953*fb1b10abSAndroid Build Coastguard Worker        movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07
954*fb1b10abSAndroid Build Coastguard Worker        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
955*fb1b10abSAndroid Build Coastguard Worker
956*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm6,       xmm5
957*fb1b10abSAndroid Build Coastguard Worker        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
958*fb1b10abSAndroid Build Coastguard Worker
959*fb1b10abSAndroid Build Coastguard Worker        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
960*fb1b10abSAndroid Build Coastguard Worker        lea         rsi,        [rsi + rdx]         ; next line
961*fb1b10abSAndroid Build Coastguard Worker
962*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm6,       xmm1
963*fb1b10abSAndroid Build Coastguard Worker
964*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm4,       xmm5
965*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm4,       xmm1
966*fb1b10abSAndroid Build Coastguard Worker
967*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
968*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
969*fb1b10abSAndroid Build Coastguard Worker
970*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
971*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
972*fb1b10abSAndroid Build Coastguard Worker
973*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm6,       xmm4
974*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,       xmm7
975*fb1b10abSAndroid Build Coastguard Worker
976*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm5,       xmm6
977*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm5,       xmm2
978*fb1b10abSAndroid Build Coastguard Worker
979*fb1b10abSAndroid Build Coastguard Worker        punpckhbw   xmm7,       xmm6
980*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm7,       xmm2
981*fb1b10abSAndroid Build Coastguard Worker
982*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm5,       [GLOBAL(rd)]        ; xmm5 += round value
983*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm5,       VP8_FILTER_SHIFT    ; xmm5 /= 128
984*fb1b10abSAndroid Build Coastguard Worker
985*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
986*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
987*fb1b10abSAndroid Build Coastguard Worker
988*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm5,       xmm7
989*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,       xmm6
990*fb1b10abSAndroid Build Coastguard Worker
991*fb1b10abSAndroid Build Coastguard Worker        movdqa      [rdi],      xmm5                ; store the results in the destination
992*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT
993*fb1b10abSAndroid Build Coastguard Worker        add         rdi,        DWORD PTR arg(5)    ; dst_pitch
994*fb1b10abSAndroid Build Coastguard Worker%else
995*fb1b10abSAndroid Build Coastguard Worker        add         rdi,        r8
996*fb1b10abSAndroid Build Coastguard Worker%endif
997*fb1b10abSAndroid Build Coastguard Worker
998*fb1b10abSAndroid Build Coastguard Worker        cmp         rdi,        rcx
999*fb1b10abSAndroid Build Coastguard Worker        jne         .next_row
1000*fb1b10abSAndroid Build Coastguard Worker
1001*fb1b10abSAndroid Build Coastguard Worker        jmp         .done
1002*fb1b10abSAndroid Build Coastguard Worker
1003*fb1b10abSAndroid Build Coastguard Worker.b16x16_sp_only:
1004*fb1b10abSAndroid Build Coastguard Worker        movsxd      rax,        dword ptr arg(3)    ; yoffset
1005*fb1b10abSAndroid Build Coastguard Worker        shl         rax,        4
1006*fb1b10abSAndroid Build Coastguard Worker        lea         rax,        [rax + rcx]         ; VFilter
1007*fb1b10abSAndroid Build Coastguard Worker
1008*fb1b10abSAndroid Build Coastguard Worker        mov         rdi,        arg(4)              ; dst_ptr
1009*fb1b10abSAndroid Build Coastguard Worker        mov         rsi,        arg(0)              ; src_ptr
1010*fb1b10abSAndroid Build Coastguard Worker        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
1011*fb1b10abSAndroid Build Coastguard Worker
1012*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm1,       [rax]               ; VFilter
1013*fb1b10abSAndroid Build Coastguard Worker
1014*fb1b10abSAndroid Build Coastguard Worker        lea         rcx,        [rdi+rdx*8]
1015*fb1b10abSAndroid Build Coastguard Worker        lea         rcx,        [rcx+rdx*8]
1016*fb1b10abSAndroid Build Coastguard Worker        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
1017*fb1b10abSAndroid Build Coastguard Worker
1018*fb1b10abSAndroid Build Coastguard Worker        ; get the first horizontal line done
1019*fb1b10abSAndroid Build Coastguard Worker        movq        xmm4,       [rsi]               ; load row 0
1020*fb1b10abSAndroid Build Coastguard Worker        movq        xmm2,       [rsi + 8]           ; load row 0
1021*fb1b10abSAndroid Build Coastguard Worker
1022*fb1b10abSAndroid Build Coastguard Worker        lea         rsi,        [rsi + rax]         ; next line
1023*fb1b10abSAndroid Build Coastguard Worker.next_row_sp:
1024*fb1b10abSAndroid Build Coastguard Worker        movq        xmm3,       [rsi]               ; load row + 1
1025*fb1b10abSAndroid Build Coastguard Worker        movq        xmm5,       [rsi + 8]           ; load row + 1
1026*fb1b10abSAndroid Build Coastguard Worker
1027*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm4,       xmm3
1028*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm2,       xmm5
1029*fb1b10abSAndroid Build Coastguard Worker
1030*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm4,       xmm1
1031*fb1b10abSAndroid Build Coastguard Worker        movq        xmm7,       [rsi + rax]         ; load row + 2
1032*fb1b10abSAndroid Build Coastguard Worker
1033*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm2,       xmm1
1034*fb1b10abSAndroid Build Coastguard Worker        movq        xmm6,       [rsi + rax + 8]     ; load row + 2
1035*fb1b10abSAndroid Build Coastguard Worker
1036*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm3,       xmm7
1037*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm5,       xmm6
1038*fb1b10abSAndroid Build Coastguard Worker
1039*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm3,       xmm1
1040*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm4,       [GLOBAL(rd)]
1041*fb1b10abSAndroid Build Coastguard Worker
1042*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm5,       xmm1
1043*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm2,       [GLOBAL(rd)]
1044*fb1b10abSAndroid Build Coastguard Worker
1045*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm4,       VP8_FILTER_SHIFT
1046*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm2,       VP8_FILTER_SHIFT
1047*fb1b10abSAndroid Build Coastguard Worker
1048*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm4,       xmm2
1049*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm3,       [GLOBAL(rd)]
1050*fb1b10abSAndroid Build Coastguard Worker
1051*fb1b10abSAndroid Build Coastguard Worker        movdqa      [rdi],      xmm4                ; store row 0
1052*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm5,       [GLOBAL(rd)]
1053*fb1b10abSAndroid Build Coastguard Worker
1054*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm3,       VP8_FILTER_SHIFT
1055*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm5,       VP8_FILTER_SHIFT
1056*fb1b10abSAndroid Build Coastguard Worker
1057*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm3,       xmm5
1058*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,       xmm7
1059*fb1b10abSAndroid Build Coastguard Worker
1060*fb1b10abSAndroid Build Coastguard Worker        movdqa      [rdi + rdx],xmm3                ; store row 1
1061*fb1b10abSAndroid Build Coastguard Worker        lea         rsi,        [rsi + 2*rax]
1062*fb1b10abSAndroid Build Coastguard Worker
1063*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm2,       xmm6
1064*fb1b10abSAndroid Build Coastguard Worker        lea         rdi,        [rdi + 2*rdx]
1065*fb1b10abSAndroid Build Coastguard Worker
1066*fb1b10abSAndroid Build Coastguard Worker        cmp         rdi,        rcx
1067*fb1b10abSAndroid Build Coastguard Worker        jne         .next_row_sp
1068*fb1b10abSAndroid Build Coastguard Worker
1069*fb1b10abSAndroid Build Coastguard Worker        jmp         .done
1070*fb1b10abSAndroid Build Coastguard Worker
1071*fb1b10abSAndroid Build Coastguard Worker.b16x16_fp_only:
1072*fb1b10abSAndroid Build Coastguard Worker        lea         rcx,        [rdi+rdx*8]
1073*fb1b10abSAndroid Build Coastguard Worker        lea         rcx,        [rcx+rdx*8]
1074*fb1b10abSAndroid Build Coastguard Worker        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
1075*fb1b10abSAndroid Build Coastguard Worker
1076*fb1b10abSAndroid Build Coastguard Worker.next_row_fp:
1077*fb1b10abSAndroid Build Coastguard Worker        movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07
1078*fb1b10abSAndroid Build Coastguard Worker        movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08
1079*fb1b10abSAndroid Build Coastguard Worker
1080*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm2,       xmm4
1081*fb1b10abSAndroid Build Coastguard Worker        movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15
1082*fb1b10abSAndroid Build Coastguard Worker
1083*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm2,       xmm1
1084*fb1b10abSAndroid Build Coastguard Worker        movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16
1085*fb1b10abSAndroid Build Coastguard Worker
1086*fb1b10abSAndroid Build Coastguard Worker        lea         rsi,        [rsi + rax]         ; next line
1087*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm3,       xmm4
1088*fb1b10abSAndroid Build Coastguard Worker
1089*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm3,       xmm1
1090*fb1b10abSAndroid Build Coastguard Worker        movq        xmm5,       [rsi]
1091*fb1b10abSAndroid Build Coastguard Worker
1092*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm2,       [GLOBAL(rd)]
1093*fb1b10abSAndroid Build Coastguard Worker        movq        xmm7,       [rsi+1]
1094*fb1b10abSAndroid Build Coastguard Worker
1095*fb1b10abSAndroid Build Coastguard Worker        movq        xmm6,       [rsi+8]
1096*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm2,       VP8_FILTER_SHIFT
1097*fb1b10abSAndroid Build Coastguard Worker
1098*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm5,       xmm7
1099*fb1b10abSAndroid Build Coastguard Worker        movq        xmm7,       [rsi+9]
1100*fb1b10abSAndroid Build Coastguard Worker
1101*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm3,       [GLOBAL(rd)]
1102*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm5,       xmm1
1103*fb1b10abSAndroid Build Coastguard Worker
1104*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm3,       VP8_FILTER_SHIFT
1105*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm6,       xmm7
1106*fb1b10abSAndroid Build Coastguard Worker
1107*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm2,       xmm3
1108*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm6,       xmm1
1109*fb1b10abSAndroid Build Coastguard Worker
1110*fb1b10abSAndroid Build Coastguard Worker        movdqa      [rdi],      xmm2                ; store the results in the destination
1111*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm5,       [GLOBAL(rd)]
1112*fb1b10abSAndroid Build Coastguard Worker
1113*fb1b10abSAndroid Build Coastguard Worker        lea         rdi,        [rdi + rdx]         ; dst_pitch
1114*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm5,       VP8_FILTER_SHIFT
1115*fb1b10abSAndroid Build Coastguard Worker
1116*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm6,       [GLOBAL(rd)]
1117*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm6,       VP8_FILTER_SHIFT
1118*fb1b10abSAndroid Build Coastguard Worker
1119*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm5,       xmm6
1120*fb1b10abSAndroid Build Coastguard Worker        lea         rsi,        [rsi + rax]         ; next line
1121*fb1b10abSAndroid Build Coastguard Worker
1122*fb1b10abSAndroid Build Coastguard Worker        movdqa      [rdi],      xmm5                ; store the results in the destination
1123*fb1b10abSAndroid Build Coastguard Worker        lea         rdi,        [rdi + rdx]         ; dst_pitch
1124*fb1b10abSAndroid Build Coastguard Worker
1125*fb1b10abSAndroid Build Coastguard Worker        cmp         rdi,        rcx
1126*fb1b10abSAndroid Build Coastguard Worker
1127*fb1b10abSAndroid Build Coastguard Worker        jne         .next_row_fp
1128*fb1b10abSAndroid Build Coastguard Worker
1129*fb1b10abSAndroid Build Coastguard Worker.done:
1130*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
1131*fb1b10abSAndroid Build Coastguard Worker    pop         rdi
1132*fb1b10abSAndroid Build Coastguard Worker    pop         rsi
1133*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
1134*fb1b10abSAndroid Build Coastguard Worker    RESTORE_XMM
1135*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
1136*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
1137*fb1b10abSAndroid Build Coastguard Worker    ret
1138*fb1b10abSAndroid Build Coastguard Worker
1139*fb1b10abSAndroid Build Coastguard Worker;void vp8_bilinear_predict8x8_ssse3
1140*fb1b10abSAndroid Build Coastguard Worker;(
1141*fb1b10abSAndroid Build Coastguard Worker;    unsigned char  *src_ptr,
1142*fb1b10abSAndroid Build Coastguard Worker;    int   src_pixels_per_line,
1143*fb1b10abSAndroid Build Coastguard Worker;    int  xoffset,
1144*fb1b10abSAndroid Build Coastguard Worker;    int  yoffset,
1145*fb1b10abSAndroid Build Coastguard Worker;    unsigned char *dst_ptr,
1146*fb1b10abSAndroid Build Coastguard Worker;    int dst_pitch
1147*fb1b10abSAndroid Build Coastguard Worker;)
1148*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_bilinear_predict8x8_ssse3)
1149*fb1b10abSAndroid Build Coastguard Workersym(vp8_bilinear_predict8x8_ssse3):
1150*fb1b10abSAndroid Build Coastguard Worker    push        rbp
1151*fb1b10abSAndroid Build Coastguard Worker    mov         rbp, rsp
1152*fb1b10abSAndroid Build Coastguard Worker    SHADOW_ARGS_TO_STACK 6
1153*fb1b10abSAndroid Build Coastguard Worker    SAVE_XMM 7
1154*fb1b10abSAndroid Build Coastguard Worker    GET_GOT     rbx
1155*fb1b10abSAndroid Build Coastguard Worker    push        rsi
1156*fb1b10abSAndroid Build Coastguard Worker    push        rdi
1157*fb1b10abSAndroid Build Coastguard Worker    ; end prolog
1158*fb1b10abSAndroid Build Coastguard Worker
1159*fb1b10abSAndroid Build Coastguard Worker    ALIGN_STACK 16, rax
1160*fb1b10abSAndroid Build Coastguard Worker    sub         rsp, 144                         ; reserve 144 bytes
1161*fb1b10abSAndroid Build Coastguard Worker
1162*fb1b10abSAndroid Build Coastguard Worker        lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
1163*fb1b10abSAndroid Build Coastguard Worker
1164*fb1b10abSAndroid Build Coastguard Worker        mov         rsi,        arg(0) ;src_ptr
1165*fb1b10abSAndroid Build Coastguard Worker        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
1166*fb1b10abSAndroid Build Coastguard Worker
1167*fb1b10abSAndroid Build Coastguard Worker    ;Read 9-line unaligned data in and put them on stack. This gives a big
1168*fb1b10abSAndroid Build Coastguard Worker    ;performance boost.
1169*fb1b10abSAndroid Build Coastguard Worker        movdqu      xmm0,       [rsi]
1170*fb1b10abSAndroid Build Coastguard Worker        lea         rax,        [rdx + rdx*2]
1171*fb1b10abSAndroid Build Coastguard Worker        movdqu      xmm1,       [rsi+rdx]
1172*fb1b10abSAndroid Build Coastguard Worker        movdqu      xmm2,       [rsi+rdx*2]
1173*fb1b10abSAndroid Build Coastguard Worker        add         rsi,        rax
1174*fb1b10abSAndroid Build Coastguard Worker        movdqu      xmm3,       [rsi]
1175*fb1b10abSAndroid Build Coastguard Worker        movdqu      xmm4,       [rsi+rdx]
1176*fb1b10abSAndroid Build Coastguard Worker        movdqu      xmm5,       [rsi+rdx*2]
1177*fb1b10abSAndroid Build Coastguard Worker        add         rsi,        rax
1178*fb1b10abSAndroid Build Coastguard Worker        movdqu      xmm6,       [rsi]
1179*fb1b10abSAndroid Build Coastguard Worker        movdqu      xmm7,       [rsi+rdx]
1180*fb1b10abSAndroid Build Coastguard Worker
1181*fb1b10abSAndroid Build Coastguard Worker        movdqa      XMMWORD PTR [rsp],            xmm0
1182*fb1b10abSAndroid Build Coastguard Worker
1183*fb1b10abSAndroid Build Coastguard Worker        movdqu      xmm0,       [rsi+rdx*2]
1184*fb1b10abSAndroid Build Coastguard Worker
1185*fb1b10abSAndroid Build Coastguard Worker        movdqa      XMMWORD PTR [rsp+16],         xmm1
1186*fb1b10abSAndroid Build Coastguard Worker        movdqa      XMMWORD PTR [rsp+32],         xmm2
1187*fb1b10abSAndroid Build Coastguard Worker        movdqa      XMMWORD PTR [rsp+48],         xmm3
1188*fb1b10abSAndroid Build Coastguard Worker        movdqa      XMMWORD PTR [rsp+64],         xmm4
1189*fb1b10abSAndroid Build Coastguard Worker        movdqa      XMMWORD PTR [rsp+80],         xmm5
1190*fb1b10abSAndroid Build Coastguard Worker        movdqa      XMMWORD PTR [rsp+96],         xmm6
1191*fb1b10abSAndroid Build Coastguard Worker        movdqa      XMMWORD PTR [rsp+112],        xmm7
1192*fb1b10abSAndroid Build Coastguard Worker        movdqa      XMMWORD PTR [rsp+128],        xmm0
1193*fb1b10abSAndroid Build Coastguard Worker
1194*fb1b10abSAndroid Build Coastguard Worker        movsxd      rax,        dword ptr arg(2)    ; xoffset
1195*fb1b10abSAndroid Build Coastguard Worker        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
1196*fb1b10abSAndroid Build Coastguard Worker        je          .b8x8_sp_only
1197*fb1b10abSAndroid Build Coastguard Worker
1198*fb1b10abSAndroid Build Coastguard Worker        shl         rax,        4
1199*fb1b10abSAndroid Build Coastguard Worker        add         rax,        rcx                 ; HFilter
1200*fb1b10abSAndroid Build Coastguard Worker
1201*fb1b10abSAndroid Build Coastguard Worker        mov         rdi,        arg(4)              ; dst_ptr
1202*fb1b10abSAndroid Build Coastguard Worker        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
1203*fb1b10abSAndroid Build Coastguard Worker
1204*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm0,       [rax]
1205*fb1b10abSAndroid Build Coastguard Worker
1206*fb1b10abSAndroid Build Coastguard Worker        movsxd      rax,        dword ptr arg(3)    ; yoffset
1207*fb1b10abSAndroid Build Coastguard Worker        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
1208*fb1b10abSAndroid Build Coastguard Worker        je          .b8x8_fp_only
1209*fb1b10abSAndroid Build Coastguard Worker
1210*fb1b10abSAndroid Build Coastguard Worker        shl         rax,        4
1211*fb1b10abSAndroid Build Coastguard Worker        lea         rax,        [rax + rcx]         ; VFilter
1212*fb1b10abSAndroid Build Coastguard Worker
1213*fb1b10abSAndroid Build Coastguard Worker        lea         rcx,        [rdi+rdx*8]
1214*fb1b10abSAndroid Build Coastguard Worker
1215*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm1,       [rax]
1216*fb1b10abSAndroid Build Coastguard Worker
1217*fb1b10abSAndroid Build Coastguard Worker        ; get the first horizontal line done
1218*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm3,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1219*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,       xmm3                ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
1220*fb1b10abSAndroid Build Coastguard Worker
1221*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm5,       1
1222*fb1b10abSAndroid Build Coastguard Worker        lea         rsp,        [rsp + 16]          ; next line
1223*fb1b10abSAndroid Build Coastguard Worker
1224*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
1225*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm3,       xmm0                ; 00 02 04 06 08 10 12 14
1226*fb1b10abSAndroid Build Coastguard Worker
1227*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1228*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
1229*fb1b10abSAndroid Build Coastguard Worker
1230*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,       xmm3
1231*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm7,       xmm7                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1232*fb1b10abSAndroid Build Coastguard Worker
1233*fb1b10abSAndroid Build Coastguard Worker.next_row:
1234*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm6,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1235*fb1b10abSAndroid Build Coastguard Worker        lea         rsp,        [rsp + 16]          ; next line
1236*fb1b10abSAndroid Build Coastguard Worker
1237*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,       xmm6
1238*fb1b10abSAndroid Build Coastguard Worker
1239*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm5,       1
1240*fb1b10abSAndroid Build Coastguard Worker
1241*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm6,       xmm5
1242*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm6,       xmm0
1243*fb1b10abSAndroid Build Coastguard Worker
1244*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
1245*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
1246*fb1b10abSAndroid Build Coastguard Worker
1247*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm6,       xmm6
1248*fb1b10abSAndroid Build Coastguard Worker
1249*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm7,       xmm6
1250*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm7,       xmm1
1251*fb1b10abSAndroid Build Coastguard Worker
1252*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
1253*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
1254*fb1b10abSAndroid Build Coastguard Worker
1255*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm7,       xmm7
1256*fb1b10abSAndroid Build Coastguard Worker
1257*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi],      xmm7                ; store the results in the destination
1258*fb1b10abSAndroid Build Coastguard Worker        lea         rdi,        [rdi + rdx]
1259*fb1b10abSAndroid Build Coastguard Worker
1260*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,       xmm6
1261*fb1b10abSAndroid Build Coastguard Worker
1262*fb1b10abSAndroid Build Coastguard Worker        cmp         rdi,        rcx
1263*fb1b10abSAndroid Build Coastguard Worker        jne         .next_row
1264*fb1b10abSAndroid Build Coastguard Worker
1265*fb1b10abSAndroid Build Coastguard Worker        jmp         .done8x8
1266*fb1b10abSAndroid Build Coastguard Worker
1267*fb1b10abSAndroid Build Coastguard Worker.b8x8_sp_only:
1268*fb1b10abSAndroid Build Coastguard Worker        movsxd      rax,        dword ptr arg(3)    ; yoffset
1269*fb1b10abSAndroid Build Coastguard Worker        shl         rax,        4
1270*fb1b10abSAndroid Build Coastguard Worker        lea         rax,        [rax + rcx]         ; VFilter
1271*fb1b10abSAndroid Build Coastguard Worker
1272*fb1b10abSAndroid Build Coastguard Worker        mov         rdi,        arg(4) ;dst_ptr
1273*fb1b10abSAndroid Build Coastguard Worker        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
1274*fb1b10abSAndroid Build Coastguard Worker
1275*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm0,       [rax]               ; VFilter
1276*fb1b10abSAndroid Build Coastguard Worker
1277*fb1b10abSAndroid Build Coastguard Worker        movq        xmm1,       XMMWORD PTR [rsp]
1278*fb1b10abSAndroid Build Coastguard Worker        movq        xmm2,       XMMWORD PTR [rsp+16]
1279*fb1b10abSAndroid Build Coastguard Worker
1280*fb1b10abSAndroid Build Coastguard Worker        movq        xmm3,       XMMWORD PTR [rsp+32]
1281*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm1,       xmm2
1282*fb1b10abSAndroid Build Coastguard Worker
1283*fb1b10abSAndroid Build Coastguard Worker        movq        xmm4,       XMMWORD PTR [rsp+48]
1284*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm2,       xmm3
1285*fb1b10abSAndroid Build Coastguard Worker
1286*fb1b10abSAndroid Build Coastguard Worker        movq        xmm5,       XMMWORD PTR [rsp+64]
1287*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm3,       xmm4
1288*fb1b10abSAndroid Build Coastguard Worker
1289*fb1b10abSAndroid Build Coastguard Worker        movq        xmm6,       XMMWORD PTR [rsp+80]
1290*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm4,       xmm5
1291*fb1b10abSAndroid Build Coastguard Worker
1292*fb1b10abSAndroid Build Coastguard Worker        movq        xmm7,       XMMWORD PTR [rsp+96]
1293*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm5,       xmm6
1294*fb1b10abSAndroid Build Coastguard Worker
1295*fb1b10abSAndroid Build Coastguard Worker        ; Because the source register (xmm0) is always treated as signed by
1296*fb1b10abSAndroid Build Coastguard Worker        ; pmaddubsw, the constant '128' is treated as '-128'.
1297*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm1,       xmm0
1298*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm2,       xmm0
1299*fb1b10abSAndroid Build Coastguard Worker
1300*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm3,       xmm0
1301*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm4,       xmm0
1302*fb1b10abSAndroid Build Coastguard Worker
1303*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm5,       xmm0
1304*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm6,       xmm7
1305*fb1b10abSAndroid Build Coastguard Worker
1306*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm6,       xmm0
1307*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm1,       [GLOBAL(rd)]
1308*fb1b10abSAndroid Build Coastguard Worker
1309*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm2,       [GLOBAL(rd)]
1310*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm1,       VP8_FILTER_SHIFT
1311*fb1b10abSAndroid Build Coastguard Worker
1312*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm3,       [GLOBAL(rd)]
1313*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm2,       VP8_FILTER_SHIFT
1314*fb1b10abSAndroid Build Coastguard Worker
1315*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm4,       [GLOBAL(rd)]
1316*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm3,       VP8_FILTER_SHIFT
1317*fb1b10abSAndroid Build Coastguard Worker
1318*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm5,       [GLOBAL(rd)]
1319*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm4,       VP8_FILTER_SHIFT
1320*fb1b10abSAndroid Build Coastguard Worker
1321*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm6,       [GLOBAL(rd)]
1322*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm5,       VP8_FILTER_SHIFT
1323*fb1b10abSAndroid Build Coastguard Worker
1324*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm6,       VP8_FILTER_SHIFT
1325*fb1b10abSAndroid Build Coastguard Worker
1326*fb1b10abSAndroid Build Coastguard Worker        ; Having multiplied everything by '-128' and obtained negative
1327*fb1b10abSAndroid Build Coastguard Worker        ; numbers, the unsigned saturation truncates those values to 0,
1328*fb1b10abSAndroid Build Coastguard Worker        ; resulting in incorrect handling of xoffset == 0 && yoffset == 0
1329*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm1,       xmm1
1330*fb1b10abSAndroid Build Coastguard Worker
1331*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm2,       xmm2
1332*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi],      xmm1
1333*fb1b10abSAndroid Build Coastguard Worker
1334*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm3,       xmm3
1335*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi+rdx],  xmm2
1336*fb1b10abSAndroid Build Coastguard Worker
1337*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm4,       xmm4
1338*fb1b10abSAndroid Build Coastguard Worker        movq        xmm1,       XMMWORD PTR [rsp+112]
1339*fb1b10abSAndroid Build Coastguard Worker
1340*fb1b10abSAndroid Build Coastguard Worker        lea         rdi,        [rdi + 2*rdx]
1341*fb1b10abSAndroid Build Coastguard Worker        movq        xmm2,       XMMWORD PTR [rsp+128]
1342*fb1b10abSAndroid Build Coastguard Worker
1343*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm5,       xmm5
1344*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi],      xmm3
1345*fb1b10abSAndroid Build Coastguard Worker
1346*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm6,       xmm6
1347*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi+rdx],  xmm4
1348*fb1b10abSAndroid Build Coastguard Worker
1349*fb1b10abSAndroid Build Coastguard Worker        lea         rdi,        [rdi + 2*rdx]
1350*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm7,       xmm1
1351*fb1b10abSAndroid Build Coastguard Worker
1352*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi],      xmm5
1353*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm7,       xmm0
1354*fb1b10abSAndroid Build Coastguard Worker
1355*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi+rdx],  xmm6
1356*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm1,       xmm2
1357*fb1b10abSAndroid Build Coastguard Worker
1358*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm1,       xmm0
1359*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm7,       [GLOBAL(rd)]
1360*fb1b10abSAndroid Build Coastguard Worker
1361*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm7,       VP8_FILTER_SHIFT
1362*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm1,       [GLOBAL(rd)]
1363*fb1b10abSAndroid Build Coastguard Worker
1364*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm1,       VP8_FILTER_SHIFT
1365*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm7,       xmm7
1366*fb1b10abSAndroid Build Coastguard Worker
1367*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm1,       xmm1
1368*fb1b10abSAndroid Build Coastguard Worker        lea         rdi,        [rdi + 2*rdx]
1369*fb1b10abSAndroid Build Coastguard Worker
1370*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi],      xmm7
1371*fb1b10abSAndroid Build Coastguard Worker
1372*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi+rdx],  xmm1
1373*fb1b10abSAndroid Build Coastguard Worker        lea         rsp,        [rsp + 144]
1374*fb1b10abSAndroid Build Coastguard Worker
1375*fb1b10abSAndroid Build Coastguard Worker        jmp         .done8x8
1376*fb1b10abSAndroid Build Coastguard Worker
1377*fb1b10abSAndroid Build Coastguard Worker.b8x8_fp_only:
1378*fb1b10abSAndroid Build Coastguard Worker        lea         rcx,        [rdi+rdx*8]
1379*fb1b10abSAndroid Build Coastguard Worker
1380*fb1b10abSAndroid Build Coastguard Worker.next_row_fp:
1381*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm1,       XMMWORD PTR [rsp]
1382*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm3,       XMMWORD PTR [rsp+16]
1383*fb1b10abSAndroid Build Coastguard Worker
1384*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm2,       xmm1
1385*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,       XMMWORD PTR [rsp+32]
1386*fb1b10abSAndroid Build Coastguard Worker
1387*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm2,       1
1388*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,       XMMWORD PTR [rsp+48]
1389*fb1b10abSAndroid Build Coastguard Worker
1390*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,       xmm3
1391*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm4,       1
1392*fb1b10abSAndroid Build Coastguard Worker
1393*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm6,       xmm5
1394*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm6,       1
1395*fb1b10abSAndroid Build Coastguard Worker
1396*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm1,       xmm2
1397*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm1,       xmm0
1398*fb1b10abSAndroid Build Coastguard Worker
1399*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm3,       xmm4
1400*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm3,       xmm0
1401*fb1b10abSAndroid Build Coastguard Worker
1402*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm5,       xmm6
1403*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm5,       xmm0
1404*fb1b10abSAndroid Build Coastguard Worker
1405*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm2,       xmm7
1406*fb1b10abSAndroid Build Coastguard Worker        psrldq      xmm2,       1
1407*fb1b10abSAndroid Build Coastguard Worker
1408*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm7,       xmm2
1409*fb1b10abSAndroid Build Coastguard Worker        pmaddubsw   xmm7,       xmm0
1410*fb1b10abSAndroid Build Coastguard Worker
1411*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm1,       [GLOBAL(rd)]
1412*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm1,       VP8_FILTER_SHIFT
1413*fb1b10abSAndroid Build Coastguard Worker
1414*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm3,       [GLOBAL(rd)]
1415*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm3,       VP8_FILTER_SHIFT
1416*fb1b10abSAndroid Build Coastguard Worker
1417*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm5,       [GLOBAL(rd)]
1418*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm5,       VP8_FILTER_SHIFT
1419*fb1b10abSAndroid Build Coastguard Worker
1420*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm7,       [GLOBAL(rd)]
1421*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm7,       VP8_FILTER_SHIFT
1422*fb1b10abSAndroid Build Coastguard Worker
1423*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm1,       xmm1
1424*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm3,       xmm3
1425*fb1b10abSAndroid Build Coastguard Worker
1426*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm5,       xmm5
1427*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi],      xmm1
1428*fb1b10abSAndroid Build Coastguard Worker
1429*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm7,       xmm7
1430*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi+rdx],  xmm3
1431*fb1b10abSAndroid Build Coastguard Worker
1432*fb1b10abSAndroid Build Coastguard Worker        lea         rdi,        [rdi + 2*rdx]
1433*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi],      xmm5
1434*fb1b10abSAndroid Build Coastguard Worker
1435*fb1b10abSAndroid Build Coastguard Worker        lea         rsp,        [rsp + 4*16]
1436*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi+rdx],  xmm7
1437*fb1b10abSAndroid Build Coastguard Worker
1438*fb1b10abSAndroid Build Coastguard Worker        lea         rdi,        [rdi + 2*rdx]
1439*fb1b10abSAndroid Build Coastguard Worker        cmp         rdi,        rcx
1440*fb1b10abSAndroid Build Coastguard Worker
1441*fb1b10abSAndroid Build Coastguard Worker        jne         .next_row_fp
1442*fb1b10abSAndroid Build Coastguard Worker
1443*fb1b10abSAndroid Build Coastguard Worker        lea         rsp,        [rsp + 16]
1444*fb1b10abSAndroid Build Coastguard Worker
1445*fb1b10abSAndroid Build Coastguard Worker.done8x8:
1446*fb1b10abSAndroid Build Coastguard Worker    ;add rsp, 144
1447*fb1b10abSAndroid Build Coastguard Worker    pop         rsp
1448*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
1449*fb1b10abSAndroid Build Coastguard Worker    pop         rdi
1450*fb1b10abSAndroid Build Coastguard Worker    pop         rsi
1451*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
1452*fb1b10abSAndroid Build Coastguard Worker    RESTORE_XMM
1453*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
1454*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
1455*fb1b10abSAndroid Build Coastguard Worker    ret
1456*fb1b10abSAndroid Build Coastguard Worker
1457*fb1b10abSAndroid Build Coastguard WorkerSECTION_RODATA
1458*fb1b10abSAndroid Build Coastguard Workeralign 16
1459*fb1b10abSAndroid Build Coastguard Workershuf1b:
1460*fb1b10abSAndroid Build Coastguard Worker    db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
1461*fb1b10abSAndroid Build Coastguard Workershuf2b:
1462*fb1b10abSAndroid Build Coastguard Worker    db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
1463*fb1b10abSAndroid Build Coastguard Workershuf3b:
1464*fb1b10abSAndroid Build Coastguard Worker    db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
1465*fb1b10abSAndroid Build Coastguard Worker
1466*fb1b10abSAndroid Build Coastguard Workeralign 16
1467*fb1b10abSAndroid Build Coastguard Workershuf2bfrom1:
1468*fb1b10abSAndroid Build Coastguard Worker    db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
1469*fb1b10abSAndroid Build Coastguard Workeralign 16
1470*fb1b10abSAndroid Build Coastguard Workershuf3bfrom1:
1471*fb1b10abSAndroid Build Coastguard Worker    db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
1472*fb1b10abSAndroid Build Coastguard Worker
1473*fb1b10abSAndroid Build Coastguard Workeralign 16
1474*fb1b10abSAndroid Build Coastguard Workerrd:
1475*fb1b10abSAndroid Build Coastguard Worker    times 8 dw 0x40
1476*fb1b10abSAndroid Build Coastguard Worker
1477*fb1b10abSAndroid Build Coastguard Workeralign 16
1478*fb1b10abSAndroid Build Coastguard Workerk0_k5:
1479*fb1b10abSAndroid Build Coastguard Worker    times 8 db 0, 0             ;placeholder
1480*fb1b10abSAndroid Build Coastguard Worker    times 8 db 0, 0
1481*fb1b10abSAndroid Build Coastguard Worker    times 8 db 2, 1
1482*fb1b10abSAndroid Build Coastguard Worker    times 8 db 0, 0
1483*fb1b10abSAndroid Build Coastguard Worker    times 8 db 3, 3
1484*fb1b10abSAndroid Build Coastguard Worker    times 8 db 0, 0
1485*fb1b10abSAndroid Build Coastguard Worker    times 8 db 1, 2
1486*fb1b10abSAndroid Build Coastguard Worker    times 8 db 0, 0
1487*fb1b10abSAndroid Build Coastguard Workerk1_k3:
1488*fb1b10abSAndroid Build Coastguard Worker    times 8 db  0,    0         ;placeholder
1489*fb1b10abSAndroid Build Coastguard Worker    times 8 db  -6,  12
1490*fb1b10abSAndroid Build Coastguard Worker    times 8 db -11,  36
1491*fb1b10abSAndroid Build Coastguard Worker    times 8 db  -9,  50
1492*fb1b10abSAndroid Build Coastguard Worker    times 8 db -16,  77
1493*fb1b10abSAndroid Build Coastguard Worker    times 8 db  -6,  93
1494*fb1b10abSAndroid Build Coastguard Worker    times 8 db  -8, 108
1495*fb1b10abSAndroid Build Coastguard Worker    times 8 db  -1, 123
1496*fb1b10abSAndroid Build Coastguard Workerk2_k4:
1497*fb1b10abSAndroid Build Coastguard Worker    times 8 db 128,    0        ;placeholder
1498*fb1b10abSAndroid Build Coastguard Worker    times 8 db 123,   -1
1499*fb1b10abSAndroid Build Coastguard Worker    times 8 db 108,   -8
1500*fb1b10abSAndroid Build Coastguard Worker    times 8 db  93,   -6
1501*fb1b10abSAndroid Build Coastguard Worker    times 8 db  77,  -16
1502*fb1b10abSAndroid Build Coastguard Worker    times 8 db  50,   -9
1503*fb1b10abSAndroid Build Coastguard Worker    times 8 db  36,  -11
1504*fb1b10abSAndroid Build Coastguard Worker    times 8 db  12,   -6
1505*fb1b10abSAndroid Build Coastguard Workeralign 16
1506*fb1b10abSAndroid Build Coastguard Workervp8_bilinear_filters_ssse3:
1507*fb1b10abSAndroid Build Coastguard Worker    times 8 db 128, 0
1508*fb1b10abSAndroid Build Coastguard Worker    times 8 db 112, 16
1509*fb1b10abSAndroid Build Coastguard Worker    times 8 db 96,  32
1510*fb1b10abSAndroid Build Coastguard Worker    times 8 db 80,  48
1511*fb1b10abSAndroid Build Coastguard Worker    times 8 db 64,  64
1512*fb1b10abSAndroid Build Coastguard Worker    times 8 db 48,  80
1513*fb1b10abSAndroid Build Coastguard Worker    times 8 db 32,  96
1514*fb1b10abSAndroid Build Coastguard Worker    times 8 db 16,  112
1515*fb1b10abSAndroid Build Coastguard Worker
1516