xref: /aosp_15_r20/external/libvpx/vp8/common/x86/idctllm_sse2.asm (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1*fb1b10abSAndroid Build Coastguard Worker;
2*fb1b10abSAndroid Build Coastguard Worker;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker;
4*fb1b10abSAndroid Build Coastguard Worker;  Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker;  that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker;  tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker;  in the file PATENTS.  All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker;  be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker;
10*fb1b10abSAndroid Build Coastguard Worker
11*fb1b10abSAndroid Build Coastguard Worker
12*fb1b10abSAndroid Build Coastguard Worker%include "vpx_ports/x86_abi_support.asm"
13*fb1b10abSAndroid Build Coastguard Worker
14*fb1b10abSAndroid Build Coastguard Worker;void vp8_idct_dequant_0_2x_sse2
15*fb1b10abSAndroid Build Coastguard Worker; (
16*fb1b10abSAndroid Build Coastguard Worker;   short *qcoeff       - 0
17*fb1b10abSAndroid Build Coastguard Worker;   short *dequant      - 1
18*fb1b10abSAndroid Build Coastguard Worker;   unsigned char *dst  - 2
19*fb1b10abSAndroid Build Coastguard Worker;   int dst_stride      - 3
20*fb1b10abSAndroid Build Coastguard Worker; )
21*fb1b10abSAndroid Build Coastguard Worker
22*fb1b10abSAndroid Build Coastguard WorkerSECTION .text
23*fb1b10abSAndroid Build Coastguard Worker
24*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_idct_dequant_0_2x_sse2)
25*fb1b10abSAndroid Build Coastguard Workersym(vp8_idct_dequant_0_2x_sse2):
26*fb1b10abSAndroid Build Coastguard Worker    push        rbp
27*fb1b10abSAndroid Build Coastguard Worker    mov         rbp, rsp
28*fb1b10abSAndroid Build Coastguard Worker    SHADOW_ARGS_TO_STACK 4
29*fb1b10abSAndroid Build Coastguard Worker    GET_GOT     rbx
30*fb1b10abSAndroid Build Coastguard Worker    ; end prolog
31*fb1b10abSAndroid Build Coastguard Worker
32*fb1b10abSAndroid Build Coastguard Worker        mov         rdx,            arg(1) ; dequant
33*fb1b10abSAndroid Build Coastguard Worker        mov         rax,            arg(0) ; qcoeff
34*fb1b10abSAndroid Build Coastguard Worker
35*fb1b10abSAndroid Build Coastguard Worker        movd        xmm4,           [rax]
36*fb1b10abSAndroid Build Coastguard Worker        movd        xmm5,           [rdx]
37*fb1b10abSAndroid Build Coastguard Worker
38*fb1b10abSAndroid Build Coastguard Worker        pinsrw      xmm4,           [rax+32],   4
39*fb1b10abSAndroid Build Coastguard Worker        pinsrw      xmm5,           [rdx],      4
40*fb1b10abSAndroid Build Coastguard Worker
41*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm4,           xmm5
42*fb1b10abSAndroid Build Coastguard Worker
43*fb1b10abSAndroid Build Coastguard Worker    ; Zero out xmm5, for use unpacking
44*fb1b10abSAndroid Build Coastguard Worker        pxor        xmm5,           xmm5
45*fb1b10abSAndroid Build Coastguard Worker
46*fb1b10abSAndroid Build Coastguard Worker    ; clear coeffs
47*fb1b10abSAndroid Build Coastguard Worker        movd        [rax],          xmm5
48*fb1b10abSAndroid Build Coastguard Worker        movd        [rax+32],       xmm5
49*fb1b10abSAndroid Build Coastguard Worker;pshufb
50*fb1b10abSAndroid Build Coastguard Worker        mov         rax,            arg(2) ; dst
51*fb1b10abSAndroid Build Coastguard Worker        movsxd      rdx,            dword ptr arg(3) ; dst_stride
52*fb1b10abSAndroid Build Coastguard Worker
53*fb1b10abSAndroid Build Coastguard Worker        pshuflw     xmm4,           xmm4,       00000000b
54*fb1b10abSAndroid Build Coastguard Worker        pshufhw     xmm4,           xmm4,       00000000b
55*fb1b10abSAndroid Build Coastguard Worker
56*fb1b10abSAndroid Build Coastguard Worker        lea         rcx,            [rdx + rdx*2]
57*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm4,           [GLOBAL(fours)]
58*fb1b10abSAndroid Build Coastguard Worker
59*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm4,           3
60*fb1b10abSAndroid Build Coastguard Worker
61*fb1b10abSAndroid Build Coastguard Worker        movq        xmm0,           [rax]
62*fb1b10abSAndroid Build Coastguard Worker        movq        xmm1,           [rax+rdx]
63*fb1b10abSAndroid Build Coastguard Worker        movq        xmm2,           [rax+2*rdx]
64*fb1b10abSAndroid Build Coastguard Worker        movq        xmm3,           [rax+rcx]
65*fb1b10abSAndroid Build Coastguard Worker
66*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm0,           xmm5
67*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm1,           xmm5
68*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm2,           xmm5
69*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm3,           xmm5
70*fb1b10abSAndroid Build Coastguard Worker
71*fb1b10abSAndroid Build Coastguard Worker
72*fb1b10abSAndroid Build Coastguard Worker    ; Add to predict buffer
73*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm0,           xmm4
74*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm1,           xmm4
75*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm2,           xmm4
76*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm3,           xmm4
77*fb1b10abSAndroid Build Coastguard Worker
78*fb1b10abSAndroid Build Coastguard Worker    ; pack up before storing
79*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm0,           xmm5
80*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm1,           xmm5
81*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm2,           xmm5
82*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm3,           xmm5
83*fb1b10abSAndroid Build Coastguard Worker
84*fb1b10abSAndroid Build Coastguard Worker    ; store blocks back out
85*fb1b10abSAndroid Build Coastguard Worker        movq        [rax],          xmm0
86*fb1b10abSAndroid Build Coastguard Worker        movq        [rax + rdx],    xmm1
87*fb1b10abSAndroid Build Coastguard Worker
88*fb1b10abSAndroid Build Coastguard Worker        lea         rax,            [rax + 2*rdx]
89*fb1b10abSAndroid Build Coastguard Worker
90*fb1b10abSAndroid Build Coastguard Worker        movq        [rax],          xmm2
91*fb1b10abSAndroid Build Coastguard Worker        movq        [rax + rdx],    xmm3
92*fb1b10abSAndroid Build Coastguard Worker
93*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
94*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
95*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
96*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
97*fb1b10abSAndroid Build Coastguard Worker    ret
98*fb1b10abSAndroid Build Coastguard Worker
99*fb1b10abSAndroid Build Coastguard Worker;void vp8_idct_dequant_full_2x_sse2
100*fb1b10abSAndroid Build Coastguard Worker; (
101*fb1b10abSAndroid Build Coastguard Worker;   short *qcoeff       - 0
102*fb1b10abSAndroid Build Coastguard Worker;   short *dequant      - 1
103*fb1b10abSAndroid Build Coastguard Worker;   unsigned char *dst  - 2
104*fb1b10abSAndroid Build Coastguard Worker;   int dst_stride      - 3
105*fb1b10abSAndroid Build Coastguard Worker; )
106*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_idct_dequant_full_2x_sse2)
107*fb1b10abSAndroid Build Coastguard Workersym(vp8_idct_dequant_full_2x_sse2):
108*fb1b10abSAndroid Build Coastguard Worker    push        rbp
109*fb1b10abSAndroid Build Coastguard Worker    mov         rbp, rsp
110*fb1b10abSAndroid Build Coastguard Worker    SHADOW_ARGS_TO_STACK 4
111*fb1b10abSAndroid Build Coastguard Worker    SAVE_XMM 7
112*fb1b10abSAndroid Build Coastguard Worker    GET_GOT     rbx
113*fb1b10abSAndroid Build Coastguard Worker    push        rsi
114*fb1b10abSAndroid Build Coastguard Worker    push        rdi
115*fb1b10abSAndroid Build Coastguard Worker    ; end prolog
116*fb1b10abSAndroid Build Coastguard Worker
117*fb1b10abSAndroid Build Coastguard Worker    ; special case when 2 blocks have 0 or 1 coeffs
118*fb1b10abSAndroid Build Coastguard Worker    ; dc is set as first coeff, so no need to load qcoeff
119*fb1b10abSAndroid Build Coastguard Worker        mov         rax,            arg(0) ; qcoeff
120*fb1b10abSAndroid Build Coastguard Worker        mov         rdx,            arg(1)  ; dequant
121*fb1b10abSAndroid Build Coastguard Worker        mov         rdi,            arg(2) ; dst
122*fb1b10abSAndroid Build Coastguard Worker
123*fb1b10abSAndroid Build Coastguard Worker
124*fb1b10abSAndroid Build Coastguard Worker    ; Zero out xmm7, for use unpacking
125*fb1b10abSAndroid Build Coastguard Worker        pxor        xmm7,           xmm7
126*fb1b10abSAndroid Build Coastguard Worker
127*fb1b10abSAndroid Build Coastguard Worker
128*fb1b10abSAndroid Build Coastguard Worker    ; note the transpose of xmm1 and xmm2, necessary for shuffle
129*fb1b10abSAndroid Build Coastguard Worker    ;   to spit out sensicle data
130*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm0,           [rax]
131*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm2,           [rax+16]
132*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm1,           [rax+32]
133*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm3,           [rax+48]
134*fb1b10abSAndroid Build Coastguard Worker
135*fb1b10abSAndroid Build Coastguard Worker    ; Clear out coeffs
136*fb1b10abSAndroid Build Coastguard Worker        movdqa      [rax],          xmm7
137*fb1b10abSAndroid Build Coastguard Worker        movdqa      [rax+16],       xmm7
138*fb1b10abSAndroid Build Coastguard Worker        movdqa      [rax+32],       xmm7
139*fb1b10abSAndroid Build Coastguard Worker        movdqa      [rax+48],       xmm7
140*fb1b10abSAndroid Build Coastguard Worker
141*fb1b10abSAndroid Build Coastguard Worker    ; dequantize qcoeff buffer
142*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm0,           [rdx]
143*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm2,           [rdx+16]
144*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm1,           [rdx]
145*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm3,           [rdx+16]
146*fb1b10abSAndroid Build Coastguard Worker        movsxd      rdx,            dword ptr arg(3) ; dst_stride
147*fb1b10abSAndroid Build Coastguard Worker
148*fb1b10abSAndroid Build Coastguard Worker    ; repack so block 0 row x and block 1 row x are together
149*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,           xmm0
150*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm0,           xmm1
151*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm4,           xmm1
152*fb1b10abSAndroid Build Coastguard Worker
153*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm0,           xmm0,       11011000b
154*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm1,           xmm4,       11011000b
155*fb1b10abSAndroid Build Coastguard Worker
156*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,           xmm2
157*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm2,           xmm3
158*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm4,           xmm3
159*fb1b10abSAndroid Build Coastguard Worker
160*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm2,           xmm2,       11011000b
161*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm3,           xmm4,       11011000b
162*fb1b10abSAndroid Build Coastguard Worker
163*fb1b10abSAndroid Build Coastguard Worker    ; first pass
164*fb1b10abSAndroid Build Coastguard Worker        psubw       xmm0,           xmm2        ; b1 = 0-2
165*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm2,           xmm2        ;
166*fb1b10abSAndroid Build Coastguard Worker
167*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,           xmm1
168*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm2,           xmm0        ; a1 = 0+2
169*fb1b10abSAndroid Build Coastguard Worker
170*fb1b10abSAndroid Build Coastguard Worker        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
171*fb1b10abSAndroid Build Coastguard Worker        lea         rcx,            [rdx + rdx*2]   ;dst_stride * 3
172*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
173*fb1b10abSAndroid Build Coastguard Worker
174*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,           xmm3
175*fb1b10abSAndroid Build Coastguard Worker        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
176*fb1b10abSAndroid Build Coastguard Worker
177*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
178*fb1b10abSAndroid Build Coastguard Worker        psubw       xmm7,           xmm5        ; c1
179*fb1b10abSAndroid Build Coastguard Worker
180*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,           xmm1
181*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,           xmm3
182*fb1b10abSAndroid Build Coastguard Worker
183*fb1b10abSAndroid Build Coastguard Worker        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
184*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm5,           xmm1
185*fb1b10abSAndroid Build Coastguard Worker
186*fb1b10abSAndroid Build Coastguard Worker        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
187*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm3,           xmm4
188*fb1b10abSAndroid Build Coastguard Worker
189*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm3,           xmm5        ; d1
190*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm6,           xmm2        ; a1
191*fb1b10abSAndroid Build Coastguard Worker
192*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,           xmm0        ; b1
193*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm2,           xmm3        ;0
194*fb1b10abSAndroid Build Coastguard Worker
195*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm4,           xmm7        ;1
196*fb1b10abSAndroid Build Coastguard Worker        psubw       xmm0,           xmm7        ;2
197*fb1b10abSAndroid Build Coastguard Worker
198*fb1b10abSAndroid Build Coastguard Worker        psubw       xmm6,           xmm3        ;3
199*fb1b10abSAndroid Build Coastguard Worker
200*fb1b10abSAndroid Build Coastguard Worker    ; transpose for the second pass
201*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
202*fb1b10abSAndroid Build Coastguard Worker        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
203*fb1b10abSAndroid Build Coastguard Worker        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
204*fb1b10abSAndroid Build Coastguard Worker
205*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
206*fb1b10abSAndroid Build Coastguard Worker        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
207*fb1b10abSAndroid Build Coastguard Worker        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
208*fb1b10abSAndroid Build Coastguard Worker
209*fb1b10abSAndroid Build Coastguard Worker
210*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
211*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
212*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
213*fb1b10abSAndroid Build Coastguard Worker
214*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
215*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
216*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
217*fb1b10abSAndroid Build Coastguard Worker
218*fb1b10abSAndroid Build Coastguard Worker
219*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
220*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
221*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
222*fb1b10abSAndroid Build Coastguard Worker
223*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
224*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
225*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
226*fb1b10abSAndroid Build Coastguard Worker
227*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm0,           xmm2,       11011000b
228*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm2,           xmm1,       11011000b
229*fb1b10abSAndroid Build Coastguard Worker
230*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm1,           xmm5,       11011000b
231*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm3,           xmm7,       11011000b
232*fb1b10abSAndroid Build Coastguard Worker
233*fb1b10abSAndroid Build Coastguard Worker    ; second pass
234*fb1b10abSAndroid Build Coastguard Worker        psubw       xmm0,           xmm2            ; b1 = 0-2
235*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm2,           xmm2
236*fb1b10abSAndroid Build Coastguard Worker
237*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,           xmm1
238*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm2,           xmm0            ; a1 = 0+2
239*fb1b10abSAndroid Build Coastguard Worker
240*fb1b10abSAndroid Build Coastguard Worker        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
241*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
242*fb1b10abSAndroid Build Coastguard Worker
243*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,           xmm3
244*fb1b10abSAndroid Build Coastguard Worker        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
245*fb1b10abSAndroid Build Coastguard Worker
246*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
247*fb1b10abSAndroid Build Coastguard Worker        psubw       xmm7,           xmm5            ; c1
248*fb1b10abSAndroid Build Coastguard Worker
249*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,           xmm1
250*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,           xmm3
251*fb1b10abSAndroid Build Coastguard Worker
252*fb1b10abSAndroid Build Coastguard Worker        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
253*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm5,           xmm1
254*fb1b10abSAndroid Build Coastguard Worker
255*fb1b10abSAndroid Build Coastguard Worker        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
256*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm3,           xmm4
257*fb1b10abSAndroid Build Coastguard Worker
258*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm3,           xmm5            ; d1
259*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm0,           [GLOBAL(fours)]
260*fb1b10abSAndroid Build Coastguard Worker
261*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm2,           [GLOBAL(fours)]
262*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm6,           xmm2            ; a1
263*fb1b10abSAndroid Build Coastguard Worker
264*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,           xmm0            ; b1
265*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm2,           xmm3            ;0
266*fb1b10abSAndroid Build Coastguard Worker
267*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm4,           xmm7            ;1
268*fb1b10abSAndroid Build Coastguard Worker        psubw       xmm0,           xmm7            ;2
269*fb1b10abSAndroid Build Coastguard Worker
270*fb1b10abSAndroid Build Coastguard Worker        psubw       xmm6,           xmm3            ;3
271*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm2,           3
272*fb1b10abSAndroid Build Coastguard Worker
273*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm0,           3
274*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm4,           3
275*fb1b10abSAndroid Build Coastguard Worker
276*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm6,           3
277*fb1b10abSAndroid Build Coastguard Worker
278*fb1b10abSAndroid Build Coastguard Worker    ; transpose to save
279*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
280*fb1b10abSAndroid Build Coastguard Worker        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
281*fb1b10abSAndroid Build Coastguard Worker        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
282*fb1b10abSAndroid Build Coastguard Worker
283*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
284*fb1b10abSAndroid Build Coastguard Worker        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
285*fb1b10abSAndroid Build Coastguard Worker        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
286*fb1b10abSAndroid Build Coastguard Worker
287*fb1b10abSAndroid Build Coastguard Worker
288*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
289*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
290*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
291*fb1b10abSAndroid Build Coastguard Worker
292*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
293*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
294*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
295*fb1b10abSAndroid Build Coastguard Worker
296*fb1b10abSAndroid Build Coastguard Worker
297*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
298*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
299*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
300*fb1b10abSAndroid Build Coastguard Worker
301*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
302*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
303*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
304*fb1b10abSAndroid Build Coastguard Worker
305*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm0,           xmm2,       11011000b
306*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm2,           xmm1,       11011000b
307*fb1b10abSAndroid Build Coastguard Worker
308*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm1,           xmm5,       11011000b
309*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm3,           xmm7,       11011000b
310*fb1b10abSAndroid Build Coastguard Worker
311*fb1b10abSAndroid Build Coastguard Worker        pxor        xmm7,           xmm7
312*fb1b10abSAndroid Build Coastguard Worker
313*fb1b10abSAndroid Build Coastguard Worker    ; Load up predict blocks
314*fb1b10abSAndroid Build Coastguard Worker        movq        xmm4,           [rdi]
315*fb1b10abSAndroid Build Coastguard Worker        movq        xmm5,           [rdi+rdx]
316*fb1b10abSAndroid Build Coastguard Worker
317*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm4,           xmm7
318*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm5,           xmm7
319*fb1b10abSAndroid Build Coastguard Worker
320*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm0,           xmm4
321*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm1,           xmm5
322*fb1b10abSAndroid Build Coastguard Worker
323*fb1b10abSAndroid Build Coastguard Worker        movq        xmm4,           [rdi+2*rdx]
324*fb1b10abSAndroid Build Coastguard Worker        movq        xmm5,           [rdi+rcx]
325*fb1b10abSAndroid Build Coastguard Worker
326*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm4,           xmm7
327*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm5,           xmm7
328*fb1b10abSAndroid Build Coastguard Worker
329*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm2,           xmm4
330*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm3,           xmm5
331*fb1b10abSAndroid Build Coastguard Worker
332*fb1b10abSAndroid Build Coastguard Worker.finish:
333*fb1b10abSAndroid Build Coastguard Worker
334*fb1b10abSAndroid Build Coastguard Worker    ; pack up before storing
335*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm0,           xmm7
336*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm1,           xmm7
337*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm2,           xmm7
338*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm3,           xmm7
339*fb1b10abSAndroid Build Coastguard Worker
340*fb1b10abSAndroid Build Coastguard Worker    ; store blocks back out
341*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi],          xmm0
342*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi + rdx],    xmm1
343*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi + rdx*2],  xmm2
344*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi + rcx],    xmm3
345*fb1b10abSAndroid Build Coastguard Worker
346*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
347*fb1b10abSAndroid Build Coastguard Worker    pop         rdi
348*fb1b10abSAndroid Build Coastguard Worker    pop         rsi
349*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
350*fb1b10abSAndroid Build Coastguard Worker    RESTORE_XMM
351*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
352*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
353*fb1b10abSAndroid Build Coastguard Worker    ret
354*fb1b10abSAndroid Build Coastguard Worker
355*fb1b10abSAndroid Build Coastguard Worker;void vp8_idct_dequant_dc_0_2x_sse2
356*fb1b10abSAndroid Build Coastguard Worker; (
357*fb1b10abSAndroid Build Coastguard Worker;   short *qcoeff       - 0
358*fb1b10abSAndroid Build Coastguard Worker;   short *dequant      - 1
359*fb1b10abSAndroid Build Coastguard Worker;   unsigned char *dst  - 2
360*fb1b10abSAndroid Build Coastguard Worker;   int dst_stride      - 3
361*fb1b10abSAndroid Build Coastguard Worker;   short *dc           - 4
362*fb1b10abSAndroid Build Coastguard Worker; )
363*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_idct_dequant_dc_0_2x_sse2)
364*fb1b10abSAndroid Build Coastguard Workersym(vp8_idct_dequant_dc_0_2x_sse2):
365*fb1b10abSAndroid Build Coastguard Worker    push        rbp
366*fb1b10abSAndroid Build Coastguard Worker    mov         rbp, rsp
367*fb1b10abSAndroid Build Coastguard Worker    SHADOW_ARGS_TO_STACK 5
368*fb1b10abSAndroid Build Coastguard Worker    GET_GOT     rbx
369*fb1b10abSAndroid Build Coastguard Worker    push        rdi
370*fb1b10abSAndroid Build Coastguard Worker    ; end prolog
371*fb1b10abSAndroid Build Coastguard Worker
372*fb1b10abSAndroid Build Coastguard Worker    ; special case when 2 blocks have 0 or 1 coeffs
373*fb1b10abSAndroid Build Coastguard Worker    ; dc is set as first coeff, so no need to load qcoeff
374*fb1b10abSAndroid Build Coastguard Worker        mov         rax,            arg(0) ; qcoeff
375*fb1b10abSAndroid Build Coastguard Worker
376*fb1b10abSAndroid Build Coastguard Worker        mov         rdi,            arg(2) ; dst
377*fb1b10abSAndroid Build Coastguard Worker        mov         rdx,            arg(4) ; dc
378*fb1b10abSAndroid Build Coastguard Worker
379*fb1b10abSAndroid Build Coastguard Worker    ; Zero out xmm5, for use unpacking
380*fb1b10abSAndroid Build Coastguard Worker        pxor        xmm5,           xmm5
381*fb1b10abSAndroid Build Coastguard Worker
382*fb1b10abSAndroid Build Coastguard Worker    ; load up 2 dc words here == 2*16 = doubleword
383*fb1b10abSAndroid Build Coastguard Worker        movd        xmm4,           [rdx]
384*fb1b10abSAndroid Build Coastguard Worker
385*fb1b10abSAndroid Build Coastguard Worker        movsxd      rdx,            dword ptr arg(3) ; dst_stride
386*fb1b10abSAndroid Build Coastguard Worker        lea         rcx, [rdx + rdx*2]
387*fb1b10abSAndroid Build Coastguard Worker    ; Load up predict blocks
388*fb1b10abSAndroid Build Coastguard Worker        movq        xmm0,           [rdi]
389*fb1b10abSAndroid Build Coastguard Worker        movq        xmm1,           [rdi+rdx*1]
390*fb1b10abSAndroid Build Coastguard Worker        movq        xmm2,           [rdi+rdx*2]
391*fb1b10abSAndroid Build Coastguard Worker        movq        xmm3,           [rdi+rcx]
392*fb1b10abSAndroid Build Coastguard Worker
393*fb1b10abSAndroid Build Coastguard Worker    ; Duplicate and expand dc across
394*fb1b10abSAndroid Build Coastguard Worker        punpcklwd   xmm4,           xmm4
395*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm4,           xmm4
396*fb1b10abSAndroid Build Coastguard Worker
397*fb1b10abSAndroid Build Coastguard Worker    ; Rounding to dequant and downshift
398*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm4,           [GLOBAL(fours)]
399*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm4,           3
400*fb1b10abSAndroid Build Coastguard Worker
401*fb1b10abSAndroid Build Coastguard Worker    ; Predict buffer needs to be expanded from bytes to words
402*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm0,           xmm5
403*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm1,           xmm5
404*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm2,           xmm5
405*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm3,           xmm5
406*fb1b10abSAndroid Build Coastguard Worker
407*fb1b10abSAndroid Build Coastguard Worker    ; Add to predict buffer
408*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm0,           xmm4
409*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm1,           xmm4
410*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm2,           xmm4
411*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm3,           xmm4
412*fb1b10abSAndroid Build Coastguard Worker
413*fb1b10abSAndroid Build Coastguard Worker    ; pack up before storing
414*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm0,           xmm5
415*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm1,           xmm5
416*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm2,           xmm5
417*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm3,           xmm5
418*fb1b10abSAndroid Build Coastguard Worker
419*fb1b10abSAndroid Build Coastguard Worker    ; store blocks back out
420*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi],          xmm0
421*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi + rdx],    xmm1
422*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi + rdx*2],  xmm2
423*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi + rcx],    xmm3
424*fb1b10abSAndroid Build Coastguard Worker
425*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
426*fb1b10abSAndroid Build Coastguard Worker    pop         rdi
427*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
428*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
429*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
430*fb1b10abSAndroid Build Coastguard Worker    ret
431*fb1b10abSAndroid Build Coastguard Worker;void vp8_idct_dequant_dc_full_2x_sse2
432*fb1b10abSAndroid Build Coastguard Worker; (
433*fb1b10abSAndroid Build Coastguard Worker;   short *qcoeff       - 0
434*fb1b10abSAndroid Build Coastguard Worker;   short *dequant      - 1
435*fb1b10abSAndroid Build Coastguard Worker;   unsigned char *dst  - 2
436*fb1b10abSAndroid Build Coastguard Worker;   int dst_stride      - 3
437*fb1b10abSAndroid Build Coastguard Worker;   short *dc           - 4
438*fb1b10abSAndroid Build Coastguard Worker; )
439*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_idct_dequant_dc_full_2x_sse2)
440*fb1b10abSAndroid Build Coastguard Workersym(vp8_idct_dequant_dc_full_2x_sse2):
441*fb1b10abSAndroid Build Coastguard Worker    push        rbp
442*fb1b10abSAndroid Build Coastguard Worker    mov         rbp, rsp
443*fb1b10abSAndroid Build Coastguard Worker    SHADOW_ARGS_TO_STACK 5
444*fb1b10abSAndroid Build Coastguard Worker    SAVE_XMM 7
445*fb1b10abSAndroid Build Coastguard Worker    GET_GOT     rbx
446*fb1b10abSAndroid Build Coastguard Worker    push        rdi
447*fb1b10abSAndroid Build Coastguard Worker    ; end prolog
448*fb1b10abSAndroid Build Coastguard Worker
449*fb1b10abSAndroid Build Coastguard Worker    ; special case when 2 blocks have 0 or 1 coeffs
450*fb1b10abSAndroid Build Coastguard Worker    ; dc is set as first coeff, so no need to load qcoeff
451*fb1b10abSAndroid Build Coastguard Worker        mov         rax,            arg(0) ; qcoeff
452*fb1b10abSAndroid Build Coastguard Worker        mov         rdx,            arg(1)  ; dequant
453*fb1b10abSAndroid Build Coastguard Worker
454*fb1b10abSAndroid Build Coastguard Worker        mov         rdi,            arg(2) ; dst
455*fb1b10abSAndroid Build Coastguard Worker
456*fb1b10abSAndroid Build Coastguard Worker    ; Zero out xmm7, for use unpacking
457*fb1b10abSAndroid Build Coastguard Worker        pxor        xmm7,           xmm7
458*fb1b10abSAndroid Build Coastguard Worker
459*fb1b10abSAndroid Build Coastguard Worker
460*fb1b10abSAndroid Build Coastguard Worker    ; note the transpose of xmm1 and xmm2, necessary for shuffle
461*fb1b10abSAndroid Build Coastguard Worker    ;   to spit out sensicle data
462*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm0,           [rax]
463*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm2,           [rax+16]
464*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm1,           [rax+32]
465*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm3,           [rax+48]
466*fb1b10abSAndroid Build Coastguard Worker
467*fb1b10abSAndroid Build Coastguard Worker    ; Clear out coeffs
468*fb1b10abSAndroid Build Coastguard Worker        movdqa      [rax],          xmm7
469*fb1b10abSAndroid Build Coastguard Worker        movdqa      [rax+16],       xmm7
470*fb1b10abSAndroid Build Coastguard Worker        movdqa      [rax+32],       xmm7
471*fb1b10abSAndroid Build Coastguard Worker        movdqa      [rax+48],       xmm7
472*fb1b10abSAndroid Build Coastguard Worker
473*fb1b10abSAndroid Build Coastguard Worker    ; dequantize qcoeff buffer
474*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm0,           [rdx]
475*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm2,           [rdx+16]
476*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm1,           [rdx]
477*fb1b10abSAndroid Build Coastguard Worker        pmullw      xmm3,           [rdx+16]
478*fb1b10abSAndroid Build Coastguard Worker
479*fb1b10abSAndroid Build Coastguard Worker    ; DC component
480*fb1b10abSAndroid Build Coastguard Worker        mov         rdx,            arg(4)
481*fb1b10abSAndroid Build Coastguard Worker
482*fb1b10abSAndroid Build Coastguard Worker    ; repack so block 0 row x and block 1 row x are together
483*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,           xmm0
484*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm0,           xmm1
485*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm4,           xmm1
486*fb1b10abSAndroid Build Coastguard Worker
487*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm0,           xmm0,       11011000b
488*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm1,           xmm4,       11011000b
489*fb1b10abSAndroid Build Coastguard Worker
490*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,           xmm2
491*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm2,           xmm3
492*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm4,           xmm3
493*fb1b10abSAndroid Build Coastguard Worker
494*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm2,           xmm2,       11011000b
495*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm3,           xmm4,       11011000b
496*fb1b10abSAndroid Build Coastguard Worker
497*fb1b10abSAndroid Build Coastguard Worker    ; insert DC component
498*fb1b10abSAndroid Build Coastguard Worker        pinsrw      xmm0,           [rdx],      0
499*fb1b10abSAndroid Build Coastguard Worker        pinsrw      xmm0,           [rdx+2],    4
500*fb1b10abSAndroid Build Coastguard Worker
501*fb1b10abSAndroid Build Coastguard Worker    ; first pass
502*fb1b10abSAndroid Build Coastguard Worker        psubw       xmm0,           xmm2        ; b1 = 0-2
503*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm2,           xmm2        ;
504*fb1b10abSAndroid Build Coastguard Worker
505*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,           xmm1
506*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm2,           xmm0        ; a1 = 0+2
507*fb1b10abSAndroid Build Coastguard Worker
508*fb1b10abSAndroid Build Coastguard Worker        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
509*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
510*fb1b10abSAndroid Build Coastguard Worker
511*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,           xmm3
512*fb1b10abSAndroid Build Coastguard Worker        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
513*fb1b10abSAndroid Build Coastguard Worker
514*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
515*fb1b10abSAndroid Build Coastguard Worker        psubw       xmm7,           xmm5        ; c1
516*fb1b10abSAndroid Build Coastguard Worker
517*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,           xmm1
518*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,           xmm3
519*fb1b10abSAndroid Build Coastguard Worker
520*fb1b10abSAndroid Build Coastguard Worker        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
521*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm5,           xmm1
522*fb1b10abSAndroid Build Coastguard Worker
523*fb1b10abSAndroid Build Coastguard Worker        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
524*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm3,           xmm4
525*fb1b10abSAndroid Build Coastguard Worker
526*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm3,           xmm5        ; d1
527*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm6,           xmm2        ; a1
528*fb1b10abSAndroid Build Coastguard Worker
529*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,           xmm0        ; b1
530*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm2,           xmm3        ;0
531*fb1b10abSAndroid Build Coastguard Worker
532*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm4,           xmm7        ;1
533*fb1b10abSAndroid Build Coastguard Worker        psubw       xmm0,           xmm7        ;2
534*fb1b10abSAndroid Build Coastguard Worker
535*fb1b10abSAndroid Build Coastguard Worker        psubw       xmm6,           xmm3        ;3
536*fb1b10abSAndroid Build Coastguard Worker
537*fb1b10abSAndroid Build Coastguard Worker    ; transpose for the second pass
538*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
539*fb1b10abSAndroid Build Coastguard Worker        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
540*fb1b10abSAndroid Build Coastguard Worker        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
541*fb1b10abSAndroid Build Coastguard Worker
542*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
543*fb1b10abSAndroid Build Coastguard Worker        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
544*fb1b10abSAndroid Build Coastguard Worker        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
545*fb1b10abSAndroid Build Coastguard Worker
546*fb1b10abSAndroid Build Coastguard Worker
547*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
548*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
549*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
550*fb1b10abSAndroid Build Coastguard Worker
551*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
552*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
553*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
554*fb1b10abSAndroid Build Coastguard Worker
555*fb1b10abSAndroid Build Coastguard Worker
556*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
557*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
558*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
559*fb1b10abSAndroid Build Coastguard Worker
560*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
561*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
562*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
563*fb1b10abSAndroid Build Coastguard Worker
564*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm0,           xmm2,       11011000b
565*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm2,           xmm1,       11011000b
566*fb1b10abSAndroid Build Coastguard Worker
567*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm1,           xmm5,       11011000b
568*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm3,           xmm7,       11011000b
569*fb1b10abSAndroid Build Coastguard Worker
570*fb1b10abSAndroid Build Coastguard Worker    ; second pass
571*fb1b10abSAndroid Build Coastguard Worker        psubw       xmm0,           xmm2            ; b1 = 0-2
572*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm2,           xmm2
573*fb1b10abSAndroid Build Coastguard Worker
574*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,           xmm1
575*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm2,           xmm0            ; a1 = 0+2
576*fb1b10abSAndroid Build Coastguard Worker
577*fb1b10abSAndroid Build Coastguard Worker        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
578*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
579*fb1b10abSAndroid Build Coastguard Worker
580*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,           xmm3
581*fb1b10abSAndroid Build Coastguard Worker        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
582*fb1b10abSAndroid Build Coastguard Worker
583*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
584*fb1b10abSAndroid Build Coastguard Worker        psubw       xmm7,           xmm5            ; c1
585*fb1b10abSAndroid Build Coastguard Worker
586*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,           xmm1
587*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,           xmm3
588*fb1b10abSAndroid Build Coastguard Worker
589*fb1b10abSAndroid Build Coastguard Worker        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
590*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm5,           xmm1
591*fb1b10abSAndroid Build Coastguard Worker
592*fb1b10abSAndroid Build Coastguard Worker        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
593*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm3,           xmm4
594*fb1b10abSAndroid Build Coastguard Worker
595*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm3,           xmm5            ; d1
596*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm0,           [GLOBAL(fours)]
597*fb1b10abSAndroid Build Coastguard Worker
598*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm2,           [GLOBAL(fours)]
599*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm6,           xmm2            ; a1
600*fb1b10abSAndroid Build Coastguard Worker
601*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,           xmm0            ; b1
602*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm2,           xmm3            ;0
603*fb1b10abSAndroid Build Coastguard Worker
604*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm4,           xmm7            ;1
605*fb1b10abSAndroid Build Coastguard Worker        psubw       xmm0,           xmm7            ;2
606*fb1b10abSAndroid Build Coastguard Worker
607*fb1b10abSAndroid Build Coastguard Worker        psubw       xmm6,           xmm3            ;3
608*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm2,           3
609*fb1b10abSAndroid Build Coastguard Worker
610*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm0,           3
611*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm4,           3
612*fb1b10abSAndroid Build Coastguard Worker
613*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm6,           3
614*fb1b10abSAndroid Build Coastguard Worker
615*fb1b10abSAndroid Build Coastguard Worker    ; transpose to save
616*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
617*fb1b10abSAndroid Build Coastguard Worker        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
618*fb1b10abSAndroid Build Coastguard Worker        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
619*fb1b10abSAndroid Build Coastguard Worker
620*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
621*fb1b10abSAndroid Build Coastguard Worker        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
622*fb1b10abSAndroid Build Coastguard Worker        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
623*fb1b10abSAndroid Build Coastguard Worker
624*fb1b10abSAndroid Build Coastguard Worker
625*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
626*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
627*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
628*fb1b10abSAndroid Build Coastguard Worker
629*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
630*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
631*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
632*fb1b10abSAndroid Build Coastguard Worker
633*fb1b10abSAndroid Build Coastguard Worker
634*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
635*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
636*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
637*fb1b10abSAndroid Build Coastguard Worker
638*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
639*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
640*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
641*fb1b10abSAndroid Build Coastguard Worker
642*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm0,           xmm2,       11011000b
643*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm2,           xmm1,       11011000b
644*fb1b10abSAndroid Build Coastguard Worker
645*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm1,           xmm5,       11011000b
646*fb1b10abSAndroid Build Coastguard Worker        pshufd      xmm3,           xmm7,       11011000b
647*fb1b10abSAndroid Build Coastguard Worker
648*fb1b10abSAndroid Build Coastguard Worker        pxor        xmm7,           xmm7
649*fb1b10abSAndroid Build Coastguard Worker
650*fb1b10abSAndroid Build Coastguard Worker    ; Load up predict blocks
651*fb1b10abSAndroid Build Coastguard Worker        movsxd      rdx,            dword ptr arg(3) ; dst_stride
652*fb1b10abSAndroid Build Coastguard Worker        movq        xmm4,           [rdi]
653*fb1b10abSAndroid Build Coastguard Worker        movq        xmm5,           [rdi+rdx]
654*fb1b10abSAndroid Build Coastguard Worker        lea         rcx,            [rdx + rdx*2]
655*fb1b10abSAndroid Build Coastguard Worker
656*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm4,           xmm7
657*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm5,           xmm7
658*fb1b10abSAndroid Build Coastguard Worker
659*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm0,           xmm4
660*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm1,           xmm5
661*fb1b10abSAndroid Build Coastguard Worker
662*fb1b10abSAndroid Build Coastguard Worker        movq        xmm4,           [rdi+rdx*2]
663*fb1b10abSAndroid Build Coastguard Worker        movq        xmm5,           [rdi+rcx]
664*fb1b10abSAndroid Build Coastguard Worker
665*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm4,           xmm7
666*fb1b10abSAndroid Build Coastguard Worker        punpcklbw   xmm5,           xmm7
667*fb1b10abSAndroid Build Coastguard Worker
668*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm2,           xmm4
669*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm3,           xmm5
670*fb1b10abSAndroid Build Coastguard Worker
671*fb1b10abSAndroid Build Coastguard Worker.finish:
672*fb1b10abSAndroid Build Coastguard Worker
673*fb1b10abSAndroid Build Coastguard Worker    ; pack up before storing
674*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm0,           xmm7
675*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm1,           xmm7
676*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm2,           xmm7
677*fb1b10abSAndroid Build Coastguard Worker        packuswb    xmm3,           xmm7
678*fb1b10abSAndroid Build Coastguard Worker
679*fb1b10abSAndroid Build Coastguard Worker    ; Load destination stride before writing out,
680*fb1b10abSAndroid Build Coastguard Worker    ;   doesn't need to persist
681*fb1b10abSAndroid Build Coastguard Worker        movsxd      rdx,            dword ptr arg(3) ; dst_stride
682*fb1b10abSAndroid Build Coastguard Worker
683*fb1b10abSAndroid Build Coastguard Worker    ; store blocks back out
684*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi],          xmm0
685*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi + rdx],    xmm1
686*fb1b10abSAndroid Build Coastguard Worker
687*fb1b10abSAndroid Build Coastguard Worker        lea         rdi,            [rdi + 2*rdx]
688*fb1b10abSAndroid Build Coastguard Worker
689*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi],          xmm2
690*fb1b10abSAndroid Build Coastguard Worker        movq        [rdi + rdx],    xmm3
691*fb1b10abSAndroid Build Coastguard Worker
692*fb1b10abSAndroid Build Coastguard Worker
693*fb1b10abSAndroid Build Coastguard Worker    ; begin epilog
694*fb1b10abSAndroid Build Coastguard Worker    pop         rdi
695*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
696*fb1b10abSAndroid Build Coastguard Worker    RESTORE_XMM
697*fb1b10abSAndroid Build Coastguard Worker    UNSHADOW_ARGS
698*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
699*fb1b10abSAndroid Build Coastguard Worker    ret
700*fb1b10abSAndroid Build Coastguard Worker
701*fb1b10abSAndroid Build Coastguard WorkerSECTION_RODATA
702*fb1b10abSAndroid Build Coastguard Workeralign 16
703*fb1b10abSAndroid Build Coastguard Workerfours:
704*fb1b10abSAndroid Build Coastguard Worker    times 8 dw 0x0004
705*fb1b10abSAndroid Build Coastguard Workeralign 16
706*fb1b10abSAndroid Build Coastguard Workerx_s1sqr2:
707*fb1b10abSAndroid Build Coastguard Worker    times 8 dw 0x8A8C
708*fb1b10abSAndroid Build Coastguard Workeralign 16
709*fb1b10abSAndroid Build Coastguard Workerx_c1sqr2less1:
710*fb1b10abSAndroid Build Coastguard Worker    times 8 dw 0x4E7B
711