xref: /aosp_15_r20/external/libvpx/vp8/encoder/x86/dct_sse2.asm (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1*fb1b10abSAndroid Build Coastguard Worker;
2*fb1b10abSAndroid Build Coastguard Worker;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker;
4*fb1b10abSAndroid Build Coastguard Worker;  Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker;  that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker;  tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker;  in the file PATENTS.  All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker;  be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker;
10*fb1b10abSAndroid Build Coastguard Worker
11*fb1b10abSAndroid Build Coastguard Worker
12*fb1b10abSAndroid Build Coastguard Worker%include "vpx_ports/x86_abi_support.asm"
13*fb1b10abSAndroid Build Coastguard Worker
14*fb1b10abSAndroid Build Coastguard Worker%macro STACK_FRAME_CREATE 0
15*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT
16*fb1b10abSAndroid Build Coastguard Worker  %define       input       rsi
17*fb1b10abSAndroid Build Coastguard Worker  %define       output      rdi
18*fb1b10abSAndroid Build Coastguard Worker  %define       pitch       rax
19*fb1b10abSAndroid Build Coastguard Worker    push        rbp
20*fb1b10abSAndroid Build Coastguard Worker    mov         rbp, rsp
21*fb1b10abSAndroid Build Coastguard Worker    GET_GOT     rbx
22*fb1b10abSAndroid Build Coastguard Worker    push        rsi
23*fb1b10abSAndroid Build Coastguard Worker    push        rdi
24*fb1b10abSAndroid Build Coastguard Worker    ; end prolog
25*fb1b10abSAndroid Build Coastguard Worker
26*fb1b10abSAndroid Build Coastguard Worker    mov         rsi, arg(0)
27*fb1b10abSAndroid Build Coastguard Worker    mov         rdi, arg(1)
28*fb1b10abSAndroid Build Coastguard Worker
29*fb1b10abSAndroid Build Coastguard Worker    movsxd      rax, dword ptr arg(2)
30*fb1b10abSAndroid Build Coastguard Worker    lea         rcx, [rsi + rax*2]
31*fb1b10abSAndroid Build Coastguard Worker%else
32*fb1b10abSAndroid Build Coastguard Worker  %if LIBVPX_YASM_WIN64
33*fb1b10abSAndroid Build Coastguard Worker    %define     input       rcx
34*fb1b10abSAndroid Build Coastguard Worker    %define     output      rdx
35*fb1b10abSAndroid Build Coastguard Worker    %define     pitch       r8
36*fb1b10abSAndroid Build Coastguard Worker    SAVE_XMM 7, u
37*fb1b10abSAndroid Build Coastguard Worker  %else
38*fb1b10abSAndroid Build Coastguard Worker    %define     input       rdi
39*fb1b10abSAndroid Build Coastguard Worker    %define     output      rsi
40*fb1b10abSAndroid Build Coastguard Worker    %define     pitch       rdx
41*fb1b10abSAndroid Build Coastguard Worker  %endif
42*fb1b10abSAndroid Build Coastguard Worker%endif
43*fb1b10abSAndroid Build Coastguard Worker%endmacro
44*fb1b10abSAndroid Build Coastguard Worker
45*fb1b10abSAndroid Build Coastguard Worker%macro STACK_FRAME_DESTROY 0
46*fb1b10abSAndroid Build Coastguard Worker  %define     input
47*fb1b10abSAndroid Build Coastguard Worker  %define     output
48*fb1b10abSAndroid Build Coastguard Worker  %define     pitch
49*fb1b10abSAndroid Build Coastguard Worker
50*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT
51*fb1b10abSAndroid Build Coastguard Worker    pop         rdi
52*fb1b10abSAndroid Build Coastguard Worker    pop         rsi
53*fb1b10abSAndroid Build Coastguard Worker    RESTORE_GOT
54*fb1b10abSAndroid Build Coastguard Worker    pop         rbp
55*fb1b10abSAndroid Build Coastguard Worker%else
56*fb1b10abSAndroid Build Coastguard Worker  %if LIBVPX_YASM_WIN64
57*fb1b10abSAndroid Build Coastguard Worker    RESTORE_XMM
58*fb1b10abSAndroid Build Coastguard Worker  %endif
59*fb1b10abSAndroid Build Coastguard Worker%endif
60*fb1b10abSAndroid Build Coastguard Worker    ret
61*fb1b10abSAndroid Build Coastguard Worker%endmacro
62*fb1b10abSAndroid Build Coastguard Worker
63*fb1b10abSAndroid Build Coastguard WorkerSECTION .text
64*fb1b10abSAndroid Build Coastguard Worker
65*fb1b10abSAndroid Build Coastguard Worker;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
66*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_short_fdct4x4_sse2)
67*fb1b10abSAndroid Build Coastguard Workersym(vp8_short_fdct4x4_sse2):
68*fb1b10abSAndroid Build Coastguard Worker
69*fb1b10abSAndroid Build Coastguard Worker    STACK_FRAME_CREATE
70*fb1b10abSAndroid Build Coastguard Worker
71*fb1b10abSAndroid Build Coastguard Worker    movq        xmm0, MMWORD PTR[input        ] ;03 02 01 00
72*fb1b10abSAndroid Build Coastguard Worker    movq        xmm2, MMWORD PTR[input+  pitch] ;13 12 11 10
73*fb1b10abSAndroid Build Coastguard Worker    lea         input,          [input+2*pitch]
74*fb1b10abSAndroid Build Coastguard Worker    movq        xmm1, MMWORD PTR[input        ] ;23 22 21 20
75*fb1b10abSAndroid Build Coastguard Worker    movq        xmm3, MMWORD PTR[input+  pitch] ;33 32 31 30
76*fb1b10abSAndroid Build Coastguard Worker
77*fb1b10abSAndroid Build Coastguard Worker    punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00
78*fb1b10abSAndroid Build Coastguard Worker    punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20
79*fb1b10abSAndroid Build Coastguard Worker
80*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm2, xmm0
81*fb1b10abSAndroid Build Coastguard Worker    punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00
82*fb1b10abSAndroid Build Coastguard Worker    punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10
83*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm1, xmm0
84*fb1b10abSAndroid Build Coastguard Worker    punpckldq   xmm0, xmm2                      ;31 21 30 20 11 10 01 00
85*fb1b10abSAndroid Build Coastguard Worker    pshufhw     xmm1, xmm1, 0b1h                ;22 23 02 03 xx xx xx xx
86*fb1b10abSAndroid Build Coastguard Worker    pshufhw     xmm2, xmm2, 0b1h                ;32 33 12 13 xx xx xx xx
87*fb1b10abSAndroid Build Coastguard Worker
88*fb1b10abSAndroid Build Coastguard Worker    punpckhdq   xmm1, xmm2                      ;32 33 22 23 12 13 02 03
89*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm3, xmm0
90*fb1b10abSAndroid Build Coastguard Worker    paddw       xmm0, xmm1                      ;b1 a1 b1 a1 b1 a1 b1 a1
91*fb1b10abSAndroid Build Coastguard Worker    psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1
92*fb1b10abSAndroid Build Coastguard Worker    psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
93*fb1b10abSAndroid Build Coastguard Worker    psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
94*fb1b10abSAndroid Build Coastguard Worker
95*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm1, xmm0
96*fb1b10abSAndroid Build Coastguard Worker    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1
97*fb1b10abSAndroid Build Coastguard Worker    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1
98*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm4, xmm3
99*fb1b10abSAndroid Build Coastguard Worker    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]   ;c1*2217 + d1*5352
100*fb1b10abSAndroid Build Coastguard Worker    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
101*fb1b10abSAndroid Build Coastguard Worker
102*fb1b10abSAndroid Build Coastguard Worker    paddd       xmm3, XMMWORD PTR[GLOBAL(_14500)]
103*fb1b10abSAndroid Build Coastguard Worker    paddd       xmm4, XMMWORD PTR[GLOBAL(_7500)]
104*fb1b10abSAndroid Build Coastguard Worker    psrad       xmm3, 12            ;(c1 * 2217 + d1 * 5352 +  14500)>>12
105*fb1b10abSAndroid Build Coastguard Worker    psrad       xmm4, 12            ;(d1 * 2217 - c1 * 5352 +   7500)>>12
106*fb1b10abSAndroid Build Coastguard Worker
107*fb1b10abSAndroid Build Coastguard Worker    packssdw    xmm0, xmm1                      ;op[2] op[0]
108*fb1b10abSAndroid Build Coastguard Worker    packssdw    xmm3, xmm4                      ;op[3] op[1]
109*fb1b10abSAndroid Build Coastguard Worker    ; 23 22 21 20 03 02 01 00
110*fb1b10abSAndroid Build Coastguard Worker    ;
111*fb1b10abSAndroid Build Coastguard Worker    ; 33 32 31 30 13 12 11 10
112*fb1b10abSAndroid Build Coastguard Worker    ;
113*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm2, xmm0
114*fb1b10abSAndroid Build Coastguard Worker    punpcklqdq  xmm0, xmm3                      ;13 12 11 10 03 02 01 00
115*fb1b10abSAndroid Build Coastguard Worker    punpckhqdq  xmm2, xmm3                      ;23 22 21 20 33 32 31 30
116*fb1b10abSAndroid Build Coastguard Worker
117*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm3, xmm0
118*fb1b10abSAndroid Build Coastguard Worker    punpcklwd   xmm0, xmm2                      ;32 30 22 20 12 10 02 00
119*fb1b10abSAndroid Build Coastguard Worker    punpckhwd   xmm3, xmm2                      ;33 31 23 21 13 11 03 01
120*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm2, xmm0
121*fb1b10abSAndroid Build Coastguard Worker    punpcklwd   xmm0, xmm3                      ;13 12 11 10 03 02 01 00
122*fb1b10abSAndroid Build Coastguard Worker    punpckhwd   xmm2, xmm3                      ;33 32 31 30 23 22 21 20
123*fb1b10abSAndroid Build Coastguard Worker
124*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm5, XMMWORD PTR[GLOBAL(_7)]
125*fb1b10abSAndroid Build Coastguard Worker    pshufd      xmm2, xmm2, 04eh
126*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm3, xmm0
127*fb1b10abSAndroid Build Coastguard Worker    paddw       xmm0, xmm2                      ;b1 b1 b1 b1 a1 a1 a1 a1
128*fb1b10abSAndroid Build Coastguard Worker    psubw       xmm3, xmm2                      ;c1 c1 c1 c1 d1 d1 d1 d1
129*fb1b10abSAndroid Build Coastguard Worker
130*fb1b10abSAndroid Build Coastguard Worker    pshufd      xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 b1 a1 a1
131*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm2, xmm3                      ;save d1 for compare
132*fb1b10abSAndroid Build Coastguard Worker    pshufd      xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 c1 d1 d1
133*fb1b10abSAndroid Build Coastguard Worker    pshuflw     xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 a1 b1 a1
134*fb1b10abSAndroid Build Coastguard Worker    pshuflw     xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 d1 c1 d1
135*fb1b10abSAndroid Build Coastguard Worker    pshufhw     xmm0, xmm0, 0d8h                ;b1 a1 b1 a1 b1 a1 b1 a1
136*fb1b10abSAndroid Build Coastguard Worker    pshufhw     xmm3, xmm3, 0d8h                ;c1 d1 c1 d1 c1 d1 c1 d1
137*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm1, xmm0
138*fb1b10abSAndroid Build Coastguard Worker    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
139*fb1b10abSAndroid Build Coastguard Worker    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
140*fb1b10abSAndroid Build Coastguard Worker
141*fb1b10abSAndroid Build Coastguard Worker    pxor        xmm4, xmm4                      ;zero out for compare
142*fb1b10abSAndroid Build Coastguard Worker    paddd       xmm0, xmm5
143*fb1b10abSAndroid Build Coastguard Worker    paddd       xmm1, xmm5
144*fb1b10abSAndroid Build Coastguard Worker    pcmpeqw     xmm2, xmm4
145*fb1b10abSAndroid Build Coastguard Worker    psrad       xmm0, 4                         ;(a1 + b1 + 7)>>4
146*fb1b10abSAndroid Build Coastguard Worker    psrad       xmm1, 4                         ;(a1 - b1 + 7)>>4
147*fb1b10abSAndroid Build Coastguard Worker    pandn       xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
148*fb1b10abSAndroid Build Coastguard Worker                                                     ;and keep bit 0 of lower
149*fb1b10abSAndroid Build Coastguard Worker
150*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm4, xmm3
151*fb1b10abSAndroid Build Coastguard Worker    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]    ;c1*2217 + d1*5352
152*fb1b10abSAndroid Build Coastguard Worker    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
153*fb1b10abSAndroid Build Coastguard Worker    paddd       xmm3, XMMWORD PTR[GLOBAL(_12000)]
154*fb1b10abSAndroid Build Coastguard Worker    paddd       xmm4, XMMWORD PTR[GLOBAL(_51000)]
155*fb1b10abSAndroid Build Coastguard Worker    packssdw    xmm0, xmm1                      ;op[8] op[0]
156*fb1b10abSAndroid Build Coastguard Worker    psrad       xmm3, 16                ;(c1 * 2217 + d1 * 5352 +  12000)>>16
157*fb1b10abSAndroid Build Coastguard Worker    psrad       xmm4, 16                ;(d1 * 2217 - c1 * 5352 +  51000)>>16
158*fb1b10abSAndroid Build Coastguard Worker
159*fb1b10abSAndroid Build Coastguard Worker    packssdw    xmm3, xmm4                      ;op[12] op[4]
160*fb1b10abSAndroid Build Coastguard Worker    movdqa      xmm1, xmm0
161*fb1b10abSAndroid Build Coastguard Worker    paddw       xmm3, xmm2                      ;op[4] += (d1!=0)
162*fb1b10abSAndroid Build Coastguard Worker    punpcklqdq  xmm0, xmm3                      ;op[4] op[0]
163*fb1b10abSAndroid Build Coastguard Worker    punpckhqdq  xmm1, xmm3                      ;op[12] op[8]
164*fb1b10abSAndroid Build Coastguard Worker
165*fb1b10abSAndroid Build Coastguard Worker    movdqa      XMMWORD PTR[output +  0], xmm0
166*fb1b10abSAndroid Build Coastguard Worker    movdqa      XMMWORD PTR[output + 16], xmm1
167*fb1b10abSAndroid Build Coastguard Worker
168*fb1b10abSAndroid Build Coastguard Worker    STACK_FRAME_DESTROY
169*fb1b10abSAndroid Build Coastguard Worker
170*fb1b10abSAndroid Build Coastguard Worker;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
171*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_short_fdct8x4_sse2)
172*fb1b10abSAndroid Build Coastguard Workersym(vp8_short_fdct8x4_sse2):
173*fb1b10abSAndroid Build Coastguard Worker
174*fb1b10abSAndroid Build Coastguard Worker    STACK_FRAME_CREATE
175*fb1b10abSAndroid Build Coastguard Worker
176*fb1b10abSAndroid Build Coastguard Worker        ; read the input data
177*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm0,       [input        ]
178*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm2,       [input+  pitch]
179*fb1b10abSAndroid Build Coastguard Worker        lea         input,      [input+2*pitch]
180*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,       [input        ]
181*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm3,       [input+  pitch]
182*fb1b10abSAndroid Build Coastguard Worker
183*fb1b10abSAndroid Build Coastguard Worker        ; transpose for the first stage
184*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm1,       xmm0        ; 00 01 02 03 04 05 06 07
185*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,       xmm4        ; 20 21 22 23 24 25 26 27
186*fb1b10abSAndroid Build Coastguard Worker
187*fb1b10abSAndroid Build Coastguard Worker        punpcklwd   xmm0,       xmm2        ; 00 10 01 11 02 12 03 13
188*fb1b10abSAndroid Build Coastguard Worker        punpckhwd   xmm1,       xmm2        ; 04 14 05 15 06 16 07 17
189*fb1b10abSAndroid Build Coastguard Worker
190*fb1b10abSAndroid Build Coastguard Worker        punpcklwd   xmm4,       xmm3        ; 20 30 21 31 22 32 23 33
191*fb1b10abSAndroid Build Coastguard Worker        punpckhwd   xmm5,       xmm3        ; 24 34 25 35 26 36 27 37
192*fb1b10abSAndroid Build Coastguard Worker
193*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm2,       xmm0        ; 00 10 01 11 02 12 03 13
194*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm0,       xmm4        ; 00 10 20 30 01 11 21 31
195*fb1b10abSAndroid Build Coastguard Worker
196*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm2,       xmm4        ; 02 12 22 32 03 13 23 33
197*fb1b10abSAndroid Build Coastguard Worker
198*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,       xmm1        ; 04 14 05 15 06 16 07 17
199*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm4,       xmm5        ; 04 14 24 34 05 15 25 35
200*fb1b10abSAndroid Build Coastguard Worker
201*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm1,       xmm5        ; 06 16 26 36 07 17 27 37
202*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm3,       xmm2        ; 02 12 22 32 03 13 23 33
203*fb1b10abSAndroid Build Coastguard Worker
204*fb1b10abSAndroid Build Coastguard Worker        punpckhqdq  xmm3,       xmm1        ; 03 13 23 33 07 17 27 37
205*fb1b10abSAndroid Build Coastguard Worker        punpcklqdq  xmm2,       xmm1        ; 02 12 22 32 06 16 26 36
206*fb1b10abSAndroid Build Coastguard Worker
207*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm1,       xmm0        ; 00 10 20 30 01 11 21 31
208*fb1b10abSAndroid Build Coastguard Worker        punpcklqdq  xmm0,       xmm4        ; 00 10 20 30 04 14 24 34
209*fb1b10abSAndroid Build Coastguard Worker
210*fb1b10abSAndroid Build Coastguard Worker        punpckhqdq  xmm1,       xmm4        ; 01 11 21 32 05 15 25 35
211*fb1b10abSAndroid Build Coastguard Worker
212*fb1b10abSAndroid Build Coastguard Worker        ; xmm0 0
213*fb1b10abSAndroid Build Coastguard Worker        ; xmm1 1
214*fb1b10abSAndroid Build Coastguard Worker        ; xmm2 2
215*fb1b10abSAndroid Build Coastguard Worker        ; xmm3 3
216*fb1b10abSAndroid Build Coastguard Worker
217*fb1b10abSAndroid Build Coastguard Worker        ; first stage
218*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,       xmm0
219*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,       xmm1
220*fb1b10abSAndroid Build Coastguard Worker
221*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm0,       xmm3        ; a1 = 0 + 3
222*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm1,       xmm2        ; b1 = 1 + 2
223*fb1b10abSAndroid Build Coastguard Worker
224*fb1b10abSAndroid Build Coastguard Worker        psubw       xmm4,       xmm2        ; c1 = 1 - 2
225*fb1b10abSAndroid Build Coastguard Worker        psubw       xmm5,       xmm3        ; d1 = 0 - 3
226*fb1b10abSAndroid Build Coastguard Worker
227*fb1b10abSAndroid Build Coastguard Worker        psllw       xmm5,        3
228*fb1b10abSAndroid Build Coastguard Worker        psllw       xmm4,        3
229*fb1b10abSAndroid Build Coastguard Worker
230*fb1b10abSAndroid Build Coastguard Worker        psllw       xmm0,        3
231*fb1b10abSAndroid Build Coastguard Worker        psllw       xmm1,        3
232*fb1b10abSAndroid Build Coastguard Worker
233*fb1b10abSAndroid Build Coastguard Worker        ; output 0 and 2
234*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm2,       xmm0        ; a1
235*fb1b10abSAndroid Build Coastguard Worker
236*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm0,       xmm1        ; op[0] = a1 + b1
237*fb1b10abSAndroid Build Coastguard Worker        psubw       xmm2,       xmm1        ; op[2] = a1 - b1
238*fb1b10abSAndroid Build Coastguard Worker
239*fb1b10abSAndroid Build Coastguard Worker        ; output 1 and 3
240*fb1b10abSAndroid Build Coastguard Worker        ; interleave c1, d1
241*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm1,       xmm5        ; d1
242*fb1b10abSAndroid Build Coastguard Worker        punpcklwd   xmm1,       xmm4        ; c1 d1
243*fb1b10abSAndroid Build Coastguard Worker        punpckhwd   xmm5,       xmm4        ; c1 d1
244*fb1b10abSAndroid Build Coastguard Worker
245*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm3,       xmm1
246*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,       xmm5
247*fb1b10abSAndroid Build Coastguard Worker
248*fb1b10abSAndroid Build Coastguard Worker        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
249*fb1b10abSAndroid Build Coastguard Worker        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
250*fb1b10abSAndroid Build Coastguard Worker
251*fb1b10abSAndroid Build Coastguard Worker        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
252*fb1b10abSAndroid Build Coastguard Worker        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
253*fb1b10abSAndroid Build Coastguard Worker
254*fb1b10abSAndroid Build Coastguard Worker        paddd       xmm1,       XMMWORD PTR[GLOBAL(_14500)]
255*fb1b10abSAndroid Build Coastguard Worker        paddd       xmm4,       XMMWORD PTR[GLOBAL(_14500)]
256*fb1b10abSAndroid Build Coastguard Worker        paddd       xmm3,       XMMWORD PTR[GLOBAL(_7500)]
257*fb1b10abSAndroid Build Coastguard Worker        paddd       xmm5,       XMMWORD PTR[GLOBAL(_7500)]
258*fb1b10abSAndroid Build Coastguard Worker
259*fb1b10abSAndroid Build Coastguard Worker        psrad       xmm1,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
260*fb1b10abSAndroid Build Coastguard Worker        psrad       xmm4,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
261*fb1b10abSAndroid Build Coastguard Worker        psrad       xmm3,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
262*fb1b10abSAndroid Build Coastguard Worker        psrad       xmm5,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
263*fb1b10abSAndroid Build Coastguard Worker
264*fb1b10abSAndroid Build Coastguard Worker        packssdw    xmm1,       xmm4        ; op[1]
265*fb1b10abSAndroid Build Coastguard Worker        packssdw    xmm3,       xmm5        ; op[3]
266*fb1b10abSAndroid Build Coastguard Worker
267*fb1b10abSAndroid Build Coastguard Worker        ; done with vertical
268*fb1b10abSAndroid Build Coastguard Worker        ; transpose for the second stage
269*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,       xmm0         ; 00 10 20 30 04 14 24 34
270*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,       xmm2         ; 02 12 22 32 06 16 26 36
271*fb1b10abSAndroid Build Coastguard Worker
272*fb1b10abSAndroid Build Coastguard Worker        punpcklwd   xmm0,       xmm1         ; 00 01 10 11 20 21 30 31
273*fb1b10abSAndroid Build Coastguard Worker        punpckhwd   xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
274*fb1b10abSAndroid Build Coastguard Worker
275*fb1b10abSAndroid Build Coastguard Worker        punpcklwd   xmm2,       xmm3         ; 02 03 12 13 22 23 32 33
276*fb1b10abSAndroid Build Coastguard Worker        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
277*fb1b10abSAndroid Build Coastguard Worker
278*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm1,       xmm0         ; 00 01 10 11 20 21 30 31
279*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm0,       xmm2         ; 00 01 02 03 10 11 12 13
280*fb1b10abSAndroid Build Coastguard Worker
281*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm1,       xmm2         ; 20 21 22 23 30 31 32 33
282*fb1b10abSAndroid Build Coastguard Worker
283*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm2,       xmm4         ; 04 05 14 15 24 25 34 35
284*fb1b10abSAndroid Build Coastguard Worker        punpckldq   xmm2,       xmm5         ; 04 05 06 07 14 15 16 17
285*fb1b10abSAndroid Build Coastguard Worker
286*fb1b10abSAndroid Build Coastguard Worker        punpckhdq   xmm4,       xmm5         ; 24 25 26 27 34 35 36 37
287*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm3,       xmm1         ; 20 21 22 23 30 31 32 33
288*fb1b10abSAndroid Build Coastguard Worker
289*fb1b10abSAndroid Build Coastguard Worker        punpckhqdq  xmm3,       xmm4         ; 30 31 32 33 34 35 36 37
290*fb1b10abSAndroid Build Coastguard Worker        punpcklqdq  xmm1,       xmm4         ; 20 21 22 23 24 25 26 27
291*fb1b10abSAndroid Build Coastguard Worker
292*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,       xmm0         ; 00 01 02 03 10 11 12 13
293*fb1b10abSAndroid Build Coastguard Worker        punpcklqdq  xmm0,       xmm2         ; 00 01 02 03 04 05 06 07
294*fb1b10abSAndroid Build Coastguard Worker
295*fb1b10abSAndroid Build Coastguard Worker        punpckhqdq  xmm4,       xmm2         ; 10 11 12 13 14 15 16 17
296*fb1b10abSAndroid Build Coastguard Worker
297*fb1b10abSAndroid Build Coastguard Worker        ; xmm0 0
298*fb1b10abSAndroid Build Coastguard Worker        ; xmm1 4
299*fb1b10abSAndroid Build Coastguard Worker        ; xmm2 1
300*fb1b10abSAndroid Build Coastguard Worker        ; xmm3 3
301*fb1b10abSAndroid Build Coastguard Worker
302*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,       xmm0
303*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm2,       xmm1
304*fb1b10abSAndroid Build Coastguard Worker
305*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm0,       xmm3        ; a1 = 0 + 3
306*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm1,       xmm4        ; b1 = 1 + 2
307*fb1b10abSAndroid Build Coastguard Worker
308*fb1b10abSAndroid Build Coastguard Worker        psubw       xmm4,       xmm2        ; c1 = 1 - 2
309*fb1b10abSAndroid Build Coastguard Worker        psubw       xmm5,       xmm3        ; d1 = 0 - 3
310*fb1b10abSAndroid Build Coastguard Worker
311*fb1b10abSAndroid Build Coastguard Worker        pxor        xmm6,       xmm6        ; zero out for compare
312*fb1b10abSAndroid Build Coastguard Worker
313*fb1b10abSAndroid Build Coastguard Worker        pcmpeqw     xmm6,       xmm5        ; d1 != 0
314*fb1b10abSAndroid Build Coastguard Worker
315*fb1b10abSAndroid Build Coastguard Worker        pandn       xmm6,       XMMWORD PTR[GLOBAL(_cmp_mask8x4)]   ; clear upper,
316*fb1b10abSAndroid Build Coastguard Worker                                                                    ; and keep bit 0 of lower
317*fb1b10abSAndroid Build Coastguard Worker
318*fb1b10abSAndroid Build Coastguard Worker        ; output 0 and 2
319*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm2,       xmm0        ; a1
320*fb1b10abSAndroid Build Coastguard Worker
321*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm0,       xmm1        ; a1 + b1
322*fb1b10abSAndroid Build Coastguard Worker        psubw       xmm2,       xmm1        ; a1 - b1
323*fb1b10abSAndroid Build Coastguard Worker
324*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm0,       XMMWORD PTR[GLOBAL(_7w)]
325*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm2,       XMMWORD PTR[GLOBAL(_7w)]
326*fb1b10abSAndroid Build Coastguard Worker
327*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm0,       4           ; op[0] = (a1 + b1 + 7)>>4
328*fb1b10abSAndroid Build Coastguard Worker        psraw       xmm2,       4           ; op[8] = (a1 - b1 + 7)>>4
329*fb1b10abSAndroid Build Coastguard Worker
330*fb1b10abSAndroid Build Coastguard Worker        ; output 1 and 3
331*fb1b10abSAndroid Build Coastguard Worker        ; interleave c1, d1
332*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm1,       xmm5        ; d1
333*fb1b10abSAndroid Build Coastguard Worker        punpcklwd   xmm1,       xmm4        ; c1 d1
334*fb1b10abSAndroid Build Coastguard Worker        punpckhwd   xmm5,       xmm4        ; c1 d1
335*fb1b10abSAndroid Build Coastguard Worker
336*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm3,       xmm1
337*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,       xmm5
338*fb1b10abSAndroid Build Coastguard Worker
339*fb1b10abSAndroid Build Coastguard Worker        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
340*fb1b10abSAndroid Build Coastguard Worker        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
341*fb1b10abSAndroid Build Coastguard Worker
342*fb1b10abSAndroid Build Coastguard Worker        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
343*fb1b10abSAndroid Build Coastguard Worker        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
344*fb1b10abSAndroid Build Coastguard Worker
345*fb1b10abSAndroid Build Coastguard Worker        paddd       xmm1,       XMMWORD PTR[GLOBAL(_12000)]
346*fb1b10abSAndroid Build Coastguard Worker        paddd       xmm4,       XMMWORD PTR[GLOBAL(_12000)]
347*fb1b10abSAndroid Build Coastguard Worker        paddd       xmm3,       XMMWORD PTR[GLOBAL(_51000)]
348*fb1b10abSAndroid Build Coastguard Worker        paddd       xmm5,       XMMWORD PTR[GLOBAL(_51000)]
349*fb1b10abSAndroid Build Coastguard Worker
350*fb1b10abSAndroid Build Coastguard Worker        psrad       xmm1,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
351*fb1b10abSAndroid Build Coastguard Worker        psrad       xmm4,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
352*fb1b10abSAndroid Build Coastguard Worker        psrad       xmm3,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
353*fb1b10abSAndroid Build Coastguard Worker        psrad       xmm5,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
354*fb1b10abSAndroid Build Coastguard Worker
355*fb1b10abSAndroid Build Coastguard Worker        packssdw    xmm1,       xmm4        ; op[4]
356*fb1b10abSAndroid Build Coastguard Worker        packssdw    xmm3,       xmm5        ; op[12]
357*fb1b10abSAndroid Build Coastguard Worker
358*fb1b10abSAndroid Build Coastguard Worker        paddw       xmm1,       xmm6        ; op[4] += (d1!=0)
359*fb1b10abSAndroid Build Coastguard Worker
360*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm4,       xmm0
361*fb1b10abSAndroid Build Coastguard Worker        movdqa      xmm5,       xmm2
362*fb1b10abSAndroid Build Coastguard Worker
363*fb1b10abSAndroid Build Coastguard Worker        punpcklqdq  xmm0,       xmm1
364*fb1b10abSAndroid Build Coastguard Worker        punpckhqdq  xmm4,       xmm1
365*fb1b10abSAndroid Build Coastguard Worker
366*fb1b10abSAndroid Build Coastguard Worker        punpcklqdq  xmm2,       xmm3
367*fb1b10abSAndroid Build Coastguard Worker        punpckhqdq  xmm5,       xmm3
368*fb1b10abSAndroid Build Coastguard Worker
369*fb1b10abSAndroid Build Coastguard Worker        movdqa      XMMWORD PTR[output + 0 ],  xmm0
370*fb1b10abSAndroid Build Coastguard Worker        movdqa      XMMWORD PTR[output + 16],  xmm2
371*fb1b10abSAndroid Build Coastguard Worker        movdqa      XMMWORD PTR[output + 32],  xmm4
372*fb1b10abSAndroid Build Coastguard Worker        movdqa      XMMWORD PTR[output + 48],  xmm5
373*fb1b10abSAndroid Build Coastguard Worker
374*fb1b10abSAndroid Build Coastguard Worker    STACK_FRAME_DESTROY
375*fb1b10abSAndroid Build Coastguard Worker
376*fb1b10abSAndroid Build Coastguard WorkerSECTION_RODATA
377*fb1b10abSAndroid Build Coastguard Workeralign 16
378*fb1b10abSAndroid Build Coastguard Worker_5352_2217:
379*fb1b10abSAndroid Build Coastguard Worker    dw 5352
380*fb1b10abSAndroid Build Coastguard Worker    dw 2217
381*fb1b10abSAndroid Build Coastguard Worker    dw 5352
382*fb1b10abSAndroid Build Coastguard Worker    dw 2217
383*fb1b10abSAndroid Build Coastguard Worker    dw 5352
384*fb1b10abSAndroid Build Coastguard Worker    dw 2217
385*fb1b10abSAndroid Build Coastguard Worker    dw 5352
386*fb1b10abSAndroid Build Coastguard Worker    dw 2217
387*fb1b10abSAndroid Build Coastguard Workeralign 16
388*fb1b10abSAndroid Build Coastguard Worker_2217_neg5352:
389*fb1b10abSAndroid Build Coastguard Worker    dw 2217
390*fb1b10abSAndroid Build Coastguard Worker    dw -5352
391*fb1b10abSAndroid Build Coastguard Worker    dw 2217
392*fb1b10abSAndroid Build Coastguard Worker    dw -5352
393*fb1b10abSAndroid Build Coastguard Worker    dw 2217
394*fb1b10abSAndroid Build Coastguard Worker    dw -5352
395*fb1b10abSAndroid Build Coastguard Worker    dw 2217
396*fb1b10abSAndroid Build Coastguard Worker    dw -5352
397*fb1b10abSAndroid Build Coastguard Workeralign 16
398*fb1b10abSAndroid Build Coastguard Worker_mult_add:
399*fb1b10abSAndroid Build Coastguard Worker    times 8 dw 1
400*fb1b10abSAndroid Build Coastguard Workeralign 16
401*fb1b10abSAndroid Build Coastguard Worker_cmp_mask:
402*fb1b10abSAndroid Build Coastguard Worker    times 4 dw 1
403*fb1b10abSAndroid Build Coastguard Worker    times 4 dw 0
404*fb1b10abSAndroid Build Coastguard Workeralign 16
405*fb1b10abSAndroid Build Coastguard Worker_cmp_mask8x4:
406*fb1b10abSAndroid Build Coastguard Worker    times 8 dw 1
407*fb1b10abSAndroid Build Coastguard Workeralign 16
408*fb1b10abSAndroid Build Coastguard Worker_mult_sub:
409*fb1b10abSAndroid Build Coastguard Worker    dw 1
410*fb1b10abSAndroid Build Coastguard Worker    dw -1
411*fb1b10abSAndroid Build Coastguard Worker    dw 1
412*fb1b10abSAndroid Build Coastguard Worker    dw -1
413*fb1b10abSAndroid Build Coastguard Worker    dw 1
414*fb1b10abSAndroid Build Coastguard Worker    dw -1
415*fb1b10abSAndroid Build Coastguard Worker    dw 1
416*fb1b10abSAndroid Build Coastguard Worker    dw -1
417*fb1b10abSAndroid Build Coastguard Workeralign 16
418*fb1b10abSAndroid Build Coastguard Worker_7:
419*fb1b10abSAndroid Build Coastguard Worker    times 4 dd 7
420*fb1b10abSAndroid Build Coastguard Workeralign 16
421*fb1b10abSAndroid Build Coastguard Worker_7w:
422*fb1b10abSAndroid Build Coastguard Worker    times 8 dw 7
423*fb1b10abSAndroid Build Coastguard Workeralign 16
424*fb1b10abSAndroid Build Coastguard Worker_14500:
425*fb1b10abSAndroid Build Coastguard Worker    times 4 dd 14500
426*fb1b10abSAndroid Build Coastguard Workeralign 16
427*fb1b10abSAndroid Build Coastguard Worker_7500:
428*fb1b10abSAndroid Build Coastguard Worker    times 4 dd 7500
429*fb1b10abSAndroid Build Coastguard Workeralign 16
430*fb1b10abSAndroid Build Coastguard Worker_12000:
431*fb1b10abSAndroid Build Coastguard Worker    times 4 dd 12000
432*fb1b10abSAndroid Build Coastguard Workeralign 16
433*fb1b10abSAndroid Build Coastguard Worker_51000:
434*fb1b10abSAndroid Build Coastguard Worker    times 4 dd 51000
435