xref: /aosp_15_r20/external/libjpeg-turbo/simd/i386/jdmrgext-avx2.asm (revision dfc6aa5c1cfd4bc4e2018dc74aa96e29ee49c6da)
1*dfc6aa5cSAndroid Build Coastguard Worker;
2*dfc6aa5cSAndroid Build Coastguard Worker; jdmrgext.asm - merged upsampling/color conversion (AVX2)
3*dfc6aa5cSAndroid Build Coastguard Worker;
4*dfc6aa5cSAndroid Build Coastguard Worker; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
5*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2012, 2016, D. R. Commander.
6*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2015, Intel Corporation.
7*dfc6aa5cSAndroid Build Coastguard Worker;
8*dfc6aa5cSAndroid Build Coastguard Worker; Based on the x86 SIMD extension for IJG JPEG library
9*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 1999-2006, MIYASAKA Masaru.
10*dfc6aa5cSAndroid Build Coastguard Worker; For conditions of distribution and use, see copyright notice in jsimdext.inc
11*dfc6aa5cSAndroid Build Coastguard Worker;
12*dfc6aa5cSAndroid Build Coastguard Worker; This file should be assembled with NASM (Netwide Assembler),
13*dfc6aa5cSAndroid Build Coastguard Worker; can *not* be assembled with Microsoft's MASM or any compatible
14*dfc6aa5cSAndroid Build Coastguard Worker; assembler (including Borland's Turbo Assembler).
15*dfc6aa5cSAndroid Build Coastguard Worker; NASM is available from http://nasm.sourceforge.net/ or
16*dfc6aa5cSAndroid Build Coastguard Worker; http://sourceforge.net/project/showfiles.php?group_id=6208
17*dfc6aa5cSAndroid Build Coastguard Worker
18*dfc6aa5cSAndroid Build Coastguard Worker%include "jcolsamp.inc"
19*dfc6aa5cSAndroid Build Coastguard Worker
20*dfc6aa5cSAndroid Build Coastguard Worker; --------------------------------------------------------------------------
21*dfc6aa5cSAndroid Build Coastguard Worker;
22*dfc6aa5cSAndroid Build Coastguard Worker; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
23*dfc6aa5cSAndroid Build Coastguard Worker;
24*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(void)
25*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_h2v1_merged_upsample_avx2(JDIMENSION output_width,
26*dfc6aa5cSAndroid Build Coastguard Worker;                                 JSAMPIMAGE input_buf,
27*dfc6aa5cSAndroid Build Coastguard Worker;                                 JDIMENSION in_row_group_ctr,
28*dfc6aa5cSAndroid Build Coastguard Worker;                                 JSAMPARRAY output_buf);
29*dfc6aa5cSAndroid Build Coastguard Worker;
30*dfc6aa5cSAndroid Build Coastguard Worker
31*dfc6aa5cSAndroid Build Coastguard Worker%define output_width(b)      (b) + 8    ; JDIMENSION output_width
32*dfc6aa5cSAndroid Build Coastguard Worker%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
33*dfc6aa5cSAndroid Build Coastguard Worker%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
34*dfc6aa5cSAndroid Build Coastguard Worker%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
35*dfc6aa5cSAndroid Build Coastguard Worker
36*dfc6aa5cSAndroid Build Coastguard Worker%define original_ebp  ebp + 0
37*dfc6aa5cSAndroid Build Coastguard Worker%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
38*dfc6aa5cSAndroid Build Coastguard Worker                                        ; ymmword wk[WK_NUM]
39*dfc6aa5cSAndroid Build Coastguard Worker%define WK_NUM        3
40*dfc6aa5cSAndroid Build Coastguard Worker%define gotptr        wk(0) - SIZEOF_POINTER  ; void * gotptr
41*dfc6aa5cSAndroid Build Coastguard Worker
42*dfc6aa5cSAndroid Build Coastguard Worker    align       32
43*dfc6aa5cSAndroid Build Coastguard Worker    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2)
44*dfc6aa5cSAndroid Build Coastguard Worker
45*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_h2v1_merged_upsample_avx2):
46*dfc6aa5cSAndroid Build Coastguard Worker    push        ebp
47*dfc6aa5cSAndroid Build Coastguard Worker    mov         eax, esp                     ; eax = original ebp
48*dfc6aa5cSAndroid Build Coastguard Worker    sub         esp, byte 4
49*dfc6aa5cSAndroid Build Coastguard Worker    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
50*dfc6aa5cSAndroid Build Coastguard Worker    mov         [esp], eax
51*dfc6aa5cSAndroid Build Coastguard Worker    mov         ebp, esp                     ; ebp = aligned ebp
52*dfc6aa5cSAndroid Build Coastguard Worker    lea         esp, [wk(0)]
53*dfc6aa5cSAndroid Build Coastguard Worker    pushpic     eax                     ; make a room for GOT address
54*dfc6aa5cSAndroid Build Coastguard Worker    push        ebx
55*dfc6aa5cSAndroid Build Coastguard Worker;   push        ecx                     ; need not be preserved
56*dfc6aa5cSAndroid Build Coastguard Worker;   push        edx                     ; need not be preserved
57*dfc6aa5cSAndroid Build Coastguard Worker    push        esi
58*dfc6aa5cSAndroid Build Coastguard Worker    push        edi
59*dfc6aa5cSAndroid Build Coastguard Worker
60*dfc6aa5cSAndroid Build Coastguard Worker    get_GOT     ebx                     ; get GOT address
61*dfc6aa5cSAndroid Build Coastguard Worker    movpic      POINTER [gotptr], ebx   ; save GOT address
62*dfc6aa5cSAndroid Build Coastguard Worker
63*dfc6aa5cSAndroid Build Coastguard Worker    mov         ecx, JDIMENSION [output_width(eax)]  ; col
64*dfc6aa5cSAndroid Build Coastguard Worker    test        ecx, ecx
65*dfc6aa5cSAndroid Build Coastguard Worker    jz          near .return
66*dfc6aa5cSAndroid Build Coastguard Worker
67*dfc6aa5cSAndroid Build Coastguard Worker    push        ecx
68*dfc6aa5cSAndroid Build Coastguard Worker
69*dfc6aa5cSAndroid Build Coastguard Worker    mov         edi, JSAMPIMAGE [input_buf(eax)]
70*dfc6aa5cSAndroid Build Coastguard Worker    mov         ecx, JDIMENSION [in_row_group_ctr(eax)]
71*dfc6aa5cSAndroid Build Coastguard Worker    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
72*dfc6aa5cSAndroid Build Coastguard Worker    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
73*dfc6aa5cSAndroid Build Coastguard Worker    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
74*dfc6aa5cSAndroid Build Coastguard Worker    mov         edi, JSAMPARRAY [output_buf(eax)]
75*dfc6aa5cSAndroid Build Coastguard Worker    mov         esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]  ; inptr0
76*dfc6aa5cSAndroid Build Coastguard Worker    mov         ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]  ; inptr1
77*dfc6aa5cSAndroid Build Coastguard Worker    mov         edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]  ; inptr2
78*dfc6aa5cSAndroid Build Coastguard Worker    mov         edi, JSAMPROW [edi]                      ; outptr
79*dfc6aa5cSAndroid Build Coastguard Worker
80*dfc6aa5cSAndroid Build Coastguard Worker    pop         ecx                     ; col
81*dfc6aa5cSAndroid Build Coastguard Worker
82*dfc6aa5cSAndroid Build Coastguard Worker    alignx      16, 7
83*dfc6aa5cSAndroid Build Coastguard Worker.columnloop:
84*dfc6aa5cSAndroid Build Coastguard Worker    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
85*dfc6aa5cSAndroid Build Coastguard Worker
86*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm6, YMMWORD [ebx]     ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
87*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm7, YMMWORD [edx]     ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
88*dfc6aa5cSAndroid Build Coastguard Worker
89*dfc6aa5cSAndroid Build Coastguard Worker    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
90*dfc6aa5cSAndroid Build Coastguard Worker    vpcmpeqw    ymm3, ymm3, ymm3
91*dfc6aa5cSAndroid Build Coastguard Worker    vpsllw      ymm3, ymm3, 7           ; ymm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
92*dfc6aa5cSAndroid Build Coastguard Worker
93*dfc6aa5cSAndroid Build Coastguard Worker    vpermq      ymm6, ymm6, 0xd8        ; ymm6=Cb(01234567GHIJKLMN89ABCDEFOPQRSTUV)
94*dfc6aa5cSAndroid Build Coastguard Worker    vpermq      ymm7, ymm7, 0xd8        ; ymm7=Cr(01234567GHIJKLMN89ABCDEFOPQRSTUV)
95*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm4, ymm6, ymm1        ; ymm4=Cb(0123456789ABCDEF)=CbL
96*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm6, ymm6, ymm1        ; ymm6=Cb(GHIJKLMNOPQRSTUV)=CbH
97*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm0, ymm7, ymm1        ; ymm0=Cr(0123456789ABCDEF)=CrL
98*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm7, ymm7, ymm1        ; ymm7=Cr(GHIJKLMNOPQRSTUV)=CrH
99*dfc6aa5cSAndroid Build Coastguard Worker
100*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm5, ymm6, ymm3
101*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm2, ymm4, ymm3
102*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm1, ymm7, ymm3
103*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm3, ymm0, ymm3
104*dfc6aa5cSAndroid Build Coastguard Worker
105*dfc6aa5cSAndroid Build Coastguard Worker    ; (Original)
106*dfc6aa5cSAndroid Build Coastguard Worker    ; R = Y                + 1.40200 * Cr
107*dfc6aa5cSAndroid Build Coastguard Worker    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
108*dfc6aa5cSAndroid Build Coastguard Worker    ; B = Y + 1.77200 * Cb
109*dfc6aa5cSAndroid Build Coastguard Worker    ;
110*dfc6aa5cSAndroid Build Coastguard Worker    ; (This implementation)
111*dfc6aa5cSAndroid Build Coastguard Worker    ; R = Y                + 0.40200 * Cr + Cr
112*dfc6aa5cSAndroid Build Coastguard Worker    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
113*dfc6aa5cSAndroid Build Coastguard Worker    ; B = Y - 0.22800 * Cb + Cb + Cb
114*dfc6aa5cSAndroid Build Coastguard Worker
115*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm6, ymm5, ymm5             ; ymm6=2*CbH
116*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm4, ymm2, ymm2             ; ymm4=2*CbL
117*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm7, ymm1, ymm1             ; ymm7=2*CrH
118*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm0, ymm3, ymm3             ; ymm0=2*CrL
119*dfc6aa5cSAndroid Build Coastguard Worker
120*dfc6aa5cSAndroid Build Coastguard Worker    vpmulhw     ymm6, ymm6, [GOTOFF(eax,PW_MF0228)]  ; ymm6=(2*CbH * -FIX(0.22800))
121*dfc6aa5cSAndroid Build Coastguard Worker    vpmulhw     ymm4, ymm4, [GOTOFF(eax,PW_MF0228)]  ; ymm4=(2*CbL * -FIX(0.22800))
122*dfc6aa5cSAndroid Build Coastguard Worker    vpmulhw     ymm7, ymm7, [GOTOFF(eax,PW_F0402)]   ; ymm7=(2*CrH * FIX(0.40200))
123*dfc6aa5cSAndroid Build Coastguard Worker    vpmulhw     ymm0, ymm0, [GOTOFF(eax,PW_F0402)]   ; ymm0=(2*CrL * FIX(0.40200))
124*dfc6aa5cSAndroid Build Coastguard Worker
125*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm6, ymm6, [GOTOFF(eax,PW_ONE)]
126*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm4, ymm4, [GOTOFF(eax,PW_ONE)]
127*dfc6aa5cSAndroid Build Coastguard Worker    vpsraw      ymm6, ymm6, 1                     ; ymm6=(CbH * -FIX(0.22800))
128*dfc6aa5cSAndroid Build Coastguard Worker    vpsraw      ymm4, ymm4, 1                     ; ymm4=(CbL * -FIX(0.22800))
129*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm7, ymm7, [GOTOFF(eax,PW_ONE)]
130*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm0, ymm0, [GOTOFF(eax,PW_ONE)]
131*dfc6aa5cSAndroid Build Coastguard Worker    vpsraw      ymm7, ymm7, 1                     ; ymm7=(CrH * FIX(0.40200))
132*dfc6aa5cSAndroid Build Coastguard Worker    vpsraw      ymm0, ymm0, 1                     ; ymm0=(CrL * FIX(0.40200))
133*dfc6aa5cSAndroid Build Coastguard Worker
134*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm6, ymm6, ymm5
135*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm4, ymm4, ymm2
136*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm6, ymm6, ymm5                  ; ymm6=(CbH * FIX(1.77200))=(B-Y)H
137*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm4, ymm4, ymm2                  ; ymm4=(CbL * FIX(1.77200))=(B-Y)L
138*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm7, ymm7, ymm1                  ; ymm7=(CrH * FIX(1.40200))=(R-Y)H
139*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm0, ymm0, ymm3                  ; ymm0=(CrL * FIX(1.40200))=(R-Y)L
140*dfc6aa5cSAndroid Build Coastguard Worker
141*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     YMMWORD [wk(0)], ymm6             ; wk(0)=(B-Y)H
142*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     YMMWORD [wk(1)], ymm7             ; wk(1)=(R-Y)H
143*dfc6aa5cSAndroid Build Coastguard Worker
144*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhwd  ymm6, ymm5, ymm1
145*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklwd  ymm5, ymm5, ymm1
146*dfc6aa5cSAndroid Build Coastguard Worker    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)]
147*dfc6aa5cSAndroid Build Coastguard Worker    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_MF0344_F0285)]
148*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhwd  ymm7, ymm2, ymm3
149*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklwd  ymm2, ymm2, ymm3
150*dfc6aa5cSAndroid Build Coastguard Worker    vpmaddwd    ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)]
151*dfc6aa5cSAndroid Build Coastguard Worker    vpmaddwd    ymm7, ymm7, [GOTOFF(eax,PW_MF0344_F0285)]
152*dfc6aa5cSAndroid Build Coastguard Worker
153*dfc6aa5cSAndroid Build Coastguard Worker    vpaddd      ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)]
154*dfc6aa5cSAndroid Build Coastguard Worker    vpaddd      ymm6, ymm6, [GOTOFF(eax,PD_ONEHALF)]
155*dfc6aa5cSAndroid Build Coastguard Worker    vpsrad      ymm5, ymm5, SCALEBITS
156*dfc6aa5cSAndroid Build Coastguard Worker    vpsrad      ymm6, ymm6, SCALEBITS
157*dfc6aa5cSAndroid Build Coastguard Worker    vpaddd      ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)]
158*dfc6aa5cSAndroid Build Coastguard Worker    vpaddd      ymm7, ymm7, [GOTOFF(eax,PD_ONEHALF)]
159*dfc6aa5cSAndroid Build Coastguard Worker    vpsrad      ymm2, ymm2, SCALEBITS
160*dfc6aa5cSAndroid Build Coastguard Worker    vpsrad      ymm7, ymm7, SCALEBITS
161*dfc6aa5cSAndroid Build Coastguard Worker
162*dfc6aa5cSAndroid Build Coastguard Worker    vpackssdw   ymm5, ymm5, ymm6        ; ymm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
163*dfc6aa5cSAndroid Build Coastguard Worker    vpackssdw   ymm2, ymm2, ymm7        ; ymm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
164*dfc6aa5cSAndroid Build Coastguard Worker    vpsubw      ymm5, ymm5, ymm1        ; ymm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
165*dfc6aa5cSAndroid Build Coastguard Worker    vpsubw      ymm2, ymm2, ymm3        ; ymm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
166*dfc6aa5cSAndroid Build Coastguard Worker
167*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     YMMWORD [wk(2)], ymm5   ; wk(2)=(G-Y)H
168*dfc6aa5cSAndroid Build Coastguard Worker
169*dfc6aa5cSAndroid Build Coastguard Worker    mov         al, 2                   ; Yctr
170*dfc6aa5cSAndroid Build Coastguard Worker    jmp         short .Yloop_1st
171*dfc6aa5cSAndroid Build Coastguard Worker    alignx      16, 7
172*dfc6aa5cSAndroid Build Coastguard Worker
173*dfc6aa5cSAndroid Build Coastguard Worker.Yloop_2nd:
174*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     ymm0, YMMWORD [wk(1)]   ; ymm0=(R-Y)H
175*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     ymm2, YMMWORD [wk(2)]   ; ymm2=(G-Y)H
176*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     ymm4, YMMWORD [wk(0)]   ; ymm4=(B-Y)H
177*dfc6aa5cSAndroid Build Coastguard Worker    alignx      16, 7
178*dfc6aa5cSAndroid Build Coastguard Worker
179*dfc6aa5cSAndroid Build Coastguard Worker.Yloop_1st:
180*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm7, YMMWORD [esi]     ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
181*dfc6aa5cSAndroid Build Coastguard Worker
182*dfc6aa5cSAndroid Build Coastguard Worker    vpcmpeqw    ymm6, ymm6, ymm6
183*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
184*dfc6aa5cSAndroid Build Coastguard Worker    vpand       ymm6, ymm6, ymm7        ; ymm6=Y(02468ACEGIKMOQSU)=YE
185*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm7, ymm7, BYTE_BIT    ; ymm7=Y(13579BDFHJLNPRTV)=YO
186*dfc6aa5cSAndroid Build Coastguard Worker
187*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     ymm1, ymm0              ; ymm1=ymm0=(R-Y)(L/H)
188*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     ymm3, ymm2              ; ymm3=ymm2=(G-Y)(L/H)
189*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     ymm5, ymm4              ; ymm5=ymm4=(B-Y)(L/H)
190*dfc6aa5cSAndroid Build Coastguard Worker
191*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm0, ymm0, ymm6        ; ymm0=((R-Y)+YE)=RE=R(02468ACEGIKMOQSU)
192*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm1, ymm1, ymm7        ; ymm1=((R-Y)+YO)=RO=R(13579BDFHJLNPRTV)
193*dfc6aa5cSAndroid Build Coastguard Worker    vpackuswb   ymm0, ymm0, ymm0        ; ymm0=R(02468ACE********GIKMOQSU********)
194*dfc6aa5cSAndroid Build Coastguard Worker    vpackuswb   ymm1, ymm1, ymm1        ; ymm1=R(13579BDF********HJLNPRTV********)
195*dfc6aa5cSAndroid Build Coastguard Worker
196*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm2, ymm2, ymm6        ; ymm2=((G-Y)+YE)=GE=G(02468ACEGIKMOQSU)
197*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm3, ymm3, ymm7        ; ymm3=((G-Y)+YO)=GO=G(13579BDFHJLNPRTV)
198*dfc6aa5cSAndroid Build Coastguard Worker    vpackuswb   ymm2, ymm2, ymm2        ; ymm2=G(02468ACE********GIKMOQSU********)
199*dfc6aa5cSAndroid Build Coastguard Worker    vpackuswb   ymm3, ymm3, ymm3        ; ymm3=G(13579BDF********HJLNPRTV********)
200*dfc6aa5cSAndroid Build Coastguard Worker
201*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm4, ymm4, ymm6        ; ymm4=((B-Y)+YE)=BE=B(02468ACEGIKMOQSU)
202*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm5, ymm5, ymm7        ; ymm5=((B-Y)+YO)=BO=B(13579BDFHJLNPRTV)
203*dfc6aa5cSAndroid Build Coastguard Worker    vpackuswb   ymm4, ymm4, ymm4        ; ymm4=B(02468ACE********GIKMOQSU********)
204*dfc6aa5cSAndroid Build Coastguard Worker    vpackuswb   ymm5, ymm5, ymm5        ; ymm5=B(13579BDF********HJLNPRTV********)
205*dfc6aa5cSAndroid Build Coastguard Worker
206*dfc6aa5cSAndroid Build Coastguard Worker%if RGB_PIXELSIZE == 3  ; ---------------
207*dfc6aa5cSAndroid Build Coastguard Worker
208*dfc6aa5cSAndroid Build Coastguard Worker    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
209*dfc6aa5cSAndroid Build Coastguard Worker    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
210*dfc6aa5cSAndroid Build Coastguard Worker    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
211*dfc6aa5cSAndroid Build Coastguard Worker    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
212*dfc6aa5cSAndroid Build Coastguard Worker    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
213*dfc6aa5cSAndroid Build Coastguard Worker    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
214*dfc6aa5cSAndroid Build Coastguard Worker    ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
215*dfc6aa5cSAndroid Build Coastguard Worker    ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
216*dfc6aa5cSAndroid Build Coastguard Worker
217*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
218*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
219*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymmE, ymmE, ymmB        ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
220*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
221*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymmD, ymmD, ymmF        ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
222*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
223*dfc6aa5cSAndroid Build Coastguard Worker
224*dfc6aa5cSAndroid Build Coastguard Worker    vpsrldq     ymmH, ymmA, 2           ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
225*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
226*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhwd  ymmG, ymmA, ymmE        ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
227*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
228*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
229*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
230*dfc6aa5cSAndroid Build Coastguard Worker
231*dfc6aa5cSAndroid Build Coastguard Worker    vpsrldq     ymmE, ymmE, 2           ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
232*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
233*dfc6aa5cSAndroid Build Coastguard Worker
234*dfc6aa5cSAndroid Build Coastguard Worker    vpsrldq     ymmB, ymmD, 2           ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
235*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
236*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhwd  ymmC, ymmD, ymmH        ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
237*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
238*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklwd  ymmD, ymmD, ymmH        ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
239*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
240*dfc6aa5cSAndroid Build Coastguard Worker
241*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhwd  ymmF, ymmE, ymmB        ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
242*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
243*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklwd  ymmE, ymmE, ymmB        ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
244*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
245*dfc6aa5cSAndroid Build Coastguard Worker
246*dfc6aa5cSAndroid Build Coastguard Worker    vpshufd     ymmH, ymmA, 0x4E        ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
247*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
248*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckldq  ymmA, ymmA, ymmD        ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
249*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
250*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhdq  ymmD, ymmD, ymmE        ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
251*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
252*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckldq  ymmE, ymmE, ymmH        ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
253*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
254*dfc6aa5cSAndroid Build Coastguard Worker
255*dfc6aa5cSAndroid Build Coastguard Worker    vpshufd     ymmH, ymmG, 0x4E        ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
256*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
257*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckldq  ymmG, ymmG, ymmC        ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
258*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
259*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhdq  ymmC, ymmC, ymmF        ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
260*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
261*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckldq  ymmF, ymmF, ymmH        ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
262*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
263*dfc6aa5cSAndroid Build Coastguard Worker
264*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklqdq ymmH, ymmA, ymmE        ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
265*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
266*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklqdq ymmG, ymmD, ymmG        ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
267*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
268*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklqdq ymmC, ymmF, ymmC        ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
269*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
270*dfc6aa5cSAndroid Build Coastguard Worker
271*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymmA, ymmH, ymmG, 0x20  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
272*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
273*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymmD, ymmC, ymmH, 0x30  ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
274*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
275*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymmF, ymmG, ymmC, 0x31  ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
276*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
277*dfc6aa5cSAndroid Build Coastguard Worker
278*dfc6aa5cSAndroid Build Coastguard Worker    cmp         ecx, byte SIZEOF_YMMWORD
279*dfc6aa5cSAndroid Build Coastguard Worker    jb          short .column_st64
280*dfc6aa5cSAndroid Build Coastguard Worker
281*dfc6aa5cSAndroid Build Coastguard Worker    test        edi, SIZEOF_YMMWORD-1
282*dfc6aa5cSAndroid Build Coastguard Worker    jnz         short .out1
283*dfc6aa5cSAndroid Build Coastguard Worker    ; --(aligned)-------------------
284*dfc6aa5cSAndroid Build Coastguard Worker    vmovntdq    YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
285*dfc6aa5cSAndroid Build Coastguard Worker    vmovntdq    YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
286*dfc6aa5cSAndroid Build Coastguard Worker    vmovntdq    YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
287*dfc6aa5cSAndroid Build Coastguard Worker    jmp         short .out0
288*dfc6aa5cSAndroid Build Coastguard Worker.out1:  ; --(unaligned)-----------------
289*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
290*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
291*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
292*dfc6aa5cSAndroid Build Coastguard Worker.out0:
293*dfc6aa5cSAndroid Build Coastguard Worker    add         edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
294*dfc6aa5cSAndroid Build Coastguard Worker    sub         ecx, byte SIZEOF_YMMWORD
295*dfc6aa5cSAndroid Build Coastguard Worker    jz          near .endcolumn
296*dfc6aa5cSAndroid Build Coastguard Worker
297*dfc6aa5cSAndroid Build Coastguard Worker    add         esi, byte SIZEOF_YMMWORD  ; inptr0
298*dfc6aa5cSAndroid Build Coastguard Worker    dec         al                        ; Yctr
299*dfc6aa5cSAndroid Build Coastguard Worker    jnz         near .Yloop_2nd
300*dfc6aa5cSAndroid Build Coastguard Worker
301*dfc6aa5cSAndroid Build Coastguard Worker    add         ebx, byte SIZEOF_YMMWORD  ; inptr1
302*dfc6aa5cSAndroid Build Coastguard Worker    add         edx, byte SIZEOF_YMMWORD  ; inptr2
303*dfc6aa5cSAndroid Build Coastguard Worker    jmp         near .columnloop
304*dfc6aa5cSAndroid Build Coastguard Worker    alignx      16, 7
305*dfc6aa5cSAndroid Build Coastguard Worker
306*dfc6aa5cSAndroid Build Coastguard Worker.column_st64:
307*dfc6aa5cSAndroid Build Coastguard Worker    lea         ecx, [ecx+ecx*2]            ; imul ecx, RGB_PIXELSIZE
308*dfc6aa5cSAndroid Build Coastguard Worker    cmp         ecx, byte 2*SIZEOF_YMMWORD
309*dfc6aa5cSAndroid Build Coastguard Worker    jb          short .column_st32
310*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
311*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
312*dfc6aa5cSAndroid Build Coastguard Worker    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
313*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     ymmA, ymmF
314*dfc6aa5cSAndroid Build Coastguard Worker    sub         ecx, byte 2*SIZEOF_YMMWORD
315*dfc6aa5cSAndroid Build Coastguard Worker    jmp         short .column_st31
316*dfc6aa5cSAndroid Build Coastguard Worker.column_st32:
317*dfc6aa5cSAndroid Build Coastguard Worker    cmp         ecx, byte SIZEOF_YMMWORD
318*dfc6aa5cSAndroid Build Coastguard Worker    jb          short .column_st31
319*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
320*dfc6aa5cSAndroid Build Coastguard Worker    add         edi, byte SIZEOF_YMMWORD    ; outptr
321*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     ymmA, ymmD
322*dfc6aa5cSAndroid Build Coastguard Worker    sub         ecx, byte SIZEOF_YMMWORD
323*dfc6aa5cSAndroid Build Coastguard Worker    jmp         short .column_st31
324*dfc6aa5cSAndroid Build Coastguard Worker.column_st31:
325*dfc6aa5cSAndroid Build Coastguard Worker    cmp         ecx, byte SIZEOF_XMMWORD
326*dfc6aa5cSAndroid Build Coastguard Worker    jb          short .column_st15
327*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
328*dfc6aa5cSAndroid Build Coastguard Worker    add         edi, byte SIZEOF_XMMWORD    ; outptr
329*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymmA, ymmA, ymmA, 1
330*dfc6aa5cSAndroid Build Coastguard Worker    sub         ecx, byte SIZEOF_XMMWORD
331*dfc6aa5cSAndroid Build Coastguard Worker.column_st15:
332*dfc6aa5cSAndroid Build Coastguard Worker    ; Store the lower 8 bytes of xmmA to the output when it has enough
333*dfc6aa5cSAndroid Build Coastguard Worker    ; space.
334*dfc6aa5cSAndroid Build Coastguard Worker    cmp         ecx, byte SIZEOF_MMWORD
335*dfc6aa5cSAndroid Build Coastguard Worker    jb          short .column_st7
336*dfc6aa5cSAndroid Build Coastguard Worker    vmovq       XMM_MMWORD [edi], xmmA
337*dfc6aa5cSAndroid Build Coastguard Worker    add         edi, byte SIZEOF_MMWORD
338*dfc6aa5cSAndroid Build Coastguard Worker    sub         ecx, byte SIZEOF_MMWORD
339*dfc6aa5cSAndroid Build Coastguard Worker    vpsrldq     xmmA, xmmA, SIZEOF_MMWORD
340*dfc6aa5cSAndroid Build Coastguard Worker.column_st7:
341*dfc6aa5cSAndroid Build Coastguard Worker    ; Store the lower 4 bytes of xmmA to the output when it has enough
342*dfc6aa5cSAndroid Build Coastguard Worker    ; space.
343*dfc6aa5cSAndroid Build Coastguard Worker    cmp         ecx, byte SIZEOF_DWORD
344*dfc6aa5cSAndroid Build Coastguard Worker    jb          short .column_st3
345*dfc6aa5cSAndroid Build Coastguard Worker    vmovd       XMM_DWORD [edi], xmmA
346*dfc6aa5cSAndroid Build Coastguard Worker    add         edi, byte SIZEOF_DWORD
347*dfc6aa5cSAndroid Build Coastguard Worker    sub         ecx, byte SIZEOF_DWORD
348*dfc6aa5cSAndroid Build Coastguard Worker    vpsrldq     xmmA, xmmA, SIZEOF_DWORD
349*dfc6aa5cSAndroid Build Coastguard Worker.column_st3:
350*dfc6aa5cSAndroid Build Coastguard Worker    ; Store the lower 2 bytes of eax to the output when it has enough
351*dfc6aa5cSAndroid Build Coastguard Worker    ; space.
352*dfc6aa5cSAndroid Build Coastguard Worker    vmovd       eax, xmmA
353*dfc6aa5cSAndroid Build Coastguard Worker    cmp         ecx, byte SIZEOF_WORD
354*dfc6aa5cSAndroid Build Coastguard Worker    jb          short .column_st1
355*dfc6aa5cSAndroid Build Coastguard Worker    mov         word [edi], ax
356*dfc6aa5cSAndroid Build Coastguard Worker    add         edi, byte SIZEOF_WORD
357*dfc6aa5cSAndroid Build Coastguard Worker    sub         ecx, byte SIZEOF_WORD
358*dfc6aa5cSAndroid Build Coastguard Worker    shr         eax, 16
359*dfc6aa5cSAndroid Build Coastguard Worker.column_st1:
360*dfc6aa5cSAndroid Build Coastguard Worker    ; Store the lower 1 byte of eax to the output when it has enough
361*dfc6aa5cSAndroid Build Coastguard Worker    ; space.
362*dfc6aa5cSAndroid Build Coastguard Worker    test        ecx, ecx
363*dfc6aa5cSAndroid Build Coastguard Worker    jz          short .endcolumn
364*dfc6aa5cSAndroid Build Coastguard Worker    mov         byte [edi], al
365*dfc6aa5cSAndroid Build Coastguard Worker
366*dfc6aa5cSAndroid Build Coastguard Worker%else  ; RGB_PIXELSIZE == 4 ; -----------
367*dfc6aa5cSAndroid Build Coastguard Worker
368*dfc6aa5cSAndroid Build Coastguard Worker%ifdef RGBX_FILLER_0XFF
369*dfc6aa5cSAndroid Build Coastguard Worker    vpcmpeqb    ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
370*dfc6aa5cSAndroid Build Coastguard Worker    vpcmpeqb    ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
371*dfc6aa5cSAndroid Build Coastguard Worker%else
372*dfc6aa5cSAndroid Build Coastguard Worker    vpxor       ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
373*dfc6aa5cSAndroid Build Coastguard Worker    vpxor       ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
374*dfc6aa5cSAndroid Build Coastguard Worker%endif
375*dfc6aa5cSAndroid Build Coastguard Worker    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
376*dfc6aa5cSAndroid Build Coastguard Worker    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
377*dfc6aa5cSAndroid Build Coastguard Worker    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
378*dfc6aa5cSAndroid Build Coastguard Worker    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
379*dfc6aa5cSAndroid Build Coastguard Worker    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
380*dfc6aa5cSAndroid Build Coastguard Worker    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
381*dfc6aa5cSAndroid Build Coastguard Worker    ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
382*dfc6aa5cSAndroid Build Coastguard Worker    ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
383*dfc6aa5cSAndroid Build Coastguard Worker
384*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
385*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
386*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymmE, ymmE, ymmG        ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
387*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
388*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymmB, ymmB, ymmD        ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
389*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
390*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymmF, ymmF, ymmH        ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
391*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
392*dfc6aa5cSAndroid Build Coastguard Worker
393*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhwd  ymmC, ymmA, ymmE        ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
394*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
395*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
396*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
397*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhwd  ymmG, ymmB, ymmF        ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
398*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
399*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklwd  ymmB, ymmB, ymmF        ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
400*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
401*dfc6aa5cSAndroid Build Coastguard Worker
402*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhdq  ymmE, ymmA, ymmB        ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
403*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
404*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckldq  ymmB, ymmA, ymmB        ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
405*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
406*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhdq  ymmF, ymmC, ymmG        ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
407*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
408*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckldq  ymmG, ymmC, ymmG        ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
409*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
410*dfc6aa5cSAndroid Build Coastguard Worker
411*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymmA, ymmB, ymmE, 0x20  ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
412*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
413*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymmD, ymmG, ymmF, 0x20  ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
414*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
415*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymmC, ymmB, ymmE, 0x31  ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
416*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
417*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymmH, ymmG, ymmF, 0x31  ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
418*dfc6aa5cSAndroid Build Coastguard Worker                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
419*dfc6aa5cSAndroid Build Coastguard Worker
420*dfc6aa5cSAndroid Build Coastguard Worker    cmp         ecx, byte SIZEOF_YMMWORD
421*dfc6aa5cSAndroid Build Coastguard Worker    jb          short .column_st64
422*dfc6aa5cSAndroid Build Coastguard Worker
423*dfc6aa5cSAndroid Build Coastguard Worker    test        edi, SIZEOF_YMMWORD-1
424*dfc6aa5cSAndroid Build Coastguard Worker    jnz         short .out1
425*dfc6aa5cSAndroid Build Coastguard Worker    ; --(aligned)-------------------
426*dfc6aa5cSAndroid Build Coastguard Worker    vmovntdq    YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
427*dfc6aa5cSAndroid Build Coastguard Worker    vmovntdq    YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
428*dfc6aa5cSAndroid Build Coastguard Worker    vmovntdq    YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
429*dfc6aa5cSAndroid Build Coastguard Worker    vmovntdq    YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
430*dfc6aa5cSAndroid Build Coastguard Worker    jmp         short .out0
431*dfc6aa5cSAndroid Build Coastguard Worker.out1:  ; --(unaligned)-----------------
432*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
433*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
434*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
435*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
436*dfc6aa5cSAndroid Build Coastguard Worker.out0:
437*dfc6aa5cSAndroid Build Coastguard Worker    add         edi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
438*dfc6aa5cSAndroid Build Coastguard Worker    sub         ecx, byte SIZEOF_YMMWORD
439*dfc6aa5cSAndroid Build Coastguard Worker    jz          near .endcolumn
440*dfc6aa5cSAndroid Build Coastguard Worker
441*dfc6aa5cSAndroid Build Coastguard Worker    add         esi, byte SIZEOF_YMMWORD  ; inptr0
442*dfc6aa5cSAndroid Build Coastguard Worker    dec         al
443*dfc6aa5cSAndroid Build Coastguard Worker    jnz         near .Yloop_2nd
444*dfc6aa5cSAndroid Build Coastguard Worker
445*dfc6aa5cSAndroid Build Coastguard Worker    add         ebx, byte SIZEOF_YMMWORD  ; inptr1
446*dfc6aa5cSAndroid Build Coastguard Worker    add         edx, byte SIZEOF_YMMWORD  ; inptr2
447*dfc6aa5cSAndroid Build Coastguard Worker    jmp         near .columnloop
448*dfc6aa5cSAndroid Build Coastguard Worker    alignx      16, 7
449*dfc6aa5cSAndroid Build Coastguard Worker
450*dfc6aa5cSAndroid Build Coastguard Worker.column_st64:
451*dfc6aa5cSAndroid Build Coastguard Worker    cmp         ecx, byte SIZEOF_YMMWORD/2
452*dfc6aa5cSAndroid Build Coastguard Worker    jb          short .column_st32
453*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
454*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
455*dfc6aa5cSAndroid Build Coastguard Worker    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
456*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     ymmA, ymmC
457*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     ymmD, ymmH
458*dfc6aa5cSAndroid Build Coastguard Worker    sub         ecx, byte SIZEOF_YMMWORD/2
459*dfc6aa5cSAndroid Build Coastguard Worker.column_st32:
460*dfc6aa5cSAndroid Build Coastguard Worker    cmp         ecx, byte SIZEOF_YMMWORD/4
461*dfc6aa5cSAndroid Build Coastguard Worker    jb          short .column_st16
462*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
463*dfc6aa5cSAndroid Build Coastguard Worker    add         edi, byte SIZEOF_YMMWORD    ; outptr
464*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     ymmA, ymmD
465*dfc6aa5cSAndroid Build Coastguard Worker    sub         ecx, byte SIZEOF_YMMWORD/4
466*dfc6aa5cSAndroid Build Coastguard Worker.column_st16:
467*dfc6aa5cSAndroid Build Coastguard Worker    cmp         ecx, byte SIZEOF_YMMWORD/8
468*dfc6aa5cSAndroid Build Coastguard Worker    jb          short .column_st15
469*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
470*dfc6aa5cSAndroid Build Coastguard Worker    add         edi, byte SIZEOF_XMMWORD    ; outptr
471*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymmA, ymmA, ymmA, 1
472*dfc6aa5cSAndroid Build Coastguard Worker    sub         ecx, byte SIZEOF_YMMWORD/8
473*dfc6aa5cSAndroid Build Coastguard Worker.column_st15:
474*dfc6aa5cSAndroid Build Coastguard Worker    ; Store two pixels (8 bytes) of ymmA to the output when it has enough
475*dfc6aa5cSAndroid Build Coastguard Worker    ; space.
476*dfc6aa5cSAndroid Build Coastguard Worker    cmp         ecx, byte SIZEOF_YMMWORD/16
477*dfc6aa5cSAndroid Build Coastguard Worker    jb          short .column_st7
478*dfc6aa5cSAndroid Build Coastguard Worker    vmovq       MMWORD [edi], xmmA
479*dfc6aa5cSAndroid Build Coastguard Worker    add         edi, byte SIZEOF_YMMWORD/16*4
480*dfc6aa5cSAndroid Build Coastguard Worker    sub         ecx, byte SIZEOF_YMMWORD/16
481*dfc6aa5cSAndroid Build Coastguard Worker    vpsrldq     xmmA, SIZEOF_YMMWORD/16*4
482*dfc6aa5cSAndroid Build Coastguard Worker.column_st7:
483*dfc6aa5cSAndroid Build Coastguard Worker    ; Store one pixel (4 bytes) of ymmA to the output when it has enough
484*dfc6aa5cSAndroid Build Coastguard Worker    ; space.
485*dfc6aa5cSAndroid Build Coastguard Worker    test        ecx, ecx
486*dfc6aa5cSAndroid Build Coastguard Worker    jz          short .endcolumn
487*dfc6aa5cSAndroid Build Coastguard Worker    vmovd       XMM_DWORD [edi], xmmA
488*dfc6aa5cSAndroid Build Coastguard Worker
489*dfc6aa5cSAndroid Build Coastguard Worker%endif  ; RGB_PIXELSIZE ; ---------------
490*dfc6aa5cSAndroid Build Coastguard Worker
491*dfc6aa5cSAndroid Build Coastguard Worker.endcolumn:
492*dfc6aa5cSAndroid Build Coastguard Worker    sfence                              ; flush the write buffer
493*dfc6aa5cSAndroid Build Coastguard Worker
494*dfc6aa5cSAndroid Build Coastguard Worker.return:
495*dfc6aa5cSAndroid Build Coastguard Worker    vzeroupper
496*dfc6aa5cSAndroid Build Coastguard Worker    pop         edi
497*dfc6aa5cSAndroid Build Coastguard Worker    pop         esi
498*dfc6aa5cSAndroid Build Coastguard Worker;   pop         edx                     ; need not be preserved
499*dfc6aa5cSAndroid Build Coastguard Worker;   pop         ecx                     ; need not be preserved
500*dfc6aa5cSAndroid Build Coastguard Worker    pop         ebx
501*dfc6aa5cSAndroid Build Coastguard Worker    mov         esp, ebp                ; esp <- aligned ebp
502*dfc6aa5cSAndroid Build Coastguard Worker    pop         esp                     ; esp <- original ebp
503*dfc6aa5cSAndroid Build Coastguard Worker    pop         ebp
504*dfc6aa5cSAndroid Build Coastguard Worker    ret
505*dfc6aa5cSAndroid Build Coastguard Worker
506*dfc6aa5cSAndroid Build Coastguard Worker; --------------------------------------------------------------------------
507*dfc6aa5cSAndroid Build Coastguard Worker;
508*dfc6aa5cSAndroid Build Coastguard Worker; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
509*dfc6aa5cSAndroid Build Coastguard Worker;
510*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(void)
511*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_h2v2_merged_upsample_avx2(JDIMENSION output_width,
512*dfc6aa5cSAndroid Build Coastguard Worker;                                 JSAMPIMAGE input_buf,
513*dfc6aa5cSAndroid Build Coastguard Worker;                                 JDIMENSION in_row_group_ctr,
514*dfc6aa5cSAndroid Build Coastguard Worker;                                 JSAMPARRAY output_buf);
515*dfc6aa5cSAndroid Build Coastguard Worker;
516*dfc6aa5cSAndroid Build Coastguard Worker
517*dfc6aa5cSAndroid Build Coastguard Worker%define output_width(b)      (b) + 8    ; JDIMENSION output_width
518*dfc6aa5cSAndroid Build Coastguard Worker%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
519*dfc6aa5cSAndroid Build Coastguard Worker%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
520*dfc6aa5cSAndroid Build Coastguard Worker%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
521*dfc6aa5cSAndroid Build Coastguard Worker
522*dfc6aa5cSAndroid Build Coastguard Worker    align       32
523*dfc6aa5cSAndroid Build Coastguard Worker    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2)
524*dfc6aa5cSAndroid Build Coastguard Worker
525*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_h2v2_merged_upsample_avx2):
526*dfc6aa5cSAndroid Build Coastguard Worker    push        ebp
527*dfc6aa5cSAndroid Build Coastguard Worker    mov         ebp, esp
528*dfc6aa5cSAndroid Build Coastguard Worker    push        ebx
529*dfc6aa5cSAndroid Build Coastguard Worker;   push        ecx                     ; need not be preserved
530*dfc6aa5cSAndroid Build Coastguard Worker;   push        edx                     ; need not be preserved
531*dfc6aa5cSAndroid Build Coastguard Worker    push        esi
532*dfc6aa5cSAndroid Build Coastguard Worker    push        edi
533*dfc6aa5cSAndroid Build Coastguard Worker
534*dfc6aa5cSAndroid Build Coastguard Worker    mov         eax, POINTER [output_width(ebp)]
535*dfc6aa5cSAndroid Build Coastguard Worker
536*dfc6aa5cSAndroid Build Coastguard Worker    mov         edi, JSAMPIMAGE [input_buf(ebp)]
537*dfc6aa5cSAndroid Build Coastguard Worker    mov         ecx, JDIMENSION [in_row_group_ctr(ebp)]
538*dfc6aa5cSAndroid Build Coastguard Worker    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
539*dfc6aa5cSAndroid Build Coastguard Worker    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
540*dfc6aa5cSAndroid Build Coastguard Worker    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
541*dfc6aa5cSAndroid Build Coastguard Worker    mov         edi, JSAMPARRAY [output_buf(ebp)]
542*dfc6aa5cSAndroid Build Coastguard Worker    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
543*dfc6aa5cSAndroid Build Coastguard Worker
544*dfc6aa5cSAndroid Build Coastguard Worker    push        edx                     ; inptr2
545*dfc6aa5cSAndroid Build Coastguard Worker    push        ebx                     ; inptr1
546*dfc6aa5cSAndroid Build Coastguard Worker    push        esi                     ; inptr00
547*dfc6aa5cSAndroid Build Coastguard Worker    mov         ebx, esp
548*dfc6aa5cSAndroid Build Coastguard Worker
549*dfc6aa5cSAndroid Build Coastguard Worker    push        edi                     ; output_buf (outptr0)
550*dfc6aa5cSAndroid Build Coastguard Worker    push        ecx                     ; in_row_group_ctr
551*dfc6aa5cSAndroid Build Coastguard Worker    push        ebx                     ; input_buf
552*dfc6aa5cSAndroid Build Coastguard Worker    push        eax                     ; output_width
553*dfc6aa5cSAndroid Build Coastguard Worker
554*dfc6aa5cSAndroid Build Coastguard Worker    call        near EXTN(jsimd_h2v1_merged_upsample_avx2)
555*dfc6aa5cSAndroid Build Coastguard Worker
556*dfc6aa5cSAndroid Build Coastguard Worker    add         esi, byte SIZEOF_JSAMPROW  ; inptr01
557*dfc6aa5cSAndroid Build Coastguard Worker    add         edi, byte SIZEOF_JSAMPROW  ; outptr1
558*dfc6aa5cSAndroid Build Coastguard Worker    mov         POINTER [ebx+0*SIZEOF_POINTER], esi
559*dfc6aa5cSAndroid Build Coastguard Worker    mov         POINTER [ebx-1*SIZEOF_POINTER], edi
560*dfc6aa5cSAndroid Build Coastguard Worker
561*dfc6aa5cSAndroid Build Coastguard Worker    call        near EXTN(jsimd_h2v1_merged_upsample_avx2)
562*dfc6aa5cSAndroid Build Coastguard Worker
563*dfc6aa5cSAndroid Build Coastguard Worker    add         esp, byte 7*SIZEOF_DWORD
564*dfc6aa5cSAndroid Build Coastguard Worker
565*dfc6aa5cSAndroid Build Coastguard Worker    pop         edi
566*dfc6aa5cSAndroid Build Coastguard Worker    pop         esi
567*dfc6aa5cSAndroid Build Coastguard Worker;   pop         edx                     ; need not be preserved
568*dfc6aa5cSAndroid Build Coastguard Worker;   pop         ecx                     ; need not be preserved
569*dfc6aa5cSAndroid Build Coastguard Worker    pop         ebx
570*dfc6aa5cSAndroid Build Coastguard Worker    pop         ebp
571*dfc6aa5cSAndroid Build Coastguard Worker    ret
572*dfc6aa5cSAndroid Build Coastguard Worker
573*dfc6aa5cSAndroid Build Coastguard Worker; For some reason, the OS X linker does not honor the request to align the
574*dfc6aa5cSAndroid Build Coastguard Worker; segment unless we do this.
575*dfc6aa5cSAndroid Build Coastguard Worker    align       32
576