xref: /aosp_15_r20/external/libjpeg-turbo/simd/i386/jdsample-avx2.asm (revision dfc6aa5c1cfd4bc4e2018dc74aa96e29ee49c6da)
1*dfc6aa5cSAndroid Build Coastguard Worker;
2*dfc6aa5cSAndroid Build Coastguard Worker; jdsample.asm - upsampling (AVX2)
3*dfc6aa5cSAndroid Build Coastguard Worker;
4*dfc6aa5cSAndroid Build Coastguard Worker; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2015, Intel Corporation.
6*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2016, D. R. Commander.
7*dfc6aa5cSAndroid Build Coastguard Worker;
8*dfc6aa5cSAndroid Build Coastguard Worker; Based on the x86 SIMD extension for IJG JPEG library
9*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 1999-2006, MIYASAKA Masaru.
10*dfc6aa5cSAndroid Build Coastguard Worker; For conditions of distribution and use, see copyright notice in jsimdext.inc
11*dfc6aa5cSAndroid Build Coastguard Worker;
12*dfc6aa5cSAndroid Build Coastguard Worker; This file should be assembled with NASM (Netwide Assembler),
13*dfc6aa5cSAndroid Build Coastguard Worker; can *not* be assembled with Microsoft's MASM or any compatible
14*dfc6aa5cSAndroid Build Coastguard Worker; assembler (including Borland's Turbo Assembler).
15*dfc6aa5cSAndroid Build Coastguard Worker; NASM is available from http://nasm.sourceforge.net/ or
16*dfc6aa5cSAndroid Build Coastguard Worker; http://sourceforge.net/project/showfiles.php?group_id=6208
17*dfc6aa5cSAndroid Build Coastguard Worker
18*dfc6aa5cSAndroid Build Coastguard Worker%include "jsimdext.inc"
19*dfc6aa5cSAndroid Build Coastguard Worker
20*dfc6aa5cSAndroid Build Coastguard Worker; --------------------------------------------------------------------------
21*dfc6aa5cSAndroid Build Coastguard Worker    SECTION     SEG_CONST
22*dfc6aa5cSAndroid Build Coastguard Worker
23*dfc6aa5cSAndroid Build Coastguard Worker    alignz      32
24*dfc6aa5cSAndroid Build Coastguard Worker    GLOBAL_DATA(jconst_fancy_upsample_avx2)
25*dfc6aa5cSAndroid Build Coastguard Worker
26*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jconst_fancy_upsample_avx2):
27*dfc6aa5cSAndroid Build Coastguard Worker
28*dfc6aa5cSAndroid Build Coastguard WorkerPW_ONE   times 16 dw 1
29*dfc6aa5cSAndroid Build Coastguard WorkerPW_TWO   times 16 dw 2
30*dfc6aa5cSAndroid Build Coastguard WorkerPW_THREE times 16 dw 3
31*dfc6aa5cSAndroid Build Coastguard WorkerPW_SEVEN times 16 dw 7
32*dfc6aa5cSAndroid Build Coastguard WorkerPW_EIGHT times 16 dw 8
33*dfc6aa5cSAndroid Build Coastguard Worker
34*dfc6aa5cSAndroid Build Coastguard Worker    alignz      32
35*dfc6aa5cSAndroid Build Coastguard Worker
36*dfc6aa5cSAndroid Build Coastguard Worker; --------------------------------------------------------------------------
37*dfc6aa5cSAndroid Build Coastguard Worker    SECTION     SEG_TEXT
38*dfc6aa5cSAndroid Build Coastguard Worker    BITS        32
39*dfc6aa5cSAndroid Build Coastguard Worker;
40*dfc6aa5cSAndroid Build Coastguard Worker; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
41*dfc6aa5cSAndroid Build Coastguard Worker;
42*dfc6aa5cSAndroid Build Coastguard Worker; The upsampling algorithm is linear interpolation between pixel centers,
43*dfc6aa5cSAndroid Build Coastguard Worker; also known as a "triangle filter".  This is a good compromise between
44*dfc6aa5cSAndroid Build Coastguard Worker; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
45*dfc6aa5cSAndroid Build Coastguard Worker; of the way between input pixel centers.
46*dfc6aa5cSAndroid Build Coastguard Worker;
47*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(void)
48*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
49*dfc6aa5cSAndroid Build Coastguard Worker;                                JDIMENSION downsampled_width,
50*dfc6aa5cSAndroid Build Coastguard Worker;                                JSAMPARRAY input_data,
51*dfc6aa5cSAndroid Build Coastguard Worker;                                JSAMPARRAY *output_data_ptr);
52*dfc6aa5cSAndroid Build Coastguard Worker;
53*dfc6aa5cSAndroid Build Coastguard Worker
54*dfc6aa5cSAndroid Build Coastguard Worker%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
55*dfc6aa5cSAndroid Build Coastguard Worker%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
56*dfc6aa5cSAndroid Build Coastguard Worker%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
57*dfc6aa5cSAndroid Build Coastguard Worker%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
58*dfc6aa5cSAndroid Build Coastguard Worker
59*dfc6aa5cSAndroid Build Coastguard Worker    align       32
60*dfc6aa5cSAndroid Build Coastguard Worker    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
61*dfc6aa5cSAndroid Build Coastguard Worker
62*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_h2v1_fancy_upsample_avx2):
63*dfc6aa5cSAndroid Build Coastguard Worker    push        ebp
64*dfc6aa5cSAndroid Build Coastguard Worker    mov         ebp, esp
65*dfc6aa5cSAndroid Build Coastguard Worker    pushpic     ebx
66*dfc6aa5cSAndroid Build Coastguard Worker;   push        ecx                     ; need not be preserved
67*dfc6aa5cSAndroid Build Coastguard Worker;   push        edx                     ; need not be preserved
68*dfc6aa5cSAndroid Build Coastguard Worker    push        esi
69*dfc6aa5cSAndroid Build Coastguard Worker    push        edi
70*dfc6aa5cSAndroid Build Coastguard Worker
71*dfc6aa5cSAndroid Build Coastguard Worker    get_GOT     ebx                     ; get GOT address
72*dfc6aa5cSAndroid Build Coastguard Worker
73*dfc6aa5cSAndroid Build Coastguard Worker    mov         eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
74*dfc6aa5cSAndroid Build Coastguard Worker    test        eax, eax
75*dfc6aa5cSAndroid Build Coastguard Worker    jz          near .return
76*dfc6aa5cSAndroid Build Coastguard Worker
77*dfc6aa5cSAndroid Build Coastguard Worker    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
78*dfc6aa5cSAndroid Build Coastguard Worker    test        ecx, ecx
79*dfc6aa5cSAndroid Build Coastguard Worker    jz          near .return
80*dfc6aa5cSAndroid Build Coastguard Worker
81*dfc6aa5cSAndroid Build Coastguard Worker    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
82*dfc6aa5cSAndroid Build Coastguard Worker    mov         edi, POINTER [output_data_ptr(ebp)]
83*dfc6aa5cSAndroid Build Coastguard Worker    mov         edi, JSAMPARRAY [edi]                ; output_data
84*dfc6aa5cSAndroid Build Coastguard Worker    alignx      16, 7
85*dfc6aa5cSAndroid Build Coastguard Worker.rowloop:
86*dfc6aa5cSAndroid Build Coastguard Worker    push        eax                     ; colctr
87*dfc6aa5cSAndroid Build Coastguard Worker    push        edi
88*dfc6aa5cSAndroid Build Coastguard Worker    push        esi
89*dfc6aa5cSAndroid Build Coastguard Worker
90*dfc6aa5cSAndroid Build Coastguard Worker    mov         esi, JSAMPROW [esi]     ; inptr
91*dfc6aa5cSAndroid Build Coastguard Worker    mov         edi, JSAMPROW [edi]     ; outptr
92*dfc6aa5cSAndroid Build Coastguard Worker
93*dfc6aa5cSAndroid Build Coastguard Worker    test        eax, SIZEOF_YMMWORD-1
94*dfc6aa5cSAndroid Build Coastguard Worker    jz          short .skip
95*dfc6aa5cSAndroid Build Coastguard Worker    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
96*dfc6aa5cSAndroid Build Coastguard Worker    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
97*dfc6aa5cSAndroid Build Coastguard Worker.skip:
98*dfc6aa5cSAndroid Build Coastguard Worker    vpxor       ymm0, ymm0, ymm0                ; ymm0=(all 0's)
99*dfc6aa5cSAndroid Build Coastguard Worker    vpcmpeqb    xmm7, xmm7, xmm7
100*dfc6aa5cSAndroid Build Coastguard Worker    vpsrldq     xmm7, xmm7, (SIZEOF_XMMWORD-1)  ; (ff -- -- -- ... -- --) LSB is ff
101*dfc6aa5cSAndroid Build Coastguard Worker    vpand       ymm7, ymm7, YMMWORD [esi+0*SIZEOF_YMMWORD]
102*dfc6aa5cSAndroid Build Coastguard Worker
103*dfc6aa5cSAndroid Build Coastguard Worker    add         eax, byte SIZEOF_YMMWORD-1
104*dfc6aa5cSAndroid Build Coastguard Worker    and         eax, byte -SIZEOF_YMMWORD
105*dfc6aa5cSAndroid Build Coastguard Worker    cmp         eax, byte SIZEOF_YMMWORD
106*dfc6aa5cSAndroid Build Coastguard Worker    ja          short .columnloop
107*dfc6aa5cSAndroid Build Coastguard Worker    alignx      16, 7
108*dfc6aa5cSAndroid Build Coastguard Worker
109*dfc6aa5cSAndroid Build Coastguard Worker.columnloop_last:
110*dfc6aa5cSAndroid Build Coastguard Worker    vpcmpeqb    xmm6, xmm6, xmm6
111*dfc6aa5cSAndroid Build Coastguard Worker    vpslldq     xmm6, xmm6, (SIZEOF_XMMWORD-1)
112*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm6, ymm6, ymm6, 1             ; (---- ---- ... ---- ---- ff) MSB is ff
113*dfc6aa5cSAndroid Build Coastguard Worker    vpand       ymm6, ymm6, YMMWORD [esi+0*SIZEOF_YMMWORD]
114*dfc6aa5cSAndroid Build Coastguard Worker    jmp         short .upsample
115*dfc6aa5cSAndroid Build Coastguard Worker    alignx      16, 7
116*dfc6aa5cSAndroid Build Coastguard Worker
117*dfc6aa5cSAndroid Build Coastguard Worker.columnloop:
118*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm6, YMMWORD [esi+1*SIZEOF_YMMWORD]
119*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm6, ymm0, ymm6, 0x20
120*dfc6aa5cSAndroid Build Coastguard Worker    vpslldq     ymm6, ymm6, 15
121*dfc6aa5cSAndroid Build Coastguard Worker
122*dfc6aa5cSAndroid Build Coastguard Worker.upsample:
123*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]  ; ymm1=( 0  1  2 ... 29 30 31)
124*dfc6aa5cSAndroid Build Coastguard Worker
125*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm2, ymm0, ymm1, 0x20
126*dfc6aa5cSAndroid Build Coastguard Worker    vpalignr    ymm2, ymm1, ymm2, 15            ; ymm2=(--  0  1 ... 28 29 30)
127*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm4, ymm0, ymm1, 0x03
128*dfc6aa5cSAndroid Build Coastguard Worker    vpalignr    ymm3, ymm4, ymm1, 1             ; ymm3=( 1  2  3 ... 30 31 --)
129*dfc6aa5cSAndroid Build Coastguard Worker
130*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm2, ymm2, ymm7                ; ymm2=(-1  0  1 ... 28 29 30)
131*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm3, ymm3, ymm6                ; ymm3=( 1  2  3 ... 30 31 32)
132*dfc6aa5cSAndroid Build Coastguard Worker
133*dfc6aa5cSAndroid Build Coastguard Worker    vpsrldq     ymm7, ymm4, (SIZEOF_XMMWORD-1)  ; ymm7=(31 -- -- ... -- -- --)
134*dfc6aa5cSAndroid Build Coastguard Worker
135*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm4, ymm1, ymm0                ; ymm4=( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
136*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm5, ymm1, ymm0                ; ymm5=( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
137*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm1, ymm5, ymm4, 0x20          ; ymm1=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
138*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm4, ymm5, ymm4, 0x31          ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
139*dfc6aa5cSAndroid Build Coastguard Worker
140*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm5, ymm2, ymm0                ; ymm5=( 7  8  9 10 11 12 13 14 23 24 25 26 27 28 29 30)
141*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm6, ymm2, ymm0                ; ymm6=(-1  0  1  2  3  4  5  6 15 16 17 18 19 20 21 22)
142*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm2, ymm6, ymm5, 0x20          ; ymm2=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
143*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm5, ymm6, ymm5, 0x31          ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
144*dfc6aa5cSAndroid Build Coastguard Worker
145*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm6, ymm3, ymm0                ; ymm6=( 1  2  3  4  5  6  7  8 17 18 19 20 21 22 23 24)
146*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm0, ymm3, ymm0                ; ymm0=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
147*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm3, ymm0, ymm6, 0x20          ; ymm3=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
148*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm6, ymm0, ymm6, 0x31          ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
149*dfc6aa5cSAndroid Build Coastguard Worker
150*dfc6aa5cSAndroid Build Coastguard Worker    vpxor       ymm0, ymm0, ymm0                ; ymm0=(all 0's)
151*dfc6aa5cSAndroid Build Coastguard Worker
152*dfc6aa5cSAndroid Build Coastguard Worker    vpmullw     ymm1, ymm1, [GOTOFF(ebx,PW_THREE)]
153*dfc6aa5cSAndroid Build Coastguard Worker    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
154*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm2, ymm2, [GOTOFF(ebx,PW_ONE)]
155*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_ONE)]
156*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm3, ymm3, [GOTOFF(ebx,PW_TWO)]
157*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm6, ymm6, [GOTOFF(ebx,PW_TWO)]
158*dfc6aa5cSAndroid Build Coastguard Worker
159*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm2, ymm2, ymm1
160*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm5, ymm5, ymm4
161*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm2, ymm2, 2                   ; ymm2=OutLE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
162*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm5, ymm5, 2                   ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
163*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm3, ymm3, ymm1
164*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm6, ymm6, ymm4
165*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm3, ymm3, 2                   ; ymm3=OutLO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
166*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm6, ymm6, 2                   ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
167*dfc6aa5cSAndroid Build Coastguard Worker
168*dfc6aa5cSAndroid Build Coastguard Worker    vpsllw      ymm3, ymm3, BYTE_BIT
169*dfc6aa5cSAndroid Build Coastguard Worker    vpsllw      ymm6, ymm6, BYTE_BIT
170*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm2, ymm2, ymm3                ; ymm2=OutL=( 0  1  2 ... 29 30 31)
171*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm5, ymm5, ymm6                ; ymm5=OutH=(32 33 34 ... 61 62 63)
172*dfc6aa5cSAndroid Build Coastguard Worker
173*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
174*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm5
175*dfc6aa5cSAndroid Build Coastguard Worker
176*dfc6aa5cSAndroid Build Coastguard Worker    sub         eax, byte SIZEOF_YMMWORD
177*dfc6aa5cSAndroid Build Coastguard Worker    add         esi, byte 1*SIZEOF_YMMWORD  ; inptr
178*dfc6aa5cSAndroid Build Coastguard Worker    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
179*dfc6aa5cSAndroid Build Coastguard Worker    cmp         eax, byte SIZEOF_YMMWORD
180*dfc6aa5cSAndroid Build Coastguard Worker    ja          near .columnloop
181*dfc6aa5cSAndroid Build Coastguard Worker    test        eax, eax
182*dfc6aa5cSAndroid Build Coastguard Worker    jnz         near .columnloop_last
183*dfc6aa5cSAndroid Build Coastguard Worker
184*dfc6aa5cSAndroid Build Coastguard Worker    pop         esi
185*dfc6aa5cSAndroid Build Coastguard Worker    pop         edi
186*dfc6aa5cSAndroid Build Coastguard Worker    pop         eax
187*dfc6aa5cSAndroid Build Coastguard Worker
188*dfc6aa5cSAndroid Build Coastguard Worker    add         esi, byte SIZEOF_JSAMPROW  ; input_data
189*dfc6aa5cSAndroid Build Coastguard Worker    add         edi, byte SIZEOF_JSAMPROW  ; output_data
190*dfc6aa5cSAndroid Build Coastguard Worker    dec         ecx                        ; rowctr
191*dfc6aa5cSAndroid Build Coastguard Worker    jg          near .rowloop
192*dfc6aa5cSAndroid Build Coastguard Worker
193*dfc6aa5cSAndroid Build Coastguard Worker.return:
194*dfc6aa5cSAndroid Build Coastguard Worker    vzeroupper
195*dfc6aa5cSAndroid Build Coastguard Worker    pop         edi
196*dfc6aa5cSAndroid Build Coastguard Worker    pop         esi
197*dfc6aa5cSAndroid Build Coastguard Worker;   pop         edx                     ; need not be preserved
198*dfc6aa5cSAndroid Build Coastguard Worker;   pop         ecx                     ; need not be preserved
199*dfc6aa5cSAndroid Build Coastguard Worker    poppic      ebx
200*dfc6aa5cSAndroid Build Coastguard Worker    pop         ebp
201*dfc6aa5cSAndroid Build Coastguard Worker    ret
202*dfc6aa5cSAndroid Build Coastguard Worker
203*dfc6aa5cSAndroid Build Coastguard Worker; --------------------------------------------------------------------------
204*dfc6aa5cSAndroid Build Coastguard Worker;
205*dfc6aa5cSAndroid Build Coastguard Worker; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
206*dfc6aa5cSAndroid Build Coastguard Worker; Again a triangle filter; see comments for h2v1 case, above.
207*dfc6aa5cSAndroid Build Coastguard Worker;
208*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(void)
209*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
210*dfc6aa5cSAndroid Build Coastguard Worker;                                JDIMENSION downsampled_width,
211*dfc6aa5cSAndroid Build Coastguard Worker;                                JSAMPARRAY input_data,
212*dfc6aa5cSAndroid Build Coastguard Worker;                                JSAMPARRAY *output_data_ptr);
213*dfc6aa5cSAndroid Build Coastguard Worker;
214*dfc6aa5cSAndroid Build Coastguard Worker
215*dfc6aa5cSAndroid Build Coastguard Worker%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
216*dfc6aa5cSAndroid Build Coastguard Worker%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
217*dfc6aa5cSAndroid Build Coastguard Worker%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
218*dfc6aa5cSAndroid Build Coastguard Worker%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
219*dfc6aa5cSAndroid Build Coastguard Worker
220*dfc6aa5cSAndroid Build Coastguard Worker%define original_ebp  ebp + 0
221*dfc6aa5cSAndroid Build Coastguard Worker%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
222*dfc6aa5cSAndroid Build Coastguard Worker                                        ; ymmword wk[WK_NUM]
223*dfc6aa5cSAndroid Build Coastguard Worker%define WK_NUM        4
224*dfc6aa5cSAndroid Build Coastguard Worker%define gotptr        wk(0) - SIZEOF_POINTER  ; void *gotptr
225*dfc6aa5cSAndroid Build Coastguard Worker
226*dfc6aa5cSAndroid Build Coastguard Worker    align       32
227*dfc6aa5cSAndroid Build Coastguard Worker    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
228*dfc6aa5cSAndroid Build Coastguard Worker
229*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_h2v2_fancy_upsample_avx2):
230*dfc6aa5cSAndroid Build Coastguard Worker    push        ebp
231*dfc6aa5cSAndroid Build Coastguard Worker    mov         eax, esp                     ; eax = original ebp
232*dfc6aa5cSAndroid Build Coastguard Worker    sub         esp, byte 4
233*dfc6aa5cSAndroid Build Coastguard Worker    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
234*dfc6aa5cSAndroid Build Coastguard Worker    mov         [esp], eax
235*dfc6aa5cSAndroid Build Coastguard Worker    mov         ebp, esp                     ; ebp = aligned ebp
236*dfc6aa5cSAndroid Build Coastguard Worker    lea         esp, [wk(0)]
237*dfc6aa5cSAndroid Build Coastguard Worker    pushpic     eax                     ; make a room for GOT address
238*dfc6aa5cSAndroid Build Coastguard Worker    push        ebx
239*dfc6aa5cSAndroid Build Coastguard Worker;   push        ecx                     ; need not be preserved
240*dfc6aa5cSAndroid Build Coastguard Worker;   push        edx                     ; need not be preserved
241*dfc6aa5cSAndroid Build Coastguard Worker    push        esi
242*dfc6aa5cSAndroid Build Coastguard Worker    push        edi
243*dfc6aa5cSAndroid Build Coastguard Worker
244*dfc6aa5cSAndroid Build Coastguard Worker    get_GOT     ebx                     ; get GOT address
245*dfc6aa5cSAndroid Build Coastguard Worker    movpic      POINTER [gotptr], ebx   ; save GOT address
246*dfc6aa5cSAndroid Build Coastguard Worker
247*dfc6aa5cSAndroid Build Coastguard Worker    mov         edx, eax                ; edx = original ebp
248*dfc6aa5cSAndroid Build Coastguard Worker    mov         eax, JDIMENSION [downsamp_width(edx)]  ; colctr
249*dfc6aa5cSAndroid Build Coastguard Worker    test        eax, eax
250*dfc6aa5cSAndroid Build Coastguard Worker    jz          near .return
251*dfc6aa5cSAndroid Build Coastguard Worker
252*dfc6aa5cSAndroid Build Coastguard Worker    mov         ecx, INT [max_v_samp(edx)]  ; rowctr
253*dfc6aa5cSAndroid Build Coastguard Worker    test        ecx, ecx
254*dfc6aa5cSAndroid Build Coastguard Worker    jz          near .return
255*dfc6aa5cSAndroid Build Coastguard Worker
256*dfc6aa5cSAndroid Build Coastguard Worker    mov         esi, JSAMPARRAY [input_data(edx)]    ; input_data
257*dfc6aa5cSAndroid Build Coastguard Worker    mov         edi, POINTER [output_data_ptr(edx)]
258*dfc6aa5cSAndroid Build Coastguard Worker    mov         edi, JSAMPARRAY [edi]                ; output_data
259*dfc6aa5cSAndroid Build Coastguard Worker    alignx      16, 7
260*dfc6aa5cSAndroid Build Coastguard Worker.rowloop:
261*dfc6aa5cSAndroid Build Coastguard Worker    push        eax                     ; colctr
262*dfc6aa5cSAndroid Build Coastguard Worker    push        ecx
263*dfc6aa5cSAndroid Build Coastguard Worker    push        edi
264*dfc6aa5cSAndroid Build Coastguard Worker    push        esi
265*dfc6aa5cSAndroid Build Coastguard Worker
266*dfc6aa5cSAndroid Build Coastguard Worker    mov         ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
267*dfc6aa5cSAndroid Build Coastguard Worker    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
268*dfc6aa5cSAndroid Build Coastguard Worker    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
269*dfc6aa5cSAndroid Build Coastguard Worker    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
270*dfc6aa5cSAndroid Build Coastguard Worker    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
271*dfc6aa5cSAndroid Build Coastguard Worker
272*dfc6aa5cSAndroid Build Coastguard Worker    test        eax, SIZEOF_YMMWORD-1
273*dfc6aa5cSAndroid Build Coastguard Worker    jz          short .skip
274*dfc6aa5cSAndroid Build Coastguard Worker    push        edx
275*dfc6aa5cSAndroid Build Coastguard Worker    mov         dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
276*dfc6aa5cSAndroid Build Coastguard Worker    mov         JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
277*dfc6aa5cSAndroid Build Coastguard Worker    mov         dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
278*dfc6aa5cSAndroid Build Coastguard Worker    mov         JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
279*dfc6aa5cSAndroid Build Coastguard Worker    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
280*dfc6aa5cSAndroid Build Coastguard Worker    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
281*dfc6aa5cSAndroid Build Coastguard Worker    pop         edx
282*dfc6aa5cSAndroid Build Coastguard Worker.skip:
283*dfc6aa5cSAndroid Build Coastguard Worker    ; -- process the first column block
284*dfc6aa5cSAndroid Build Coastguard Worker
285*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm0, YMMWORD [ebx+0*SIZEOF_YMMWORD]  ; ymm0=row[ 0][0]
286*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm1, YMMWORD [ecx+0*SIZEOF_YMMWORD]  ; ymm1=row[-1][0]
287*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm2, YMMWORD [esi+0*SIZEOF_YMMWORD]  ; ymm2=row[+1][0]
288*dfc6aa5cSAndroid Build Coastguard Worker
289*dfc6aa5cSAndroid Build Coastguard Worker    pushpic     ebx
290*dfc6aa5cSAndroid Build Coastguard Worker    movpic      ebx, POINTER [gotptr]   ; load GOT address
291*dfc6aa5cSAndroid Build Coastguard Worker
292*dfc6aa5cSAndroid Build Coastguard Worker    vpxor       ymm3, ymm3, ymm3        ; ymm3=(all 0's)
293*dfc6aa5cSAndroid Build Coastguard Worker
294*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm4, ymm0, ymm3        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
295*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm5, ymm0, ymm3        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
296*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
297*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
298*dfc6aa5cSAndroid Build Coastguard Worker
299*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm5, ymm1, ymm3        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
300*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm6, ymm1, ymm3        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
301*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
302*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
303*dfc6aa5cSAndroid Build Coastguard Worker
304*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm6, ymm2, ymm3        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
305*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm3, ymm2, ymm3        ; ymm3=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
306*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm2, ymm3, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
307*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm6, ymm3, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
308*dfc6aa5cSAndroid Build Coastguard Worker
309*dfc6aa5cSAndroid Build Coastguard Worker    vpmullw     ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
310*dfc6aa5cSAndroid Build Coastguard Worker    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
311*dfc6aa5cSAndroid Build Coastguard Worker
312*dfc6aa5cSAndroid Build Coastguard Worker    vpcmpeqb    xmm7, xmm7, xmm7
313*dfc6aa5cSAndroid Build Coastguard Worker    vpsrldq     xmm7, xmm7, (SIZEOF_XMMWORD-2)  ; (ffff ---- ---- ... ---- ----) LSB is ffff
314*dfc6aa5cSAndroid Build Coastguard Worker
315*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
316*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
317*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
318*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
319*dfc6aa5cSAndroid Build Coastguard Worker
320*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1  ; temporarily save
321*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5  ; the intermediate data
322*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
323*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm6
324*dfc6aa5cSAndroid Build Coastguard Worker
325*dfc6aa5cSAndroid Build Coastguard Worker    vpand       ymm1, ymm1, ymm7        ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
326*dfc6aa5cSAndroid Build Coastguard Worker    vpand       ymm2, ymm2, ymm7        ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
327*dfc6aa5cSAndroid Build Coastguard Worker
328*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     YMMWORD [wk(0)], ymm1
329*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     YMMWORD [wk(1)], ymm2
330*dfc6aa5cSAndroid Build Coastguard Worker
331*dfc6aa5cSAndroid Build Coastguard Worker    poppic      ebx
332*dfc6aa5cSAndroid Build Coastguard Worker
333*dfc6aa5cSAndroid Build Coastguard Worker    add         eax, byte SIZEOF_YMMWORD-1
334*dfc6aa5cSAndroid Build Coastguard Worker    and         eax, byte -SIZEOF_YMMWORD
335*dfc6aa5cSAndroid Build Coastguard Worker    cmp         eax, byte SIZEOF_YMMWORD
336*dfc6aa5cSAndroid Build Coastguard Worker    ja          short .columnloop
337*dfc6aa5cSAndroid Build Coastguard Worker    alignx      16, 7
338*dfc6aa5cSAndroid Build Coastguard Worker
339*dfc6aa5cSAndroid Build Coastguard Worker.columnloop_last:
340*dfc6aa5cSAndroid Build Coastguard Worker    ; -- process the last column block
341*dfc6aa5cSAndroid Build Coastguard Worker
342*dfc6aa5cSAndroid Build Coastguard Worker    pushpic     ebx
343*dfc6aa5cSAndroid Build Coastguard Worker    movpic      ebx, POINTER [gotptr]   ; load GOT address
344*dfc6aa5cSAndroid Build Coastguard Worker
345*dfc6aa5cSAndroid Build Coastguard Worker    vpcmpeqb    xmm1, xmm1, xmm1
346*dfc6aa5cSAndroid Build Coastguard Worker    vpslldq     xmm1, xmm1, (SIZEOF_XMMWORD-2)
347*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm1, ymm1, ymm1, 1             ; (---- ---- ... ---- ---- ffff) MSB is ffff
348*dfc6aa5cSAndroid Build Coastguard Worker
349*dfc6aa5cSAndroid Build Coastguard Worker    vpand       ymm2, ymm1, YMMWORD [edi+1*SIZEOF_YMMWORD]
350*dfc6aa5cSAndroid Build Coastguard Worker    vpand       ymm1, ymm1, YMMWORD [edx+1*SIZEOF_YMMWORD]
351*dfc6aa5cSAndroid Build Coastguard Worker
352*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     YMMWORD [wk(2)], ymm1          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
353*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     YMMWORD [wk(3)], ymm2          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
354*dfc6aa5cSAndroid Build Coastguard Worker
355*dfc6aa5cSAndroid Build Coastguard Worker    jmp         near .upsample
356*dfc6aa5cSAndroid Build Coastguard Worker    alignx      16, 7
357*dfc6aa5cSAndroid Build Coastguard Worker
358*dfc6aa5cSAndroid Build Coastguard Worker.columnloop:
359*dfc6aa5cSAndroid Build Coastguard Worker    ; -- process the next column block
360*dfc6aa5cSAndroid Build Coastguard Worker
361*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm0, YMMWORD [ebx+1*SIZEOF_YMMWORD]  ; ymm0=row[ 0][1]
362*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm1, YMMWORD [ecx+1*SIZEOF_YMMWORD]  ; ymm1=row[-1][1]
363*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm2, YMMWORD [esi+1*SIZEOF_YMMWORD]  ; ymm2=row[+1][1]
364*dfc6aa5cSAndroid Build Coastguard Worker
365*dfc6aa5cSAndroid Build Coastguard Worker    pushpic     ebx
366*dfc6aa5cSAndroid Build Coastguard Worker    movpic      ebx, POINTER [gotptr]   ; load GOT address
367*dfc6aa5cSAndroid Build Coastguard Worker
368*dfc6aa5cSAndroid Build Coastguard Worker    vpxor       ymm3, ymm3, ymm3        ; ymm3=(all 0's)
369*dfc6aa5cSAndroid Build Coastguard Worker
370*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm4, ymm0, ymm3        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
371*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm5, ymm0, ymm3        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
372*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
373*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
374*dfc6aa5cSAndroid Build Coastguard Worker
375*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm5, ymm1, ymm3        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
376*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm6, ymm1, ymm3        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
377*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
378*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
379*dfc6aa5cSAndroid Build Coastguard Worker
380*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm6, ymm2, ymm3        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
381*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm7, ymm2, ymm3        ; ymm7=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
382*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm2, ymm7, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
383*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm6, ymm7, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
384*dfc6aa5cSAndroid Build Coastguard Worker
385*dfc6aa5cSAndroid Build Coastguard Worker    vpmullw     ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
386*dfc6aa5cSAndroid Build Coastguard Worker    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
387*dfc6aa5cSAndroid Build Coastguard Worker
388*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
389*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
390*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
391*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
392*dfc6aa5cSAndroid Build Coastguard Worker
393*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edx+2*SIZEOF_YMMWORD], ymm1  ; temporarily save
394*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edx+3*SIZEOF_YMMWORD], ymm5  ; the intermediate data
395*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymm2
396*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+3*SIZEOF_YMMWORD], ymm6
397*dfc6aa5cSAndroid Build Coastguard Worker
398*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm1, ymm3, ymm1, 0x20
399*dfc6aa5cSAndroid Build Coastguard Worker    vpslldq     ymm1, ymm1, 14          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
400*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm2, ymm3, ymm2, 0x20
401*dfc6aa5cSAndroid Build Coastguard Worker    vpslldq     ymm2, ymm2, 14          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
402*dfc6aa5cSAndroid Build Coastguard Worker
403*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     YMMWORD [wk(2)], ymm1
404*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     YMMWORD [wk(3)], ymm2
405*dfc6aa5cSAndroid Build Coastguard Worker
406*dfc6aa5cSAndroid Build Coastguard Worker.upsample:
407*dfc6aa5cSAndroid Build Coastguard Worker    ; -- process the upper row
408*dfc6aa5cSAndroid Build Coastguard Worker
409*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm7, YMMWORD [edx+0*SIZEOF_YMMWORD]  ; ymm7=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
410*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm3, YMMWORD [edx+1*SIZEOF_YMMWORD]  ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
411*dfc6aa5cSAndroid Build Coastguard Worker
412*dfc6aa5cSAndroid Build Coastguard Worker    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
413*dfc6aa5cSAndroid Build Coastguard Worker
414*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm0, ymm1, ymm7, 0x03
415*dfc6aa5cSAndroid Build Coastguard Worker    vpalignr    ymm0, ymm0, ymm7, 2     ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
416*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm4, ymm1, ymm3, 0x20
417*dfc6aa5cSAndroid Build Coastguard Worker    vpslldq     ymm4, ymm4, 14          ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
418*dfc6aa5cSAndroid Build Coastguard Worker
419*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm5, ymm1, ymm7, 0x03
420*dfc6aa5cSAndroid Build Coastguard Worker    vpsrldq     ymm5, ymm5, 14          ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
421*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm6, ymm1, ymm3, 0x20
422*dfc6aa5cSAndroid Build Coastguard Worker    vpalignr    ymm6, ymm3, ymm6, 14    ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
423*dfc6aa5cSAndroid Build Coastguard Worker
424*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm0, ymm0, ymm4        ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
425*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm5, ymm5, ymm6        ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
426*dfc6aa5cSAndroid Build Coastguard Worker
427*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm2, ymm1, ymm3, 0x03
428*dfc6aa5cSAndroid Build Coastguard Worker    vpalignr    ymm2, ymm2, ymm3, 2     ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
429*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm4, ymm1, ymm3, 0x03
430*dfc6aa5cSAndroid Build Coastguard Worker    vpsrldq     ymm4, ymm4, 14          ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
431*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm1, ymm1, ymm7, 0x20
432*dfc6aa5cSAndroid Build Coastguard Worker    vpalignr    ymm1, ymm7, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
433*dfc6aa5cSAndroid Build Coastguard Worker
434*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm1, ymm1, YMMWORD [wk(0)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
435*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm2, ymm2, YMMWORD [wk(2)]  ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
436*dfc6aa5cSAndroid Build Coastguard Worker
437*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     YMMWORD [wk(0)], ymm4
438*dfc6aa5cSAndroid Build Coastguard Worker
439*dfc6aa5cSAndroid Build Coastguard Worker    vpmullw     ymm7, ymm7, [GOTOFF(ebx,PW_THREE)]
440*dfc6aa5cSAndroid Build Coastguard Worker    vpmullw     ymm3, ymm3, [GOTOFF(ebx,PW_THREE)]
441*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
442*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_EIGHT)]
443*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm0, ymm0, [GOTOFF(ebx,PW_SEVEN)]
444*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm2, [GOTOFF(ebx,PW_SEVEN)]
445*dfc6aa5cSAndroid Build Coastguard Worker
446*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm1, ymm1, ymm7
447*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm5, ymm5, ymm3
448*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out0LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
449*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
450*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm0, ymm0, ymm7
451*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm2, ymm2, ymm3
452*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out0LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
453*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm2, ymm2, 4           ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
454*dfc6aa5cSAndroid Build Coastguard Worker
455*dfc6aa5cSAndroid Build Coastguard Worker    vpsllw      ymm0, ymm0, BYTE_BIT
456*dfc6aa5cSAndroid Build Coastguard Worker    vpsllw      ymm2, ymm2, BYTE_BIT
457*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm1, ymm1, ymm0        ; ymm1=Out0L=( 0  1  2 ... 29 30 31)
458*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm5, ymm5, ymm2        ; ymm5=Out0H=(32 33 34 ... 61 62 63)
459*dfc6aa5cSAndroid Build Coastguard Worker
460*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1
461*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5
462*dfc6aa5cSAndroid Build Coastguard Worker
463*dfc6aa5cSAndroid Build Coastguard Worker    ; -- process the lower row
464*dfc6aa5cSAndroid Build Coastguard Worker
465*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm6, YMMWORD [edi+0*SIZEOF_YMMWORD]  ; ymm6=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
466*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm4, YMMWORD [edi+1*SIZEOF_YMMWORD]  ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
467*dfc6aa5cSAndroid Build Coastguard Worker
468*dfc6aa5cSAndroid Build Coastguard Worker    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
469*dfc6aa5cSAndroid Build Coastguard Worker
470*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm7, ymm1, ymm6, 0x03
471*dfc6aa5cSAndroid Build Coastguard Worker    vpalignr    ymm7, ymm7, ymm6, 2     ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
472*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm3, ymm1, ymm4, 0x20
473*dfc6aa5cSAndroid Build Coastguard Worker    vpslldq     ymm3, ymm3, 14          ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
474*dfc6aa5cSAndroid Build Coastguard Worker
475*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm0, ymm1, ymm6, 0x03
476*dfc6aa5cSAndroid Build Coastguard Worker    vpsrldq     ymm0, ymm0, 14          ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
477*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm2, ymm1, ymm4, 0x20
478*dfc6aa5cSAndroid Build Coastguard Worker    vpalignr    ymm2, ymm4, ymm2, 14    ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
479*dfc6aa5cSAndroid Build Coastguard Worker
480*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm7, ymm7, ymm3        ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
481*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm0, ymm0, ymm2        ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
482*dfc6aa5cSAndroid Build Coastguard Worker
483*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm5, ymm1, ymm4, 0x03
484*dfc6aa5cSAndroid Build Coastguard Worker    vpalignr    ymm5, ymm5, ymm4, 2     ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
485*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm3, ymm1, ymm4, 0x03
486*dfc6aa5cSAndroid Build Coastguard Worker    vpsrldq     ymm3, ymm3, 14          ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
487*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm1, ymm1, ymm6, 0x20
488*dfc6aa5cSAndroid Build Coastguard Worker    vpalignr    ymm1, ymm6, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
489*dfc6aa5cSAndroid Build Coastguard Worker
490*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm1, ymm1, YMMWORD [wk(1)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
491*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm5, ymm5, YMMWORD [wk(3)]  ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
492*dfc6aa5cSAndroid Build Coastguard Worker
493*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     YMMWORD [wk(1)], ymm3
494*dfc6aa5cSAndroid Build Coastguard Worker
495*dfc6aa5cSAndroid Build Coastguard Worker    vpmullw     ymm6, ymm6, [GOTOFF(ebx,PW_THREE)]
496*dfc6aa5cSAndroid Build Coastguard Worker    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
497*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
498*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm0, ymm0, [GOTOFF(ebx,PW_EIGHT)]
499*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm7, ymm7, [GOTOFF(ebx,PW_SEVEN)]
500*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_SEVEN)]
501*dfc6aa5cSAndroid Build Coastguard Worker
502*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm1, ymm1, ymm6
503*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm0, ymm0, ymm4
504*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out1LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
505*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
506*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm7, ymm7, ymm6
507*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm5, ymm5, ymm4
508*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm7, ymm7, 4           ; ymm7=Out1LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
509*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
510*dfc6aa5cSAndroid Build Coastguard Worker
511*dfc6aa5cSAndroid Build Coastguard Worker    vpsllw      ymm7, ymm7, BYTE_BIT
512*dfc6aa5cSAndroid Build Coastguard Worker    vpsllw      ymm5, ymm5, BYTE_BIT
513*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm1, ymm1, ymm7        ; ymm1=Out1L=( 0  1  2 ... 29 30 31)
514*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm0, ymm0, ymm5        ; ymm0=Out1H=(32 33 34 ... 61 62 63)
515*dfc6aa5cSAndroid Build Coastguard Worker
516*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm1
517*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm0
518*dfc6aa5cSAndroid Build Coastguard Worker
519*dfc6aa5cSAndroid Build Coastguard Worker    poppic      ebx
520*dfc6aa5cSAndroid Build Coastguard Worker
521*dfc6aa5cSAndroid Build Coastguard Worker    sub         eax, byte SIZEOF_YMMWORD
522*dfc6aa5cSAndroid Build Coastguard Worker    add         ecx, byte 1*SIZEOF_YMMWORD  ; inptr1(above)
523*dfc6aa5cSAndroid Build Coastguard Worker    add         ebx, byte 1*SIZEOF_YMMWORD  ; inptr0
524*dfc6aa5cSAndroid Build Coastguard Worker    add         esi, byte 1*SIZEOF_YMMWORD  ; inptr1(below)
525*dfc6aa5cSAndroid Build Coastguard Worker    add         edx, byte 2*SIZEOF_YMMWORD  ; outptr0
526*dfc6aa5cSAndroid Build Coastguard Worker    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr1
527*dfc6aa5cSAndroid Build Coastguard Worker    cmp         eax, byte SIZEOF_YMMWORD
528*dfc6aa5cSAndroid Build Coastguard Worker    ja          near .columnloop
529*dfc6aa5cSAndroid Build Coastguard Worker    test        eax, eax
530*dfc6aa5cSAndroid Build Coastguard Worker    jnz         near .columnloop_last
531*dfc6aa5cSAndroid Build Coastguard Worker
532*dfc6aa5cSAndroid Build Coastguard Worker    pop         esi
533*dfc6aa5cSAndroid Build Coastguard Worker    pop         edi
534*dfc6aa5cSAndroid Build Coastguard Worker    pop         ecx
535*dfc6aa5cSAndroid Build Coastguard Worker    pop         eax
536*dfc6aa5cSAndroid Build Coastguard Worker
537*dfc6aa5cSAndroid Build Coastguard Worker    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
538*dfc6aa5cSAndroid Build Coastguard Worker    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
539*dfc6aa5cSAndroid Build Coastguard Worker    sub         ecx, byte 2                  ; rowctr
540*dfc6aa5cSAndroid Build Coastguard Worker    jg          near .rowloop
541*dfc6aa5cSAndroid Build Coastguard Worker
542*dfc6aa5cSAndroid Build Coastguard Worker.return:
543*dfc6aa5cSAndroid Build Coastguard Worker    vzeroupper
544*dfc6aa5cSAndroid Build Coastguard Worker    pop         edi
545*dfc6aa5cSAndroid Build Coastguard Worker    pop         esi
546*dfc6aa5cSAndroid Build Coastguard Worker;   pop         edx                     ; need not be preserved
547*dfc6aa5cSAndroid Build Coastguard Worker;   pop         ecx                     ; need not be preserved
548*dfc6aa5cSAndroid Build Coastguard Worker    pop         ebx
549*dfc6aa5cSAndroid Build Coastguard Worker    mov         esp, ebp                ; esp <- aligned ebp
550*dfc6aa5cSAndroid Build Coastguard Worker    pop         esp                     ; esp <- original ebp
551*dfc6aa5cSAndroid Build Coastguard Worker    pop         ebp
552*dfc6aa5cSAndroid Build Coastguard Worker    ret
553*dfc6aa5cSAndroid Build Coastguard Worker
554*dfc6aa5cSAndroid Build Coastguard Worker; --------------------------------------------------------------------------
555*dfc6aa5cSAndroid Build Coastguard Worker;
556*dfc6aa5cSAndroid Build Coastguard Worker; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
557*dfc6aa5cSAndroid Build Coastguard Worker; It's still a box filter.
558*dfc6aa5cSAndroid Build Coastguard Worker;
559*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(void)
560*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
561*dfc6aa5cSAndroid Build Coastguard Worker;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
562*dfc6aa5cSAndroid Build Coastguard Worker;
563*dfc6aa5cSAndroid Build Coastguard Worker
564*dfc6aa5cSAndroid Build Coastguard Worker%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
565*dfc6aa5cSAndroid Build Coastguard Worker%define output_width(b)     (b) + 12    ; JDIMENSION output_width
566*dfc6aa5cSAndroid Build Coastguard Worker%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
567*dfc6aa5cSAndroid Build Coastguard Worker%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
568*dfc6aa5cSAndroid Build Coastguard Worker
569*dfc6aa5cSAndroid Build Coastguard Worker    align       32
570*dfc6aa5cSAndroid Build Coastguard Worker    GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
571*dfc6aa5cSAndroid Build Coastguard Worker
572*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_h2v1_upsample_avx2):
573*dfc6aa5cSAndroid Build Coastguard Worker    push        ebp
574*dfc6aa5cSAndroid Build Coastguard Worker    mov         ebp, esp
575*dfc6aa5cSAndroid Build Coastguard Worker;   push        ebx                     ; unused
576*dfc6aa5cSAndroid Build Coastguard Worker;   push        ecx                     ; need not be preserved
577*dfc6aa5cSAndroid Build Coastguard Worker;   push        edx                     ; need not be preserved
578*dfc6aa5cSAndroid Build Coastguard Worker    push        esi
579*dfc6aa5cSAndroid Build Coastguard Worker    push        edi
580*dfc6aa5cSAndroid Build Coastguard Worker
581*dfc6aa5cSAndroid Build Coastguard Worker    mov         edx, JDIMENSION [output_width(ebp)]
582*dfc6aa5cSAndroid Build Coastguard Worker    add         edx, byte (SIZEOF_YMMWORD-1)
583*dfc6aa5cSAndroid Build Coastguard Worker    and         edx, -SIZEOF_YMMWORD
584*dfc6aa5cSAndroid Build Coastguard Worker    jz          short .return
585*dfc6aa5cSAndroid Build Coastguard Worker
586*dfc6aa5cSAndroid Build Coastguard Worker    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
587*dfc6aa5cSAndroid Build Coastguard Worker    test        ecx, ecx
588*dfc6aa5cSAndroid Build Coastguard Worker    jz          short .return
589*dfc6aa5cSAndroid Build Coastguard Worker
590*dfc6aa5cSAndroid Build Coastguard Worker    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
591*dfc6aa5cSAndroid Build Coastguard Worker    mov         edi, POINTER [output_data_ptr(ebp)]
592*dfc6aa5cSAndroid Build Coastguard Worker    mov         edi, JSAMPARRAY [edi]                ; output_data
593*dfc6aa5cSAndroid Build Coastguard Worker    alignx      16, 7
594*dfc6aa5cSAndroid Build Coastguard Worker.rowloop:
595*dfc6aa5cSAndroid Build Coastguard Worker    push        edi
596*dfc6aa5cSAndroid Build Coastguard Worker    push        esi
597*dfc6aa5cSAndroid Build Coastguard Worker
598*dfc6aa5cSAndroid Build Coastguard Worker    mov         esi, JSAMPROW [esi]     ; inptr
599*dfc6aa5cSAndroid Build Coastguard Worker    mov         edi, JSAMPROW [edi]     ; outptr
600*dfc6aa5cSAndroid Build Coastguard Worker    mov         eax, edx                ; colctr
601*dfc6aa5cSAndroid Build Coastguard Worker    alignx      16, 7
602*dfc6aa5cSAndroid Build Coastguard Worker.columnloop:
603*dfc6aa5cSAndroid Build Coastguard Worker
604*dfc6aa5cSAndroid Build Coastguard Worker    cmp         eax, byte SIZEOF_YMMWORD
605*dfc6aa5cSAndroid Build Coastguard Worker    ja          near .above_16
606*dfc6aa5cSAndroid Build Coastguard Worker
607*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     xmm0, XMMWORD [esi+0*SIZEOF_YMMWORD]
608*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  xmm1, xmm0, xmm0
609*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  xmm0, xmm0, xmm0
610*dfc6aa5cSAndroid Build Coastguard Worker
611*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
612*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
613*dfc6aa5cSAndroid Build Coastguard Worker
614*dfc6aa5cSAndroid Build Coastguard Worker    jmp         short .nextrow
615*dfc6aa5cSAndroid Build Coastguard Worker
616*dfc6aa5cSAndroid Build Coastguard Worker.above_16:
617*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
618*dfc6aa5cSAndroid Build Coastguard Worker
619*dfc6aa5cSAndroid Build Coastguard Worker    vpermq      ymm0, ymm0, 0xd8
620*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm1, ymm0, ymm0
621*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm0, ymm0, ymm0
622*dfc6aa5cSAndroid Build Coastguard Worker
623*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
624*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
625*dfc6aa5cSAndroid Build Coastguard Worker
626*dfc6aa5cSAndroid Build Coastguard Worker    sub         eax, byte 2*SIZEOF_YMMWORD
627*dfc6aa5cSAndroid Build Coastguard Worker    jz          short .nextrow
628*dfc6aa5cSAndroid Build Coastguard Worker
629*dfc6aa5cSAndroid Build Coastguard Worker    add         esi, byte SIZEOF_YMMWORD    ; inptr
630*dfc6aa5cSAndroid Build Coastguard Worker    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
631*dfc6aa5cSAndroid Build Coastguard Worker    jmp         short .columnloop
632*dfc6aa5cSAndroid Build Coastguard Worker    alignx      16, 7
633*dfc6aa5cSAndroid Build Coastguard Worker
634*dfc6aa5cSAndroid Build Coastguard Worker.nextrow:
635*dfc6aa5cSAndroid Build Coastguard Worker    pop         esi
636*dfc6aa5cSAndroid Build Coastguard Worker    pop         edi
637*dfc6aa5cSAndroid Build Coastguard Worker
638*dfc6aa5cSAndroid Build Coastguard Worker    add         esi, byte SIZEOF_JSAMPROW  ; input_data
639*dfc6aa5cSAndroid Build Coastguard Worker    add         edi, byte SIZEOF_JSAMPROW  ; output_data
640*dfc6aa5cSAndroid Build Coastguard Worker    dec         ecx                        ; rowctr
641*dfc6aa5cSAndroid Build Coastguard Worker    jg          short .rowloop
642*dfc6aa5cSAndroid Build Coastguard Worker
643*dfc6aa5cSAndroid Build Coastguard Worker.return:
644*dfc6aa5cSAndroid Build Coastguard Worker    vzeroupper
645*dfc6aa5cSAndroid Build Coastguard Worker    pop         edi
646*dfc6aa5cSAndroid Build Coastguard Worker    pop         esi
647*dfc6aa5cSAndroid Build Coastguard Worker;   pop         edx                     ; need not be preserved
648*dfc6aa5cSAndroid Build Coastguard Worker;   pop         ecx                     ; need not be preserved
649*dfc6aa5cSAndroid Build Coastguard Worker;   pop         ebx                     ; unused
650*dfc6aa5cSAndroid Build Coastguard Worker    pop         ebp
651*dfc6aa5cSAndroid Build Coastguard Worker    ret
652*dfc6aa5cSAndroid Build Coastguard Worker
653*dfc6aa5cSAndroid Build Coastguard Worker; --------------------------------------------------------------------------
654*dfc6aa5cSAndroid Build Coastguard Worker;
655*dfc6aa5cSAndroid Build Coastguard Worker; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
656*dfc6aa5cSAndroid Build Coastguard Worker; It's still a box filter.
657*dfc6aa5cSAndroid Build Coastguard Worker;
658*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(void)
659*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
660*dfc6aa5cSAndroid Build Coastguard Worker;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
661*dfc6aa5cSAndroid Build Coastguard Worker;
662*dfc6aa5cSAndroid Build Coastguard Worker
663*dfc6aa5cSAndroid Build Coastguard Worker%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
664*dfc6aa5cSAndroid Build Coastguard Worker%define output_width(b)     (b) + 12    ; JDIMENSION output_width
665*dfc6aa5cSAndroid Build Coastguard Worker%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
666*dfc6aa5cSAndroid Build Coastguard Worker%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
667*dfc6aa5cSAndroid Build Coastguard Worker
668*dfc6aa5cSAndroid Build Coastguard Worker    align       32
669*dfc6aa5cSAndroid Build Coastguard Worker    GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
670*dfc6aa5cSAndroid Build Coastguard Worker
671*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_h2v2_upsample_avx2):
672*dfc6aa5cSAndroid Build Coastguard Worker    push        ebp
673*dfc6aa5cSAndroid Build Coastguard Worker    mov         ebp, esp
674*dfc6aa5cSAndroid Build Coastguard Worker    push        ebx
675*dfc6aa5cSAndroid Build Coastguard Worker;   push        ecx                     ; need not be preserved
676*dfc6aa5cSAndroid Build Coastguard Worker;   push        edx                     ; need not be preserved
677*dfc6aa5cSAndroid Build Coastguard Worker    push        esi
678*dfc6aa5cSAndroid Build Coastguard Worker    push        edi
679*dfc6aa5cSAndroid Build Coastguard Worker
680*dfc6aa5cSAndroid Build Coastguard Worker    mov         edx, JDIMENSION [output_width(ebp)]
681*dfc6aa5cSAndroid Build Coastguard Worker    add         edx, byte (SIZEOF_YMMWORD-1)
682*dfc6aa5cSAndroid Build Coastguard Worker    and         edx, -SIZEOF_YMMWORD
683*dfc6aa5cSAndroid Build Coastguard Worker    jz          near .return
684*dfc6aa5cSAndroid Build Coastguard Worker
685*dfc6aa5cSAndroid Build Coastguard Worker    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
686*dfc6aa5cSAndroid Build Coastguard Worker    test        ecx, ecx
687*dfc6aa5cSAndroid Build Coastguard Worker    jz          near .return
688*dfc6aa5cSAndroid Build Coastguard Worker
689*dfc6aa5cSAndroid Build Coastguard Worker    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
690*dfc6aa5cSAndroid Build Coastguard Worker    mov         edi, POINTER [output_data_ptr(ebp)]
691*dfc6aa5cSAndroid Build Coastguard Worker    mov         edi, JSAMPARRAY [edi]                ; output_data
692*dfc6aa5cSAndroid Build Coastguard Worker    alignx      16, 7
693*dfc6aa5cSAndroid Build Coastguard Worker.rowloop:
694*dfc6aa5cSAndroid Build Coastguard Worker    push        edi
695*dfc6aa5cSAndroid Build Coastguard Worker    push        esi
696*dfc6aa5cSAndroid Build Coastguard Worker
697*dfc6aa5cSAndroid Build Coastguard Worker    mov         esi, JSAMPROW [esi]                    ; inptr
698*dfc6aa5cSAndroid Build Coastguard Worker    mov         ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
699*dfc6aa5cSAndroid Build Coastguard Worker    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
700*dfc6aa5cSAndroid Build Coastguard Worker    mov         eax, edx                               ; colctr
701*dfc6aa5cSAndroid Build Coastguard Worker    alignx      16, 7
702*dfc6aa5cSAndroid Build Coastguard Worker.columnloop:
703*dfc6aa5cSAndroid Build Coastguard Worker
704*dfc6aa5cSAndroid Build Coastguard Worker    cmp         eax, byte SIZEOF_YMMWORD
705*dfc6aa5cSAndroid Build Coastguard Worker    ja          short .above_16
706*dfc6aa5cSAndroid Build Coastguard Worker
707*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
708*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  xmm1, xmm0, xmm0
709*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  xmm0, xmm0, xmm0
710*dfc6aa5cSAndroid Build Coastguard Worker
711*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
712*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
713*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
714*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
715*dfc6aa5cSAndroid Build Coastguard Worker
716*dfc6aa5cSAndroid Build Coastguard Worker    jmp         near .nextrow
717*dfc6aa5cSAndroid Build Coastguard Worker
718*dfc6aa5cSAndroid Build Coastguard Worker.above_16:
719*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
720*dfc6aa5cSAndroid Build Coastguard Worker
721*dfc6aa5cSAndroid Build Coastguard Worker    vpermq      ymm0, ymm0, 0xd8
722*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm1, ymm0, ymm0
723*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm0, ymm0, ymm0
724*dfc6aa5cSAndroid Build Coastguard Worker
725*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [ebx+0*SIZEOF_YMMWORD], ymm0
726*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [ebx+1*SIZEOF_YMMWORD], ymm1
727*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
728*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
729*dfc6aa5cSAndroid Build Coastguard Worker
730*dfc6aa5cSAndroid Build Coastguard Worker    sub         eax, byte 2*SIZEOF_YMMWORD
731*dfc6aa5cSAndroid Build Coastguard Worker    jz          short .nextrow
732*dfc6aa5cSAndroid Build Coastguard Worker
733*dfc6aa5cSAndroid Build Coastguard Worker    add         esi, byte SIZEOF_YMMWORD  ; inptr
734*dfc6aa5cSAndroid Build Coastguard Worker    add         ebx, 2*SIZEOF_YMMWORD     ; outptr0
735*dfc6aa5cSAndroid Build Coastguard Worker    add         edi, 2*SIZEOF_YMMWORD     ; outptr1
736*dfc6aa5cSAndroid Build Coastguard Worker    jmp         short .columnloop
737*dfc6aa5cSAndroid Build Coastguard Worker    alignx      16, 7
738*dfc6aa5cSAndroid Build Coastguard Worker
739*dfc6aa5cSAndroid Build Coastguard Worker.nextrow:
740*dfc6aa5cSAndroid Build Coastguard Worker    pop         esi
741*dfc6aa5cSAndroid Build Coastguard Worker    pop         edi
742*dfc6aa5cSAndroid Build Coastguard Worker
743*dfc6aa5cSAndroid Build Coastguard Worker    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
744*dfc6aa5cSAndroid Build Coastguard Worker    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
745*dfc6aa5cSAndroid Build Coastguard Worker    sub         ecx, byte 2                  ; rowctr
746*dfc6aa5cSAndroid Build Coastguard Worker    jg          near .rowloop
747*dfc6aa5cSAndroid Build Coastguard Worker
748*dfc6aa5cSAndroid Build Coastguard Worker.return:
749*dfc6aa5cSAndroid Build Coastguard Worker    vzeroupper
750*dfc6aa5cSAndroid Build Coastguard Worker    pop         edi
751*dfc6aa5cSAndroid Build Coastguard Worker    pop         esi
752*dfc6aa5cSAndroid Build Coastguard Worker;   pop         edx                     ; need not be preserved
753*dfc6aa5cSAndroid Build Coastguard Worker;   pop         ecx                     ; need not be preserved
754*dfc6aa5cSAndroid Build Coastguard Worker    pop         ebx
755*dfc6aa5cSAndroid Build Coastguard Worker    pop         ebp
756*dfc6aa5cSAndroid Build Coastguard Worker    ret
757*dfc6aa5cSAndroid Build Coastguard Worker
758*dfc6aa5cSAndroid Build Coastguard Worker; For some reason, the OS X linker does not honor the request to align the
759*dfc6aa5cSAndroid Build Coastguard Worker; segment unless we do this.
760*dfc6aa5cSAndroid Build Coastguard Worker    align       32
761