xref: /aosp_15_r20/external/libjpeg-turbo/simd/x86_64/jdsample-avx2.asm (revision dfc6aa5c1cfd4bc4e2018dc74aa96e29ee49c6da)
1*dfc6aa5cSAndroid Build Coastguard Worker;
2*dfc6aa5cSAndroid Build Coastguard Worker; jdsample.asm - upsampling (64-bit AVX2)
3*dfc6aa5cSAndroid Build Coastguard Worker;
4*dfc6aa5cSAndroid Build Coastguard Worker; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2009, 2016, D. R. Commander.
6*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2015, Intel Corporation.
7*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2018, Matthias Räncker.
8*dfc6aa5cSAndroid Build Coastguard Worker;
9*dfc6aa5cSAndroid Build Coastguard Worker; Based on the x86 SIMD extension for IJG JPEG library
10*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 1999-2006, MIYASAKA Masaru.
11*dfc6aa5cSAndroid Build Coastguard Worker; For conditions of distribution and use, see copyright notice in jsimdext.inc
12*dfc6aa5cSAndroid Build Coastguard Worker;
13*dfc6aa5cSAndroid Build Coastguard Worker; This file should be assembled with NASM (Netwide Assembler),
14*dfc6aa5cSAndroid Build Coastguard Worker; can *not* be assembled with Microsoft's MASM or any compatible
15*dfc6aa5cSAndroid Build Coastguard Worker; assembler (including Borland's Turbo Assembler).
16*dfc6aa5cSAndroid Build Coastguard Worker; NASM is available from http://nasm.sourceforge.net/ or
17*dfc6aa5cSAndroid Build Coastguard Worker; http://sourceforge.net/project/showfiles.php?group_id=6208
18*dfc6aa5cSAndroid Build Coastguard Worker
19*dfc6aa5cSAndroid Build Coastguard Worker%include "jsimdext.inc"
20*dfc6aa5cSAndroid Build Coastguard Worker
21*dfc6aa5cSAndroid Build Coastguard Worker; --------------------------------------------------------------------------
22*dfc6aa5cSAndroid Build Coastguard Worker    SECTION     SEG_CONST
23*dfc6aa5cSAndroid Build Coastguard Worker
24*dfc6aa5cSAndroid Build Coastguard Worker    alignz      32
25*dfc6aa5cSAndroid Build Coastguard Worker    GLOBAL_DATA(jconst_fancy_upsample_avx2)
26*dfc6aa5cSAndroid Build Coastguard Worker
27*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jconst_fancy_upsample_avx2):
28*dfc6aa5cSAndroid Build Coastguard Worker
29*dfc6aa5cSAndroid Build Coastguard WorkerPW_ONE   times 16 dw 1
30*dfc6aa5cSAndroid Build Coastguard WorkerPW_TWO   times 16 dw 2
31*dfc6aa5cSAndroid Build Coastguard WorkerPW_THREE times 16 dw 3
32*dfc6aa5cSAndroid Build Coastguard WorkerPW_SEVEN times 16 dw 7
33*dfc6aa5cSAndroid Build Coastguard WorkerPW_EIGHT times 16 dw 8
34*dfc6aa5cSAndroid Build Coastguard Worker
35*dfc6aa5cSAndroid Build Coastguard Worker    alignz      32
36*dfc6aa5cSAndroid Build Coastguard Worker
37*dfc6aa5cSAndroid Build Coastguard Worker; --------------------------------------------------------------------------
38*dfc6aa5cSAndroid Build Coastguard Worker    SECTION     SEG_TEXT
39*dfc6aa5cSAndroid Build Coastguard Worker    BITS        64
40*dfc6aa5cSAndroid Build Coastguard Worker;
41*dfc6aa5cSAndroid Build Coastguard Worker; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
42*dfc6aa5cSAndroid Build Coastguard Worker;
43*dfc6aa5cSAndroid Build Coastguard Worker; The upsampling algorithm is linear interpolation between pixel centers,
44*dfc6aa5cSAndroid Build Coastguard Worker; also known as a "triangle filter".  This is a good compromise between
45*dfc6aa5cSAndroid Build Coastguard Worker; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
46*dfc6aa5cSAndroid Build Coastguard Worker; of the way between input pixel centers.
47*dfc6aa5cSAndroid Build Coastguard Worker;
48*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(void)
49*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
50*dfc6aa5cSAndroid Build Coastguard Worker;                                JDIMENSION downsampled_width,
51*dfc6aa5cSAndroid Build Coastguard Worker;                                JSAMPARRAY input_data,
52*dfc6aa5cSAndroid Build Coastguard Worker;                                JSAMPARRAY *output_data_ptr);
53*dfc6aa5cSAndroid Build Coastguard Worker;
54*dfc6aa5cSAndroid Build Coastguard Worker
55*dfc6aa5cSAndroid Build Coastguard Worker; r10 = int max_v_samp_factor
56*dfc6aa5cSAndroid Build Coastguard Worker; r11d = JDIMENSION downsampled_width
57*dfc6aa5cSAndroid Build Coastguard Worker; r12 = JSAMPARRAY input_data
58*dfc6aa5cSAndroid Build Coastguard Worker; r13 = JSAMPARRAY *output_data_ptr
59*dfc6aa5cSAndroid Build Coastguard Worker
60*dfc6aa5cSAndroid Build Coastguard Worker    align       32
61*dfc6aa5cSAndroid Build Coastguard Worker    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
62*dfc6aa5cSAndroid Build Coastguard Worker
63*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_h2v1_fancy_upsample_avx2):
64*dfc6aa5cSAndroid Build Coastguard Worker    push        rbp
65*dfc6aa5cSAndroid Build Coastguard Worker    mov         rax, rsp
66*dfc6aa5cSAndroid Build Coastguard Worker    mov         rbp, rsp
67*dfc6aa5cSAndroid Build Coastguard Worker    push_xmm    3
68*dfc6aa5cSAndroid Build Coastguard Worker    collect_args 4
69*dfc6aa5cSAndroid Build Coastguard Worker
70*dfc6aa5cSAndroid Build Coastguard Worker    mov         eax, r11d               ; colctr
71*dfc6aa5cSAndroid Build Coastguard Worker    test        rax, rax
72*dfc6aa5cSAndroid Build Coastguard Worker    jz          near .return
73*dfc6aa5cSAndroid Build Coastguard Worker
74*dfc6aa5cSAndroid Build Coastguard Worker    mov         rcx, r10                ; rowctr
75*dfc6aa5cSAndroid Build Coastguard Worker    test        rcx, rcx
76*dfc6aa5cSAndroid Build Coastguard Worker    jz          near .return
77*dfc6aa5cSAndroid Build Coastguard Worker
78*dfc6aa5cSAndroid Build Coastguard Worker    mov         rsi, r12                ; input_data
79*dfc6aa5cSAndroid Build Coastguard Worker    mov         rdi, r13
80*dfc6aa5cSAndroid Build Coastguard Worker    mov         rdip, JSAMPARRAY [rdi]  ; output_data
81*dfc6aa5cSAndroid Build Coastguard Worker
82*dfc6aa5cSAndroid Build Coastguard Worker    vpxor       ymm0, ymm0, ymm0                 ; ymm0=(all 0's)
83*dfc6aa5cSAndroid Build Coastguard Worker    vpcmpeqb    xmm9, xmm9, xmm9
84*dfc6aa5cSAndroid Build Coastguard Worker    vpsrldq     xmm10, xmm9, (SIZEOF_XMMWORD-1)  ; (ff -- -- -- ... -- --) LSB is ff
85*dfc6aa5cSAndroid Build Coastguard Worker
86*dfc6aa5cSAndroid Build Coastguard Worker    vpslldq     xmm9, xmm9, (SIZEOF_XMMWORD-1)
87*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm9, ymm9, ymm9, 1              ; (---- ---- ... ---- ---- ff) MSB is ff
88*dfc6aa5cSAndroid Build Coastguard Worker
89*dfc6aa5cSAndroid Build Coastguard Worker.rowloop:
90*dfc6aa5cSAndroid Build Coastguard Worker    push        rax                     ; colctr
91*dfc6aa5cSAndroid Build Coastguard Worker    push        rdi
92*dfc6aa5cSAndroid Build Coastguard Worker    push        rsi
93*dfc6aa5cSAndroid Build Coastguard Worker
94*dfc6aa5cSAndroid Build Coastguard Worker    mov         rsip, JSAMPROW [rsi]    ; inptr
95*dfc6aa5cSAndroid Build Coastguard Worker    mov         rdip, JSAMPROW [rdi]    ; outptr
96*dfc6aa5cSAndroid Build Coastguard Worker
97*dfc6aa5cSAndroid Build Coastguard Worker    test        rax, SIZEOF_YMMWORD-1
98*dfc6aa5cSAndroid Build Coastguard Worker    jz          short .skip
99*dfc6aa5cSAndroid Build Coastguard Worker    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
100*dfc6aa5cSAndroid Build Coastguard Worker    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
101*dfc6aa5cSAndroid Build Coastguard Worker.skip:
102*dfc6aa5cSAndroid Build Coastguard Worker    vpand       ymm7, ymm10, YMMWORD [rsi+0*SIZEOF_YMMWORD]
103*dfc6aa5cSAndroid Build Coastguard Worker
104*dfc6aa5cSAndroid Build Coastguard Worker    add         rax, byte SIZEOF_YMMWORD-1
105*dfc6aa5cSAndroid Build Coastguard Worker    and         rax, byte -SIZEOF_YMMWORD
106*dfc6aa5cSAndroid Build Coastguard Worker    cmp         rax, byte SIZEOF_YMMWORD
107*dfc6aa5cSAndroid Build Coastguard Worker    ja          short .columnloop
108*dfc6aa5cSAndroid Build Coastguard Worker
109*dfc6aa5cSAndroid Build Coastguard Worker.columnloop_last:
110*dfc6aa5cSAndroid Build Coastguard Worker    vpand       ymm6, ymm9, YMMWORD [rsi+0*SIZEOF_YMMWORD]
111*dfc6aa5cSAndroid Build Coastguard Worker    jmp         short .upsample
112*dfc6aa5cSAndroid Build Coastguard Worker
113*dfc6aa5cSAndroid Build Coastguard Worker.columnloop:
114*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm6, YMMWORD [rsi+1*SIZEOF_YMMWORD]
115*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm6, ymm0, ymm6, 0x20
116*dfc6aa5cSAndroid Build Coastguard Worker    vpslldq     ymm6, ymm6, 15
117*dfc6aa5cSAndroid Build Coastguard Worker
118*dfc6aa5cSAndroid Build Coastguard Worker.upsample:
119*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]  ; ymm1=( 0  1  2 ... 29 30 31)
120*dfc6aa5cSAndroid Build Coastguard Worker
121*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm2, ymm0, ymm1, 0x20
122*dfc6aa5cSAndroid Build Coastguard Worker    vpalignr    ymm2, ymm1, ymm2, 15            ; ymm2=(--  0  1 ... 28 29 30)
123*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm4, ymm0, ymm1, 0x03
124*dfc6aa5cSAndroid Build Coastguard Worker    vpalignr    ymm3, ymm4, ymm1, 1             ; ymm3=( 1  2  3 ... 30 31 --)
125*dfc6aa5cSAndroid Build Coastguard Worker
126*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm2, ymm2, ymm7                ; ymm2=(-1  0  1 ... 28 29 30)
127*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm3, ymm3, ymm6                ; ymm3=( 1  2  3 ... 30 31 32)
128*dfc6aa5cSAndroid Build Coastguard Worker
129*dfc6aa5cSAndroid Build Coastguard Worker    vpsrldq     ymm7, ymm4, (SIZEOF_XMMWORD-1)  ; ymm7=(31 -- -- ... -- -- --)
130*dfc6aa5cSAndroid Build Coastguard Worker
131*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm4, ymm1, ymm0                ; ymm4=( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
132*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm5, ymm1, ymm0                ; ymm5=( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
133*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm1, ymm5, ymm4, 0x20          ; ymm1=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
134*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm4, ymm5, ymm4, 0x31          ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
135*dfc6aa5cSAndroid Build Coastguard Worker
136*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm5, ymm2, ymm0                ; ymm5=( 7  8  9 10 11 12 13 14 23 24 25 26 27 28 29 30)
137*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm6, ymm2, ymm0                ; ymm6=(-1  0  1  2  3  4  5  6 15 16 17 18 19 20 21 22)
138*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm2, ymm6, ymm5, 0x20          ; ymm2=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
139*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm5, ymm6, ymm5, 0x31          ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
140*dfc6aa5cSAndroid Build Coastguard Worker
141*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm6, ymm3, ymm0                ; ymm6=( 1  2  3  4  5  6  7  8 17 18 19 20 21 22 23 24)
142*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm8, ymm3, ymm0                ; ymm8=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
143*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm3, ymm8, ymm6, 0x20          ; ymm3=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
144*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm6, ymm8, ymm6, 0x31          ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
145*dfc6aa5cSAndroid Build Coastguard Worker
146*dfc6aa5cSAndroid Build Coastguard Worker    vpmullw     ymm1, ymm1, [rel PW_THREE]
147*dfc6aa5cSAndroid Build Coastguard Worker    vpmullw     ymm4, ymm4, [rel PW_THREE]
148*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm2, ymm2, [rel PW_ONE]
149*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm5, ymm5, [rel PW_ONE]
150*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm3, ymm3, [rel PW_TWO]
151*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm6, ymm6, [rel PW_TWO]
152*dfc6aa5cSAndroid Build Coastguard Worker
153*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm2, ymm2, ymm1
154*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm5, ymm5, ymm4
155*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm2, ymm2, 2                   ; ymm2=OutLE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
156*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm5, ymm5, 2                   ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
157*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm3, ymm3, ymm1
158*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm6, ymm6, ymm4
159*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm3, ymm3, 2                   ; ymm3=OutLO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
160*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm6, ymm6, 2                   ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
161*dfc6aa5cSAndroid Build Coastguard Worker
162*dfc6aa5cSAndroid Build Coastguard Worker    vpsllw      ymm3, ymm3, BYTE_BIT
163*dfc6aa5cSAndroid Build Coastguard Worker    vpsllw      ymm6, ymm6, BYTE_BIT
164*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm2, ymm2, ymm3                ; ymm2=OutL=( 0  1  2 ... 29 30 31)
165*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm5, ymm5, ymm6                ; ymm5=OutH=(32 33 34 ... 61 62 63)
166*dfc6aa5cSAndroid Build Coastguard Worker
167*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
168*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm5
169*dfc6aa5cSAndroid Build Coastguard Worker
170*dfc6aa5cSAndroid Build Coastguard Worker    sub         rax, byte SIZEOF_YMMWORD
171*dfc6aa5cSAndroid Build Coastguard Worker    add         rsi, byte 1*SIZEOF_YMMWORD  ; inptr
172*dfc6aa5cSAndroid Build Coastguard Worker    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
173*dfc6aa5cSAndroid Build Coastguard Worker    cmp         rax, byte SIZEOF_YMMWORD
174*dfc6aa5cSAndroid Build Coastguard Worker    ja          near .columnloop
175*dfc6aa5cSAndroid Build Coastguard Worker    test        eax, eax
176*dfc6aa5cSAndroid Build Coastguard Worker    jnz         near .columnloop_last
177*dfc6aa5cSAndroid Build Coastguard Worker
178*dfc6aa5cSAndroid Build Coastguard Worker    pop         rsi
179*dfc6aa5cSAndroid Build Coastguard Worker    pop         rdi
180*dfc6aa5cSAndroid Build Coastguard Worker    pop         rax
181*dfc6aa5cSAndroid Build Coastguard Worker
182*dfc6aa5cSAndroid Build Coastguard Worker    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
183*dfc6aa5cSAndroid Build Coastguard Worker    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
184*dfc6aa5cSAndroid Build Coastguard Worker    dec         rcx                        ; rowctr
185*dfc6aa5cSAndroid Build Coastguard Worker    jg          near .rowloop
186*dfc6aa5cSAndroid Build Coastguard Worker
187*dfc6aa5cSAndroid Build Coastguard Worker.return:
188*dfc6aa5cSAndroid Build Coastguard Worker    vzeroupper
189*dfc6aa5cSAndroid Build Coastguard Worker    uncollect_args 4
190*dfc6aa5cSAndroid Build Coastguard Worker    pop_xmm     3
191*dfc6aa5cSAndroid Build Coastguard Worker    pop         rbp
192*dfc6aa5cSAndroid Build Coastguard Worker    ret
193*dfc6aa5cSAndroid Build Coastguard Worker
194*dfc6aa5cSAndroid Build Coastguard Worker; --------------------------------------------------------------------------
195*dfc6aa5cSAndroid Build Coastguard Worker;
196*dfc6aa5cSAndroid Build Coastguard Worker; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
197*dfc6aa5cSAndroid Build Coastguard Worker; Again a triangle filter; see comments for h2v1 case, above.
198*dfc6aa5cSAndroid Build Coastguard Worker;
199*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(void)
200*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
201*dfc6aa5cSAndroid Build Coastguard Worker;                                JDIMENSION downsampled_width,
202*dfc6aa5cSAndroid Build Coastguard Worker;                                JSAMPARRAY input_data,
203*dfc6aa5cSAndroid Build Coastguard Worker;                                JSAMPARRAY *output_data_ptr);
204*dfc6aa5cSAndroid Build Coastguard Worker;
205*dfc6aa5cSAndroid Build Coastguard Worker
206*dfc6aa5cSAndroid Build Coastguard Worker; r10 = int max_v_samp_factor
207*dfc6aa5cSAndroid Build Coastguard Worker; r11d = JDIMENSION downsampled_width
208*dfc6aa5cSAndroid Build Coastguard Worker; r12 = JSAMPARRAY input_data
209*dfc6aa5cSAndroid Build Coastguard Worker; r13 = JSAMPARRAY *output_data_ptr
210*dfc6aa5cSAndroid Build Coastguard Worker
211*dfc6aa5cSAndroid Build Coastguard Worker%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
212*dfc6aa5cSAndroid Build Coastguard Worker%define WK_NUM  4
213*dfc6aa5cSAndroid Build Coastguard Worker
214*dfc6aa5cSAndroid Build Coastguard Worker    align       32
215*dfc6aa5cSAndroid Build Coastguard Worker    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
216*dfc6aa5cSAndroid Build Coastguard Worker
217*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_h2v2_fancy_upsample_avx2):
218*dfc6aa5cSAndroid Build Coastguard Worker    push        rbp
219*dfc6aa5cSAndroid Build Coastguard Worker    mov         rax, rsp                     ; rax = original rbp
220*dfc6aa5cSAndroid Build Coastguard Worker    sub         rsp, byte 4
221*dfc6aa5cSAndroid Build Coastguard Worker    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
222*dfc6aa5cSAndroid Build Coastguard Worker    mov         [rsp], rax
223*dfc6aa5cSAndroid Build Coastguard Worker    mov         rbp, rsp                     ; rbp = aligned rbp
224*dfc6aa5cSAndroid Build Coastguard Worker    lea         rsp, [wk(0)]
225*dfc6aa5cSAndroid Build Coastguard Worker    push_xmm    3
226*dfc6aa5cSAndroid Build Coastguard Worker    collect_args 4
227*dfc6aa5cSAndroid Build Coastguard Worker    push        rbx
228*dfc6aa5cSAndroid Build Coastguard Worker
229*dfc6aa5cSAndroid Build Coastguard Worker    mov         eax, r11d               ; colctr
230*dfc6aa5cSAndroid Build Coastguard Worker    test        rax, rax
231*dfc6aa5cSAndroid Build Coastguard Worker    jz          near .return
232*dfc6aa5cSAndroid Build Coastguard Worker
233*dfc6aa5cSAndroid Build Coastguard Worker    mov         rcx, r10                ; rowctr
234*dfc6aa5cSAndroid Build Coastguard Worker    test        rcx, rcx
235*dfc6aa5cSAndroid Build Coastguard Worker    jz          near .return
236*dfc6aa5cSAndroid Build Coastguard Worker
237*dfc6aa5cSAndroid Build Coastguard Worker    mov         rsi, r12                ; input_data
238*dfc6aa5cSAndroid Build Coastguard Worker    mov         rdi, r13
239*dfc6aa5cSAndroid Build Coastguard Worker    mov         rdip, JSAMPARRAY [rdi]  ; output_data
240*dfc6aa5cSAndroid Build Coastguard Worker.rowloop:
241*dfc6aa5cSAndroid Build Coastguard Worker    push        rax                     ; colctr
242*dfc6aa5cSAndroid Build Coastguard Worker    push        rcx
243*dfc6aa5cSAndroid Build Coastguard Worker    push        rdi
244*dfc6aa5cSAndroid Build Coastguard Worker    push        rsi
245*dfc6aa5cSAndroid Build Coastguard Worker
246*dfc6aa5cSAndroid Build Coastguard Worker    mov         rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
247*dfc6aa5cSAndroid Build Coastguard Worker    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
248*dfc6aa5cSAndroid Build Coastguard Worker    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
249*dfc6aa5cSAndroid Build Coastguard Worker    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
250*dfc6aa5cSAndroid Build Coastguard Worker    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
251*dfc6aa5cSAndroid Build Coastguard Worker
252*dfc6aa5cSAndroid Build Coastguard Worker    vpxor       ymm8, ymm8, ymm8                 ; ymm8=(all 0's)
253*dfc6aa5cSAndroid Build Coastguard Worker    vpcmpeqb    xmm9, xmm9, xmm9
254*dfc6aa5cSAndroid Build Coastguard Worker    vpsrldq     xmm10, xmm9, (SIZEOF_XMMWORD-2)  ; (ffff ---- ---- ... ---- ----) LSB is ffff
255*dfc6aa5cSAndroid Build Coastguard Worker    vpslldq     xmm9, xmm9, (SIZEOF_XMMWORD-2)
256*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm9, ymm9, ymm9, 1              ; (---- ---- ... ---- ---- ffff) MSB is ffff
257*dfc6aa5cSAndroid Build Coastguard Worker
258*dfc6aa5cSAndroid Build Coastguard Worker    test        rax, SIZEOF_YMMWORD-1
259*dfc6aa5cSAndroid Build Coastguard Worker    jz          short .skip
260*dfc6aa5cSAndroid Build Coastguard Worker    push        rdx
261*dfc6aa5cSAndroid Build Coastguard Worker    mov         dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
262*dfc6aa5cSAndroid Build Coastguard Worker    mov         JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
263*dfc6aa5cSAndroid Build Coastguard Worker    mov         dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
264*dfc6aa5cSAndroid Build Coastguard Worker    mov         JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
265*dfc6aa5cSAndroid Build Coastguard Worker    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
266*dfc6aa5cSAndroid Build Coastguard Worker    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
267*dfc6aa5cSAndroid Build Coastguard Worker    pop         rdx
268*dfc6aa5cSAndroid Build Coastguard Worker.skip:
269*dfc6aa5cSAndroid Build Coastguard Worker    ; -- process the first column block
270*dfc6aa5cSAndroid Build Coastguard Worker
271*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm0, YMMWORD [rbx+0*SIZEOF_YMMWORD]  ; ymm0=row[ 0][0]
272*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm1, YMMWORD [rcx+0*SIZEOF_YMMWORD]  ; ymm1=row[-1][0]
273*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm2, YMMWORD [rsi+0*SIZEOF_YMMWORD]  ; ymm2=row[+1][0]
274*dfc6aa5cSAndroid Build Coastguard Worker
275*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm4, ymm0, ymm8        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
276*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm5, ymm0, ymm8        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
277*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
278*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
279*dfc6aa5cSAndroid Build Coastguard Worker
280*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm5, ymm1, ymm8        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
281*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm6, ymm1, ymm8        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
282*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
283*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
284*dfc6aa5cSAndroid Build Coastguard Worker
285*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm6, ymm2, ymm8        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
286*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm3, ymm2, ymm8        ; ymm3=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
287*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm2, ymm3, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
288*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm6, ymm3, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
289*dfc6aa5cSAndroid Build Coastguard Worker
290*dfc6aa5cSAndroid Build Coastguard Worker    vpmullw     ymm0, ymm0, [rel PW_THREE]
291*dfc6aa5cSAndroid Build Coastguard Worker    vpmullw     ymm4, ymm4, [rel PW_THREE]
292*dfc6aa5cSAndroid Build Coastguard Worker
293*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
294*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
295*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
296*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
297*dfc6aa5cSAndroid Build Coastguard Worker
298*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1  ; temporarily save
299*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5  ; the intermediate data
300*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
301*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm6
302*dfc6aa5cSAndroid Build Coastguard Worker
303*dfc6aa5cSAndroid Build Coastguard Worker    vpand       ymm1, ymm1, ymm10       ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
304*dfc6aa5cSAndroid Build Coastguard Worker    vpand       ymm2, ymm2, ymm10       ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
305*dfc6aa5cSAndroid Build Coastguard Worker
306*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     YMMWORD [wk(0)], ymm1
307*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     YMMWORD [wk(1)], ymm2
308*dfc6aa5cSAndroid Build Coastguard Worker
309*dfc6aa5cSAndroid Build Coastguard Worker    add         rax, byte SIZEOF_YMMWORD-1
310*dfc6aa5cSAndroid Build Coastguard Worker    and         rax, byte -SIZEOF_YMMWORD
311*dfc6aa5cSAndroid Build Coastguard Worker    cmp         rax, byte SIZEOF_YMMWORD
312*dfc6aa5cSAndroid Build Coastguard Worker    ja          short .columnloop
313*dfc6aa5cSAndroid Build Coastguard Worker
314*dfc6aa5cSAndroid Build Coastguard Worker.columnloop_last:
315*dfc6aa5cSAndroid Build Coastguard Worker    ; -- process the last column block
316*dfc6aa5cSAndroid Build Coastguard Worker
317*dfc6aa5cSAndroid Build Coastguard Worker    vpand       ymm1, ymm9, YMMWORD [rdx+1*SIZEOF_YMMWORD]
318*dfc6aa5cSAndroid Build Coastguard Worker    vpand       ymm2, ymm9, YMMWORD [rdi+1*SIZEOF_YMMWORD]
319*dfc6aa5cSAndroid Build Coastguard Worker
320*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     YMMWORD [wk(2)], ymm1   ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
321*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     YMMWORD [wk(3)], ymm2   ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
322*dfc6aa5cSAndroid Build Coastguard Worker
323*dfc6aa5cSAndroid Build Coastguard Worker    jmp         near .upsample
324*dfc6aa5cSAndroid Build Coastguard Worker
325*dfc6aa5cSAndroid Build Coastguard Worker.columnloop:
326*dfc6aa5cSAndroid Build Coastguard Worker    ; -- process the next column block
327*dfc6aa5cSAndroid Build Coastguard Worker
328*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm0, YMMWORD [rbx+1*SIZEOF_YMMWORD]  ; ymm0=row[ 0][1]
329*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm1, YMMWORD [rcx+1*SIZEOF_YMMWORD]  ; ymm1=row[-1][1]
330*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm2, YMMWORD [rsi+1*SIZEOF_YMMWORD]  ; ymm2=row[+1][1]
331*dfc6aa5cSAndroid Build Coastguard Worker
332*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm4, ymm0, ymm8        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
333*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm5, ymm0, ymm8        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
334*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
335*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
336*dfc6aa5cSAndroid Build Coastguard Worker
337*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm5, ymm1, ymm8        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
338*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm6, ymm1, ymm8        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
339*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
340*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
341*dfc6aa5cSAndroid Build Coastguard Worker
342*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm6, ymm2, ymm8        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
343*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm7, ymm2, ymm8        ; ymm7=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
344*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm2, ymm7, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
345*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm6, ymm7, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
346*dfc6aa5cSAndroid Build Coastguard Worker
347*dfc6aa5cSAndroid Build Coastguard Worker    vpmullw     ymm0, ymm0, [rel PW_THREE]
348*dfc6aa5cSAndroid Build Coastguard Worker    vpmullw     ymm4, ymm4, [rel PW_THREE]
349*dfc6aa5cSAndroid Build Coastguard Worker
350*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
351*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
352*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
353*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
354*dfc6aa5cSAndroid Build Coastguard Worker
355*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [rdx+2*SIZEOF_YMMWORD], ymm1  ; temporarily save
356*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [rdx+3*SIZEOF_YMMWORD], ymm5  ; the intermediate data
357*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymm2
358*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [rdi+3*SIZEOF_YMMWORD], ymm6
359*dfc6aa5cSAndroid Build Coastguard Worker
360*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm1, ymm8, ymm1, 0x20
361*dfc6aa5cSAndroid Build Coastguard Worker    vpslldq     ymm1, ymm1, 14          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
362*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm2, ymm8, ymm2, 0x20
363*dfc6aa5cSAndroid Build Coastguard Worker    vpslldq     ymm2, ymm2, 14          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
364*dfc6aa5cSAndroid Build Coastguard Worker
365*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     YMMWORD [wk(2)], ymm1
366*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     YMMWORD [wk(3)], ymm2
367*dfc6aa5cSAndroid Build Coastguard Worker
368*dfc6aa5cSAndroid Build Coastguard Worker.upsample:
369*dfc6aa5cSAndroid Build Coastguard Worker    ; -- process the upper row
370*dfc6aa5cSAndroid Build Coastguard Worker
371*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm7, YMMWORD [rdx+0*SIZEOF_YMMWORD]  ; ymm7=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
372*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm3, YMMWORD [rdx+1*SIZEOF_YMMWORD]  ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
373*dfc6aa5cSAndroid Build Coastguard Worker
374*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm0, ymm8, ymm7, 0x03
375*dfc6aa5cSAndroid Build Coastguard Worker    vpalignr    ymm0, ymm0, ymm7, 2     ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
376*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm4, ymm8, ymm3, 0x20
377*dfc6aa5cSAndroid Build Coastguard Worker    vpslldq     ymm4, ymm4, 14          ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
378*dfc6aa5cSAndroid Build Coastguard Worker
379*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm5, ymm8, ymm7, 0x03
380*dfc6aa5cSAndroid Build Coastguard Worker    vpsrldq     ymm5, ymm5, 14          ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
381*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm6, ymm8, ymm3, 0x20
382*dfc6aa5cSAndroid Build Coastguard Worker    vpalignr    ymm6, ymm3, ymm6, 14    ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
383*dfc6aa5cSAndroid Build Coastguard Worker
384*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm0, ymm0, ymm4        ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
385*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm5, ymm5, ymm6        ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
386*dfc6aa5cSAndroid Build Coastguard Worker
387*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm2, ymm8, ymm3, 0x03
388*dfc6aa5cSAndroid Build Coastguard Worker    vpalignr    ymm2, ymm2, ymm3, 2     ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
389*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm4, ymm8, ymm3, 0x03
390*dfc6aa5cSAndroid Build Coastguard Worker    vpsrldq     ymm4, ymm4, 14          ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
391*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm1, ymm8, ymm7, 0x20
392*dfc6aa5cSAndroid Build Coastguard Worker    vpalignr    ymm1, ymm7, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
393*dfc6aa5cSAndroid Build Coastguard Worker
394*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm1, ymm1, YMMWORD [wk(0)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
395*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm2, ymm2, YMMWORD [wk(2)]  ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
396*dfc6aa5cSAndroid Build Coastguard Worker
397*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     YMMWORD [wk(0)], ymm4
398*dfc6aa5cSAndroid Build Coastguard Worker
399*dfc6aa5cSAndroid Build Coastguard Worker    vpmullw     ymm7, ymm7, [rel PW_THREE]
400*dfc6aa5cSAndroid Build Coastguard Worker    vpmullw     ymm3, ymm3, [rel PW_THREE]
401*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm1, ymm1, [rel PW_EIGHT]
402*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm5, ymm5, [rel PW_EIGHT]
403*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm0, ymm0, [rel PW_SEVEN]
404*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm2, [rel PW_SEVEN]
405*dfc6aa5cSAndroid Build Coastguard Worker
406*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm1, ymm1, ymm7
407*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm5, ymm5, ymm3
408*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out0LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
409*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
410*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm0, ymm0, ymm7
411*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm2, ymm2, ymm3
412*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out0LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
413*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm2, ymm2, 4           ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
414*dfc6aa5cSAndroid Build Coastguard Worker
415*dfc6aa5cSAndroid Build Coastguard Worker    vpsllw      ymm0, ymm0, BYTE_BIT
416*dfc6aa5cSAndroid Build Coastguard Worker    vpsllw      ymm2, ymm2, BYTE_BIT
417*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm1, ymm1, ymm0        ; ymm1=Out0L=( 0  1  2 ... 29 30 31)
418*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm5, ymm5, ymm2        ; ymm5=Out0H=(32 33 34 ... 61 62 63)
419*dfc6aa5cSAndroid Build Coastguard Worker
420*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1
421*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5
422*dfc6aa5cSAndroid Build Coastguard Worker
423*dfc6aa5cSAndroid Build Coastguard Worker    ; -- process the lower row
424*dfc6aa5cSAndroid Build Coastguard Worker
425*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm6, YMMWORD [rdi+0*SIZEOF_YMMWORD]  ; ymm6=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
426*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm4, YMMWORD [rdi+1*SIZEOF_YMMWORD]  ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
427*dfc6aa5cSAndroid Build Coastguard Worker
428*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm7, ymm8, ymm6, 0x03
429*dfc6aa5cSAndroid Build Coastguard Worker    vpalignr    ymm7, ymm7, ymm6, 2     ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
430*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm3, ymm8, ymm4, 0x20
431*dfc6aa5cSAndroid Build Coastguard Worker    vpslldq     ymm3, ymm3, 14          ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
432*dfc6aa5cSAndroid Build Coastguard Worker
433*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm0, ymm8, ymm6, 0x03
434*dfc6aa5cSAndroid Build Coastguard Worker    vpsrldq     ymm0, ymm0, 14          ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
435*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm2, ymm8, ymm4, 0x20
436*dfc6aa5cSAndroid Build Coastguard Worker    vpalignr    ymm2, ymm4, ymm2, 14    ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
437*dfc6aa5cSAndroid Build Coastguard Worker
438*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm7, ymm7, ymm3        ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
439*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm0, ymm0, ymm2        ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
440*dfc6aa5cSAndroid Build Coastguard Worker
441*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm5, ymm8, ymm4, 0x03
442*dfc6aa5cSAndroid Build Coastguard Worker    vpalignr    ymm5, ymm5, ymm4, 2     ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
443*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm3, ymm8, ymm4, 0x03
444*dfc6aa5cSAndroid Build Coastguard Worker    vpsrldq     ymm3, ymm3, 14          ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
445*dfc6aa5cSAndroid Build Coastguard Worker    vperm2i128  ymm1, ymm8, ymm6, 0x20
446*dfc6aa5cSAndroid Build Coastguard Worker    vpalignr    ymm1, ymm6, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
447*dfc6aa5cSAndroid Build Coastguard Worker
448*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm1, ymm1, YMMWORD [wk(1)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
449*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm5, ymm5, YMMWORD [wk(3)]  ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
450*dfc6aa5cSAndroid Build Coastguard Worker
451*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqa     YMMWORD [wk(1)], ymm3
452*dfc6aa5cSAndroid Build Coastguard Worker
453*dfc6aa5cSAndroid Build Coastguard Worker    vpmullw     ymm6, ymm6, [rel PW_THREE]
454*dfc6aa5cSAndroid Build Coastguard Worker    vpmullw     ymm4, ymm4, [rel PW_THREE]
455*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm1, ymm1, [rel PW_EIGHT]
456*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm0, ymm0, [rel PW_EIGHT]
457*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm7, ymm7, [rel PW_SEVEN]
458*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm5, ymm5, [rel PW_SEVEN]
459*dfc6aa5cSAndroid Build Coastguard Worker
460*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm1, ymm1, ymm6
461*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm0, ymm0, ymm4
462*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out1LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
463*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
464*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm7, ymm7, ymm6
465*dfc6aa5cSAndroid Build Coastguard Worker    vpaddw      ymm5, ymm5, ymm4
466*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm7, ymm7, 4           ; ymm7=Out1LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
467*dfc6aa5cSAndroid Build Coastguard Worker    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
468*dfc6aa5cSAndroid Build Coastguard Worker
469*dfc6aa5cSAndroid Build Coastguard Worker    vpsllw      ymm7, ymm7, BYTE_BIT
470*dfc6aa5cSAndroid Build Coastguard Worker    vpsllw      ymm5, ymm5, BYTE_BIT
471*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm1, ymm1, ymm7        ; ymm1=Out1L=( 0  1  2 ... 29 30 31)
472*dfc6aa5cSAndroid Build Coastguard Worker    vpor        ymm0, ymm0, ymm5        ; ymm0=Out1H=(32 33 34 ... 61 62 63)
473*dfc6aa5cSAndroid Build Coastguard Worker
474*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm1
475*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm0
476*dfc6aa5cSAndroid Build Coastguard Worker
477*dfc6aa5cSAndroid Build Coastguard Worker    sub         rax, byte SIZEOF_YMMWORD
478*dfc6aa5cSAndroid Build Coastguard Worker    add         rcx, byte 1*SIZEOF_YMMWORD  ; inptr1(above)
479*dfc6aa5cSAndroid Build Coastguard Worker    add         rbx, byte 1*SIZEOF_YMMWORD  ; inptr0
480*dfc6aa5cSAndroid Build Coastguard Worker    add         rsi, byte 1*SIZEOF_YMMWORD  ; inptr1(below)
481*dfc6aa5cSAndroid Build Coastguard Worker    add         rdx, byte 2*SIZEOF_YMMWORD  ; outptr0
482*dfc6aa5cSAndroid Build Coastguard Worker    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr1
483*dfc6aa5cSAndroid Build Coastguard Worker    cmp         rax, byte SIZEOF_YMMWORD
484*dfc6aa5cSAndroid Build Coastguard Worker    ja          near .columnloop
485*dfc6aa5cSAndroid Build Coastguard Worker    test        rax, rax
486*dfc6aa5cSAndroid Build Coastguard Worker    jnz         near .columnloop_last
487*dfc6aa5cSAndroid Build Coastguard Worker
488*dfc6aa5cSAndroid Build Coastguard Worker    pop         rsi
489*dfc6aa5cSAndroid Build Coastguard Worker    pop         rdi
490*dfc6aa5cSAndroid Build Coastguard Worker    pop         rcx
491*dfc6aa5cSAndroid Build Coastguard Worker    pop         rax
492*dfc6aa5cSAndroid Build Coastguard Worker
493*dfc6aa5cSAndroid Build Coastguard Worker    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
494*dfc6aa5cSAndroid Build Coastguard Worker    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
495*dfc6aa5cSAndroid Build Coastguard Worker    sub         rcx, byte 2                  ; rowctr
496*dfc6aa5cSAndroid Build Coastguard Worker    jg          near .rowloop
497*dfc6aa5cSAndroid Build Coastguard Worker
498*dfc6aa5cSAndroid Build Coastguard Worker.return:
499*dfc6aa5cSAndroid Build Coastguard Worker    pop         rbx
500*dfc6aa5cSAndroid Build Coastguard Worker    vzeroupper
501*dfc6aa5cSAndroid Build Coastguard Worker    uncollect_args 4
502*dfc6aa5cSAndroid Build Coastguard Worker    pop_xmm     3
503*dfc6aa5cSAndroid Build Coastguard Worker    mov         rsp, rbp                ; rsp <- aligned rbp
504*dfc6aa5cSAndroid Build Coastguard Worker    pop         rsp                     ; rsp <- original rbp
505*dfc6aa5cSAndroid Build Coastguard Worker    pop         rbp
506*dfc6aa5cSAndroid Build Coastguard Worker    ret
507*dfc6aa5cSAndroid Build Coastguard Worker
508*dfc6aa5cSAndroid Build Coastguard Worker; --------------------------------------------------------------------------
509*dfc6aa5cSAndroid Build Coastguard Worker;
510*dfc6aa5cSAndroid Build Coastguard Worker; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
511*dfc6aa5cSAndroid Build Coastguard Worker; It's still a box filter.
512*dfc6aa5cSAndroid Build Coastguard Worker;
513*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(void)
514*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
515*dfc6aa5cSAndroid Build Coastguard Worker;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
516*dfc6aa5cSAndroid Build Coastguard Worker;
517*dfc6aa5cSAndroid Build Coastguard Worker
518*dfc6aa5cSAndroid Build Coastguard Worker; r10 = int max_v_samp_factor
519*dfc6aa5cSAndroid Build Coastguard Worker; r11d = JDIMENSION output_width
520*dfc6aa5cSAndroid Build Coastguard Worker; r12 = JSAMPARRAY input_data
521*dfc6aa5cSAndroid Build Coastguard Worker; r13 = JSAMPARRAY *output_data_ptr
522*dfc6aa5cSAndroid Build Coastguard Worker
523*dfc6aa5cSAndroid Build Coastguard Worker    align       32
524*dfc6aa5cSAndroid Build Coastguard Worker    GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
525*dfc6aa5cSAndroid Build Coastguard Worker
526*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_h2v1_upsample_avx2):
527*dfc6aa5cSAndroid Build Coastguard Worker    push        rbp
528*dfc6aa5cSAndroid Build Coastguard Worker    mov         rax, rsp
529*dfc6aa5cSAndroid Build Coastguard Worker    mov         rbp, rsp
530*dfc6aa5cSAndroid Build Coastguard Worker    collect_args 4
531*dfc6aa5cSAndroid Build Coastguard Worker
532*dfc6aa5cSAndroid Build Coastguard Worker    mov         edx, r11d
533*dfc6aa5cSAndroid Build Coastguard Worker    add         rdx, byte (SIZEOF_YMMWORD-1)
534*dfc6aa5cSAndroid Build Coastguard Worker    and         rdx, -SIZEOF_YMMWORD
535*dfc6aa5cSAndroid Build Coastguard Worker    jz          near .return
536*dfc6aa5cSAndroid Build Coastguard Worker
537*dfc6aa5cSAndroid Build Coastguard Worker    mov         rcx, r10                ; rowctr
538*dfc6aa5cSAndroid Build Coastguard Worker    test        rcx, rcx
539*dfc6aa5cSAndroid Build Coastguard Worker    jz          short .return
540*dfc6aa5cSAndroid Build Coastguard Worker
541*dfc6aa5cSAndroid Build Coastguard Worker    mov         rsi, r12                ; input_data
542*dfc6aa5cSAndroid Build Coastguard Worker    mov         rdi, r13
543*dfc6aa5cSAndroid Build Coastguard Worker    mov         rdip, JSAMPARRAY [rdi]  ; output_data
544*dfc6aa5cSAndroid Build Coastguard Worker.rowloop:
545*dfc6aa5cSAndroid Build Coastguard Worker    push        rdi
546*dfc6aa5cSAndroid Build Coastguard Worker    push        rsi
547*dfc6aa5cSAndroid Build Coastguard Worker
548*dfc6aa5cSAndroid Build Coastguard Worker    mov         rsip, JSAMPROW [rsi]    ; inptr
549*dfc6aa5cSAndroid Build Coastguard Worker    mov         rdip, JSAMPROW [rdi]    ; outptr
550*dfc6aa5cSAndroid Build Coastguard Worker    mov         rax, rdx                ; colctr
551*dfc6aa5cSAndroid Build Coastguard Worker.columnloop:
552*dfc6aa5cSAndroid Build Coastguard Worker
553*dfc6aa5cSAndroid Build Coastguard Worker    cmp         rax, byte SIZEOF_YMMWORD
554*dfc6aa5cSAndroid Build Coastguard Worker    ja          near .above_16
555*dfc6aa5cSAndroid Build Coastguard Worker
556*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     xmm0, XMMWORD [rsi+0*SIZEOF_YMMWORD]
557*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  xmm1, xmm0, xmm0
558*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  xmm0, xmm0, xmm0
559*dfc6aa5cSAndroid Build Coastguard Worker
560*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
561*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
562*dfc6aa5cSAndroid Build Coastguard Worker
563*dfc6aa5cSAndroid Build Coastguard Worker    jmp         short .nextrow
564*dfc6aa5cSAndroid Build Coastguard Worker
565*dfc6aa5cSAndroid Build Coastguard Worker.above_16:
566*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
567*dfc6aa5cSAndroid Build Coastguard Worker
568*dfc6aa5cSAndroid Build Coastguard Worker    vpermq      ymm0, ymm0, 0xd8
569*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm1, ymm0, ymm0
570*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm0, ymm0, ymm0
571*dfc6aa5cSAndroid Build Coastguard Worker
572*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
573*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
574*dfc6aa5cSAndroid Build Coastguard Worker
575*dfc6aa5cSAndroid Build Coastguard Worker    sub         rax, byte 2*SIZEOF_YMMWORD
576*dfc6aa5cSAndroid Build Coastguard Worker    jz          short .nextrow
577*dfc6aa5cSAndroid Build Coastguard Worker
578*dfc6aa5cSAndroid Build Coastguard Worker    add         rsi, byte SIZEOF_YMMWORD    ; inptr
579*dfc6aa5cSAndroid Build Coastguard Worker    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
580*dfc6aa5cSAndroid Build Coastguard Worker    jmp         short .columnloop
581*dfc6aa5cSAndroid Build Coastguard Worker
582*dfc6aa5cSAndroid Build Coastguard Worker.nextrow:
583*dfc6aa5cSAndroid Build Coastguard Worker    pop         rsi
584*dfc6aa5cSAndroid Build Coastguard Worker    pop         rdi
585*dfc6aa5cSAndroid Build Coastguard Worker
586*dfc6aa5cSAndroid Build Coastguard Worker    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
587*dfc6aa5cSAndroid Build Coastguard Worker    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
588*dfc6aa5cSAndroid Build Coastguard Worker    dec         rcx                        ; rowctr
589*dfc6aa5cSAndroid Build Coastguard Worker    jg          short .rowloop
590*dfc6aa5cSAndroid Build Coastguard Worker
591*dfc6aa5cSAndroid Build Coastguard Worker.return:
592*dfc6aa5cSAndroid Build Coastguard Worker    vzeroupper
593*dfc6aa5cSAndroid Build Coastguard Worker    uncollect_args 4
594*dfc6aa5cSAndroid Build Coastguard Worker    pop         rbp
595*dfc6aa5cSAndroid Build Coastguard Worker    ret
596*dfc6aa5cSAndroid Build Coastguard Worker
597*dfc6aa5cSAndroid Build Coastguard Worker; --------------------------------------------------------------------------
598*dfc6aa5cSAndroid Build Coastguard Worker;
599*dfc6aa5cSAndroid Build Coastguard Worker; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
600*dfc6aa5cSAndroid Build Coastguard Worker; It's still a box filter.
601*dfc6aa5cSAndroid Build Coastguard Worker;
602*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(void)
603*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
604*dfc6aa5cSAndroid Build Coastguard Worker;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
605*dfc6aa5cSAndroid Build Coastguard Worker;
606*dfc6aa5cSAndroid Build Coastguard Worker
607*dfc6aa5cSAndroid Build Coastguard Worker; r10 = int max_v_samp_factor
608*dfc6aa5cSAndroid Build Coastguard Worker; r11d = JDIMENSION output_width
609*dfc6aa5cSAndroid Build Coastguard Worker; r12 = JSAMPARRAY input_data
610*dfc6aa5cSAndroid Build Coastguard Worker; r13 = JSAMPARRAY *output_data_ptr
611*dfc6aa5cSAndroid Build Coastguard Worker
612*dfc6aa5cSAndroid Build Coastguard Worker    align       32
613*dfc6aa5cSAndroid Build Coastguard Worker    GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
614*dfc6aa5cSAndroid Build Coastguard Worker
615*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_h2v2_upsample_avx2):
616*dfc6aa5cSAndroid Build Coastguard Worker    push        rbp
617*dfc6aa5cSAndroid Build Coastguard Worker    mov         rax, rsp
618*dfc6aa5cSAndroid Build Coastguard Worker    mov         rbp, rsp
619*dfc6aa5cSAndroid Build Coastguard Worker    collect_args 4
620*dfc6aa5cSAndroid Build Coastguard Worker    push        rbx
621*dfc6aa5cSAndroid Build Coastguard Worker
622*dfc6aa5cSAndroid Build Coastguard Worker    mov         edx, r11d
623*dfc6aa5cSAndroid Build Coastguard Worker    add         rdx, byte (SIZEOF_YMMWORD-1)
624*dfc6aa5cSAndroid Build Coastguard Worker    and         rdx, -SIZEOF_YMMWORD
625*dfc6aa5cSAndroid Build Coastguard Worker    jz          near .return
626*dfc6aa5cSAndroid Build Coastguard Worker
627*dfc6aa5cSAndroid Build Coastguard Worker    mov         rcx, r10                ; rowctr
628*dfc6aa5cSAndroid Build Coastguard Worker    test        rcx, rcx
629*dfc6aa5cSAndroid Build Coastguard Worker    jz          near .return
630*dfc6aa5cSAndroid Build Coastguard Worker
631*dfc6aa5cSAndroid Build Coastguard Worker    mov         rsi, r12                ; input_data
632*dfc6aa5cSAndroid Build Coastguard Worker    mov         rdi, r13
633*dfc6aa5cSAndroid Build Coastguard Worker    mov         rdip, JSAMPARRAY [rdi]  ; output_data
634*dfc6aa5cSAndroid Build Coastguard Worker.rowloop:
635*dfc6aa5cSAndroid Build Coastguard Worker    push        rdi
636*dfc6aa5cSAndroid Build Coastguard Worker    push        rsi
637*dfc6aa5cSAndroid Build Coastguard Worker
638*dfc6aa5cSAndroid Build Coastguard Worker    mov         rsip, JSAMPROW [rsi]                   ; inptr
639*dfc6aa5cSAndroid Build Coastguard Worker    mov         rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
640*dfc6aa5cSAndroid Build Coastguard Worker    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
641*dfc6aa5cSAndroid Build Coastguard Worker    mov         rax, rdx                               ; colctr
642*dfc6aa5cSAndroid Build Coastguard Worker.columnloop:
643*dfc6aa5cSAndroid Build Coastguard Worker
644*dfc6aa5cSAndroid Build Coastguard Worker    cmp         rax, byte SIZEOF_YMMWORD
645*dfc6aa5cSAndroid Build Coastguard Worker    ja          short .above_16
646*dfc6aa5cSAndroid Build Coastguard Worker
647*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
648*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  xmm1, xmm0, xmm0
649*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  xmm0, xmm0, xmm0
650*dfc6aa5cSAndroid Build Coastguard Worker
651*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
652*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
653*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
654*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
655*dfc6aa5cSAndroid Build Coastguard Worker
656*dfc6aa5cSAndroid Build Coastguard Worker    jmp         near .nextrow
657*dfc6aa5cSAndroid Build Coastguard Worker
658*dfc6aa5cSAndroid Build Coastguard Worker.above_16:
659*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
660*dfc6aa5cSAndroid Build Coastguard Worker
661*dfc6aa5cSAndroid Build Coastguard Worker    vpermq      ymm0, ymm0, 0xd8
662*dfc6aa5cSAndroid Build Coastguard Worker    vpunpckhbw  ymm1, ymm0, ymm0
663*dfc6aa5cSAndroid Build Coastguard Worker    vpunpcklbw  ymm0, ymm0, ymm0
664*dfc6aa5cSAndroid Build Coastguard Worker
665*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [rbx+0*SIZEOF_YMMWORD], ymm0
666*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [rbx+1*SIZEOF_YMMWORD], ymm1
667*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
668*dfc6aa5cSAndroid Build Coastguard Worker    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
669*dfc6aa5cSAndroid Build Coastguard Worker
670*dfc6aa5cSAndroid Build Coastguard Worker    sub         rax, byte 2*SIZEOF_YMMWORD
671*dfc6aa5cSAndroid Build Coastguard Worker    jz          short .nextrow
672*dfc6aa5cSAndroid Build Coastguard Worker
673*dfc6aa5cSAndroid Build Coastguard Worker    add         rsi, byte SIZEOF_YMMWORD  ; inptr
674*dfc6aa5cSAndroid Build Coastguard Worker    add         rbx, 2*SIZEOF_YMMWORD     ; outptr0
675*dfc6aa5cSAndroid Build Coastguard Worker    add         rdi, 2*SIZEOF_YMMWORD     ; outptr1
676*dfc6aa5cSAndroid Build Coastguard Worker    jmp         short .columnloop
677*dfc6aa5cSAndroid Build Coastguard Worker
678*dfc6aa5cSAndroid Build Coastguard Worker.nextrow:
679*dfc6aa5cSAndroid Build Coastguard Worker    pop         rsi
680*dfc6aa5cSAndroid Build Coastguard Worker    pop         rdi
681*dfc6aa5cSAndroid Build Coastguard Worker
682*dfc6aa5cSAndroid Build Coastguard Worker    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
683*dfc6aa5cSAndroid Build Coastguard Worker    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
684*dfc6aa5cSAndroid Build Coastguard Worker    sub         rcx, byte 2                  ; rowctr
685*dfc6aa5cSAndroid Build Coastguard Worker    jg          near .rowloop
686*dfc6aa5cSAndroid Build Coastguard Worker
687*dfc6aa5cSAndroid Build Coastguard Worker.return:
688*dfc6aa5cSAndroid Build Coastguard Worker    pop         rbx
689*dfc6aa5cSAndroid Build Coastguard Worker    vzeroupper
690*dfc6aa5cSAndroid Build Coastguard Worker    uncollect_args 4
691*dfc6aa5cSAndroid Build Coastguard Worker    pop         rbp
692*dfc6aa5cSAndroid Build Coastguard Worker    ret
693*dfc6aa5cSAndroid Build Coastguard Worker
694*dfc6aa5cSAndroid Build Coastguard Worker; For some reason, the OS X linker does not honor the request to align the
695*dfc6aa5cSAndroid Build Coastguard Worker; segment unless we do this.
696*dfc6aa5cSAndroid Build Coastguard Worker    align       32
697