xref: /aosp_15_r20/external/libjpeg-turbo/simd/x86_64/jcsample-sse2.asm (revision dfc6aa5c1cfd4bc4e2018dc74aa96e29ee49c6da)
1*dfc6aa5cSAndroid Build Coastguard Worker;
2*dfc6aa5cSAndroid Build Coastguard Worker; jcsample.asm - downsampling (64-bit SSE2)
3*dfc6aa5cSAndroid Build Coastguard Worker;
4*dfc6aa5cSAndroid Build Coastguard Worker; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2009, 2016, D. R. Commander.
6*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 2018, Matthias Räncker.
7*dfc6aa5cSAndroid Build Coastguard Worker;
8*dfc6aa5cSAndroid Build Coastguard Worker; Based on the x86 SIMD extension for IJG JPEG library
9*dfc6aa5cSAndroid Build Coastguard Worker; Copyright (C) 1999-2006, MIYASAKA Masaru.
10*dfc6aa5cSAndroid Build Coastguard Worker; For conditions of distribution and use, see copyright notice in jsimdext.inc
11*dfc6aa5cSAndroid Build Coastguard Worker;
12*dfc6aa5cSAndroid Build Coastguard Worker; This file should be assembled with NASM (Netwide Assembler),
13*dfc6aa5cSAndroid Build Coastguard Worker; can *not* be assembled with Microsoft's MASM or any compatible
14*dfc6aa5cSAndroid Build Coastguard Worker; assembler (including Borland's Turbo Assembler).
15*dfc6aa5cSAndroid Build Coastguard Worker; NASM is available from http://nasm.sourceforge.net/ or
16*dfc6aa5cSAndroid Build Coastguard Worker; http://sourceforge.net/project/showfiles.php?group_id=6208
17*dfc6aa5cSAndroid Build Coastguard Worker
18*dfc6aa5cSAndroid Build Coastguard Worker%include "jsimdext.inc"
19*dfc6aa5cSAndroid Build Coastguard Worker
20*dfc6aa5cSAndroid Build Coastguard Worker; --------------------------------------------------------------------------
21*dfc6aa5cSAndroid Build Coastguard Worker    SECTION     SEG_TEXT
22*dfc6aa5cSAndroid Build Coastguard Worker    BITS        64
23*dfc6aa5cSAndroid Build Coastguard Worker;
24*dfc6aa5cSAndroid Build Coastguard Worker; Downsample pixel values of a single component.
25*dfc6aa5cSAndroid Build Coastguard Worker; This version handles the common case of 2:1 horizontal and 1:1 vertical,
26*dfc6aa5cSAndroid Build Coastguard Worker; without smoothing.
27*dfc6aa5cSAndroid Build Coastguard Worker;
28*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(void)
29*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
30*dfc6aa5cSAndroid Build Coastguard Worker;                            JDIMENSION v_samp_factor,
31*dfc6aa5cSAndroid Build Coastguard Worker;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
32*dfc6aa5cSAndroid Build Coastguard Worker;                            JSAMPARRAY output_data);
33*dfc6aa5cSAndroid Build Coastguard Worker;
34*dfc6aa5cSAndroid Build Coastguard Worker
35*dfc6aa5cSAndroid Build Coastguard Worker; r10d = JDIMENSION image_width
36*dfc6aa5cSAndroid Build Coastguard Worker; r11 = int max_v_samp_factor
37*dfc6aa5cSAndroid Build Coastguard Worker; r12d = JDIMENSION v_samp_factor
38*dfc6aa5cSAndroid Build Coastguard Worker; r13d = JDIMENSION width_in_blocks
39*dfc6aa5cSAndroid Build Coastguard Worker; r14 = JSAMPARRAY input_data
40*dfc6aa5cSAndroid Build Coastguard Worker; r15 = JSAMPARRAY output_data
41*dfc6aa5cSAndroid Build Coastguard Worker
42*dfc6aa5cSAndroid Build Coastguard Worker    align       32
43*dfc6aa5cSAndroid Build Coastguard Worker    GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
44*dfc6aa5cSAndroid Build Coastguard Worker
45*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_h2v1_downsample_sse2):
46*dfc6aa5cSAndroid Build Coastguard Worker    push        rbp
47*dfc6aa5cSAndroid Build Coastguard Worker    mov         rax, rsp
48*dfc6aa5cSAndroid Build Coastguard Worker    mov         rbp, rsp
49*dfc6aa5cSAndroid Build Coastguard Worker    collect_args 6
50*dfc6aa5cSAndroid Build Coastguard Worker
51*dfc6aa5cSAndroid Build Coastguard Worker    mov         ecx, r13d
52*dfc6aa5cSAndroid Build Coastguard Worker    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
53*dfc6aa5cSAndroid Build Coastguard Worker    jz          near .return
54*dfc6aa5cSAndroid Build Coastguard Worker
55*dfc6aa5cSAndroid Build Coastguard Worker    mov         edx, r10d
56*dfc6aa5cSAndroid Build Coastguard Worker
57*dfc6aa5cSAndroid Build Coastguard Worker    ; -- expand_right_edge
58*dfc6aa5cSAndroid Build Coastguard Worker
59*dfc6aa5cSAndroid Build Coastguard Worker    push        rcx
60*dfc6aa5cSAndroid Build Coastguard Worker    shl         rcx, 1                  ; output_cols * 2
61*dfc6aa5cSAndroid Build Coastguard Worker    sub         rcx, rdx
62*dfc6aa5cSAndroid Build Coastguard Worker    jle         short .expand_end
63*dfc6aa5cSAndroid Build Coastguard Worker
64*dfc6aa5cSAndroid Build Coastguard Worker    mov         rax, r11
65*dfc6aa5cSAndroid Build Coastguard Worker    test        rax, rax
66*dfc6aa5cSAndroid Build Coastguard Worker    jle         short .expand_end
67*dfc6aa5cSAndroid Build Coastguard Worker
68*dfc6aa5cSAndroid Build Coastguard Worker    cld
69*dfc6aa5cSAndroid Build Coastguard Worker    mov         rsi, r14                ; input_data
70*dfc6aa5cSAndroid Build Coastguard Worker.expandloop:
71*dfc6aa5cSAndroid Build Coastguard Worker    push        rax
72*dfc6aa5cSAndroid Build Coastguard Worker    push        rcx
73*dfc6aa5cSAndroid Build Coastguard Worker
74*dfc6aa5cSAndroid Build Coastguard Worker    mov         rdip, JSAMPROW [rsi]
75*dfc6aa5cSAndroid Build Coastguard Worker    add         rdi, rdx
76*dfc6aa5cSAndroid Build Coastguard Worker    mov         al, JSAMPLE [rdi-1]
77*dfc6aa5cSAndroid Build Coastguard Worker
78*dfc6aa5cSAndroid Build Coastguard Worker    rep stosb
79*dfc6aa5cSAndroid Build Coastguard Worker
80*dfc6aa5cSAndroid Build Coastguard Worker    pop         rcx
81*dfc6aa5cSAndroid Build Coastguard Worker    pop         rax
82*dfc6aa5cSAndroid Build Coastguard Worker
83*dfc6aa5cSAndroid Build Coastguard Worker    add         rsi, byte SIZEOF_JSAMPROW
84*dfc6aa5cSAndroid Build Coastguard Worker    dec         rax
85*dfc6aa5cSAndroid Build Coastguard Worker    jg          short .expandloop
86*dfc6aa5cSAndroid Build Coastguard Worker
87*dfc6aa5cSAndroid Build Coastguard Worker.expand_end:
88*dfc6aa5cSAndroid Build Coastguard Worker    pop         rcx                     ; output_cols
89*dfc6aa5cSAndroid Build Coastguard Worker
90*dfc6aa5cSAndroid Build Coastguard Worker    ; -- h2v1_downsample
91*dfc6aa5cSAndroid Build Coastguard Worker
92*dfc6aa5cSAndroid Build Coastguard Worker    mov         eax, r12d               ; rowctr
93*dfc6aa5cSAndroid Build Coastguard Worker    test        eax, eax
94*dfc6aa5cSAndroid Build Coastguard Worker    jle         near .return
95*dfc6aa5cSAndroid Build Coastguard Worker
96*dfc6aa5cSAndroid Build Coastguard Worker    mov         rdx, 0x00010000         ; bias pattern
97*dfc6aa5cSAndroid Build Coastguard Worker    movd        xmm7, edx
98*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm6, xmm6
99*dfc6aa5cSAndroid Build Coastguard Worker    pshufd      xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
100*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
101*dfc6aa5cSAndroid Build Coastguard Worker
102*dfc6aa5cSAndroid Build Coastguard Worker    mov         rsi, r14                ; input_data
103*dfc6aa5cSAndroid Build Coastguard Worker    mov         rdi, r15                ; output_data
104*dfc6aa5cSAndroid Build Coastguard Worker.rowloop:
105*dfc6aa5cSAndroid Build Coastguard Worker    push        rcx
106*dfc6aa5cSAndroid Build Coastguard Worker    push        rdi
107*dfc6aa5cSAndroid Build Coastguard Worker    push        rsi
108*dfc6aa5cSAndroid Build Coastguard Worker
109*dfc6aa5cSAndroid Build Coastguard Worker    mov         rsip, JSAMPROW [rsi]    ; inptr
110*dfc6aa5cSAndroid Build Coastguard Worker    mov         rdip, JSAMPROW [rdi]    ; outptr
111*dfc6aa5cSAndroid Build Coastguard Worker
112*dfc6aa5cSAndroid Build Coastguard Worker    cmp         rcx, byte SIZEOF_XMMWORD
113*dfc6aa5cSAndroid Build Coastguard Worker    jae         short .columnloop
114*dfc6aa5cSAndroid Build Coastguard Worker
115*dfc6aa5cSAndroid Build Coastguard Worker.columnloop_r8:
116*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
117*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm1, xmm1
118*dfc6aa5cSAndroid Build Coastguard Worker    mov         rcx, SIZEOF_XMMWORD
119*dfc6aa5cSAndroid Build Coastguard Worker    jmp         short .downsample
120*dfc6aa5cSAndroid Build Coastguard Worker
121*dfc6aa5cSAndroid Build Coastguard Worker.columnloop:
122*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
123*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
124*dfc6aa5cSAndroid Build Coastguard Worker
125*dfc6aa5cSAndroid Build Coastguard Worker.downsample:
126*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm2, xmm0
127*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm3, xmm1
128*dfc6aa5cSAndroid Build Coastguard Worker
129*dfc6aa5cSAndroid Build Coastguard Worker    pand        xmm0, xmm6
130*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       xmm2, BYTE_BIT
131*dfc6aa5cSAndroid Build Coastguard Worker    pand        xmm1, xmm6
132*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       xmm3, BYTE_BIT
133*dfc6aa5cSAndroid Build Coastguard Worker
134*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm0, xmm2
135*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm1, xmm3
136*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm0, xmm7
137*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm1, xmm7
138*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       xmm0, 1
139*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       xmm1, 1
140*dfc6aa5cSAndroid Build Coastguard Worker
141*dfc6aa5cSAndroid Build Coastguard Worker    packuswb    xmm0, xmm1
142*dfc6aa5cSAndroid Build Coastguard Worker
143*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
144*dfc6aa5cSAndroid Build Coastguard Worker
145*dfc6aa5cSAndroid Build Coastguard Worker    sub         rcx, byte SIZEOF_XMMWORD    ; outcol
146*dfc6aa5cSAndroid Build Coastguard Worker    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
147*dfc6aa5cSAndroid Build Coastguard Worker    add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
148*dfc6aa5cSAndroid Build Coastguard Worker    cmp         rcx, byte SIZEOF_XMMWORD
149*dfc6aa5cSAndroid Build Coastguard Worker    jae         short .columnloop
150*dfc6aa5cSAndroid Build Coastguard Worker    test        rcx, rcx
151*dfc6aa5cSAndroid Build Coastguard Worker    jnz         short .columnloop_r8
152*dfc6aa5cSAndroid Build Coastguard Worker
153*dfc6aa5cSAndroid Build Coastguard Worker    pop         rsi
154*dfc6aa5cSAndroid Build Coastguard Worker    pop         rdi
155*dfc6aa5cSAndroid Build Coastguard Worker    pop         rcx
156*dfc6aa5cSAndroid Build Coastguard Worker
157*dfc6aa5cSAndroid Build Coastguard Worker    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
158*dfc6aa5cSAndroid Build Coastguard Worker    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
159*dfc6aa5cSAndroid Build Coastguard Worker    dec         rax                        ; rowctr
160*dfc6aa5cSAndroid Build Coastguard Worker    jg          near .rowloop
161*dfc6aa5cSAndroid Build Coastguard Worker
162*dfc6aa5cSAndroid Build Coastguard Worker.return:
163*dfc6aa5cSAndroid Build Coastguard Worker    uncollect_args 6
164*dfc6aa5cSAndroid Build Coastguard Worker    pop         rbp
165*dfc6aa5cSAndroid Build Coastguard Worker    ret
166*dfc6aa5cSAndroid Build Coastguard Worker
167*dfc6aa5cSAndroid Build Coastguard Worker; --------------------------------------------------------------------------
168*dfc6aa5cSAndroid Build Coastguard Worker;
169*dfc6aa5cSAndroid Build Coastguard Worker; Downsample pixel values of a single component.
170*dfc6aa5cSAndroid Build Coastguard Worker; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
171*dfc6aa5cSAndroid Build Coastguard Worker; without smoothing.
172*dfc6aa5cSAndroid Build Coastguard Worker;
173*dfc6aa5cSAndroid Build Coastguard Worker; GLOBAL(void)
174*dfc6aa5cSAndroid Build Coastguard Worker; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
175*dfc6aa5cSAndroid Build Coastguard Worker;                            JDIMENSION v_samp_factor,
176*dfc6aa5cSAndroid Build Coastguard Worker;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
177*dfc6aa5cSAndroid Build Coastguard Worker;                            JSAMPARRAY output_data);
178*dfc6aa5cSAndroid Build Coastguard Worker;
179*dfc6aa5cSAndroid Build Coastguard Worker
180*dfc6aa5cSAndroid Build Coastguard Worker; r10d = JDIMENSION image_width
181*dfc6aa5cSAndroid Build Coastguard Worker; r11 = int max_v_samp_factor
182*dfc6aa5cSAndroid Build Coastguard Worker; r12d = JDIMENSION v_samp_factor
183*dfc6aa5cSAndroid Build Coastguard Worker; r13d = JDIMENSION width_in_blocks
184*dfc6aa5cSAndroid Build Coastguard Worker; r14 = JSAMPARRAY input_data
185*dfc6aa5cSAndroid Build Coastguard Worker; r15 = JSAMPARRAY output_data
186*dfc6aa5cSAndroid Build Coastguard Worker
187*dfc6aa5cSAndroid Build Coastguard Worker    align       32
188*dfc6aa5cSAndroid Build Coastguard Worker    GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
189*dfc6aa5cSAndroid Build Coastguard Worker
190*dfc6aa5cSAndroid Build Coastguard WorkerEXTN(jsimd_h2v2_downsample_sse2):
191*dfc6aa5cSAndroid Build Coastguard Worker    push        rbp
192*dfc6aa5cSAndroid Build Coastguard Worker    mov         rax, rsp
193*dfc6aa5cSAndroid Build Coastguard Worker    mov         rbp, rsp
194*dfc6aa5cSAndroid Build Coastguard Worker    collect_args 6
195*dfc6aa5cSAndroid Build Coastguard Worker
196*dfc6aa5cSAndroid Build Coastguard Worker    mov         ecx, r13d
197*dfc6aa5cSAndroid Build Coastguard Worker    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
198*dfc6aa5cSAndroid Build Coastguard Worker    jz          near .return
199*dfc6aa5cSAndroid Build Coastguard Worker
200*dfc6aa5cSAndroid Build Coastguard Worker    mov         edx, r10d
201*dfc6aa5cSAndroid Build Coastguard Worker
202*dfc6aa5cSAndroid Build Coastguard Worker    ; -- expand_right_edge
203*dfc6aa5cSAndroid Build Coastguard Worker
204*dfc6aa5cSAndroid Build Coastguard Worker    push        rcx
205*dfc6aa5cSAndroid Build Coastguard Worker    shl         rcx, 1                  ; output_cols * 2
206*dfc6aa5cSAndroid Build Coastguard Worker    sub         rcx, rdx
207*dfc6aa5cSAndroid Build Coastguard Worker    jle         short .expand_end
208*dfc6aa5cSAndroid Build Coastguard Worker
209*dfc6aa5cSAndroid Build Coastguard Worker    mov         rax, r11
210*dfc6aa5cSAndroid Build Coastguard Worker    test        rax, rax
211*dfc6aa5cSAndroid Build Coastguard Worker    jle         short .expand_end
212*dfc6aa5cSAndroid Build Coastguard Worker
213*dfc6aa5cSAndroid Build Coastguard Worker    cld
214*dfc6aa5cSAndroid Build Coastguard Worker    mov         rsi, r14                ; input_data
215*dfc6aa5cSAndroid Build Coastguard Worker.expandloop:
216*dfc6aa5cSAndroid Build Coastguard Worker    push        rax
217*dfc6aa5cSAndroid Build Coastguard Worker    push        rcx
218*dfc6aa5cSAndroid Build Coastguard Worker
219*dfc6aa5cSAndroid Build Coastguard Worker    mov         rdip, JSAMPROW [rsi]
220*dfc6aa5cSAndroid Build Coastguard Worker    add         rdi, rdx
221*dfc6aa5cSAndroid Build Coastguard Worker    mov         al, JSAMPLE [rdi-1]
222*dfc6aa5cSAndroid Build Coastguard Worker
223*dfc6aa5cSAndroid Build Coastguard Worker    rep stosb
224*dfc6aa5cSAndroid Build Coastguard Worker
225*dfc6aa5cSAndroid Build Coastguard Worker    pop         rcx
226*dfc6aa5cSAndroid Build Coastguard Worker    pop         rax
227*dfc6aa5cSAndroid Build Coastguard Worker
228*dfc6aa5cSAndroid Build Coastguard Worker    add         rsi, byte SIZEOF_JSAMPROW
229*dfc6aa5cSAndroid Build Coastguard Worker    dec         rax
230*dfc6aa5cSAndroid Build Coastguard Worker    jg          short .expandloop
231*dfc6aa5cSAndroid Build Coastguard Worker
232*dfc6aa5cSAndroid Build Coastguard Worker.expand_end:
233*dfc6aa5cSAndroid Build Coastguard Worker    pop         rcx                     ; output_cols
234*dfc6aa5cSAndroid Build Coastguard Worker
235*dfc6aa5cSAndroid Build Coastguard Worker    ; -- h2v2_downsample
236*dfc6aa5cSAndroid Build Coastguard Worker
237*dfc6aa5cSAndroid Build Coastguard Worker    mov         eax, r12d               ; rowctr
238*dfc6aa5cSAndroid Build Coastguard Worker    test        rax, rax
239*dfc6aa5cSAndroid Build Coastguard Worker    jle         near .return
240*dfc6aa5cSAndroid Build Coastguard Worker
241*dfc6aa5cSAndroid Build Coastguard Worker    mov         rdx, 0x00020001         ; bias pattern
242*dfc6aa5cSAndroid Build Coastguard Worker    movd        xmm7, edx
243*dfc6aa5cSAndroid Build Coastguard Worker    pcmpeqw     xmm6, xmm6
244*dfc6aa5cSAndroid Build Coastguard Worker    pshufd      xmm7, xmm7, 0x00        ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
245*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
246*dfc6aa5cSAndroid Build Coastguard Worker
247*dfc6aa5cSAndroid Build Coastguard Worker    mov         rsi, r14                ; input_data
248*dfc6aa5cSAndroid Build Coastguard Worker    mov         rdi, r15                ; output_data
249*dfc6aa5cSAndroid Build Coastguard Worker.rowloop:
250*dfc6aa5cSAndroid Build Coastguard Worker    push        rcx
251*dfc6aa5cSAndroid Build Coastguard Worker    push        rdi
252*dfc6aa5cSAndroid Build Coastguard Worker    push        rsi
253*dfc6aa5cSAndroid Build Coastguard Worker
254*dfc6aa5cSAndroid Build Coastguard Worker    mov         rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
255*dfc6aa5cSAndroid Build Coastguard Worker    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
256*dfc6aa5cSAndroid Build Coastguard Worker    mov         rdip, JSAMPROW [rdi]                    ; outptr
257*dfc6aa5cSAndroid Build Coastguard Worker
258*dfc6aa5cSAndroid Build Coastguard Worker    cmp         rcx, byte SIZEOF_XMMWORD
259*dfc6aa5cSAndroid Build Coastguard Worker    jae         short .columnloop
260*dfc6aa5cSAndroid Build Coastguard Worker
261*dfc6aa5cSAndroid Build Coastguard Worker.columnloop_r8:
262*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
263*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
264*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm2, xmm2
265*dfc6aa5cSAndroid Build Coastguard Worker    pxor        xmm3, xmm3
266*dfc6aa5cSAndroid Build Coastguard Worker    mov         rcx, SIZEOF_XMMWORD
267*dfc6aa5cSAndroid Build Coastguard Worker    jmp         short .downsample
268*dfc6aa5cSAndroid Build Coastguard Worker
269*dfc6aa5cSAndroid Build Coastguard Worker.columnloop:
270*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
271*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
272*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
273*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
274*dfc6aa5cSAndroid Build Coastguard Worker
275*dfc6aa5cSAndroid Build Coastguard Worker.downsample:
276*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm4, xmm0
277*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm5, xmm1
278*dfc6aa5cSAndroid Build Coastguard Worker    pand        xmm0, xmm6
279*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       xmm4, BYTE_BIT
280*dfc6aa5cSAndroid Build Coastguard Worker    pand        xmm1, xmm6
281*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       xmm5, BYTE_BIT
282*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm0, xmm4
283*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm1, xmm5
284*dfc6aa5cSAndroid Build Coastguard Worker
285*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm4, xmm2
286*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      xmm5, xmm3
287*dfc6aa5cSAndroid Build Coastguard Worker    pand        xmm2, xmm6
288*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       xmm4, BYTE_BIT
289*dfc6aa5cSAndroid Build Coastguard Worker    pand        xmm3, xmm6
290*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       xmm5, BYTE_BIT
291*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm2, xmm4
292*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm3, xmm5
293*dfc6aa5cSAndroid Build Coastguard Worker
294*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm0, xmm1
295*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm2, xmm3
296*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm0, xmm7
297*dfc6aa5cSAndroid Build Coastguard Worker    paddw       xmm2, xmm7
298*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       xmm0, 2
299*dfc6aa5cSAndroid Build Coastguard Worker    psrlw       xmm2, 2
300*dfc6aa5cSAndroid Build Coastguard Worker
301*dfc6aa5cSAndroid Build Coastguard Worker    packuswb    xmm0, xmm2
302*dfc6aa5cSAndroid Build Coastguard Worker
303*dfc6aa5cSAndroid Build Coastguard Worker    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
304*dfc6aa5cSAndroid Build Coastguard Worker
305*dfc6aa5cSAndroid Build Coastguard Worker    sub         rcx, byte SIZEOF_XMMWORD    ; outcol
306*dfc6aa5cSAndroid Build Coastguard Worker    add         rdx, byte 2*SIZEOF_XMMWORD  ; inptr0
307*dfc6aa5cSAndroid Build Coastguard Worker    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr1
308*dfc6aa5cSAndroid Build Coastguard Worker    add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
309*dfc6aa5cSAndroid Build Coastguard Worker    cmp         rcx, byte SIZEOF_XMMWORD
310*dfc6aa5cSAndroid Build Coastguard Worker    jae         near .columnloop
311*dfc6aa5cSAndroid Build Coastguard Worker    test        rcx, rcx
312*dfc6aa5cSAndroid Build Coastguard Worker    jnz         near .columnloop_r8
313*dfc6aa5cSAndroid Build Coastguard Worker
314*dfc6aa5cSAndroid Build Coastguard Worker    pop         rsi
315*dfc6aa5cSAndroid Build Coastguard Worker    pop         rdi
316*dfc6aa5cSAndroid Build Coastguard Worker    pop         rcx
317*dfc6aa5cSAndroid Build Coastguard Worker
318*dfc6aa5cSAndroid Build Coastguard Worker    add         rsi, byte 2*SIZEOF_JSAMPROW  ; input_data
319*dfc6aa5cSAndroid Build Coastguard Worker    add         rdi, byte 1*SIZEOF_JSAMPROW  ; output_data
320*dfc6aa5cSAndroid Build Coastguard Worker    dec         rax                          ; rowctr
321*dfc6aa5cSAndroid Build Coastguard Worker    jg          near .rowloop
322*dfc6aa5cSAndroid Build Coastguard Worker
323*dfc6aa5cSAndroid Build Coastguard Worker.return:
324*dfc6aa5cSAndroid Build Coastguard Worker    uncollect_args 6
325*dfc6aa5cSAndroid Build Coastguard Worker    pop         rbp
326*dfc6aa5cSAndroid Build Coastguard Worker    ret
327*dfc6aa5cSAndroid Build Coastguard Worker
328*dfc6aa5cSAndroid Build Coastguard Worker; For some reason, the OS X linker does not honor the request to align the
329*dfc6aa5cSAndroid Build Coastguard Worker; segment unless we do this.
330*dfc6aa5cSAndroid Build Coastguard Worker    align       32
331