xref: /aosp_15_r20/external/libjpeg-turbo/simd/x86_64/jccolext-sse2.asm (revision dfc6aa5c1cfd4bc4e2018dc74aa96e29ee49c6da)
1;
2; jccolext.asm - colorspace conversion (64-bit SSE2)
3;
4; Copyright (C) 2009, 2016, D. R. Commander.
5; Copyright (C) 2018, Matthias Räncker.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16
17%include "jcolsamp.inc"
18
19; --------------------------------------------------------------------------
20;
21; Convert some rows of samples to the output colorspace.
22;
23; GLOBAL(void)
24; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
25;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
26;                            int num_rows);
27;
28
29; r10d = JDIMENSION img_width
30; r11 = JSAMPARRAY input_buf
31; r12 = JSAMPIMAGE output_buf
32; r13d = JDIMENSION output_row
33; r14d = int num_rows
34
35%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
36%define WK_NUM  8
37
38    align       32
39    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2)
40
41EXTN(jsimd_rgb_ycc_convert_sse2):
42    push        rbp
43    mov         rax, rsp                     ; rax = original rbp
44    sub         rsp, byte 4
45    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
46    mov         [rsp], rax
47    mov         rbp, rsp                     ; rbp = aligned rbp
48    lea         rsp, [wk(0)]
49    collect_args 5
50    push        rbx
51
52    mov         ecx, r10d
53    test        rcx, rcx
54    jz          near .return
55
56    push        rcx
57
58    mov         rsi, r12
59    mov         ecx, r13d
60    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
61    mov         rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
62    mov         rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
63    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
64    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
65    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
66
67    pop         rcx
68
69    mov         rsi, r11
70    mov         eax, r14d
71    test        rax, rax
72    jle         near .return
73.rowloop:
74    push        rdx
75    push        rbx
76    push        rdi
77    push        rsi
78    push        rcx                     ; col
79
80    mov         rsip, JSAMPROW [rsi]    ; inptr
81    mov         rdip, JSAMPROW [rdi]    ; outptr0
82    mov         rbxp, JSAMPROW [rbx]    ; outptr1
83    mov         rdxp, JSAMPROW [rdx]    ; outptr2
84
85    cmp         rcx, byte SIZEOF_XMMWORD
86    jae         near .columnloop
87
88%if RGB_PIXELSIZE == 3  ; ---------------
89
90.column_ld1:
91    push        rax
92    push        rdx
93    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
94    test        cl, SIZEOF_BYTE
95    jz          short .column_ld2
96    sub         rcx, byte SIZEOF_BYTE
97    movzx       rax, byte [rsi+rcx]
98.column_ld2:
99    test        cl, SIZEOF_WORD
100    jz          short .column_ld4
101    sub         rcx, byte SIZEOF_WORD
102    movzx       rdx, word [rsi+rcx]
103    shl         rax, WORD_BIT
104    or          rax, rdx
105.column_ld4:
106    movd        xmmA, eax
107    pop         rdx
108    pop         rax
109    test        cl, SIZEOF_DWORD
110    jz          short .column_ld8
111    sub         rcx, byte SIZEOF_DWORD
112    movd        xmmF, XMM_DWORD [rsi+rcx]
113    pslldq      xmmA, SIZEOF_DWORD
114    por         xmmA, xmmF
115.column_ld8:
116    test        cl, SIZEOF_MMWORD
117    jz          short .column_ld16
118    sub         rcx, byte SIZEOF_MMWORD
119    movq        xmmB, XMM_MMWORD [rsi+rcx]
120    pslldq      xmmA, SIZEOF_MMWORD
121    por         xmmA, xmmB
122.column_ld16:
123    test        cl, SIZEOF_XMMWORD
124    jz          short .column_ld32
125    movdqa      xmmF, xmmA
126    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
127    mov         rcx, SIZEOF_XMMWORD
128    jmp         short .rgb_ycc_cnv
129.column_ld32:
130    test        cl, 2*SIZEOF_XMMWORD
131    mov         rcx, SIZEOF_XMMWORD
132    jz          short .rgb_ycc_cnv
133    movdqa      xmmB, xmmA
134    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
135    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
136    jmp         short .rgb_ycc_cnv
137
138.columnloop:
139    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
140    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
141    movdqu      xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
142
143.rgb_ycc_cnv:
144    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
145    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
146    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
147
148    movdqa      xmmG, xmmA
149    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
150    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
151
152    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
153    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
154
155    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
156    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
157
158    movdqa      xmmD, xmmA
159    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
160    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
161
162    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
163    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
164
165    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
166    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
167
168    movdqa      xmmE, xmmA
169    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
170    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
171
172    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
173    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
174
175    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
176    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
177
178    pxor        xmmH, xmmH
179
180    movdqa      xmmC, xmmA
181    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
182    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
183
184    movdqa      xmmB, xmmE
185    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
186    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
187
188    movdqa      xmmF, xmmD
189    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
190    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
191
192%else  ; RGB_PIXELSIZE == 4 ; -----------
193
194.column_ld1:
195    test        cl, SIZEOF_XMMWORD/16
196    jz          short .column_ld2
197    sub         rcx, byte SIZEOF_XMMWORD/16
198    movd        xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
199.column_ld2:
200    test        cl, SIZEOF_XMMWORD/8
201    jz          short .column_ld4
202    sub         rcx, byte SIZEOF_XMMWORD/8
203    movq        xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
204    pslldq      xmmA, SIZEOF_MMWORD
205    por         xmmA, xmmE
206.column_ld4:
207    test        cl, SIZEOF_XMMWORD/4
208    jz          short .column_ld8
209    sub         rcx, byte SIZEOF_XMMWORD/4
210    movdqa      xmmE, xmmA
211    movdqu      xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
212.column_ld8:
213    test        cl, SIZEOF_XMMWORD/2
214    mov         rcx, SIZEOF_XMMWORD
215    jz          short .rgb_ycc_cnv
216    movdqa      xmmF, xmmA
217    movdqa      xmmH, xmmE
218    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
219    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
220    jmp         short .rgb_ycc_cnv
221
222.columnloop:
223    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
224    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
225    movdqu      xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
226    movdqu      xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
227
228.rgb_ycc_cnv:
229    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
230    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
231    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
232    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
233
234    movdqa      xmmD, xmmA
235    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
236    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
237
238    movdqa      xmmC, xmmF
239    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
240    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
241
242    movdqa      xmmB, xmmA
243    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
244    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
245
246    movdqa      xmmG, xmmD
247    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
248    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
249
250    movdqa      xmmE, xmmA
251    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
252    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
253
254    movdqa      xmmH, xmmB
255    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
256    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
257
258    pxor        xmmF, xmmF
259
260    movdqa      xmmC, xmmA
261    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
262    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
263
264    movdqa      xmmD, xmmB
265    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
266    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
267
268    movdqa      xmmG, xmmE
269    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
270    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
271
272    punpcklbw   xmmF, xmmH
273    punpckhbw   xmmH, xmmH
274    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
275    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
276
277%endif  ; RGB_PIXELSIZE ; ---------------
278
279    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
280    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
281
282    ; (Original)
283    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
284    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
285    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
286    ;
287    ; (This implementation)
288    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
289    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
290    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
291
292    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=RE
293    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=RO
294    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=BE
295    movdqa      XMMWORD [wk(3)], xmm5   ; wk(3)=BO
296
297    movdqa      xmm6, xmm1
298    punpcklwd   xmm1, xmm3
299    punpckhwd   xmm6, xmm3
300    movdqa      xmm7, xmm1
301    movdqa      xmm4, xmm6
302    pmaddwd     xmm1, [rel PW_F0299_F0337]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
303    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
304    pmaddwd     xmm7, [rel PW_MF016_MF033]  ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
305    pmaddwd     xmm4, [rel PW_MF016_MF033]  ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
306
307    movdqa      XMMWORD [wk(4)], xmm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
308    movdqa      XMMWORD [wk(5)], xmm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
309
310    pxor        xmm1, xmm1
311    pxor        xmm6, xmm6
312    punpcklwd   xmm1, xmm5              ; xmm1=BOL
313    punpckhwd   xmm6, xmm5              ; xmm6=BOH
314    psrld       xmm1, 1                 ; xmm1=BOL*FIX(0.500)
315    psrld       xmm6, 1                 ; xmm6=BOH*FIX(0.500)
316
317    movdqa      xmm5, [rel PD_ONEHALFM1_CJ]  ; xmm5=[PD_ONEHALFM1_CJ]
318
319    paddd       xmm7, xmm1
320    paddd       xmm4, xmm6
321    paddd       xmm7, xmm5
322    paddd       xmm4, xmm5
323    psrld       xmm7, SCALEBITS         ; xmm7=CbOL
324    psrld       xmm4, SCALEBITS         ; xmm4=CbOH
325    packssdw    xmm7, xmm4              ; xmm7=CbO
326
327    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=BE
328
329    movdqa      xmm6, xmm0
330    punpcklwd   xmm0, xmm2
331    punpckhwd   xmm6, xmm2
332    movdqa      xmm5, xmm0
333    movdqa      xmm4, xmm6
334    pmaddwd     xmm0, [rel PW_F0299_F0337]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
335    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
336    pmaddwd     xmm5, [rel PW_MF016_MF033]  ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
337    pmaddwd     xmm4, [rel PW_MF016_MF033]  ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
338
339    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
340    movdqa      XMMWORD [wk(7)], xmm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
341
342    pxor        xmm0, xmm0
343    pxor        xmm6, xmm6
344    punpcklwd   xmm0, xmm1              ; xmm0=BEL
345    punpckhwd   xmm6, xmm1              ; xmm6=BEH
346    psrld       xmm0, 1                 ; xmm0=BEL*FIX(0.500)
347    psrld       xmm6, 1                 ; xmm6=BEH*FIX(0.500)
348
349    movdqa      xmm1, [rel PD_ONEHALFM1_CJ]  ; xmm1=[PD_ONEHALFM1_CJ]
350
351    paddd       xmm5, xmm0
352    paddd       xmm4, xmm6
353    paddd       xmm5, xmm1
354    paddd       xmm4, xmm1
355    psrld       xmm5, SCALEBITS         ; xmm5=CbEL
356    psrld       xmm4, SCALEBITS         ; xmm4=CbEH
357    packssdw    xmm5, xmm4              ; xmm5=CbE
358
359    psllw       xmm7, BYTE_BIT
360    por         xmm5, xmm7              ; xmm5=Cb
361    movdqa      XMMWORD [rbx], xmm5     ; Save Cb
362
363    movdqa      xmm0, XMMWORD [wk(3)]   ; xmm0=BO
364    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=BE
365    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=RO
366
367    movdqa      xmm4, xmm0
368    punpcklwd   xmm0, xmm3
369    punpckhwd   xmm4, xmm3
370    movdqa      xmm7, xmm0
371    movdqa      xmm5, xmm4
372    pmaddwd     xmm0, [rel PW_F0114_F0250]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
373    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
374    pmaddwd     xmm7, [rel PW_MF008_MF041]  ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
375    pmaddwd     xmm5, [rel PW_MF008_MF041]  ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
376
377    movdqa      xmm3, [rel PD_ONEHALF]      ; xmm3=[PD_ONEHALF]
378
379    paddd       xmm0, XMMWORD [wk(4)]
380    paddd       xmm4, XMMWORD [wk(5)]
381    paddd       xmm0, xmm3
382    paddd       xmm4, xmm3
383    psrld       xmm0, SCALEBITS         ; xmm0=YOL
384    psrld       xmm4, SCALEBITS         ; xmm4=YOH
385    packssdw    xmm0, xmm4              ; xmm0=YO
386
387    pxor        xmm3, xmm3
388    pxor        xmm4, xmm4
389    punpcklwd   xmm3, xmm1              ; xmm3=ROL
390    punpckhwd   xmm4, xmm1              ; xmm4=ROH
391    psrld       xmm3, 1                 ; xmm3=ROL*FIX(0.500)
392    psrld       xmm4, 1                 ; xmm4=ROH*FIX(0.500)
393
394    movdqa      xmm1, [rel PD_ONEHALFM1_CJ]  ; xmm1=[PD_ONEHALFM1_CJ]
395
396    paddd       xmm7, xmm3
397    paddd       xmm5, xmm4
398    paddd       xmm7, xmm1
399    paddd       xmm5, xmm1
400    psrld       xmm7, SCALEBITS         ; xmm7=CrOL
401    psrld       xmm5, SCALEBITS         ; xmm5=CrOH
402    packssdw    xmm7, xmm5              ; xmm7=CrO
403
404    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=RE
405
406    movdqa      xmm4, xmm6
407    punpcklwd   xmm6, xmm2
408    punpckhwd   xmm4, xmm2
409    movdqa      xmm1, xmm6
410    movdqa      xmm5, xmm4
411    pmaddwd     xmm6, [rel PW_F0114_F0250]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
412    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
413    pmaddwd     xmm1, [rel PW_MF008_MF041]  ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
414    pmaddwd     xmm5, [rel PW_MF008_MF041]  ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
415
416    movdqa      xmm2, [rel PD_ONEHALF]      ; xmm2=[PD_ONEHALF]
417
418    paddd       xmm6, XMMWORD [wk(6)]
419    paddd       xmm4, XMMWORD [wk(7)]
420    paddd       xmm6, xmm2
421    paddd       xmm4, xmm2
422    psrld       xmm6, SCALEBITS         ; xmm6=YEL
423    psrld       xmm4, SCALEBITS         ; xmm4=YEH
424    packssdw    xmm6, xmm4              ; xmm6=YE
425
426    psllw       xmm0, BYTE_BIT
427    por         xmm6, xmm0              ; xmm6=Y
428    movdqa      XMMWORD [rdi], xmm6     ; Save Y
429
430    pxor        xmm2, xmm2
431    pxor        xmm4, xmm4
432    punpcklwd   xmm2, xmm3              ; xmm2=REL
433    punpckhwd   xmm4, xmm3              ; xmm4=REH
434    psrld       xmm2, 1                 ; xmm2=REL*FIX(0.500)
435    psrld       xmm4, 1                 ; xmm4=REH*FIX(0.500)
436
437    movdqa      xmm0, [rel PD_ONEHALFM1_CJ]  ; xmm0=[PD_ONEHALFM1_CJ]
438
439    paddd       xmm1, xmm2
440    paddd       xmm5, xmm4
441    paddd       xmm1, xmm0
442    paddd       xmm5, xmm0
443    psrld       xmm1, SCALEBITS         ; xmm1=CrEL
444    psrld       xmm5, SCALEBITS         ; xmm5=CrEH
445    packssdw    xmm1, xmm5              ; xmm1=CrE
446
447    psllw       xmm7, BYTE_BIT
448    por         xmm1, xmm7              ; xmm1=Cr
449    movdqa      XMMWORD [rdx], xmm1     ; Save Cr
450
451    sub         rcx, byte SIZEOF_XMMWORD
452    add         rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
453    add         rdi, byte SIZEOF_XMMWORD                ; outptr0
454    add         rbx, byte SIZEOF_XMMWORD                ; outptr1
455    add         rdx, byte SIZEOF_XMMWORD                ; outptr2
456    cmp         rcx, byte SIZEOF_XMMWORD
457    jae         near .columnloop
458    test        rcx, rcx
459    jnz         near .column_ld1
460
461    pop         rcx                     ; col
462    pop         rsi
463    pop         rdi
464    pop         rbx
465    pop         rdx
466
467    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
468    add         rdi, byte SIZEOF_JSAMPROW
469    add         rbx, byte SIZEOF_JSAMPROW
470    add         rdx, byte SIZEOF_JSAMPROW
471    dec         rax                        ; num_rows
472    jg          near .rowloop
473
474.return:
475    pop         rbx
476    uncollect_args 5
477    mov         rsp, rbp                ; rsp <- aligned rbp
478    pop         rsp                     ; rsp <- original rbp
479    pop         rbp
480    ret
481
482; For some reason, the OS X linker does not honor the request to align the
483; segment unless we do this.
484    align       32
485