1; 2; jccolext.asm - colorspace conversion (64-bit SSE2) 3; 4; Copyright (C) 2009, 2016, D. R. Commander. 5; Copyright (C) 2018, Matthias Räncker. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16 17%include "jcolsamp.inc" 18 19; -------------------------------------------------------------------------- 20; 21; Convert some rows of samples to the output colorspace. 22; 23; GLOBAL(void) 24; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf, 25; JSAMPIMAGE output_buf, JDIMENSION output_row, 26; int num_rows); 27; 28 29; r10d = JDIMENSION img_width 30; r11 = JSAMPARRAY input_buf 31; r12 = JSAMPIMAGE output_buf 32; r13d = JDIMENSION output_row 33; r14d = int num_rows 34 35%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 36%define WK_NUM 8 37 38 align 32 39 GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2) 40 41EXTN(jsimd_rgb_ycc_convert_sse2): 42 push rbp 43 mov rax, rsp ; rax = original rbp 44 sub rsp, byte 4 45 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 46 mov [rsp], rax 47 mov rbp, rsp ; rbp = aligned rbp 48 lea rsp, [wk(0)] 49 collect_args 5 50 push rbx 51 52 mov ecx, r10d 53 test rcx, rcx 54 jz near .return 55 56 push rcx 57 58 mov rsi, r12 59 mov ecx, r13d 60 mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] 61 mov rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY] 62 mov rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY] 63 lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] 64 lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] 65 lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] 66 67 pop rcx 68 69 mov rsi, r11 70 mov eax, r14d 71 test rax, rax 72 jle near .return 73.rowloop: 74 push rdx 75 push rbx 76 push rdi 77 push rsi 78 push rcx ; col 79 80 mov rsip, JSAMPROW [rsi] ; inptr 81 mov rdip, JSAMPROW [rdi] ; outptr0 82 mov rbxp, JSAMPROW [rbx] ; outptr1 83 mov rdxp, JSAMPROW [rdx] ; outptr2 84 85 cmp rcx, byte SIZEOF_XMMWORD 86 jae near .columnloop 87 88%if RGB_PIXELSIZE == 3 ; --------------- 89 90.column_ld1: 91 push rax 92 push rdx 93 lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE 94 test cl, SIZEOF_BYTE 95 jz short .column_ld2 96 sub rcx, byte SIZEOF_BYTE 97 movzx rax, byte [rsi+rcx] 98.column_ld2: 99 test cl, SIZEOF_WORD 100 jz short .column_ld4 101 sub rcx, byte SIZEOF_WORD 102 movzx rdx, word [rsi+rcx] 103 shl rax, WORD_BIT 104 or rax, rdx 105.column_ld4: 106 movd xmmA, eax 107 pop rdx 108 pop rax 109 test cl, SIZEOF_DWORD 110 jz short .column_ld8 111 sub rcx, byte SIZEOF_DWORD 112 movd xmmF, XMM_DWORD [rsi+rcx] 113 pslldq xmmA, SIZEOF_DWORD 114 por xmmA, xmmF 115.column_ld8: 116 test cl, SIZEOF_MMWORD 117 jz short .column_ld16 118 sub rcx, byte SIZEOF_MMWORD 119 movq xmmB, XMM_MMWORD [rsi+rcx] 120 pslldq xmmA, SIZEOF_MMWORD 121 por xmmA, xmmB 122.column_ld16: 123 test cl, SIZEOF_XMMWORD 124 jz short .column_ld32 125 movdqa xmmF, xmmA 126 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 127 mov rcx, SIZEOF_XMMWORD 128 jmp short .rgb_ycc_cnv 129.column_ld32: 130 test cl, 2*SIZEOF_XMMWORD 131 mov rcx, SIZEOF_XMMWORD 132 jz short .rgb_ycc_cnv 133 movdqa xmmB, xmmA 134 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 135 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] 136 jmp short .rgb_ycc_cnv 137 138.columnloop: 139 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 140 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] 141 movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] 142 143.rgb_ycc_cnv: 144 ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) 145 ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 146 ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) 147 148 movdqa xmmG, xmmA 149 pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) 150 psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) 151 152 punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) 153 pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) 154 155 punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) 156 punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) 157 158 movdqa xmmD, xmmA 159 pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) 160 psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) 161 162 punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) 163 pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) 164 165 punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) 166 punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) 167 168 movdqa xmmE, xmmA 169 pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) 170 psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) 171 172 punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 173 pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) 174 175 punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) 176 punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) 177 178 pxor xmmH, xmmH 179 180 movdqa xmmC, xmmA 181 punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) 182 punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) 183 184 movdqa xmmB, xmmE 185 punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) 186 punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) 187 188 movdqa xmmF, xmmD 189 punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) 190 punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) 191 192%else ; RGB_PIXELSIZE == 4 ; ----------- 193 194.column_ld1: 195 test cl, SIZEOF_XMMWORD/16 196 jz short .column_ld2 197 sub rcx, byte SIZEOF_XMMWORD/16 198 movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] 199.column_ld2: 200 test cl, SIZEOF_XMMWORD/8 201 jz short .column_ld4 202 sub rcx, byte SIZEOF_XMMWORD/8 203 movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] 204 pslldq xmmA, SIZEOF_MMWORD 205 por xmmA, xmmE 206.column_ld4: 207 test cl, SIZEOF_XMMWORD/4 208 jz short .column_ld8 209 sub rcx, byte SIZEOF_XMMWORD/4 210 movdqa xmmE, xmmA 211 movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] 212.column_ld8: 213 test cl, SIZEOF_XMMWORD/2 214 mov rcx, SIZEOF_XMMWORD 215 jz short .rgb_ycc_cnv 216 movdqa xmmF, xmmA 217 movdqa xmmH, xmmE 218 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 219 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] 220 jmp short .rgb_ycc_cnv 221 222.columnloop: 223 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 224 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] 225 movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] 226 movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] 227 228.rgb_ycc_cnv: 229 ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) 230 ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 231 ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) 232 ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 233 234 movdqa xmmD, xmmA 235 punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) 236 punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) 237 238 movdqa xmmC, xmmF 239 punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) 240 punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) 241 242 movdqa xmmB, xmmA 243 punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) 244 punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) 245 246 movdqa xmmG, xmmD 247 punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) 248 punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) 249 250 movdqa xmmE, xmmA 251 punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 252 punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) 253 254 movdqa xmmH, xmmB 255 punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) 256 punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) 257 258 pxor xmmF, xmmF 259 260 movdqa xmmC, xmmA 261 punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) 262 punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) 263 264 movdqa xmmD, xmmB 265 punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) 266 punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) 267 268 movdqa xmmG, xmmE 269 punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) 270 punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) 271 272 punpcklbw xmmF, xmmH 273 punpckhbw xmmH, xmmH 274 psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) 275 psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) 276 277%endif ; RGB_PIXELSIZE ; --------------- 278 279 ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE 280 ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO 281 282 ; (Original) 283 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 284 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 285 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 286 ; 287 ; (This implementation) 288 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 289 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 290 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 291 292 movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE 293 movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO 294 movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE 295 movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO 296 297 movdqa xmm6, xmm1 298 punpcklwd xmm1, xmm3 299 punpckhwd xmm6, xmm3 300 movdqa xmm7, xmm1 301 movdqa xmm4, xmm6 302 pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) 303 pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) 304 pmaddwd xmm7, [rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) 305 pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) 306 307 movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) 308 movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) 309 310 pxor xmm1, xmm1 311 pxor xmm6, xmm6 312 punpcklwd xmm1, xmm5 ; xmm1=BOL 313 punpckhwd xmm6, xmm5 ; xmm6=BOH 314 psrld xmm1, 1 ; xmm1=BOL*FIX(0.500) 315 psrld xmm6, 1 ; xmm6=BOH*FIX(0.500) 316 317 movdqa xmm5, [rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ] 318 319 paddd xmm7, xmm1 320 paddd xmm4, xmm6 321 paddd xmm7, xmm5 322 paddd xmm4, xmm5 323 psrld xmm7, SCALEBITS ; xmm7=CbOL 324 psrld xmm4, SCALEBITS ; xmm4=CbOH 325 packssdw xmm7, xmm4 ; xmm7=CbO 326 327 movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE 328 329 movdqa xmm6, xmm0 330 punpcklwd xmm0, xmm2 331 punpckhwd xmm6, xmm2 332 movdqa xmm5, xmm0 333 movdqa xmm4, xmm6 334 pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) 335 pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) 336 pmaddwd xmm5, [rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) 337 pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) 338 339 movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) 340 movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) 341 342 pxor xmm0, xmm0 343 pxor xmm6, xmm6 344 punpcklwd xmm0, xmm1 ; xmm0=BEL 345 punpckhwd xmm6, xmm1 ; xmm6=BEH 346 psrld xmm0, 1 ; xmm0=BEL*FIX(0.500) 347 psrld xmm6, 1 ; xmm6=BEH*FIX(0.500) 348 349 movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] 350 351 paddd xmm5, xmm0 352 paddd xmm4, xmm6 353 paddd xmm5, xmm1 354 paddd xmm4, xmm1 355 psrld xmm5, SCALEBITS ; xmm5=CbEL 356 psrld xmm4, SCALEBITS ; xmm4=CbEH 357 packssdw xmm5, xmm4 ; xmm5=CbE 358 359 psllw xmm7, BYTE_BIT 360 por xmm5, xmm7 ; xmm5=Cb 361 movdqa XMMWORD [rbx], xmm5 ; Save Cb 362 363 movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO 364 movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE 365 movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO 366 367 movdqa xmm4, xmm0 368 punpcklwd xmm0, xmm3 369 punpckhwd xmm4, xmm3 370 movdqa xmm7, xmm0 371 movdqa xmm5, xmm4 372 pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) 373 pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) 374 pmaddwd xmm7, [rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) 375 pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) 376 377 movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] 378 379 paddd xmm0, XMMWORD [wk(4)] 380 paddd xmm4, XMMWORD [wk(5)] 381 paddd xmm0, xmm3 382 paddd xmm4, xmm3 383 psrld xmm0, SCALEBITS ; xmm0=YOL 384 psrld xmm4, SCALEBITS ; xmm4=YOH 385 packssdw xmm0, xmm4 ; xmm0=YO 386 387 pxor xmm3, xmm3 388 pxor xmm4, xmm4 389 punpcklwd xmm3, xmm1 ; xmm3=ROL 390 punpckhwd xmm4, xmm1 ; xmm4=ROH 391 psrld xmm3, 1 ; xmm3=ROL*FIX(0.500) 392 psrld xmm4, 1 ; xmm4=ROH*FIX(0.500) 393 394 movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] 395 396 paddd xmm7, xmm3 397 paddd xmm5, xmm4 398 paddd xmm7, xmm1 399 paddd xmm5, xmm1 400 psrld xmm7, SCALEBITS ; xmm7=CrOL 401 psrld xmm5, SCALEBITS ; xmm5=CrOH 402 packssdw xmm7, xmm5 ; xmm7=CrO 403 404 movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE 405 406 movdqa xmm4, xmm6 407 punpcklwd xmm6, xmm2 408 punpckhwd xmm4, xmm2 409 movdqa xmm1, xmm6 410 movdqa xmm5, xmm4 411 pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) 412 pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) 413 pmaddwd xmm1, [rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) 414 pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) 415 416 movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] 417 418 paddd xmm6, XMMWORD [wk(6)] 419 paddd xmm4, XMMWORD [wk(7)] 420 paddd xmm6, xmm2 421 paddd xmm4, xmm2 422 psrld xmm6, SCALEBITS ; xmm6=YEL 423 psrld xmm4, SCALEBITS ; xmm4=YEH 424 packssdw xmm6, xmm4 ; xmm6=YE 425 426 psllw xmm0, BYTE_BIT 427 por xmm6, xmm0 ; xmm6=Y 428 movdqa XMMWORD [rdi], xmm6 ; Save Y 429 430 pxor xmm2, xmm2 431 pxor xmm4, xmm4 432 punpcklwd xmm2, xmm3 ; xmm2=REL 433 punpckhwd xmm4, xmm3 ; xmm4=REH 434 psrld xmm2, 1 ; xmm2=REL*FIX(0.500) 435 psrld xmm4, 1 ; xmm4=REH*FIX(0.500) 436 437 movdqa xmm0, [rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ] 438 439 paddd xmm1, xmm2 440 paddd xmm5, xmm4 441 paddd xmm1, xmm0 442 paddd xmm5, xmm0 443 psrld xmm1, SCALEBITS ; xmm1=CrEL 444 psrld xmm5, SCALEBITS ; xmm5=CrEH 445 packssdw xmm1, xmm5 ; xmm1=CrE 446 447 psllw xmm7, BYTE_BIT 448 por xmm1, xmm7 ; xmm1=Cr 449 movdqa XMMWORD [rdx], xmm1 ; Save Cr 450 451 sub rcx, byte SIZEOF_XMMWORD 452 add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr 453 add rdi, byte SIZEOF_XMMWORD ; outptr0 454 add rbx, byte SIZEOF_XMMWORD ; outptr1 455 add rdx, byte SIZEOF_XMMWORD ; outptr2 456 cmp rcx, byte SIZEOF_XMMWORD 457 jae near .columnloop 458 test rcx, rcx 459 jnz near .column_ld1 460 461 pop rcx ; col 462 pop rsi 463 pop rdi 464 pop rbx 465 pop rdx 466 467 add rsi, byte SIZEOF_JSAMPROW ; input_buf 468 add rdi, byte SIZEOF_JSAMPROW 469 add rbx, byte SIZEOF_JSAMPROW 470 add rdx, byte SIZEOF_JSAMPROW 471 dec rax ; num_rows 472 jg near .rowloop 473 474.return: 475 pop rbx 476 uncollect_args 5 477 mov rsp, rbp ; rsp <- aligned rbp 478 pop rsp ; rsp <- original rbp 479 pop rbp 480 ret 481 482; For some reason, the OS X linker does not honor the request to align the 483; segment unless we do this. 484 align 32 485