1; 2; jdsample.asm - upsampling (64-bit SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2016, D. R. Commander. 6; Copyright (C) 2018, Matthias Räncker. 7; 8; Based on the x86 SIMD extension for IJG JPEG library 9; Copyright (C) 1999-2006, MIYASAKA Masaru. 10; For conditions of distribution and use, see copyright notice in jsimdext.inc 11; 12; This file should be assembled with NASM (Netwide Assembler), 13; can *not* be assembled with Microsoft's MASM or any compatible 14; assembler (including Borland's Turbo Assembler). 15; NASM is available from http://nasm.sourceforge.net/ or 16; http://sourceforge.net/project/showfiles.php?group_id=6208 17 18%include "jsimdext.inc" 19 20; -------------------------------------------------------------------------- 21 SECTION SEG_CONST 22 23 alignz 32 24 GLOBAL_DATA(jconst_fancy_upsample_sse2) 25 26EXTN(jconst_fancy_upsample_sse2): 27 28PW_ONE times 8 dw 1 29PW_TWO times 8 dw 2 30PW_THREE times 8 dw 3 31PW_SEVEN times 8 dw 7 32PW_EIGHT times 8 dw 8 33 34 alignz 32 35 36; -------------------------------------------------------------------------- 37 SECTION SEG_TEXT 38 BITS 64 39; 40; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. 41; 42; The upsampling algorithm is linear interpolation between pixel centers, 43; also known as a "triangle filter". This is a good compromise between 44; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 45; of the way between input pixel centers. 46; 47; GLOBAL(void) 48; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor, 49; JDIMENSION downsampled_width, 50; JSAMPARRAY input_data, 51; JSAMPARRAY *output_data_ptr); 52; 53 54; r10 = int max_v_samp_factor 55; r11d = JDIMENSION downsampled_width 56; r12 = JSAMPARRAY input_data 57; r13 = JSAMPARRAY *output_data_ptr 58 59 align 32 60 GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2) 61 62EXTN(jsimd_h2v1_fancy_upsample_sse2): 63 push rbp 64 mov rax, rsp 65 mov rbp, rsp 66 collect_args 4 67 68 mov eax, r11d ; colctr 69 test rax, rax 70 jz near .return 71 72 mov rcx, r10 ; rowctr 73 test rcx, rcx 74 jz near .return 75 76 mov rsi, r12 ; input_data 77 mov rdi, r13 78 mov rdip, JSAMPARRAY [rdi] ; output_data 79.rowloop: 80 push rax ; colctr 81 push rdi 82 push rsi 83 84 mov rsip, JSAMPROW [rsi] ; inptr 85 mov rdip, JSAMPROW [rdi] ; outptr 86 87 test rax, SIZEOF_XMMWORD-1 88 jz short .skip 89 mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] 90 mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 91.skip: 92 pxor xmm0, xmm0 ; xmm0=(all 0's) 93 pcmpeqb xmm7, xmm7 94 psrldq xmm7, (SIZEOF_XMMWORD-1) 95 pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD] 96 97 add rax, byte SIZEOF_XMMWORD-1 98 and rax, byte -SIZEOF_XMMWORD 99 cmp rax, byte SIZEOF_XMMWORD 100 ja short .columnloop 101 102.columnloop_last: 103 pcmpeqb xmm6, xmm6 104 pslldq xmm6, (SIZEOF_XMMWORD-1) 105 pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD] 106 jmp short .upsample 107 108.columnloop: 109 movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD] 110 pslldq xmm6, (SIZEOF_XMMWORD-1) 111 112.upsample: 113 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] 114 movdqa xmm2, xmm1 115 movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15) 116 pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14) 117 psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --) 118 119 por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14) 120 por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16) 121 122 movdqa xmm7, xmm1 123 psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) 124 125 movdqa xmm4, xmm1 126 punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) 127 punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) 128 movdqa xmm5, xmm2 129 punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) 130 punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) 131 movdqa xmm6, xmm3 132 punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) 133 punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) 134 135 pmullw xmm1, [rel PW_THREE] 136 pmullw xmm4, [rel PW_THREE] 137 paddw xmm2, [rel PW_ONE] 138 paddw xmm5, [rel PW_ONE] 139 paddw xmm3, [rel PW_TWO] 140 paddw xmm6, [rel PW_TWO] 141 142 paddw xmm2, xmm1 143 paddw xmm5, xmm4 144 psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) 145 psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) 146 paddw xmm3, xmm1 147 paddw xmm6, xmm4 148 psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) 149 psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) 150 151 psllw xmm3, BYTE_BIT 152 psllw xmm6, BYTE_BIT 153 por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) 154 por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) 155 156 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 157 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5 158 159 sub rax, byte SIZEOF_XMMWORD 160 add rsi, byte 1*SIZEOF_XMMWORD ; inptr 161 add rdi, byte 2*SIZEOF_XMMWORD ; outptr 162 cmp rax, byte SIZEOF_XMMWORD 163 ja near .columnloop 164 test eax, eax 165 jnz near .columnloop_last 166 167 pop rsi 168 pop rdi 169 pop rax 170 171 add rsi, byte SIZEOF_JSAMPROW ; input_data 172 add rdi, byte SIZEOF_JSAMPROW ; output_data 173 dec rcx ; rowctr 174 jg near .rowloop 175 176.return: 177 uncollect_args 4 178 pop rbp 179 ret 180 181; -------------------------------------------------------------------------- 182; 183; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. 184; Again a triangle filter; see comments for h2v1 case, above. 185; 186; GLOBAL(void) 187; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor, 188; JDIMENSION downsampled_width, 189; JSAMPARRAY input_data, 190; JSAMPARRAY *output_data_ptr); 191; 192 193; r10 = int max_v_samp_factor 194; r11d = JDIMENSION downsampled_width 195; r12 = JSAMPARRAY input_data 196; r13 = JSAMPARRAY *output_data_ptr 197 198%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 199%define WK_NUM 4 200 201 align 32 202 GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2) 203 204EXTN(jsimd_h2v2_fancy_upsample_sse2): 205 push rbp 206 mov rax, rsp ; rax = original rbp 207 sub rsp, byte 4 208 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 209 mov [rsp], rax 210 mov rbp, rsp ; rbp = aligned rbp 211 lea rsp, [wk(0)] 212 collect_args 4 213 push rbx 214 215 mov eax, r11d ; colctr 216 test rax, rax 217 jz near .return 218 219 mov rcx, r10 ; rowctr 220 test rcx, rcx 221 jz near .return 222 223 mov rsi, r12 ; input_data 224 mov rdi, r13 225 mov rdip, JSAMPARRAY [rdi] ; output_data 226.rowloop: 227 push rax ; colctr 228 push rcx 229 push rdi 230 push rsi 231 232 mov rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above) 233 mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 234 mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below) 235 mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 236 mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 237 238 test rax, SIZEOF_XMMWORD-1 239 jz short .skip 240 push rdx 241 mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE] 242 mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl 243 mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE] 244 mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl 245 mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] 246 mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 247 pop rdx 248.skip: 249 ; -- process the first column block 250 251 movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] 252 movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] 253 movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] 254 255 pxor xmm3, xmm3 ; xmm3=(all 0's) 256 movdqa xmm4, xmm0 257 punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) 258 punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) 259 movdqa xmm5, xmm1 260 punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) 261 punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) 262 movdqa xmm6, xmm2 263 punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) 264 punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) 265 266 pmullw xmm0, [rel PW_THREE] 267 pmullw xmm4, [rel PW_THREE] 268 269 pcmpeqb xmm7, xmm7 270 psrldq xmm7, (SIZEOF_XMMWORD-2) 271 272 paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) 273 paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) 274 paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) 275 paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) 276 277 movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save 278 movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data 279 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 280 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6 281 282 pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) 283 pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) 284 285 movdqa XMMWORD [wk(0)], xmm1 286 movdqa XMMWORD [wk(1)], xmm2 287 288 add rax, byte SIZEOF_XMMWORD-1 289 and rax, byte -SIZEOF_XMMWORD 290 cmp rax, byte SIZEOF_XMMWORD 291 ja short .columnloop 292 293.columnloop_last: 294 ; -- process the last column block 295 296 pcmpeqb xmm1, xmm1 297 pslldq xmm1, (SIZEOF_XMMWORD-2) 298 movdqa xmm2, xmm1 299 300 pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD] 301 pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD] 302 303 movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) 304 movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) 305 306 jmp near .upsample 307 308.columnloop: 309 ; -- process the next column block 310 311 movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] 312 movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] 313 movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] 314 315 pxor xmm3, xmm3 ; xmm3=(all 0's) 316 movdqa xmm4, xmm0 317 punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) 318 punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) 319 movdqa xmm5, xmm1 320 punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) 321 punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) 322 movdqa xmm6, xmm2 323 punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) 324 punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) 325 326 pmullw xmm0, [rel PW_THREE] 327 pmullw xmm4, [rel PW_THREE] 328 329 paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) 330 paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) 331 paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) 332 paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) 333 334 movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save 335 movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data 336 movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 337 movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6 338 339 pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) 340 pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) 341 342 movdqa XMMWORD [wk(2)], xmm1 343 movdqa XMMWORD [wk(3)], xmm2 344 345.upsample: 346 ; -- process the upper row 347 348 movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD] 349 movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD] 350 351 movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) 352 movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) 353 psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --) 354 pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) 355 movdqa xmm5, xmm7 356 movdqa xmm6, xmm3 357 psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) 358 pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14) 359 360 por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) 361 por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) 362 363 movdqa xmm1, xmm7 364 movdqa xmm2, xmm3 365 pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) 366 psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --) 367 movdqa xmm4, xmm3 368 psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) 369 370 por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) 371 por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) 372 373 movdqa XMMWORD [wk(0)], xmm4 374 375 pmullw xmm7, [rel PW_THREE] 376 pmullw xmm3, [rel PW_THREE] 377 paddw xmm1, [rel PW_EIGHT] 378 paddw xmm5, [rel PW_EIGHT] 379 paddw xmm0, [rel PW_SEVEN] 380 paddw xmm2, [rel PW_SEVEN] 381 382 paddw xmm1, xmm7 383 paddw xmm5, xmm3 384 psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) 385 psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) 386 paddw xmm0, xmm7 387 paddw xmm2, xmm3 388 psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) 389 psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) 390 391 psllw xmm0, BYTE_BIT 392 psllw xmm2, BYTE_BIT 393 por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) 394 por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) 395 396 movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 397 movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 398 399 ; -- process the lower row 400 401 movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD] 402 movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD] 403 404 movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) 405 movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) 406 psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --) 407 pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) 408 movdqa xmm0, xmm6 409 movdqa xmm2, xmm4 410 psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) 411 pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14) 412 413 por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) 414 por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) 415 416 movdqa xmm1, xmm6 417 movdqa xmm5, xmm4 418 pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) 419 psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --) 420 movdqa xmm3, xmm4 421 psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) 422 423 por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) 424 por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) 425 426 movdqa XMMWORD [wk(1)], xmm3 427 428 pmullw xmm6, [rel PW_THREE] 429 pmullw xmm4, [rel PW_THREE] 430 paddw xmm1, [rel PW_EIGHT] 431 paddw xmm0, [rel PW_EIGHT] 432 paddw xmm7, [rel PW_SEVEN] 433 paddw xmm5, [rel PW_SEVEN] 434 435 paddw xmm1, xmm6 436 paddw xmm0, xmm4 437 psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) 438 psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) 439 paddw xmm7, xmm6 440 paddw xmm5, xmm4 441 psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) 442 psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) 443 444 psllw xmm7, BYTE_BIT 445 psllw xmm5, BYTE_BIT 446 por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) 447 por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) 448 449 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1 450 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0 451 452 sub rax, byte SIZEOF_XMMWORD 453 add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above) 454 add rbx, byte 1*SIZEOF_XMMWORD ; inptr0 455 add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below) 456 add rdx, byte 2*SIZEOF_XMMWORD ; outptr0 457 add rdi, byte 2*SIZEOF_XMMWORD ; outptr1 458 cmp rax, byte SIZEOF_XMMWORD 459 ja near .columnloop 460 test rax, rax 461 jnz near .columnloop_last 462 463 pop rsi 464 pop rdi 465 pop rcx 466 pop rax 467 468 add rsi, byte 1*SIZEOF_JSAMPROW ; input_data 469 add rdi, byte 2*SIZEOF_JSAMPROW ; output_data 470 sub rcx, byte 2 ; rowctr 471 jg near .rowloop 472 473.return: 474 pop rbx 475 uncollect_args 4 476 mov rsp, rbp ; rsp <- aligned rbp 477 pop rsp ; rsp <- original rbp 478 pop rbp 479 ret 480 481; -------------------------------------------------------------------------- 482; 483; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. 484; It's still a box filter. 485; 486; GLOBAL(void) 487; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width, 488; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 489; 490 491; r10 = int max_v_samp_factor 492; r11d = JDIMENSION output_width 493; r12 = JSAMPARRAY input_data 494; r13 = JSAMPARRAY *output_data_ptr 495 496 align 32 497 GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2) 498 499EXTN(jsimd_h2v1_upsample_sse2): 500 push rbp 501 mov rax, rsp 502 mov rbp, rsp 503 collect_args 4 504 505 mov edx, r11d 506 add rdx, byte (2*SIZEOF_XMMWORD)-1 507 and rdx, byte -(2*SIZEOF_XMMWORD) 508 jz near .return 509 510 mov rcx, r10 ; rowctr 511 test rcx, rcx 512 jz short .return 513 514 mov rsi, r12 ; input_data 515 mov rdi, r13 516 mov rdip, JSAMPARRAY [rdi] ; output_data 517.rowloop: 518 push rdi 519 push rsi 520 521 mov rsip, JSAMPROW [rsi] ; inptr 522 mov rdip, JSAMPROW [rdi] ; outptr 523 mov rax, rdx ; colctr 524.columnloop: 525 526 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] 527 528 movdqa xmm1, xmm0 529 punpcklbw xmm0, xmm0 530 punpckhbw xmm1, xmm1 531 532 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 533 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 534 535 sub rax, byte 2*SIZEOF_XMMWORD 536 jz short .nextrow 537 538 movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] 539 540 movdqa xmm3, xmm2 541 punpcklbw xmm2, xmm2 542 punpckhbw xmm3, xmm3 543 544 movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 545 movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 546 547 sub rax, byte 2*SIZEOF_XMMWORD 548 jz short .nextrow 549 550 add rsi, byte 2*SIZEOF_XMMWORD ; inptr 551 add rdi, byte 4*SIZEOF_XMMWORD ; outptr 552 jmp short .columnloop 553 554.nextrow: 555 pop rsi 556 pop rdi 557 558 add rsi, byte SIZEOF_JSAMPROW ; input_data 559 add rdi, byte SIZEOF_JSAMPROW ; output_data 560 dec rcx ; rowctr 561 jg short .rowloop 562 563.return: 564 uncollect_args 4 565 pop rbp 566 ret 567 568; -------------------------------------------------------------------------- 569; 570; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. 571; It's still a box filter. 572; 573; GLOBAL(void) 574; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width, 575; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 576; 577 578; r10 = int max_v_samp_factor 579; r11d = JDIMENSION output_width 580; r12 = JSAMPARRAY input_data 581; r13 = JSAMPARRAY *output_data_ptr 582 583 align 32 584 GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2) 585 586EXTN(jsimd_h2v2_upsample_sse2): 587 push rbp 588 mov rax, rsp 589 mov rbp, rsp 590 collect_args 4 591 push rbx 592 593 mov edx, r11d 594 add rdx, byte (2*SIZEOF_XMMWORD)-1 595 and rdx, byte -(2*SIZEOF_XMMWORD) 596 jz near .return 597 598 mov rcx, r10 ; rowctr 599 test rcx, rcx 600 jz near .return 601 602 mov rsi, r12 ; input_data 603 mov rdi, r13 604 mov rdip, JSAMPARRAY [rdi] ; output_data 605.rowloop: 606 push rdi 607 push rsi 608 609 mov rsip, JSAMPROW [rsi] ; inptr 610 mov rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 611 mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 612 mov rax, rdx ; colctr 613.columnloop: 614 615 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] 616 617 movdqa xmm1, xmm0 618 punpcklbw xmm0, xmm0 619 punpckhbw xmm1, xmm1 620 621 movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0 622 movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1 623 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 624 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 625 626 sub rax, byte 2*SIZEOF_XMMWORD 627 jz short .nextrow 628 629 movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] 630 631 movdqa xmm3, xmm2 632 punpcklbw xmm2, xmm2 633 punpckhbw xmm3, xmm3 634 635 movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2 636 movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3 637 movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 638 movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 639 640 sub rax, byte 2*SIZEOF_XMMWORD 641 jz short .nextrow 642 643 add rsi, byte 2*SIZEOF_XMMWORD ; inptr 644 add rbx, byte 4*SIZEOF_XMMWORD ; outptr0 645 add rdi, byte 4*SIZEOF_XMMWORD ; outptr1 646 jmp short .columnloop 647 648.nextrow: 649 pop rsi 650 pop rdi 651 652 add rsi, byte 1*SIZEOF_JSAMPROW ; input_data 653 add rdi, byte 2*SIZEOF_JSAMPROW ; output_data 654 sub rcx, byte 2 ; rowctr 655 jg near .rowloop 656 657.return: 658 pop rbx 659 uncollect_args 4 660 pop rbp 661 ret 662 663; For some reason, the OS X linker does not honor the request to align the 664; segment unless we do this. 665 align 32 666