1; 2; jcsample.asm - downsampling (64-bit AVX2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2016, D. R. Commander. 6; Copyright (C) 2015, Intel Corporation. 7; Copyright (C) 2018, Matthias Räncker. 8; 9; Based on the x86 SIMD extension for IJG JPEG library 10; Copyright (C) 1999-2006, MIYASAKA Masaru. 11; For conditions of distribution and use, see copyright notice in jsimdext.inc 12; 13; This file should be assembled with NASM (Netwide Assembler), 14; can *not* be assembled with Microsoft's MASM or any compatible 15; assembler (including Borland's Turbo Assembler). 16; NASM is available from http://nasm.sourceforge.net/ or 17; http://sourceforge.net/project/showfiles.php?group_id=6208 18 19%include "jsimdext.inc" 20 21; -------------------------------------------------------------------------- 22 SECTION SEG_TEXT 23 BITS 64 24; 25; Downsample pixel values of a single component. 26; This version handles the common case of 2:1 horizontal and 1:1 vertical, 27; without smoothing. 28; 29; GLOBAL(void) 30; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor, 31; JDIMENSION v_samp_factor, 32; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 33; JSAMPARRAY output_data); 34; 35 36; r10d = JDIMENSION image_width 37; r11 = int max_v_samp_factor 38; r12d = JDIMENSION v_samp_factor 39; r13d = JDIMENSION width_in_blocks 40; r14 = JSAMPARRAY input_data 41; r15 = JSAMPARRAY output_data 42 43 align 32 44 GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2) 45 46EXTN(jsimd_h2v1_downsample_avx2): 47 push rbp 48 mov rax, rsp 49 mov rbp, rsp 50 collect_args 6 51 52 mov ecx, r13d 53 shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) 54 jz near .return 55 56 mov edx, r10d 57 58 ; -- expand_right_edge 59 60 push rcx 61 shl rcx, 1 ; output_cols * 2 62 sub rcx, rdx 63 jle short .expand_end 64 65 mov rax, r11 66 test rax, rax 67 jle short .expand_end 68 69 cld 70 mov rsi, r14 ; input_data 71.expandloop: 72 push rax 73 push rcx 74 75 mov rdip, JSAMPROW [rsi] 76 add rdi, rdx 77 mov al, JSAMPLE [rdi-1] 78 79 rep stosb 80 81 pop rcx 82 pop rax 83 84 add rsi, byte SIZEOF_JSAMPROW 85 dec rax 86 jg short .expandloop 87 88.expand_end: 89 pop rcx ; output_cols 90 91 ; -- h2v1_downsample 92 93 mov eax, r12d ; rowctr 94 test eax, eax 95 jle near .return 96 97 mov rdx, 0x00010000 ; bias pattern 98 vmovd xmm7, edx 99 vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} 100 vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7} 101 vpcmpeqw ymm6, ymm6, ymm6 102 vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} 103 104 mov rsi, r14 ; input_data 105 mov rdi, r15 ; output_data 106.rowloop: 107 push rcx 108 push rdi 109 push rsi 110 111 mov rsip, JSAMPROW [rsi] ; inptr 112 mov rdip, JSAMPROW [rdi] ; outptr 113 114 cmp rcx, byte SIZEOF_YMMWORD 115 jae short .columnloop 116 117.columnloop_r24: 118 ; rcx can possibly be 8, 16, 24 119 cmp rcx, 24 120 jne .columnloop_r16 121 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] 122 vmovdqu xmm1, XMMWORD [rsi+1*SIZEOF_YMMWORD] 123 mov rcx, SIZEOF_YMMWORD 124 jmp short .downsample 125 126.columnloop_r16: 127 cmp rcx, 16 128 jne .columnloop_r8 129 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] 130 vpxor ymm1, ymm1, ymm1 131 mov rcx, SIZEOF_YMMWORD 132 jmp short .downsample 133 134.columnloop_r8: 135 vmovdqu xmm0, XMMWORD[rsi+0*SIZEOF_YMMWORD] 136 vpxor ymm1, ymm1, ymm1 137 mov rcx, SIZEOF_YMMWORD 138 jmp short .downsample 139 140.columnloop: 141 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] 142 vmovdqu ymm1, YMMWORD [rsi+1*SIZEOF_YMMWORD] 143 144.downsample: 145 vpsrlw ymm2, ymm0, BYTE_BIT 146 vpand ymm0, ymm0, ymm6 147 vpsrlw ymm3, ymm1, BYTE_BIT 148 vpand ymm1, ymm1, ymm6 149 150 vpaddw ymm0, ymm0, ymm2 151 vpaddw ymm1, ymm1, ymm3 152 vpaddw ymm0, ymm0, ymm7 153 vpaddw ymm1, ymm1, ymm7 154 vpsrlw ymm0, ymm0, 1 155 vpsrlw ymm1, ymm1, 1 156 157 vpackuswb ymm0, ymm0, ymm1 158 vpermq ymm0, ymm0, 0xd8 159 160 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0 161 162 sub rcx, byte SIZEOF_YMMWORD ; outcol 163 add rsi, byte 2*SIZEOF_YMMWORD ; inptr 164 add rdi, byte 1*SIZEOF_YMMWORD ; outptr 165 cmp rcx, byte SIZEOF_YMMWORD 166 jae short .columnloop 167 test rcx, rcx 168 jnz near .columnloop_r24 169 170 pop rsi 171 pop rdi 172 pop rcx 173 174 add rsi, byte SIZEOF_JSAMPROW ; input_data 175 add rdi, byte SIZEOF_JSAMPROW ; output_data 176 dec rax ; rowctr 177 jg near .rowloop 178 179.return: 180 vzeroupper 181 uncollect_args 6 182 pop rbp 183 ret 184 185; -------------------------------------------------------------------------- 186; 187; Downsample pixel values of a single component. 188; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 189; without smoothing. 190; 191; GLOBAL(void) 192; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor, 193; JDIMENSION v_samp_factor, 194; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 195; JSAMPARRAY output_data); 196; 197 198; r10d = JDIMENSION image_width 199; r11 = int max_v_samp_factor 200; r12d = JDIMENSION v_samp_factor 201; r13d = JDIMENSION width_in_blocks 202; r14 = JSAMPARRAY input_data 203; r15 = JSAMPARRAY output_data 204 205 align 32 206 GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2) 207 208EXTN(jsimd_h2v2_downsample_avx2): 209 push rbp 210 mov rax, rsp 211 mov rbp, rsp 212 collect_args 6 213 214 mov ecx, r13d 215 shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) 216 jz near .return 217 218 mov edx, r10d 219 220 ; -- expand_right_edge 221 222 push rcx 223 shl rcx, 1 ; output_cols * 2 224 sub rcx, rdx 225 jle short .expand_end 226 227 mov rax, r11 228 test rax, rax 229 jle short .expand_end 230 231 cld 232 mov rsi, r14 ; input_data 233.expandloop: 234 push rax 235 push rcx 236 237 mov rdip, JSAMPROW [rsi] 238 add rdi, rdx 239 mov al, JSAMPLE [rdi-1] 240 241 rep stosb 242 243 pop rcx 244 pop rax 245 246 add rsi, byte SIZEOF_JSAMPROW 247 dec rax 248 jg short .expandloop 249 250.expand_end: 251 pop rcx ; output_cols 252 253 ; -- h2v2_downsample 254 255 mov eax, r12d ; rowctr 256 test rax, rax 257 jle near .return 258 259 mov rdx, 0x00020001 ; bias pattern 260 vmovd xmm7, edx 261 vpcmpeqw ymm6, ymm6, ymm6 262 vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2} 263 vperm2i128 ymm7, ymm7, ymm7, 0 264 vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} 265 266 mov rsi, r14 ; input_data 267 mov rdi, r15 ; output_data 268.rowloop: 269 push rcx 270 push rdi 271 push rsi 272 273 mov rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 274 mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 275 mov rdip, JSAMPROW [rdi] ; outptr 276 277 cmp rcx, byte SIZEOF_YMMWORD 278 jae short .columnloop 279 280.columnloop_r24: 281 cmp rcx, 24 282 jne .columnloop_r16 283 vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD] 284 vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] 285 vmovdqu xmm2, XMMWORD [rdx+1*SIZEOF_YMMWORD] 286 vmovdqu xmm3, XMMWORD [rsi+1*SIZEOF_YMMWORD] 287 mov rcx, SIZEOF_YMMWORD 288 jmp short .downsample 289 290.columnloop_r16: 291 cmp rcx, 16 292 jne .columnloop_r8 293 vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD] 294 vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] 295 vpxor ymm2, ymm2, ymm2 296 vpxor ymm3, ymm3, ymm3 297 mov rcx, SIZEOF_YMMWORD 298 jmp short .downsample 299 300.columnloop_r8: 301 vmovdqu xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] 302 vmovdqu xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] 303 vpxor ymm2, ymm2, ymm2 304 vpxor ymm3, ymm3, ymm3 305 mov rcx, SIZEOF_YMMWORD 306 jmp short .downsample 307 308.columnloop: 309 vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD] 310 vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] 311 vmovdqu ymm2, YMMWORD [rdx+1*SIZEOF_YMMWORD] 312 vmovdqu ymm3, YMMWORD [rsi+1*SIZEOF_YMMWORD] 313 314.downsample: 315 vpand ymm4, ymm0, ymm6 316 vpsrlw ymm0, ymm0, BYTE_BIT 317 vpand ymm5, ymm1, ymm6 318 vpsrlw ymm1, ymm1, BYTE_BIT 319 vpaddw ymm0, ymm0, ymm4 320 vpaddw ymm1, ymm1, ymm5 321 322 vpand ymm4, ymm2, ymm6 323 vpsrlw ymm2, ymm2, BYTE_BIT 324 vpand ymm5, ymm3, ymm6 325 vpsrlw ymm3, ymm3, BYTE_BIT 326 vpaddw ymm2, ymm2, ymm4 327 vpaddw ymm3, ymm3, ymm5 328 329 vpaddw ymm0, ymm0, ymm1 330 vpaddw ymm2, ymm2, ymm3 331 vpaddw ymm0, ymm0, ymm7 332 vpaddw ymm2, ymm2, ymm7 333 vpsrlw ymm0, ymm0, 2 334 vpsrlw ymm2, ymm2, 2 335 336 vpackuswb ymm0, ymm0, ymm2 337 vpermq ymm0, ymm0, 0xd8 338 339 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0 340 341 sub rcx, byte SIZEOF_YMMWORD ; outcol 342 add rdx, byte 2*SIZEOF_YMMWORD ; inptr0 343 add rsi, byte 2*SIZEOF_YMMWORD ; inptr1 344 add rdi, byte 1*SIZEOF_YMMWORD ; outptr 345 cmp rcx, byte SIZEOF_YMMWORD 346 jae near .columnloop 347 test rcx, rcx 348 jnz near .columnloop_r24 349 350 pop rsi 351 pop rdi 352 pop rcx 353 354 add rsi, byte 2*SIZEOF_JSAMPROW ; input_data 355 add rdi, byte 1*SIZEOF_JSAMPROW ; output_data 356 dec rax ; rowctr 357 jg near .rowloop 358 359.return: 360 vzeroupper 361 uncollect_args 6 362 pop rbp 363 ret 364 365; For some reason, the OS X linker does not honor the request to align the 366; segment unless we do this. 367 align 32 368