1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14;macro in deblock functions 15%macro FIRST_2_ROWS 0 16 movdqa xmm4, xmm0 17 movdqa xmm6, xmm0 18 movdqa xmm5, xmm1 19 pavgb xmm5, xmm3 20 21 ;calculate absolute value 22 psubusb xmm4, xmm1 23 psubusb xmm1, xmm0 24 psubusb xmm6, xmm3 25 psubusb xmm3, xmm0 26 paddusb xmm4, xmm1 27 paddusb xmm6, xmm3 28 29 ;get threshold 30 movdqa xmm2, flimit 31 pxor xmm1, xmm1 32 movdqa xmm7, xmm2 33 34 ;get mask 35 psubusb xmm2, xmm4 36 psubusb xmm7, xmm6 37 pcmpeqb xmm2, xmm1 38 pcmpeqb xmm7, xmm1 39 por xmm7, xmm2 40%endmacro 41 42%macro SECOND_2_ROWS 0 43 movdqa xmm6, xmm0 44 movdqa xmm4, xmm0 45 movdqa xmm2, xmm1 46 pavgb xmm1, xmm3 47 48 ;calculate absolute value 49 psubusb xmm6, xmm2 50 psubusb xmm2, xmm0 51 psubusb xmm4, xmm3 52 psubusb xmm3, xmm0 53 paddusb xmm6, xmm2 54 paddusb xmm4, xmm3 55 56 pavgb xmm5, xmm1 57 58 ;get threshold 59 movdqa xmm2, flimit 60 pxor xmm1, xmm1 61 movdqa xmm3, xmm2 62 63 ;get mask 64 psubusb xmm2, xmm6 65 psubusb xmm3, xmm4 66 pcmpeqb xmm2, xmm1 67 pcmpeqb xmm3, xmm1 68 69 por xmm7, xmm2 70 por xmm7, xmm3 71 72 pavgb xmm5, xmm0 73 74 ;decide if or not to use filtered value 75 pand xmm0, xmm7 76 pandn xmm7, xmm5 77 paddusb xmm0, xmm7 78%endmacro 79 80%macro UPDATE_FLIMIT 0 81 movdqu xmm2, XMMWORD PTR [rbx] 82 movdqu [rsp], xmm2 83 add rbx, 16 84%endmacro 85 86SECTION .text 87 88;void vpx_post_proc_down_and_across_mb_row_sse2 89;( 90; unsigned char *src_ptr, 91; unsigned char *dst_ptr, 92; int src_pixels_per_line, 93; int dst_pixels_per_line, 94; int cols, 95; int *flimits, 96; int size 97;) 98globalsym(vpx_post_proc_down_and_across_mb_row_sse2) 99sym(vpx_post_proc_down_and_across_mb_row_sse2): 100 push rbp 101 mov rbp, rsp 102 SHADOW_ARGS_TO_STACK 7 103 SAVE_XMM 7 104 push rbx 105 push rsi 106 push rdi 107 ; end prolog 108 ALIGN_STACK 16, rax 109 sub rsp, 16 110 111 ; put flimit on stack 112 mov rbx, arg(5) ;flimits ptr 113 UPDATE_FLIMIT 114 115%define flimit [rsp] 116 117 mov rsi, arg(0) ;src_ptr 118 mov rdi, arg(1) ;dst_ptr 119 120 movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line 121 movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock 122.nextrow: 123 xor rdx, rdx ;col 124.nextcol: 125 ;load current and next 2 rows 126 movdqu xmm0, XMMWORD PTR [rsi] 127 movdqu xmm1, XMMWORD PTR [rsi + rax] 128 movdqu xmm3, XMMWORD PTR [rsi + 2*rax] 129 130 FIRST_2_ROWS 131 132 ;load above 2 rows 133 neg rax 134 movdqu xmm1, XMMWORD PTR [rsi + 2*rax] 135 movdqu xmm3, XMMWORD PTR [rsi + rax] 136 137 SECOND_2_ROWS 138 139 movdqu XMMWORD PTR [rdi], xmm0 140 141 neg rax ; positive stride 142 add rsi, 16 143 add rdi, 16 144 145 add rdx, 16 146 cmp edx, dword arg(4) ;cols 147 jge .downdone 148 UPDATE_FLIMIT 149 jmp .nextcol 150 151.downdone: 152 ; done with the all cols, start the across filtering in place 153 sub rsi, rdx 154 sub rdi, rdx 155 156 mov rbx, arg(5) ; flimits 157 UPDATE_FLIMIT 158 159 ; dup the first byte into the left border 8 times 160 movq mm1, [rdi] 161 punpcklbw mm1, mm1 162 punpcklwd mm1, mm1 163 punpckldq mm1, mm1 164 mov rdx, -8 165 movq [rdi+rdx], mm1 166 167 ; dup the last byte into the right border 168 movsxd rdx, dword arg(4) 169 movq mm1, [rdi + rdx + -1] 170 punpcklbw mm1, mm1 171 punpcklwd mm1, mm1 172 punpckldq mm1, mm1 173 movq [rdi+rdx], mm1 174 175 xor rdx, rdx 176 movq mm0, QWORD PTR [rdi-16]; 177 movq mm1, QWORD PTR [rdi-8]; 178 179.acrossnextcol: 180 movdqu xmm0, XMMWORD PTR [rdi + rdx] 181 movdqu xmm1, XMMWORD PTR [rdi + rdx -2] 182 movdqu xmm3, XMMWORD PTR [rdi + rdx -1] 183 184 FIRST_2_ROWS 185 186 movdqu xmm1, XMMWORD PTR [rdi + rdx +1] 187 movdqu xmm3, XMMWORD PTR [rdi + rdx +2] 188 189 SECOND_2_ROWS 190 191 movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes 192 movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes 193 movdq2q mm0, xmm0 194 psrldq xmm0, 8 195 movdq2q mm1, xmm0 196 197 add rdx, 16 198 cmp edx, dword arg(4) ;cols 199 jge .acrossdone 200 UPDATE_FLIMIT 201 jmp .acrossnextcol 202 203.acrossdone: 204 ; last 16 pixels 205 movq QWORD PTR [rdi+rdx-16], mm0 206 207 cmp edx, dword arg(4) 208 jne .throw_last_8 209 movq QWORD PTR [rdi+rdx-8], mm1 210.throw_last_8: 211 ; done with this rwo 212 add rsi,rax ;next src line 213 mov eax, dword arg(3) ;dst_pixels_per_line 214 add rdi,rax ;next destination 215 mov eax, dword arg(2) ;src_pixels_per_line 216 217 mov rbx, arg(5) ;flimits 218 UPDATE_FLIMIT 219 220 dec rcx ;decrement count 221 jnz .nextrow ;next row 222 223 add rsp, 16 224 pop rsp 225 ; begin epilog 226 pop rdi 227 pop rsi 228 pop rbx 229 RESTORE_XMM 230 UNSHADOW_ARGS 231 pop rbp 232 ret 233%undef flimit 234 235 236;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, 237; int pitch, int rows, int cols,int flimit) 238globalsym(vpx_mbpost_proc_across_ip_sse2) 239sym(vpx_mbpost_proc_across_ip_sse2): 240 push rbp 241 mov rbp, rsp 242 SHADOW_ARGS_TO_STACK 5 243 SAVE_XMM 7 244 GET_GOT rbx 245 push rsi 246 push rdi 247 ; end prolog 248 249 ALIGN_STACK 16, rax 250 sub rsp, 16 251 252 ; create flimit4 at [rsp] 253 mov eax, dword ptr arg(4) ;flimit 254 mov [rsp], eax 255 mov [rsp+4], eax 256 mov [rsp+8], eax 257 mov [rsp+12], eax 258%define flimit4 [rsp] 259 260 261 ;for(r=0;r<rows;r++) 262.ip_row_loop: 263 264 xor rdx, rdx ;sumsq=0; 265 xor rcx, rcx ;sum=0; 266 mov rsi, arg(0); s 267 268 269 ; dup the first byte into the left border 8 times 270 movq mm1, [rsi] 271 punpcklbw mm1, mm1 272 punpcklwd mm1, mm1 273 punpckldq mm1, mm1 274 275 mov rdi, -8 276 movq [rsi+rdi], mm1 277 278 ; dup the last byte into the right border 279 movsxd rdx, dword arg(3) 280 movq mm1, [rsi + rdx + -1] 281 punpcklbw mm1, mm1 282 punpcklwd mm1, mm1 283 punpckldq mm1, mm1 284 movq [rsi+rdx], mm1 285 286.ip_var_loop: 287 ;for(i=-8;i<=6;i++) 288 ;{ 289 ; sumsq += s[i]*s[i]; 290 ; sum += s[i]; 291 ;} 292 movzx eax, byte [rsi+rdi] 293 add ecx, eax 294 mul al 295 add edx, eax 296 add rdi, 1 297 cmp rdi, 6 298 jle .ip_var_loop 299 300 301 ;mov rax, sumsq 302 ;movd xmm7, rax 303 movd xmm7, edx 304 305 ;mov rax, sum 306 ;movd xmm6, rax 307 movd xmm6, ecx 308 309 mov rsi, arg(0) ;s 310 xor rcx, rcx 311 312 movsxd rdx, dword arg(3) ;cols 313 add rdx, 8 314 pxor mm0, mm0 315 pxor mm1, mm1 316 317 pxor xmm0, xmm0 318.nextcol4: 319 320 movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5 321 movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10 322 323 punpcklbw xmm1, xmm0 ; expanding 324 punpcklbw xmm2, xmm0 ; expanding 325 326 punpcklwd xmm1, xmm0 ; expanding to dwords 327 punpcklwd xmm2, xmm0 ; expanding to dwords 328 329 psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5 330 paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2 331 332 paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5 333 pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5 334 335 paddd xmm6, xmm2 336 paddd xmm7, xmm1 337 338 pshufd xmm6, xmm6, 0 ; duplicate the last ones 339 pshufd xmm7, xmm7, 0 ; duplicate the last ones 340 341 psrldq xmm1, 4 ; 8--7 9--6 10--5 0000 342 psrldq xmm2, 4 ; 8--7 9--6 10--5 0000 343 344 pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared 345 pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared 346 347 paddd xmm6, xmm4 348 paddd xmm7, xmm3 349 350 pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared 351 pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared 352 353 paddd xmm7, xmm3 354 paddd xmm6, xmm4 355 356 pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared 357 pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared 358 359 paddd xmm7, xmm3 360 paddd xmm6, xmm4 361 362 movdqa xmm3, xmm6 363 pmaddwd xmm3, xmm3 364 365 movdqa xmm5, xmm7 366 pslld xmm5, 4 367 368 psubd xmm5, xmm7 369 psubd xmm5, xmm3 370 371 psubd xmm5, flimit4 372 psrad xmm5, 31 373 374 packssdw xmm5, xmm0 375 packsswb xmm5, xmm0 376 377 movd xmm1, DWORD PTR [rsi+rcx] 378 movq xmm2, xmm1 379 380 punpcklbw xmm1, xmm0 381 punpcklwd xmm1, xmm0 382 383 paddd xmm1, xmm6 384 paddd xmm1, [GLOBAL(four8s)] 385 386 psrad xmm1, 4 387 packssdw xmm1, xmm0 388 389 packuswb xmm1, xmm0 390 pand xmm1, xmm5 391 392 pandn xmm5, xmm2 393 por xmm5, xmm1 394 395 movd [rsi+rcx-8], mm0 396 movq mm0, mm1 397 398 movdq2q mm1, xmm5 399 psrldq xmm7, 12 400 401 psrldq xmm6, 12 402 add rcx, 4 403 404 cmp rcx, rdx 405 jl .nextcol4 406 407 ;s+=pitch; 408 movsxd rax, dword arg(1) 409 add arg(0), rax 410 411 sub dword arg(2), 1 ;rows-=1 412 cmp dword arg(2), 0 413 jg .ip_row_loop 414 415 add rsp, 16 416 pop rsp 417 418 ; begin epilog 419 pop rdi 420 pop rsi 421 RESTORE_GOT 422 RESTORE_XMM 423 UNSHADOW_ARGS 424 pop rbp 425 ret 426%undef flimit4 427 428 429SECTION_RODATA 430align 16 431four8s: 432 times 4 dd 8 433