1; 2; Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "vpx_ports/x86_abi_support.asm" 12 13%macro GET_PARAM_4 0 14 mov rdx, arg(5) ;filter ptr 15 mov rsi, arg(0) ;src_ptr 16 mov rdi, arg(2) ;output_ptr 17 mov ecx, 0x01000100 18 19 movdqa xmm3, [rdx] ;load filters 20 psrldq xmm3, 6 21 packsswb xmm3, xmm3 22 pshuflw xmm3, xmm3, 0b ;k3_k4 23 24 movd xmm2, ecx ;rounding_shift 25 pshufd xmm2, xmm2, 0 26 27 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 28 movsxd rdx, DWORD PTR arg(3) ;out_pitch 29 movsxd rcx, DWORD PTR arg(4) ;output_height 30%endm 31 32%macro APPLY_FILTER_4 1 33 punpcklbw xmm0, xmm1 34 pmaddubsw xmm0, xmm3 35 36 pmulhrsw xmm0, xmm2 ;rounding(+64)+shift(>>7) 37 packuswb xmm0, xmm0 ;pack to byte 38 39%if %1 40 movd xmm1, [rdi] 41 pavgb xmm0, xmm1 42%endif 43 movd [rdi], xmm0 44 lea rsi, [rsi + rax] 45 lea rdi, [rdi + rdx] 46 dec rcx 47%endm 48 49%macro GET_PARAM 0 50 mov rdx, arg(5) ;filter ptr 51 mov rsi, arg(0) ;src_ptr 52 mov rdi, arg(2) ;output_ptr 53 mov ecx, 0x01000100 54 55 movdqa xmm7, [rdx] ;load filters 56 psrldq xmm7, 6 57 packsswb xmm7, xmm7 58 pshuflw xmm7, xmm7, 0b ;k3_k4 59 punpcklwd xmm7, xmm7 60 61 movd xmm6, ecx ;rounding_shift 62 pshufd xmm6, xmm6, 0 63 64 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 65 movsxd rdx, DWORD PTR arg(3) ;out_pitch 66 movsxd rcx, DWORD PTR arg(4) ;output_height 67%endm 68 69%macro APPLY_FILTER_8 1 70 punpcklbw xmm0, xmm1 71 pmaddubsw xmm0, xmm7 72 73 pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) 74 packuswb xmm0, xmm0 ;pack back to byte 75 76%if %1 77 movq xmm1, [rdi] 78 pavgb xmm0, xmm1 79%endif 80 movq [rdi], xmm0 ;store the result 81 82 lea rsi, [rsi + rax] 83 lea rdi, [rdi + rdx] 84 dec rcx 85%endm 86 87%macro APPLY_FILTER_16 1 88 punpcklbw xmm0, xmm1 89 punpckhbw xmm2, xmm1 90 pmaddubsw xmm0, xmm7 91 pmaddubsw xmm2, xmm7 92 93 pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) 94 pmulhrsw xmm2, xmm6 95 packuswb xmm0, xmm2 ;pack back to byte 96 97%if %1 98 movdqu xmm1, [rdi] 99 pavgb xmm0, xmm1 100%endif 101 movdqu [rdi], xmm0 ;store the result 102 103 lea rsi, [rsi + rax] 104 lea rdi, [rdi + rdx] 105 dec rcx 106%endm 107 108SECTION .text 109 110globalsym(vpx_filter_block1d4_v2_ssse3) 111sym(vpx_filter_block1d4_v2_ssse3): 112 push rbp 113 mov rbp, rsp 114 SHADOW_ARGS_TO_STACK 6 115 push rsi 116 push rdi 117 ; end prolog 118 119 GET_PARAM_4 120.loop: 121 movd xmm0, [rsi] ;load src 122 movd xmm1, [rsi + rax] 123 124 APPLY_FILTER_4 0 125 jnz .loop 126 127 ; begin epilog 128 pop rdi 129 pop rsi 130 UNSHADOW_ARGS 131 pop rbp 132 ret 133 134globalsym(vpx_filter_block1d8_v2_ssse3) 135sym(vpx_filter_block1d8_v2_ssse3): 136 push rbp 137 mov rbp, rsp 138 SHADOW_ARGS_TO_STACK 6 139 SAVE_XMM 7 140 push rsi 141 push rdi 142 ; end prolog 143 144 GET_PARAM 145.loop: 146 movq xmm0, [rsi] ;0 147 movq xmm1, [rsi + rax] ;1 148 149 APPLY_FILTER_8 0 150 jnz .loop 151 152 ; begin epilog 153 pop rdi 154 pop rsi 155 RESTORE_XMM 156 UNSHADOW_ARGS 157 pop rbp 158 ret 159 160globalsym(vpx_filter_block1d16_v2_ssse3) 161sym(vpx_filter_block1d16_v2_ssse3): 162 push rbp 163 mov rbp, rsp 164 SHADOW_ARGS_TO_STACK 6 165 SAVE_XMM 7 166 push rsi 167 push rdi 168 ; end prolog 169 170 GET_PARAM 171.loop: 172 movdqu xmm0, [rsi] ;0 173 movdqu xmm1, [rsi + rax] ;1 174 movdqa xmm2, xmm0 175 176 APPLY_FILTER_16 0 177 jnz .loop 178 179 ; begin epilog 180 pop rdi 181 pop rsi 182 RESTORE_XMM 183 UNSHADOW_ARGS 184 pop rbp 185 ret 186 187globalsym(vpx_filter_block1d4_v2_avg_ssse3) 188sym(vpx_filter_block1d4_v2_avg_ssse3): 189 push rbp 190 mov rbp, rsp 191 SHADOW_ARGS_TO_STACK 6 192 push rsi 193 push rdi 194 ; end prolog 195 196 GET_PARAM_4 197.loop: 198 movd xmm0, [rsi] ;load src 199 movd xmm1, [rsi + rax] 200 201 APPLY_FILTER_4 1 202 jnz .loop 203 204 ; begin epilog 205 pop rdi 206 pop rsi 207 UNSHADOW_ARGS 208 pop rbp 209 ret 210 211globalsym(vpx_filter_block1d8_v2_avg_ssse3) 212sym(vpx_filter_block1d8_v2_avg_ssse3): 213 push rbp 214 mov rbp, rsp 215 SHADOW_ARGS_TO_STACK 6 216 SAVE_XMM 7 217 push rsi 218 push rdi 219 ; end prolog 220 221 GET_PARAM 222.loop: 223 movq xmm0, [rsi] ;0 224 movq xmm1, [rsi + rax] ;1 225 226 APPLY_FILTER_8 1 227 jnz .loop 228 229 ; begin epilog 230 pop rdi 231 pop rsi 232 RESTORE_XMM 233 UNSHADOW_ARGS 234 pop rbp 235 ret 236 237globalsym(vpx_filter_block1d16_v2_avg_ssse3) 238sym(vpx_filter_block1d16_v2_avg_ssse3): 239 push rbp 240 mov rbp, rsp 241 SHADOW_ARGS_TO_STACK 6 242 SAVE_XMM 7 243 push rsi 244 push rdi 245 ; end prolog 246 247 GET_PARAM 248.loop: 249 movdqu xmm0, [rsi] ;0 250 movdqu xmm1, [rsi + rax] ;1 251 movdqa xmm2, xmm0 252 253 APPLY_FILTER_16 1 254 jnz .loop 255 256 ; begin epilog 257 pop rdi 258 pop rsi 259 RESTORE_XMM 260 UNSHADOW_ARGS 261 pop rbp 262 ret 263 264globalsym(vpx_filter_block1d4_h2_ssse3) 265sym(vpx_filter_block1d4_h2_ssse3): 266 push rbp 267 mov rbp, rsp 268 SHADOW_ARGS_TO_STACK 6 269 push rsi 270 push rdi 271 ; end prolog 272 273 GET_PARAM_4 274.loop: 275 movdqu xmm0, [rsi] ;load src 276 movdqa xmm1, xmm0 277 psrldq xmm1, 1 278 279 APPLY_FILTER_4 0 280 jnz .loop 281 282 ; begin epilog 283 pop rdi 284 pop rsi 285 UNSHADOW_ARGS 286 pop rbp 287 ret 288 289globalsym(vpx_filter_block1d8_h2_ssse3) 290sym(vpx_filter_block1d8_h2_ssse3): 291 push rbp 292 mov rbp, rsp 293 SHADOW_ARGS_TO_STACK 6 294 SAVE_XMM 7 295 push rsi 296 push rdi 297 ; end prolog 298 299 GET_PARAM 300.loop: 301 movdqu xmm0, [rsi] ;load src 302 movdqa xmm1, xmm0 303 psrldq xmm1, 1 304 305 APPLY_FILTER_8 0 306 jnz .loop 307 308 ; begin epilog 309 pop rdi 310 pop rsi 311 RESTORE_XMM 312 UNSHADOW_ARGS 313 pop rbp 314 ret 315 316globalsym(vpx_filter_block1d16_h2_ssse3) 317sym(vpx_filter_block1d16_h2_ssse3): 318 push rbp 319 mov rbp, rsp 320 SHADOW_ARGS_TO_STACK 6 321 SAVE_XMM 7 322 push rsi 323 push rdi 324 ; end prolog 325 326 GET_PARAM 327.loop: 328 movdqu xmm0, [rsi] ;load src 329 movdqu xmm1, [rsi + 1] 330 movdqa xmm2, xmm0 331 332 APPLY_FILTER_16 0 333 jnz .loop 334 335 ; begin epilog 336 pop rdi 337 pop rsi 338 RESTORE_XMM 339 UNSHADOW_ARGS 340 pop rbp 341 ret 342 343globalsym(vpx_filter_block1d4_h2_avg_ssse3) 344sym(vpx_filter_block1d4_h2_avg_ssse3): 345 push rbp 346 mov rbp, rsp 347 SHADOW_ARGS_TO_STACK 6 348 push rsi 349 push rdi 350 ; end prolog 351 352 GET_PARAM_4 353.loop: 354 movdqu xmm0, [rsi] ;load src 355 movdqa xmm1, xmm0 356 psrldq xmm1, 1 357 358 APPLY_FILTER_4 1 359 jnz .loop 360 361 ; begin epilog 362 pop rdi 363 pop rsi 364 UNSHADOW_ARGS 365 pop rbp 366 ret 367 368globalsym(vpx_filter_block1d8_h2_avg_ssse3) 369sym(vpx_filter_block1d8_h2_avg_ssse3): 370 push rbp 371 mov rbp, rsp 372 SHADOW_ARGS_TO_STACK 6 373 SAVE_XMM 7 374 push rsi 375 push rdi 376 ; end prolog 377 378 GET_PARAM 379.loop: 380 movdqu xmm0, [rsi] ;load src 381 movdqa xmm1, xmm0 382 psrldq xmm1, 1 383 384 APPLY_FILTER_8 1 385 jnz .loop 386 387 ; begin epilog 388 pop rdi 389 pop rsi 390 RESTORE_XMM 391 UNSHADOW_ARGS 392 pop rbp 393 ret 394 395globalsym(vpx_filter_block1d16_h2_avg_ssse3) 396sym(vpx_filter_block1d16_h2_avg_ssse3): 397 push rbp 398 mov rbp, rsp 399 SHADOW_ARGS_TO_STACK 6 400 SAVE_XMM 7 401 push rsi 402 push rdi 403 ; end prolog 404 405 GET_PARAM 406.loop: 407 movdqu xmm0, [rsi] ;load src 408 movdqu xmm1, [rsi + 1] 409 movdqa xmm2, xmm0 410 411 APPLY_FILTER_16 1 412 jnz .loop 413 414 ; begin epilog 415 pop rdi 416 pop rsi 417 RESTORE_XMM 418 UNSHADOW_ARGS 419 pop rbp 420 ret 421