1; 2; Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION_RODATA 14pw_4: times 8 dw 4 15pw_8: times 8 dw 8 16pw_16: times 4 dd 16 17pw_32: times 4 dd 32 18 19SECTION .text 20INIT_XMM sse2 21cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset 22 GET_GOT goffsetq 23 24 movq m0, [aboveq] 25 movq m2, [leftq] 26 paddw m0, m2 27 pshuflw m1, m0, 0xe 28 paddw m0, m1 29 pshuflw m1, m0, 0x1 30 paddw m0, m1 31 paddw m0, [GLOBAL(pw_4)] 32 psraw m0, 3 33 pshuflw m0, m0, 0x0 34 movq [dstq ], m0 35 movq [dstq+strideq*2], m0 36 lea dstq, [dstq+strideq*4] 37 movq [dstq ], m0 38 movq [dstq+strideq*2], m0 39 40 RESTORE_GOT 41 RET 42 43INIT_XMM sse2 44cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset 45 GET_GOT goffsetq 46 47 pxor m1, m1 48 mova m0, [aboveq] 49 mova m2, [leftq] 50 DEFINE_ARGS dst, stride, stride3, one 51 mov oned, 0x00010001 52 lea stride3q, [strideq*3] 53 movd m3, oned 54 pshufd m3, m3, 0x0 55 paddw m0, m2 56 pmaddwd m0, m3 57 packssdw m0, m1 58 pmaddwd m0, m3 59 packssdw m0, m1 60 pmaddwd m0, m3 61 paddw m0, [GLOBAL(pw_8)] 62 psrlw m0, 4 63 pshuflw m0, m0, 0x0 64 punpcklqdq m0, m0 65 mova [dstq ], m0 66 mova [dstq+strideq*2 ], m0 67 mova [dstq+strideq*4 ], m0 68 mova [dstq+stride3q*2], m0 69 lea dstq, [dstq+strideq*8] 70 mova [dstq ], m0 71 mova [dstq+strideq*2 ], m0 72 mova [dstq+strideq*4 ], m0 73 mova [dstq+stride3q*2], m0 74 75 RESTORE_GOT 76 RET 77 78INIT_XMM sse2 79cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset 80 GET_GOT goffsetq 81 82 pxor m1, m1 83 mova m0, [aboveq] 84 mova m3, [aboveq+16] 85 mova m2, [leftq] 86 mova m4, [leftq+16] 87 DEFINE_ARGS dst, stride, stride3, lines4 88 lea stride3q, [strideq*3] 89 mov lines4d, 4 90 paddw m0, m2 91 paddw m0, m3 92 paddw m0, m4 93 movhlps m2, m0 94 paddw m0, m2 95 punpcklwd m0, m1 96 movhlps m2, m0 97 paddd m0, m2 98 punpckldq m0, m1 99 movhlps m2, m0 100 paddd m0, m2 101 paddd m0, [GLOBAL(pw_16)] 102 psrad m0, 5 103 pshuflw m0, m0, 0x0 104 punpcklqdq m0, m0 105.loop: 106 mova [dstq ], m0 107 mova [dstq +16], m0 108 mova [dstq+strideq*2 ], m0 109 mova [dstq+strideq*2 +16], m0 110 mova [dstq+strideq*4 ], m0 111 mova [dstq+strideq*4 +16], m0 112 mova [dstq+stride3q*2 ], m0 113 mova [dstq+stride3q*2+16], m0 114 lea dstq, [dstq+strideq*8] 115 dec lines4d 116 jnz .loop 117 118 RESTORE_GOT 119 REP_RET 120 121INIT_XMM sse2 122cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset 123 GET_GOT goffsetq 124 125 mova m0, [aboveq] 126 mova m2, [aboveq+16] 127 mova m3, [aboveq+32] 128 mova m4, [aboveq+48] 129 paddw m0, m2 130 paddw m3, m4 131 mova m2, [leftq] 132 mova m4, [leftq+16] 133 mova m5, [leftq+32] 134 mova m6, [leftq+48] 135 paddw m2, m4 136 paddw m5, m6 137 paddw m0, m3 138 paddw m2, m5 139 pxor m1, m1 140 paddw m0, m2 141 DEFINE_ARGS dst, stride, stride3, lines4 142 lea stride3q, [strideq*3] 143 mov lines4d, 8 144 movhlps m2, m0 145 paddw m0, m2 146 punpcklwd m0, m1 147 movhlps m2, m0 148 paddd m0, m2 149 punpckldq m0, m1 150 movhlps m2, m0 151 paddd m0, m2 152 paddd m0, [GLOBAL(pw_32)] 153 psrad m0, 6 154 pshuflw m0, m0, 0x0 155 punpcklqdq m0, m0 156.loop: 157 mova [dstq ], m0 158 mova [dstq +16 ], m0 159 mova [dstq +32 ], m0 160 mova [dstq +48 ], m0 161 mova [dstq+strideq*2 ], m0 162 mova [dstq+strideq*2+16 ], m0 163 mova [dstq+strideq*2+32 ], m0 164 mova [dstq+strideq*2+48 ], m0 165 mova [dstq+strideq*4 ], m0 166 mova [dstq+strideq*4+16 ], m0 167 mova [dstq+strideq*4+32 ], m0 168 mova [dstq+strideq*4+48 ], m0 169 mova [dstq+stride3q*2 ], m0 170 mova [dstq+stride3q*2 +16], m0 171 mova [dstq+stride3q*2 +32], m0 172 mova [dstq+stride3q*2 +48], m0 173 lea dstq, [dstq+strideq*8] 174 dec lines4d 175 jnz .loop 176 177 RESTORE_GOT 178 REP_RET 179 180INIT_XMM sse2 181cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above 182 movq m0, [aboveq] 183 movq [dstq ], m0 184 movq [dstq+strideq*2], m0 185 lea dstq, [dstq+strideq*4] 186 movq [dstq ], m0 187 movq [dstq+strideq*2], m0 188 RET 189 190INIT_XMM sse2 191cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above 192 mova m0, [aboveq] 193 DEFINE_ARGS dst, stride, stride3 194 lea stride3q, [strideq*3] 195 mova [dstq ], m0 196 mova [dstq+strideq*2 ], m0 197 mova [dstq+strideq*4 ], m0 198 mova [dstq+stride3q*2], m0 199 lea dstq, [dstq+strideq*8] 200 mova [dstq ], m0 201 mova [dstq+strideq*2 ], m0 202 mova [dstq+strideq*4 ], m0 203 mova [dstq+stride3q*2], m0 204 RET 205 206INIT_XMM sse2 207cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above 208 mova m0, [aboveq] 209 mova m1, [aboveq+16] 210 DEFINE_ARGS dst, stride, stride3, nlines4 211 lea stride3q, [strideq*3] 212 mov nlines4d, 4 213.loop: 214 mova [dstq ], m0 215 mova [dstq +16], m1 216 mova [dstq+strideq*2 ], m0 217 mova [dstq+strideq*2 +16], m1 218 mova [dstq+strideq*4 ], m0 219 mova [dstq+strideq*4 +16], m1 220 mova [dstq+stride3q*2 ], m0 221 mova [dstq+stride3q*2+16], m1 222 lea dstq, [dstq+strideq*8] 223 dec nlines4d 224 jnz .loop 225 REP_RET 226 227INIT_XMM sse2 228cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above 229 mova m0, [aboveq] 230 mova m1, [aboveq+16] 231 mova m2, [aboveq+32] 232 mova m3, [aboveq+48] 233 DEFINE_ARGS dst, stride, stride3, nlines4 234 lea stride3q, [strideq*3] 235 mov nlines4d, 8 236.loop: 237 mova [dstq ], m0 238 mova [dstq +16], m1 239 mova [dstq +32], m2 240 mova [dstq +48], m3 241 mova [dstq+strideq*2 ], m0 242 mova [dstq+strideq*2 +16], m1 243 mova [dstq+strideq*2 +32], m2 244 mova [dstq+strideq*2 +48], m3 245 mova [dstq+strideq*4 ], m0 246 mova [dstq+strideq*4 +16], m1 247 mova [dstq+strideq*4 +32], m2 248 mova [dstq+strideq*4 +48], m3 249 mova [dstq+stride3q*2 ], m0 250 mova [dstq+stride3q*2 +16], m1 251 mova [dstq+stride3q*2 +32], m2 252 mova [dstq+stride3q*2 +48], m3 253 lea dstq, [dstq+strideq*8] 254 dec nlines4d 255 jnz .loop 256 REP_RET 257 258INIT_XMM sse2 259cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bd 260 movd m1, [aboveq-2] 261 movq m0, [aboveq] 262 pshuflw m1, m1, 0x0 263 movlhps m0, m0 ; t1 t2 t3 t4 t1 t2 t3 t4 264 movlhps m1, m1 ; tl tl tl tl tl tl tl tl 265 ; Get the values to compute the maximum value at this bit depth 266 pcmpeqw m3, m3 267 movd m4, bdd 268 psubw m0, m1 ; t1-tl t2-tl t3-tl t4-tl 269 psllw m3, m4 270 pcmpeqw m2, m2 271 pxor m4, m4 ; min possible value 272 pxor m3, m2 ; max possible value 273 mova m1, [leftq] 274 pshuflw m2, m1, 0x0 275 pshuflw m5, m1, 0x55 276 movlhps m2, m5 ; l1 l1 l1 l1 l2 l2 l2 l2 277 paddw m2, m0 278 ;Clamp to the bit-depth 279 pminsw m2, m3 280 pmaxsw m2, m4 281 ;Store the values 282 movq [dstq ], m2 283 movhpd [dstq+strideq*2], m2 284 lea dstq, [dstq+strideq*4] 285 pshuflw m2, m1, 0xaa 286 pshuflw m5, m1, 0xff 287 movlhps m2, m5 288 paddw m2, m0 289 ;Clamp to the bit-depth 290 pminsw m2, m3 291 pmaxsw m2, m4 292 ;Store the values 293 movq [dstq ], m2 294 movhpd [dstq+strideq*2], m2 295 RET 296 297INIT_XMM sse2 298cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bd, one 299 movd m1, [aboveq-2] 300 mova m0, [aboveq] 301 pshuflw m1, m1, 0x0 302 ; Get the values to compute the maximum value at this bit depth 303 mov oned, 1 304 pxor m3, m3 305 pxor m4, m4 306 pinsrw m3, oned, 0 307 pinsrw m4, bdd, 0 308 pshuflw m3, m3, 0x0 309 DEFINE_ARGS dst, stride, line, left 310 punpcklqdq m3, m3 311 mov lineq, -4 312 mova m2, m3 313 punpcklqdq m1, m1 314 psllw m3, m4 315 add leftq, 16 316 psubw m3, m2 ; max possible value 317 pxor m4, m4 ; min possible value 318 psubw m0, m1 319.loop: 320 movd m1, [leftq+lineq*4] 321 movd m2, [leftq+lineq*4+2] 322 pshuflw m1, m1, 0x0 323 pshuflw m2, m2, 0x0 324 punpcklqdq m1, m1 325 punpcklqdq m2, m2 326 paddw m1, m0 327 paddw m2, m0 328 ;Clamp to the bit-depth 329 pminsw m1, m3 330 pminsw m2, m3 331 pmaxsw m1, m4 332 pmaxsw m2, m4 333 ;Store the values 334 mova [dstq ], m1 335 mova [dstq+strideq*2], m2 336 lea dstq, [dstq+strideq*4] 337 inc lineq 338 jnz .loop 339 REP_RET 340 341INIT_XMM sse2 342cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bd 343 movd m2, [aboveq-2] 344 mova m0, [aboveq] 345 mova m1, [aboveq+16] 346 pshuflw m2, m2, 0x0 347 ; Get the values to compute the maximum value at this bit depth 348 pcmpeqw m3, m3 349 movd m4, bdd 350 punpcklqdq m2, m2 351 psllw m3, m4 352 pcmpeqw m5, m5 353 pxor m4, m4 ; min possible value 354 pxor m3, m5 ; max possible value 355 DEFINE_ARGS dst, stride, line, left 356 mov lineq, -8 357 psubw m0, m2 358 psubw m1, m2 359.loop: 360 movd m7, [leftq] 361 pshuflw m5, m7, 0x0 362 pshuflw m2, m7, 0x55 363 punpcklqdq m5, m5 ; l1 l1 l1 l1 l1 l1 l1 l1 364 punpcklqdq m2, m2 ; l2 l2 l2 l2 l2 l2 l2 l2 365 paddw m6, m5, m0 ; t1-tl+l1 to t4-tl+l1 366 paddw m5, m1 ; t5-tl+l1 to t8-tl+l1 367 pminsw m6, m3 368 pminsw m5, m3 369 pmaxsw m6, m4 ; Clamp to the bit-depth 370 pmaxsw m5, m4 371 mova [dstq ], m6 372 mova [dstq +16], m5 373 paddw m6, m2, m0 374 paddw m2, m1 375 pminsw m6, m3 376 pminsw m2, m3 377 pmaxsw m6, m4 378 pmaxsw m2, m4 379 mova [dstq+strideq*2 ], m6 380 mova [dstq+strideq*2+16], m2 381 lea dstq, [dstq+strideq*4] 382 inc lineq 383 lea leftq, [leftq+4] 384 385 jnz .loop 386 REP_RET 387 388INIT_XMM sse2 389cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bd 390 movd m0, [aboveq-2] 391 mova m1, [aboveq] 392 mova m2, [aboveq+16] 393 mova m3, [aboveq+32] 394 mova m4, [aboveq+48] 395 pshuflw m0, m0, 0x0 396 ; Get the values to compute the maximum value at this bit depth 397 pcmpeqw m5, m5 398 movd m6, bdd 399 psllw m5, m6 400 pcmpeqw m7, m7 401 pxor m6, m6 ; min possible value 402 pxor m5, m7 ; max possible value 403 punpcklqdq m0, m0 404 DEFINE_ARGS dst, stride, line, left 405 mov lineq, -16 406 psubw m1, m0 407 psubw m2, m0 408 psubw m3, m0 409 psubw m4, m0 410.loop: 411 movd m7, [leftq] 412 pshuflw m7, m7, 0x0 413 punpcklqdq m7, m7 ; l1 l1 l1 l1 l1 l1 l1 l1 414 paddw m0, m7, m1 415 pminsw m0, m5 416 pmaxsw m0, m6 417 mova [dstq ], m0 418 paddw m0, m7, m2 419 pminsw m0, m5 420 pmaxsw m0, m6 421 mova [dstq +16], m0 422 paddw m0, m7, m3 423 pminsw m0, m5 424 pmaxsw m0, m6 425 mova [dstq +32], m0 426 paddw m0, m7, m4 427 pminsw m0, m5 428 pmaxsw m0, m6 429 mova [dstq +48], m0 430 movd m7, [leftq+2] 431 pshuflw m7, m7, 0x0 432 punpcklqdq m7, m7 ; l2 l2 l2 l2 l2 l2 l2 l2 433 paddw m0, m7, m1 434 pminsw m0, m5 435 pmaxsw m0, m6 436 mova [dstq+strideq*2 ], m0 437 paddw m0, m7, m2 438 pminsw m0, m5 439 pmaxsw m0, m6 440 mova [dstq+strideq*2+16], m0 441 paddw m0, m7, m3 442 pminsw m0, m5 443 pmaxsw m0, m6 444 mova [dstq+strideq*2+32], m0 445 paddw m0, m7, m4 446 pminsw m0, m5 447 pmaxsw m0, m6 448 mova [dstq+strideq*2+48], m0 449 lea dstq, [dstq+strideq*4] 450 lea leftq, [leftq+4] 451 inc lineq 452 jnz .loop 453 REP_RET 454