1*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; All rights reserved. 4*c0909341SAndroid Build Coastguard Worker; 5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 7*c0909341SAndroid Build Coastguard Worker; 8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 9*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 10*c0909341SAndroid Build Coastguard Worker; 11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 12*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 13*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 14*c0909341SAndroid Build Coastguard Worker; 15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*c0909341SAndroid Build Coastguard Worker 26*c0909341SAndroid Build Coastguard Worker%include "config.asm" 27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 28*c0909341SAndroid Build Coastguard Worker%include "x86/filmgrain_common.asm" 29*c0909341SAndroid Build Coastguard Worker 30*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 16 31*c0909341SAndroid Build Coastguard Workerpd_16: times 4 dd 16 32*c0909341SAndroid Build Coastguard Workerpw_1: times 8 dw 1 33*c0909341SAndroid Build Coastguard Workerpw_16384: times 8 dw 16384 34*c0909341SAndroid Build Coastguard Workerpw_8192: times 8 dw 8192 35*c0909341SAndroid Build Coastguard Workerpw_23_22: dw 23, 22 36*c0909341SAndroid Build Coastguard Worker times 3 dw 0, 32 37*c0909341SAndroid Build Coastguard Workerpb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 38*c0909341SAndroid Build Coastguard Workerpw_27_17_17_27: dw 27, 17, 17, 27 39*c0909341SAndroid Build Coastguard Worker times 2 dw 0, 32 40*c0909341SAndroid Build Coastguard Workerrnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 41*c0909341SAndroid Build Coastguard Workerpw_seed_xor: times 2 dw 0xb524 42*c0909341SAndroid Build Coastguard Worker times 2 dw 0x49d8 43*c0909341SAndroid Build Coastguard Workerpb_1: times 4 db 1 44*c0909341SAndroid Build Coastguard Workerhmul_bits: dw 32768, 16384, 8192, 4096 45*c0909341SAndroid Build Coastguard Workerround: dw 2048, 1024, 512 46*c0909341SAndroid Build Coastguard Workermul_bits: dw 256, 128, 64, 32, 16 47*c0909341SAndroid Build Coastguard Workerround_vals: dw 32, 64, 128, 256, 512, 1024 48*c0909341SAndroid Build Coastguard Workermax: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16 49*c0909341SAndroid Build Coastguard Workermin: dw 0, 16*4, 16*16 50*c0909341SAndroid Build Coastguard Worker; these two should be next to each other 51*c0909341SAndroid Build Coastguard Workerpw_4: times 2 dw 4 52*c0909341SAndroid Build Coastguard Workerpw_16: times 2 dw 16 53*c0909341SAndroid Build Coastguard Worker 54*c0909341SAndroid Build Coastguard Worker%macro JMP_TABLE 1-* 55*c0909341SAndroid Build Coastguard Worker %xdefine %1_table %%table 56*c0909341SAndroid Build Coastguard Worker %xdefine %%base %1_table 57*c0909341SAndroid Build Coastguard Worker %xdefine %%prefix mangle(private_prefix %+ _%1) 58*c0909341SAndroid Build Coastguard Worker %%table: 59*c0909341SAndroid Build Coastguard Worker %rep %0 - 1 60*c0909341SAndroid Build Coastguard Worker dd %%prefix %+ .ar%2 - %%base 61*c0909341SAndroid Build Coastguard Worker %rotate 1 62*c0909341SAndroid Build Coastguard Worker %endrep 63*c0909341SAndroid Build Coastguard Worker%endmacro 64*c0909341SAndroid Build Coastguard Worker 65*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_y_16bpc_ssse3, 0, 1, 2, 3 66*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3 67*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3 68*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3 69*c0909341SAndroid Build Coastguard Worker 70*c0909341SAndroid Build Coastguard WorkerSECTION .text 71*c0909341SAndroid Build Coastguard Worker 72*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 73*c0909341SAndroid Build Coastguard Worker%undef base 74*c0909341SAndroid Build Coastguard Worker%define PIC_ptr(a) base+a 75*c0909341SAndroid Build Coastguard Worker%else 76*c0909341SAndroid Build Coastguard Worker%define PIC_ptr(a) a 77*c0909341SAndroid Build Coastguard Worker%endif 78*c0909341SAndroid Build Coastguard Worker 79*c0909341SAndroid Build Coastguard Worker%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) 80*c0909341SAndroid Build Coastguard Worker 81*c0909341SAndroid Build Coastguard Worker%macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg 82*c0909341SAndroid Build Coastguard Worker%assign %%idx 0 83*c0909341SAndroid Build Coastguard Worker%define %%tmp %2 84*c0909341SAndroid Build Coastguard Worker%if %0 == 8 85*c0909341SAndroid Build Coastguard Worker%define %%tmp %8 86*c0909341SAndroid Build Coastguard Worker%endif 87*c0909341SAndroid Build Coastguard Worker%rep (%6/2) 88*c0909341SAndroid Build Coastguard Worker%if %%idx == 0 89*c0909341SAndroid Build Coastguard Worker movd %5 %+ d, %2 90*c0909341SAndroid Build Coastguard Worker pshuflw %%tmp, %2, q3232 91*c0909341SAndroid Build Coastguard Worker%else 92*c0909341SAndroid Build Coastguard Worker movd %5 %+ d, %%tmp 93*c0909341SAndroid Build Coastguard Worker%if %6 == 8 94*c0909341SAndroid Build Coastguard Worker%if %%idx == 2 95*c0909341SAndroid Build Coastguard Worker punpckhqdq %%tmp, %%tmp 96*c0909341SAndroid Build Coastguard Worker%elif %%idx == 4 97*c0909341SAndroid Build Coastguard Worker psrlq %%tmp, 32 98*c0909341SAndroid Build Coastguard Worker%endif 99*c0909341SAndroid Build Coastguard Worker%endif 100*c0909341SAndroid Build Coastguard Worker%endif 101*c0909341SAndroid Build Coastguard Worker movzx %4 %+ d, %5 %+ w 102*c0909341SAndroid Build Coastguard Worker shr %5 %+ d, 16 103*c0909341SAndroid Build Coastguard Worker 104*c0909341SAndroid Build Coastguard Worker%if %%idx == 0 105*c0909341SAndroid Build Coastguard Worker movd %1, [%3+%4*%7] 106*c0909341SAndroid Build Coastguard Worker%else 107*c0909341SAndroid Build Coastguard Worker pinsrw %1, [%3+%4*%7], %%idx + 0 108*c0909341SAndroid Build Coastguard Worker%endif 109*c0909341SAndroid Build Coastguard Worker pinsrw %1, [%3+%5*%7], %%idx + 1 110*c0909341SAndroid Build Coastguard Worker%assign %%idx %%idx+2 111*c0909341SAndroid Build Coastguard Worker%endrep 112*c0909341SAndroid Build Coastguard Worker%endmacro 113*c0909341SAndroid Build Coastguard Worker 114*c0909341SAndroid Build Coastguard Worker%macro SPLATD 2 ; dst, src 115*c0909341SAndroid Build Coastguard Worker%ifnidn %1, %2 116*c0909341SAndroid Build Coastguard Worker movd %1, %2 117*c0909341SAndroid Build Coastguard Worker%endif 118*c0909341SAndroid Build Coastguard Worker pshufd %1, %1, q0000 119*c0909341SAndroid Build Coastguard Worker%endmacro 120*c0909341SAndroid Build Coastguard Worker 121*c0909341SAndroid Build Coastguard Worker%macro SPLATW 2 ; dst, src 122*c0909341SAndroid Build Coastguard Worker%ifnidn %1, %2 123*c0909341SAndroid Build Coastguard Worker movd %1, %2 124*c0909341SAndroid Build Coastguard Worker%endif 125*c0909341SAndroid Build Coastguard Worker pshuflw %1, %1, q0000 126*c0909341SAndroid Build Coastguard Worker punpcklqdq %1, %1 127*c0909341SAndroid Build Coastguard Worker%endmacro 128*c0909341SAndroid Build Coastguard Worker 129*c0909341SAndroid Build Coastguard Worker 130*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3 131*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 132*c0909341SAndroid Build Coastguard Workercglobal generate_grain_y_16bpc, 3, 8, 16, buf, fg_data, bdmax 133*c0909341SAndroid Build Coastguard Worker lea r4, [pb_mask] 134*c0909341SAndroid Build Coastguard Worker%define base r4-pb_mask 135*c0909341SAndroid Build Coastguard Worker%else 136*c0909341SAndroid Build Coastguard Workercglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax 137*c0909341SAndroid Build Coastguard Worker LEA r4, $$ 138*c0909341SAndroid Build Coastguard Worker%define base r4-$$ 139*c0909341SAndroid Build Coastguard Worker%endif 140*c0909341SAndroid Build Coastguard Worker movq m1, [base+rnd_next_upperbit_mask] 141*c0909341SAndroid Build Coastguard Worker movq m4, [base+mul_bits] 142*c0909341SAndroid Build Coastguard Worker movq m7, [base+hmul_bits] 143*c0909341SAndroid Build Coastguard Worker mov r3d, [fg_dataq+FGData.grain_scale_shift] 144*c0909341SAndroid Build Coastguard Worker lea r5d, [bdmaxq+1] 145*c0909341SAndroid Build Coastguard Worker shr r5d, 11 ; 0 for 10bpc, 2 for 12bpc 146*c0909341SAndroid Build Coastguard Worker sub r3, r5 147*c0909341SAndroid Build Coastguard Worker SPLATW m6, [base+round+r3*2-2] 148*c0909341SAndroid Build Coastguard Worker mova m5, [base+pb_mask] 149*c0909341SAndroid Build Coastguard Worker SPLATW m0, [fg_dataq+FGData.seed] 150*c0909341SAndroid Build Coastguard Worker mov r3, -73*82*2 151*c0909341SAndroid Build Coastguard Worker sub bufq, r3 152*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 153*c0909341SAndroid Build Coastguard Worker lea r6, [gaussian_sequence] 154*c0909341SAndroid Build Coastguard Worker%endif 155*c0909341SAndroid Build Coastguard Worker.loop: 156*c0909341SAndroid Build Coastguard Worker pand m2, m0, m1 157*c0909341SAndroid Build Coastguard Worker psrlw m3, m2, 10 158*c0909341SAndroid Build Coastguard Worker por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 159*c0909341SAndroid Build Coastguard Worker pmullw m2, m4 ; bits 0x0f00 are set 160*c0909341SAndroid Build Coastguard Worker pshufb m3, m5, m2 ; set 15th bit for next 4 seeds 161*c0909341SAndroid Build Coastguard Worker psllq m2, m3, 30 162*c0909341SAndroid Build Coastguard Worker por m2, m3 163*c0909341SAndroid Build Coastguard Worker psllq m3, m2, 15 164*c0909341SAndroid Build Coastguard Worker por m2, m3 ; aggregate each bit into next seed's high bit 165*c0909341SAndroid Build Coastguard Worker pmulhuw m3, m0, m7 166*c0909341SAndroid Build Coastguard Worker por m2, m3 ; 4 next output seeds 167*c0909341SAndroid Build Coastguard Worker pshuflw m0, m2, q3333 168*c0909341SAndroid Build Coastguard Worker psrlw m2, 5 169*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 170*c0909341SAndroid Build Coastguard Worker vpgatherdw m3, m2, r6, r5, r7, 4, 2 171*c0909341SAndroid Build Coastguard Worker%else 172*c0909341SAndroid Build Coastguard Worker vpgatherdw m3, m2, base+gaussian_sequence, r5, r2, 4, 2 173*c0909341SAndroid Build Coastguard Worker%endif 174*c0909341SAndroid Build Coastguard Worker paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 175*c0909341SAndroid Build Coastguard Worker ; shifts by 0, which pmulhrsw does not support 176*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m6 177*c0909341SAndroid Build Coastguard Worker movq [bufq+r3], m3 178*c0909341SAndroid Build Coastguard Worker add r3, 4*2 179*c0909341SAndroid Build Coastguard Worker jl .loop 180*c0909341SAndroid Build Coastguard Worker 181*c0909341SAndroid Build Coastguard Worker ; auto-regression code 182*c0909341SAndroid Build Coastguard Worker movsxd r3, [fg_dataq+FGData.ar_coeff_lag] 183*c0909341SAndroid Build Coastguard Worker movsxd r3, [base+generate_grain_y_16bpc_ssse3_table+r3*4] 184*c0909341SAndroid Build Coastguard Worker lea r3, [r3+base+generate_grain_y_16bpc_ssse3_table] 185*c0909341SAndroid Build Coastguard Worker jmp r3 186*c0909341SAndroid Build Coastguard Worker 187*c0909341SAndroid Build Coastguard Worker.ar1: 188*c0909341SAndroid Build Coastguard Worker%if WIN64 189*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS shift, fg_data, max, buf, val3, min, cf3, x, val0 190*c0909341SAndroid Build Coastguard Worker lea bufq, [r0-2*(82*73-(82*3+79))] 191*c0909341SAndroid Build Coastguard Worker PUSH r8 192*c0909341SAndroid Build Coastguard Worker%else 193*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 194*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 195*c0909341SAndroid Build Coastguard Worker%else ; x86-32 196*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, fg_data, min, val3, x, cf3, val0 197*c0909341SAndroid Build Coastguard Worker PUSH r6 198*c0909341SAndroid Build Coastguard Worker%define shiftd r1d 199*c0909341SAndroid Build Coastguard Worker%endif 200*c0909341SAndroid Build Coastguard Worker sub bufq, 2*(82*73-(82*3+79)) 201*c0909341SAndroid Build Coastguard Worker%endif 202*c0909341SAndroid Build Coastguard Worker movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] 203*c0909341SAndroid Build Coastguard Worker movd m4, [fg_dataq+FGData.ar_coeffs_y] 204*c0909341SAndroid Build Coastguard Worker mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 205*c0909341SAndroid Build Coastguard Worker%if WIN64 206*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS shift, h, max, buf, val3, min, cf3, x, val0 207*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_64 208*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 209*c0909341SAndroid Build Coastguard Worker%else ; x86-32 210*c0909341SAndroid Build Coastguard Worker%undef shiftd 211*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, shift, min, val3, x, cf3, val0 212*c0909341SAndroid Build Coastguard Worker%define hd dword r0m 213*c0909341SAndroid Build Coastguard Worker%define maxd dword minm 214*c0909341SAndroid Build Coastguard Worker%endif 215*c0909341SAndroid Build Coastguard Worker%if cpuflag(sse4) 216*c0909341SAndroid Build Coastguard Worker pmovsxbw m4, m4 217*c0909341SAndroid Build Coastguard Worker%else 218*c0909341SAndroid Build Coastguard Worker pxor m3, m3 219*c0909341SAndroid Build Coastguard Worker pcmpgtb m3, m4 220*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m3 221*c0909341SAndroid Build Coastguard Worker%endif 222*c0909341SAndroid Build Coastguard Worker pinsrw m4, [base+pw_1], 3 223*c0909341SAndroid Build Coastguard Worker pshufd m5, m4, q1111 224*c0909341SAndroid Build Coastguard Worker pshufd m4, m4, q0000 225*c0909341SAndroid Build Coastguard Worker SPLATW m3, [base+round_vals+shiftq*2-12] ; rnd 226*c0909341SAndroid Build Coastguard Worker mov hd, 70 227*c0909341SAndroid Build Coastguard Worker sar maxd, 1 228*c0909341SAndroid Build Coastguard Worker mov mind, maxd 229*c0909341SAndroid Build Coastguard Worker xor mind, -1 230*c0909341SAndroid Build Coastguard Worker.y_loop_ar1: 231*c0909341SAndroid Build Coastguard Worker mov xq, -76 232*c0909341SAndroid Build Coastguard Worker movsx val3d, word [bufq+xq*2-2] 233*c0909341SAndroid Build Coastguard Worker.x_loop_ar1: 234*c0909341SAndroid Build Coastguard Worker movu m0, [bufq+xq*2-82*2-2] ; top/left 235*c0909341SAndroid Build Coastguard Worker psrldq m2, m0, 2 ; top 236*c0909341SAndroid Build Coastguard Worker psrldq m1, m0, 4 ; top/right 237*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2 238*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3 239*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m4 240*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m5 241*c0909341SAndroid Build Coastguard Worker paddd m0, m1 242*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_inner: 243*c0909341SAndroid Build Coastguard Worker movd val0d, m0 244*c0909341SAndroid Build Coastguard Worker psrldq m0, 4 245*c0909341SAndroid Build Coastguard Worker imul val3d, cf3d 246*c0909341SAndroid Build Coastguard Worker add val3d, val0d 247*c0909341SAndroid Build Coastguard Worker sar val3d, shiftb 248*c0909341SAndroid Build Coastguard Worker movsx val0d, word [bufq+xq*2] 249*c0909341SAndroid Build Coastguard Worker add val3d, val0d 250*c0909341SAndroid Build Coastguard Worker cmp val3d, maxd 251*c0909341SAndroid Build Coastguard Worker cmovg val3d, maxd 252*c0909341SAndroid Build Coastguard Worker cmp val3d, mind 253*c0909341SAndroid Build Coastguard Worker cmovl val3d, mind 254*c0909341SAndroid Build Coastguard Worker mov word [bufq+xq*2], val3w 255*c0909341SAndroid Build Coastguard Worker ; keep val3d in-place as left for next x iteration 256*c0909341SAndroid Build Coastguard Worker inc xq 257*c0909341SAndroid Build Coastguard Worker jz .x_loop_ar1_end 258*c0909341SAndroid Build Coastguard Worker test xq, 3 259*c0909341SAndroid Build Coastguard Worker jnz .x_loop_ar1_inner 260*c0909341SAndroid Build Coastguard Worker jmp .x_loop_ar1 261*c0909341SAndroid Build Coastguard Worker 262*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_end: 263*c0909341SAndroid Build Coastguard Worker add bufq, 82*2 264*c0909341SAndroid Build Coastguard Worker dec hd 265*c0909341SAndroid Build Coastguard Worker jg .y_loop_ar1 266*c0909341SAndroid Build Coastguard Worker%if WIN64 267*c0909341SAndroid Build Coastguard Worker POP r8 268*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_32 269*c0909341SAndroid Build Coastguard Worker POP r6 270*c0909341SAndroid Build Coastguard Worker%undef maxd 271*c0909341SAndroid Build Coastguard Worker%undef hd 272*c0909341SAndroid Build Coastguard Worker%endif 273*c0909341SAndroid Build Coastguard Worker.ar0: 274*c0909341SAndroid Build Coastguard Worker RET 275*c0909341SAndroid Build Coastguard Worker 276*c0909341SAndroid Build Coastguard Worker.ar2: 277*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 278*c0909341SAndroid Build Coastguard Worker ALLOC_STACK -16*8 279*c0909341SAndroid Build Coastguard Worker%endif 280*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, fg_data, bdmax, shift 281*c0909341SAndroid Build Coastguard Worker mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 282*c0909341SAndroid Build Coastguard Worker movd m0, [base+round_vals-12+shiftq*2] 283*c0909341SAndroid Build Coastguard Worker pshuflw m0, m0, q0000 284*c0909341SAndroid Build Coastguard Worker movu m6, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-11 285*c0909341SAndroid Build Coastguard Worker pxor m2, m2 286*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2 287*c0909341SAndroid Build Coastguard Worker pcmpgtb m2, m6 288*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m6, m2 289*c0909341SAndroid Build Coastguard Worker punpcklbw m6, m2 290*c0909341SAndroid Build Coastguard Worker pshufd m2, m6, q3333 291*c0909341SAndroid Build Coastguard Worker pshufd m1, m6, q2222 292*c0909341SAndroid Build Coastguard Worker pshufd m7, m6, q1111 293*c0909341SAndroid Build Coastguard Worker pshufd m6, m6, q0000 294*c0909341SAndroid Build Coastguard Worker pshufd m4, m3, q1111 295*c0909341SAndroid Build Coastguard Worker pshufd m3, m3, q0000 296*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 297*c0909341SAndroid Build Coastguard Worker SWAP 0, 12 298*c0909341SAndroid Build Coastguard Worker SWAP 1, 8 299*c0909341SAndroid Build Coastguard Worker SWAP 2, 9 300*c0909341SAndroid Build Coastguard Worker SWAP 3, 10 301*c0909341SAndroid Build Coastguard Worker SWAP 4, 11 302*c0909341SAndroid Build Coastguard Worker%else 303*c0909341SAndroid Build Coastguard Worker%define m12 [rsp+0*16] 304*c0909341SAndroid Build Coastguard Worker%define m8 [rsp+1*16] 305*c0909341SAndroid Build Coastguard Worker%define m9 [rsp+2*16] 306*c0909341SAndroid Build Coastguard Worker%define m10 [rsp+3*16] 307*c0909341SAndroid Build Coastguard Worker%define m11 [rsp+4*16] 308*c0909341SAndroid Build Coastguard Worker mova m12, m0 309*c0909341SAndroid Build Coastguard Worker mova m8, m1 310*c0909341SAndroid Build Coastguard Worker mova m9, m2 311*c0909341SAndroid Build Coastguard Worker mova m10, m3 312*c0909341SAndroid Build Coastguard Worker mova m11, m4 313*c0909341SAndroid Build Coastguard Worker mov bdmaxd, bdmaxm 314*c0909341SAndroid Build Coastguard Worker%endif 315*c0909341SAndroid Build Coastguard Worker sar bdmaxd, 1 316*c0909341SAndroid Build Coastguard Worker SPLATW m0, bdmaxd ; max_grain 317*c0909341SAndroid Build Coastguard Worker pcmpeqw m1, m1 318*c0909341SAndroid Build Coastguard Worker%if !cpuflag(sse4) 319*c0909341SAndroid Build Coastguard Worker pcmpeqw m2, m2 320*c0909341SAndroid Build Coastguard Worker psrldq m2, 14 321*c0909341SAndroid Build Coastguard Worker pslldq m2, 2 322*c0909341SAndroid Build Coastguard Worker pxor m2, m1 323*c0909341SAndroid Build Coastguard Worker%endif 324*c0909341SAndroid Build Coastguard Worker pxor m1, m0 ; min_grain 325*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 326*c0909341SAndroid Build Coastguard Worker SWAP 0, 13 327*c0909341SAndroid Build Coastguard Worker SWAP 1, 14 328*c0909341SAndroid Build Coastguard Worker SWAP 2, 15 329*c0909341SAndroid Build Coastguard Worker%else 330*c0909341SAndroid Build Coastguard Worker%define m13 [rsp+5*16] 331*c0909341SAndroid Build Coastguard Worker%define m14 [rsp+6*16] 332*c0909341SAndroid Build Coastguard Worker mova m13, m0 333*c0909341SAndroid Build Coastguard Worker mova m14, m1 334*c0909341SAndroid Build Coastguard Worker%if !cpuflag(sse4) 335*c0909341SAndroid Build Coastguard Worker%define m15 [rsp+7*16] 336*c0909341SAndroid Build Coastguard Worker mova m15, m2 337*c0909341SAndroid Build Coastguard Worker%endif 338*c0909341SAndroid Build Coastguard Worker%endif 339*c0909341SAndroid Build Coastguard Worker sub bufq, 2*(82*73-(82*3+79)) 340*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, fg_data, h, x 341*c0909341SAndroid Build Coastguard Worker mov hd, 70 342*c0909341SAndroid Build Coastguard Worker.y_loop_ar2: 343*c0909341SAndroid Build Coastguard Worker mov xq, -76 344*c0909341SAndroid Build Coastguard Worker 345*c0909341SAndroid Build Coastguard Worker.x_loop_ar2: 346*c0909341SAndroid Build Coastguard Worker movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] 347*c0909341SAndroid Build Coastguard Worker movu m1, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] 348*c0909341SAndroid Build Coastguard Worker psrldq m2, m0, 2 349*c0909341SAndroid Build Coastguard Worker psrldq m3, m0, 4 350*c0909341SAndroid Build Coastguard Worker psrldq m4, m0, 6 351*c0909341SAndroid Build Coastguard Worker psrldq m5, m0, 8 352*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2 353*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4 354*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m1 355*c0909341SAndroid Build Coastguard Worker psrldq m2, m1, 2 356*c0909341SAndroid Build Coastguard Worker psrldq m4, m1, 4 357*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4 358*c0909341SAndroid Build Coastguard Worker psrldq m4, m1, 6 359*c0909341SAndroid Build Coastguard Worker psrldq m1, 8 360*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m1 361*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m6 362*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m7 363*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m8 364*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m9 365*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m10 366*c0909341SAndroid Build Coastguard Worker paddd m0, m3 367*c0909341SAndroid Build Coastguard Worker paddd m5, m2 368*c0909341SAndroid Build Coastguard Worker paddd m0, m4 369*c0909341SAndroid Build Coastguard Worker paddd m0, m5 ; accumulated top 2 rows 370*c0909341SAndroid Build Coastguard Worker paddd m0, m12 371*c0909341SAndroid Build Coastguard Worker 372*c0909341SAndroid Build Coastguard Worker movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] 373*c0909341SAndroid Build Coastguard Worker pshufd m4, m1, q3321 374*c0909341SAndroid Build Coastguard Worker pxor m2, m2 375*c0909341SAndroid Build Coastguard Worker pcmpgtw m2, m4 376*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m2 ; in dwords, y=0,x=[0,3] 377*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_inner: 378*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m1, m11 379*c0909341SAndroid Build Coastguard Worker paddd m2, m0 380*c0909341SAndroid Build Coastguard Worker psrldq m0, 4 ; shift top to next pixel 381*c0909341SAndroid Build Coastguard Worker psrad m2, [fg_dataq+FGData.ar_coeff_shift] 382*c0909341SAndroid Build Coastguard Worker paddd m2, m4 383*c0909341SAndroid Build Coastguard Worker packssdw m2, m2 384*c0909341SAndroid Build Coastguard Worker pminsw m2, m13 385*c0909341SAndroid Build Coastguard Worker pmaxsw m2, m14 386*c0909341SAndroid Build Coastguard Worker psrldq m4, 4 387*c0909341SAndroid Build Coastguard Worker pslldq m2, 2 388*c0909341SAndroid Build Coastguard Worker psrldq m1, 2 389*c0909341SAndroid Build Coastguard Worker%if cpuflag(sse4) 390*c0909341SAndroid Build Coastguard Worker pblendw m1, m2, 00000010b 391*c0909341SAndroid Build Coastguard Worker%else 392*c0909341SAndroid Build Coastguard Worker pand m1, m15 393*c0909341SAndroid Build Coastguard Worker pandn m3, m15, m2 394*c0909341SAndroid Build Coastguard Worker por m1, m3 395*c0909341SAndroid Build Coastguard Worker%endif 396*c0909341SAndroid Build Coastguard Worker ; overwrite previous pixel, this should be ok 397*c0909341SAndroid Build Coastguard Worker movd [bufq+xq*2-2], m1 398*c0909341SAndroid Build Coastguard Worker inc xq 399*c0909341SAndroid Build Coastguard Worker jz .x_loop_ar2_end 400*c0909341SAndroid Build Coastguard Worker test xq, 3 401*c0909341SAndroid Build Coastguard Worker jnz .x_loop_ar2_inner 402*c0909341SAndroid Build Coastguard Worker jmp .x_loop_ar2 403*c0909341SAndroid Build Coastguard Worker 404*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_end: 405*c0909341SAndroid Build Coastguard Worker add bufq, 82*2 406*c0909341SAndroid Build Coastguard Worker dec hd 407*c0909341SAndroid Build Coastguard Worker jg .y_loop_ar2 408*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 409*c0909341SAndroid Build Coastguard Worker%undef m8 410*c0909341SAndroid Build Coastguard Worker%undef m9 411*c0909341SAndroid Build Coastguard Worker%undef m10 412*c0909341SAndroid Build Coastguard Worker%undef m11 413*c0909341SAndroid Build Coastguard Worker%undef m12 414*c0909341SAndroid Build Coastguard Worker%undef m13 415*c0909341SAndroid Build Coastguard Worker%undef m14 416*c0909341SAndroid Build Coastguard Worker%undef m15 417*c0909341SAndroid Build Coastguard Worker%endif 418*c0909341SAndroid Build Coastguard Worker RET 419*c0909341SAndroid Build Coastguard Worker 420*c0909341SAndroid Build Coastguard Worker.ar3: 421*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, fg_data, bdmax, shift 422*c0909341SAndroid Build Coastguard Worker%if WIN64 423*c0909341SAndroid Build Coastguard Worker mov r6, rsp 424*c0909341SAndroid Build Coastguard Worker and rsp, ~15 425*c0909341SAndroid Build Coastguard Worker sub rsp, 64 426*c0909341SAndroid Build Coastguard Worker %define tmp rsp 427*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_64 428*c0909341SAndroid Build Coastguard Worker %define tmp rsp+stack_offset-72 429*c0909341SAndroid Build Coastguard Worker%else 430*c0909341SAndroid Build Coastguard Worker ALLOC_STACK -16*12 431*c0909341SAndroid Build Coastguard Worker %define tmp rsp 432*c0909341SAndroid Build Coastguard Worker mov bdmaxd, bdmaxm 433*c0909341SAndroid Build Coastguard Worker%endif 434*c0909341SAndroid Build Coastguard Worker sar bdmaxd, 1 435*c0909341SAndroid Build Coastguard Worker SPLATW m7, bdmaxd ; max_grain 436*c0909341SAndroid Build Coastguard Worker pcmpeqw m6, m6 437*c0909341SAndroid Build Coastguard Worker%if !cpuflag(sse4) 438*c0909341SAndroid Build Coastguard Worker pcmpeqw m4, m4 439*c0909341SAndroid Build Coastguard Worker psrldq m4, 14 440*c0909341SAndroid Build Coastguard Worker pslldq m4, 4 441*c0909341SAndroid Build Coastguard Worker pxor m4, m6 442*c0909341SAndroid Build Coastguard Worker%endif 443*c0909341SAndroid Build Coastguard Worker pxor m6, m7 ; min_grain 444*c0909341SAndroid Build Coastguard Worker mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 445*c0909341SAndroid Build Coastguard Worker 446*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 447*c0909341SAndroid Build Coastguard Worker SWAP 6, 14 448*c0909341SAndroid Build Coastguard Worker SWAP 7, 15 449*c0909341SAndroid Build Coastguard Worker%else 450*c0909341SAndroid Build Coastguard Worker%define m14 [rsp+10*16] 451*c0909341SAndroid Build Coastguard Worker%define m15 [esp+11*16] 452*c0909341SAndroid Build Coastguard Worker mova m14, m6 453*c0909341SAndroid Build Coastguard Worker mova m15, m7 454*c0909341SAndroid Build Coastguard Worker%endif 455*c0909341SAndroid Build Coastguard Worker 456*c0909341SAndroid Build Coastguard Worker ; build cf0-1 until 18-19 in m5-12 and r0/1 457*c0909341SAndroid Build Coastguard Worker pxor m1, m1 458*c0909341SAndroid Build Coastguard Worker movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 459*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m0 460*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m0, m1 461*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1 462*c0909341SAndroid Build Coastguard Worker 463*c0909341SAndroid Build Coastguard Worker%if cpuflag(sse4) 464*c0909341SAndroid Build Coastguard Worker pshufd m4, m2, q3333 465*c0909341SAndroid Build Coastguard Worker%else 466*c0909341SAndroid Build Coastguard Worker pshufd m5, m2, q3333 467*c0909341SAndroid Build Coastguard Worker mova [tmp+48], m5 468*c0909341SAndroid Build Coastguard Worker%endif 469*c0909341SAndroid Build Coastguard Worker pshufd m3, m2, q2222 470*c0909341SAndroid Build Coastguard Worker pshufd m1, m2, q0000 471*c0909341SAndroid Build Coastguard Worker pshufd m2, m2, q1111 472*c0909341SAndroid Build Coastguard Worker pshufd m7, m0, q2222 473*c0909341SAndroid Build Coastguard Worker pshufd m6, m0, q1111 474*c0909341SAndroid Build Coastguard Worker pshufd m5, m0, q0000 475*c0909341SAndroid Build Coastguard Worker pshufd m0, m0, q3333 476*c0909341SAndroid Build Coastguard Worker 477*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 478*c0909341SAndroid Build Coastguard Worker SWAP 0, 8 479*c0909341SAndroid Build Coastguard Worker SWAP 1, 9 480*c0909341SAndroid Build Coastguard Worker SWAP 2, 10 481*c0909341SAndroid Build Coastguard Worker SWAP 3, 11 482*c0909341SAndroid Build Coastguard Worker SWAP 4, 12 483*c0909341SAndroid Build Coastguard Worker%else 484*c0909341SAndroid Build Coastguard Worker%define m8 [rsp+4*16] 485*c0909341SAndroid Build Coastguard Worker%define m9 [esp+5*16] 486*c0909341SAndroid Build Coastguard Worker%define m10 [rsp+6*16] 487*c0909341SAndroid Build Coastguard Worker%define m11 [esp+7*16] 488*c0909341SAndroid Build Coastguard Worker%define m12 [rsp+8*16] 489*c0909341SAndroid Build Coastguard Worker mova m8, m0 490*c0909341SAndroid Build Coastguard Worker mova m9, m1 491*c0909341SAndroid Build Coastguard Worker mova m10, m2 492*c0909341SAndroid Build Coastguard Worker mova m11, m3 493*c0909341SAndroid Build Coastguard Worker mova m12, m4 494*c0909341SAndroid Build Coastguard Worker%endif 495*c0909341SAndroid Build Coastguard Worker 496*c0909341SAndroid Build Coastguard Worker ; build cf20,round in r2 497*c0909341SAndroid Build Coastguard Worker ; build cf21-23,round*2 in m13 498*c0909341SAndroid Build Coastguard Worker pxor m1, m1 499*c0909341SAndroid Build Coastguard Worker movq m0, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 500*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m0 501*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1 502*c0909341SAndroid Build Coastguard Worker pshufd m1, m0, q0000 503*c0909341SAndroid Build Coastguard Worker pshufd m2, m0, q1111 504*c0909341SAndroid Build Coastguard Worker mova [tmp+ 0], m1 505*c0909341SAndroid Build Coastguard Worker mova [tmp+16], m2 506*c0909341SAndroid Build Coastguard Worker psrldq m3, m0, 10 507*c0909341SAndroid Build Coastguard Worker pinsrw m3, [base+round_vals+shiftq*2-10], 3 508*c0909341SAndroid Build Coastguard Worker 509*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 510*c0909341SAndroid Build Coastguard Worker SWAP 3, 13 511*c0909341SAndroid Build Coastguard Worker%else 512*c0909341SAndroid Build Coastguard Worker%define m13 [esp+9*16] 513*c0909341SAndroid Build Coastguard Worker mova m13, m3 514*c0909341SAndroid Build Coastguard Worker%endif 515*c0909341SAndroid Build Coastguard Worker 516*c0909341SAndroid Build Coastguard Worker pinsrw m0, [base+round_vals+shiftq*2-12], 5 517*c0909341SAndroid Build Coastguard Worker pshufd m3, m0, q2222 518*c0909341SAndroid Build Coastguard Worker mova [tmp+32], m3 519*c0909341SAndroid Build Coastguard Worker 520*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, fg_data, h, x 521*c0909341SAndroid Build Coastguard Worker sub bufq, 2*(82*73-(82*3+79)) 522*c0909341SAndroid Build Coastguard Worker mov hd, 70 523*c0909341SAndroid Build Coastguard Worker.y_loop_ar3: 524*c0909341SAndroid Build Coastguard Worker mov xq, -76 525*c0909341SAndroid Build Coastguard Worker 526*c0909341SAndroid Build Coastguard Worker.x_loop_ar3: 527*c0909341SAndroid Build Coastguard Worker movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] 528*c0909341SAndroid Build Coastguard Worker movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] 529*c0909341SAndroid Build Coastguard Worker palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] 530*c0909341SAndroid Build Coastguard Worker palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] 531*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] 532*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] 533*c0909341SAndroid Build Coastguard Worker shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] 534*c0909341SAndroid Build Coastguard Worker 535*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m5 536*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m6 537*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m7 538*c0909341SAndroid Build Coastguard Worker paddd m0, m2 539*c0909341SAndroid Build Coastguard Worker paddd m0, m3 540*c0909341SAndroid Build Coastguard Worker ; m0 = top line first 6 multiplied by cf, m1 = top line last entry 541*c0909341SAndroid Build Coastguard Worker 542*c0909341SAndroid Build Coastguard Worker movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] 543*c0909341SAndroid Build Coastguard Worker movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] 544*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] 545*c0909341SAndroid Build Coastguard Worker palignr m4, m3, m2, 2 ; y=-3,x=[-2,+5] 546*c0909341SAndroid Build Coastguard Worker palignr m3, m3, m2, 4 ; y=-3,x=[-1,+6] 547*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] 548*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] 549*c0909341SAndroid Build Coastguard Worker shufps m3, m4, m2, q1032 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] 550*c0909341SAndroid Build Coastguard Worker 551*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m8 552*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m9 553*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m10 554*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m11 555*c0909341SAndroid Build Coastguard Worker paddd m1, m4 556*c0909341SAndroid Build Coastguard Worker paddd m3, m2 557*c0909341SAndroid Build Coastguard Worker paddd m0, m1 558*c0909341SAndroid Build Coastguard Worker paddd m0, m3 559*c0909341SAndroid Build Coastguard Worker ; m0 = top 2 lines multiplied by cf 560*c0909341SAndroid Build Coastguard Worker 561*c0909341SAndroid Build Coastguard Worker movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] 562*c0909341SAndroid Build Coastguard Worker movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] 563*c0909341SAndroid Build Coastguard Worker palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] 564*c0909341SAndroid Build Coastguard Worker palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] 565*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] 566*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] 567*c0909341SAndroid Build Coastguard Worker shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] 568*c0909341SAndroid Build Coastguard Worker punpcklwd m2, [base+pw_1] 569*c0909341SAndroid Build Coastguard Worker 570*c0909341SAndroid Build Coastguard Worker%if cpuflag(sse4) 571*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m12 572*c0909341SAndroid Build Coastguard Worker%else 573*c0909341SAndroid Build Coastguard Worker pmaddwd m1, [tmp+48] 574*c0909341SAndroid Build Coastguard Worker%endif 575*c0909341SAndroid Build Coastguard Worker pmaddwd m3, [tmp+ 0] 576*c0909341SAndroid Build Coastguard Worker pmaddwd m4, [tmp+16] 577*c0909341SAndroid Build Coastguard Worker pmaddwd m2, [tmp+32] 578*c0909341SAndroid Build Coastguard Worker paddd m1, m3 579*c0909341SAndroid Build Coastguard Worker paddd m4, m2 580*c0909341SAndroid Build Coastguard Worker paddd m0, m1 581*c0909341SAndroid Build Coastguard Worker paddd m0, m4 582*c0909341SAndroid Build Coastguard Worker ; m0 = top 3 lines multiplied by cf plus rounding for downshift 583*c0909341SAndroid Build Coastguard Worker 584*c0909341SAndroid Build Coastguard Worker movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] 585*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_inner: 586*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m1, m13 587*c0909341SAndroid Build Coastguard Worker pshufd m3, m2, q1111 588*c0909341SAndroid Build Coastguard Worker paddd m2, m3 ; left+cur 589*c0909341SAndroid Build Coastguard Worker paddd m2, m0 ; add top 590*c0909341SAndroid Build Coastguard Worker psrldq m0, 4 591*c0909341SAndroid Build Coastguard Worker psrad m2, [fg_dataq+FGData.ar_coeff_shift] 592*c0909341SAndroid Build Coastguard Worker packssdw m2, m2 593*c0909341SAndroid Build Coastguard Worker pminsw m2, m15 594*c0909341SAndroid Build Coastguard Worker pmaxsw m2, m14 595*c0909341SAndroid Build Coastguard Worker pslldq m2, 4 596*c0909341SAndroid Build Coastguard Worker psrldq m1, 2 597*c0909341SAndroid Build Coastguard Worker%if cpuflag(sse4) 598*c0909341SAndroid Build Coastguard Worker pblendw m1, m2, 00000100b 599*c0909341SAndroid Build Coastguard Worker%else 600*c0909341SAndroid Build Coastguard Worker pand m1, m12 601*c0909341SAndroid Build Coastguard Worker pandn m3, m12, m2 602*c0909341SAndroid Build Coastguard Worker por m1, m3 603*c0909341SAndroid Build Coastguard Worker%endif 604*c0909341SAndroid Build Coastguard Worker ; overwrite a couple of pixels, should be ok 605*c0909341SAndroid Build Coastguard Worker movq [bufq+xq*2-4], m1 606*c0909341SAndroid Build Coastguard Worker inc xq 607*c0909341SAndroid Build Coastguard Worker jz .x_loop_ar3_end 608*c0909341SAndroid Build Coastguard Worker test xq, 3 609*c0909341SAndroid Build Coastguard Worker jnz .x_loop_ar3_inner 610*c0909341SAndroid Build Coastguard Worker jmp .x_loop_ar3 611*c0909341SAndroid Build Coastguard Worker 612*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_end: 613*c0909341SAndroid Build Coastguard Worker add bufq, 82*2 614*c0909341SAndroid Build Coastguard Worker dec hd 615*c0909341SAndroid Build Coastguard Worker jg .y_loop_ar3 616*c0909341SAndroid Build Coastguard Worker%if WIN64 617*c0909341SAndroid Build Coastguard Worker mov rsp, r6 618*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_32 619*c0909341SAndroid Build Coastguard Worker%undef m8 620*c0909341SAndroid Build Coastguard Worker%undef m9 621*c0909341SAndroid Build Coastguard Worker%undef m10 622*c0909341SAndroid Build Coastguard Worker%undef m11 623*c0909341SAndroid Build Coastguard Worker%undef m12 624*c0909341SAndroid Build Coastguard Worker%undef m13 625*c0909341SAndroid Build Coastguard Worker%undef m14 626*c0909341SAndroid Build Coastguard Worker%undef m15 627*c0909341SAndroid Build Coastguard Worker%endif 628*c0909341SAndroid Build Coastguard Worker RET 629*c0909341SAndroid Build Coastguard Worker 630*c0909341SAndroid Build Coastguard Worker%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y 631*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3 632*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 633*c0909341SAndroid Build Coastguard Workercglobal generate_grain_uv_%1_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax, x, gaussian_reg, h, pic_reg 634*c0909341SAndroid Build Coastguard Worker%define base r8-pb_mask 635*c0909341SAndroid Build Coastguard Worker lea r8, [pb_mask] 636*c0909341SAndroid Build Coastguard Worker movifnidn bdmaxd, bdmaxm 637*c0909341SAndroid Build Coastguard Worker lea r6d, [bdmaxq+1] 638*c0909341SAndroid Build Coastguard Worker%else 639*c0909341SAndroid Build Coastguard Workercglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h 640*c0909341SAndroid Build Coastguard Worker%define base r2-$$ 641*c0909341SAndroid Build Coastguard Worker LEA r2, $$ 642*c0909341SAndroid Build Coastguard Worker mov fg_dataq, r2m 643*c0909341SAndroid Build Coastguard Worker mov r6d, r4m 644*c0909341SAndroid Build Coastguard Worker inc r6d 645*c0909341SAndroid Build Coastguard Worker%endif 646*c0909341SAndroid Build Coastguard Worker movq m1, [base+rnd_next_upperbit_mask] 647*c0909341SAndroid Build Coastguard Worker movq m4, [base+mul_bits] 648*c0909341SAndroid Build Coastguard Worker movq m7, [base+hmul_bits] 649*c0909341SAndroid Build Coastguard Worker mov r5d, [fg_dataq+FGData.grain_scale_shift] 650*c0909341SAndroid Build Coastguard Worker shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc 651*c0909341SAndroid Build Coastguard Worker sub r5, r6 652*c0909341SAndroid Build Coastguard Worker SPLATW m6, [base+round+r5*2-2] 653*c0909341SAndroid Build Coastguard Worker mova m5, [base+pb_mask] 654*c0909341SAndroid Build Coastguard Worker SPLATW m0, [fg_dataq+FGData.seed] 655*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 656*c0909341SAndroid Build Coastguard Worker SPLATW m2, [base+pw_seed_xor+uvq*4] 657*c0909341SAndroid Build Coastguard Worker%else 658*c0909341SAndroid Build Coastguard Worker mov r5d, r3m 659*c0909341SAndroid Build Coastguard Worker SPLATW m2, [base+pw_seed_xor+r5*4] 660*c0909341SAndroid Build Coastguard Worker%endif 661*c0909341SAndroid Build Coastguard Worker pxor m0, m2 662*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 663*c0909341SAndroid Build Coastguard Worker lea r6, [gaussian_sequence] 664*c0909341SAndroid Build Coastguard Worker%endif 665*c0909341SAndroid Build Coastguard Worker%if %2 666*c0909341SAndroid Build Coastguard Worker mov hd, 73-35*%3 667*c0909341SAndroid Build Coastguard Worker add bufq, 44*2 668*c0909341SAndroid Build Coastguard Worker.loop_y: 669*c0909341SAndroid Build Coastguard Worker mov xq, -44 670*c0909341SAndroid Build Coastguard Worker%else 671*c0909341SAndroid Build Coastguard Worker mov xq, -82*73 672*c0909341SAndroid Build Coastguard Worker add bufq, 82*73*2 673*c0909341SAndroid Build Coastguard Worker%endif 674*c0909341SAndroid Build Coastguard Worker.loop_x: 675*c0909341SAndroid Build Coastguard Worker pand m2, m0, m1 676*c0909341SAndroid Build Coastguard Worker psrlw m3, m2, 10 677*c0909341SAndroid Build Coastguard Worker por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 678*c0909341SAndroid Build Coastguard Worker pmullw m2, m4 ; bits 0x0f00 are set 679*c0909341SAndroid Build Coastguard Worker pshufb m3, m5, m2 ; set 15th bit for next 4 seeds 680*c0909341SAndroid Build Coastguard Worker psllq m2, m3, 30 681*c0909341SAndroid Build Coastguard Worker por m2, m3 682*c0909341SAndroid Build Coastguard Worker psllq m3, m2, 15 683*c0909341SAndroid Build Coastguard Worker por m2, m3 ; aggregate each bit into next seed's high bit 684*c0909341SAndroid Build Coastguard Worker pmulhuw m3, m0, m7 685*c0909341SAndroid Build Coastguard Worker por m2, m3 ; 4 next output seeds 686*c0909341SAndroid Build Coastguard Worker pshuflw m0, m2, q3333 687*c0909341SAndroid Build Coastguard Worker psrlw m2, 5 688*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 689*c0909341SAndroid Build Coastguard Worker vpgatherdw m3, m2, r6, r9, r10, 4, 2 690*c0909341SAndroid Build Coastguard Worker%else 691*c0909341SAndroid Build Coastguard Worker vpgatherdw m3, m2, base+gaussian_sequence, r5, r6, 4, 2 692*c0909341SAndroid Build Coastguard Worker%endif 693*c0909341SAndroid Build Coastguard Worker paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 694*c0909341SAndroid Build Coastguard Worker ; shifts by 0, which pmulhrsw does not support 695*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m6 696*c0909341SAndroid Build Coastguard Worker movq [bufq+xq*2], m3 697*c0909341SAndroid Build Coastguard Worker add xq, 4 698*c0909341SAndroid Build Coastguard Worker jl .loop_x 699*c0909341SAndroid Build Coastguard Worker%if %2 700*c0909341SAndroid Build Coastguard Worker add bufq, 82*2 701*c0909341SAndroid Build Coastguard Worker dec hd 702*c0909341SAndroid Build Coastguard Worker jg .loop_y 703*c0909341SAndroid Build Coastguard Worker%endif 704*c0909341SAndroid Build Coastguard Worker 705*c0909341SAndroid Build Coastguard Worker ; auto-regression code 706*c0909341SAndroid Build Coastguard Worker movsxd r5, [fg_dataq+FGData.ar_coeff_lag] 707*c0909341SAndroid Build Coastguard Worker movsxd r5, [base+generate_grain_uv_%1_16bpc_ssse3_table+r5*4] 708*c0909341SAndroid Build Coastguard Worker lea r5, [r5+base+generate_grain_uv_%1_16bpc_ssse3_table] 709*c0909341SAndroid Build Coastguard Worker jmp r5 710*c0909341SAndroid Build Coastguard Worker 711*c0909341SAndroid Build Coastguard Worker.ar0: 712*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 713*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift 714*c0909341SAndroid Build Coastguard Worker%else 715*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift 716*c0909341SAndroid Build Coastguard Worker ALLOC_STACK -16*2 717*c0909341SAndroid Build Coastguard Worker mov bufyq, r1m 718*c0909341SAndroid Build Coastguard Worker mov uvd, r3m 719*c0909341SAndroid Build Coastguard Worker%endif 720*c0909341SAndroid Build Coastguard Worker imul uvd, 28 721*c0909341SAndroid Build Coastguard Worker mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 722*c0909341SAndroid Build Coastguard Worker movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] 723*c0909341SAndroid Build Coastguard Worker SPLATW m3, [base+hmul_bits+shiftq*2-10] 724*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 725*c0909341SAndroid Build Coastguard Worker sar bdmaxd, 1 726*c0909341SAndroid Build Coastguard Worker SPLATW m1, bdmaxd ; max_gain 727*c0909341SAndroid Build Coastguard Worker%else 728*c0909341SAndroid Build Coastguard Worker SPLATW m1, r4m 729*c0909341SAndroid Build Coastguard Worker psraw m1, 1 730*c0909341SAndroid Build Coastguard Worker%endif 731*c0909341SAndroid Build Coastguard Worker pcmpeqw m7, m7 732*c0909341SAndroid Build Coastguard Worker pxor m7, m1 ; min_grain 733*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 734*c0909341SAndroid Build Coastguard Worker SWAP 1, 14 735*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, h, x 736*c0909341SAndroid Build Coastguard Worker%else 737*c0909341SAndroid Build Coastguard Worker%define m14 [rsp+0*16] 738*c0909341SAndroid Build Coastguard Worker mova m14, m1 739*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, pic_reg, h, x 740*c0909341SAndroid Build Coastguard Worker%endif 741*c0909341SAndroid Build Coastguard Worker pxor m5, m5 742*c0909341SAndroid Build Coastguard Worker pcmpgtb m5, m4 743*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5 744*c0909341SAndroid Build Coastguard Worker%if %2 745*c0909341SAndroid Build Coastguard Worker SPLATW m6, [base+hmul_bits+2+%3*2] 746*c0909341SAndroid Build Coastguard Worker%endif 747*c0909341SAndroid Build Coastguard Worker SPLATW m4, m4 748*c0909341SAndroid Build Coastguard Worker pxor m5, m5 749*c0909341SAndroid Build Coastguard Worker%if %2 750*c0909341SAndroid Build Coastguard Worker%if !cpuflag(sse4) 751*c0909341SAndroid Build Coastguard Worker pcmpeqw m2, m2 752*c0909341SAndroid Build Coastguard Worker pslldq m2, 12 753*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 754*c0909341SAndroid Build Coastguard Worker SWAP 2, 12 755*c0909341SAndroid Build Coastguard Worker%else 756*c0909341SAndroid Build Coastguard Worker%define m12 [rsp+1*16] 757*c0909341SAndroid Build Coastguard Worker mova m12, m2 758*c0909341SAndroid Build Coastguard Worker%endif 759*c0909341SAndroid Build Coastguard Worker%endif 760*c0909341SAndroid Build Coastguard Worker%endif 761*c0909341SAndroid Build Coastguard Worker%if %2 762*c0909341SAndroid Build Coastguard Worker sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) 763*c0909341SAndroid Build Coastguard Worker%else 764*c0909341SAndroid Build Coastguard Worker sub bufq, 2*(82*70-3) 765*c0909341SAndroid Build Coastguard Worker%endif 766*c0909341SAndroid Build Coastguard Worker add bufyq, 2*(3+82*3) 767*c0909341SAndroid Build Coastguard Worker mov hd, 70-35*%3 768*c0909341SAndroid Build Coastguard Worker.y_loop_ar0: 769*c0909341SAndroid Build Coastguard Worker ; first 32 pixels 770*c0909341SAndroid Build Coastguard Worker xor xd, xd 771*c0909341SAndroid Build Coastguard Worker.x_loop_ar0: 772*c0909341SAndroid Build Coastguard Worker movu m0, [bufyq+xq*(2<<%2)] 773*c0909341SAndroid Build Coastguard Worker%if %2 774*c0909341SAndroid Build Coastguard Worker%if %3 775*c0909341SAndroid Build Coastguard Worker movu m2, [bufyq+xq*4+82*2] 776*c0909341SAndroid Build Coastguard Worker paddw m0, m2 777*c0909341SAndroid Build Coastguard Worker%endif 778*c0909341SAndroid Build Coastguard Worker movu m1, [bufyq+xq*4 +16] 779*c0909341SAndroid Build Coastguard Worker%if %3 780*c0909341SAndroid Build Coastguard Worker movu m2, [bufyq+xq*4+82*2+16] 781*c0909341SAndroid Build Coastguard Worker paddw m1, m2 782*c0909341SAndroid Build Coastguard Worker%endif 783*c0909341SAndroid Build Coastguard Worker phaddw m0, m1 784*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m6 785*c0909341SAndroid Build Coastguard Worker%endif 786*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m5 787*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m5 788*c0909341SAndroid Build Coastguard Worker REPX {pmaddwd x, m4}, m0, m1 789*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 5}, m0, m1 790*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 791*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m3 792*c0909341SAndroid Build Coastguard Worker movu m1, [bufq+xq*2] 793*c0909341SAndroid Build Coastguard Worker paddw m0, m1 794*c0909341SAndroid Build Coastguard Worker pminsw m0, m14 795*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m7 796*c0909341SAndroid Build Coastguard Worker cmp xd, 72-40*%2 797*c0909341SAndroid Build Coastguard Worker je .end 798*c0909341SAndroid Build Coastguard Worker movu [bufq+xq*2], m0 799*c0909341SAndroid Build Coastguard Worker add xd, 8 800*c0909341SAndroid Build Coastguard Worker jmp .x_loop_ar0 801*c0909341SAndroid Build Coastguard Worker 802*c0909341SAndroid Build Coastguard Worker ; last 6/4 pixels 803*c0909341SAndroid Build Coastguard Worker.end: 804*c0909341SAndroid Build Coastguard Worker%if %2 805*c0909341SAndroid Build Coastguard Worker%if cpuflag(sse4) 806*c0909341SAndroid Build Coastguard Worker pblendw m0, m1, 11000000b 807*c0909341SAndroid Build Coastguard Worker%else 808*c0909341SAndroid Build Coastguard Worker pand m1, m12 809*c0909341SAndroid Build Coastguard Worker pandn m2, m12, m0 810*c0909341SAndroid Build Coastguard Worker por m0, m1, m2 811*c0909341SAndroid Build Coastguard Worker%endif 812*c0909341SAndroid Build Coastguard Worker movu [bufq+xq*2], m0 813*c0909341SAndroid Build Coastguard Worker%else 814*c0909341SAndroid Build Coastguard Worker movq [bufq+xq*2], m0 815*c0909341SAndroid Build Coastguard Worker%endif 816*c0909341SAndroid Build Coastguard Worker 817*c0909341SAndroid Build Coastguard Worker add bufq, 82*2 818*c0909341SAndroid Build Coastguard Worker add bufyq, 82*(2<<%3) 819*c0909341SAndroid Build Coastguard Worker dec hd 820*c0909341SAndroid Build Coastguard Worker jg .y_loop_ar0 821*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 822*c0909341SAndroid Build Coastguard Worker%undef m12 823*c0909341SAndroid Build Coastguard Worker%undef m14 824*c0909341SAndroid Build Coastguard Worker%endif 825*c0909341SAndroid Build Coastguard Worker RET 826*c0909341SAndroid Build Coastguard Worker 827*c0909341SAndroid Build Coastguard Worker.ar1: 828*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 829*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x 830*c0909341SAndroid Build Coastguard Worker%else 831*c0909341SAndroid Build Coastguard Worker RESET_STACK_STATE 832*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3 833*c0909341SAndroid Build Coastguard Worker mov bufyq, r1m 834*c0909341SAndroid Build Coastguard Worker mov uvd, r3m 835*c0909341SAndroid Build Coastguard Worker%endif 836*c0909341SAndroid Build Coastguard Worker imul uvd, 28 837*c0909341SAndroid Build Coastguard Worker movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] 838*c0909341SAndroid Build Coastguard Worker movq m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] 839*c0909341SAndroid Build Coastguard Worker%if WIN64 840*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS shift, bufy, h, buf, max, cf3, min, val3, x, val0 841*c0909341SAndroid Build Coastguard Worker%if %2 842*c0909341SAndroid Build Coastguard Worker lea bufq, [r0-2*(82*(73-35*%3)+44-(82*3+41))] 843*c0909341SAndroid Build Coastguard Worker%else 844*c0909341SAndroid Build Coastguard Worker lea bufq, [r0-2*(82*69+3)] 845*c0909341SAndroid Build Coastguard Worker%endif 846*c0909341SAndroid Build Coastguard Worker%else 847*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 848*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, h, shift, max, cf3, min, val3, x, val0 849*c0909341SAndroid Build Coastguard Worker%else 850*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, shift, pic_reg, fg_data, val0, bufy, cf3 851*c0909341SAndroid Build Coastguard Worker%define hd dword r1m 852*c0909341SAndroid Build Coastguard Worker%define mind dword r3m 853*c0909341SAndroid Build Coastguard Worker%define maxd dword r4m 854*c0909341SAndroid Build Coastguard Worker%endif 855*c0909341SAndroid Build Coastguard Worker%if %2 856*c0909341SAndroid Build Coastguard Worker sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) 857*c0909341SAndroid Build Coastguard Worker%else 858*c0909341SAndroid Build Coastguard Worker sub bufq, 2*(82*69+3) 859*c0909341SAndroid Build Coastguard Worker%endif 860*c0909341SAndroid Build Coastguard Worker%endif 861*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 862*c0909341SAndroid Build Coastguard Worker mov shiftd, [r2+FGData.ar_coeff_shift] 863*c0909341SAndroid Build Coastguard Worker%else 864*c0909341SAndroid Build Coastguard Worker mov shiftd, [r3+FGData.ar_coeff_shift] 865*c0909341SAndroid Build Coastguard Worker%endif 866*c0909341SAndroid Build Coastguard Worker pxor m5, m5 867*c0909341SAndroid Build Coastguard Worker pcmpgtb m5, m4 868*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5 ; cf0-4 in words 869*c0909341SAndroid Build Coastguard Worker pshuflw m4, m4, q2100 870*c0909341SAndroid Build Coastguard Worker psrldq m4, 2 ; cf0-3,4 in words 871*c0909341SAndroid Build Coastguard Worker pshufd m5, m4, q1111 872*c0909341SAndroid Build Coastguard Worker pshufd m4, m4, q0000 873*c0909341SAndroid Build Coastguard Worker movd m3, [base+round_vals+shiftq*2-12] ; rnd 874*c0909341SAndroid Build Coastguard Worker pxor m6, m6 875*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m6 876*c0909341SAndroid Build Coastguard Worker%if %2 877*c0909341SAndroid Build Coastguard Worker SPLATW m6, [base+hmul_bits+2+%3*2] 878*c0909341SAndroid Build Coastguard Worker%endif 879*c0909341SAndroid Build Coastguard Worker SPLATD m3, m3 880*c0909341SAndroid Build Coastguard Worker add bufyq, 2*(79+82*3) 881*c0909341SAndroid Build Coastguard Worker mov hd, 70-35*%3 882*c0909341SAndroid Build Coastguard Worker sar maxd, 1 883*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 884*c0909341SAndroid Build Coastguard Worker mov mind, maxd 885*c0909341SAndroid Build Coastguard Worker xor mind, -1 886*c0909341SAndroid Build Coastguard Worker%else 887*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, shift, val3, x, val0, bufy, cf3 888*c0909341SAndroid Build Coastguard Worker mov r2, maxd 889*c0909341SAndroid Build Coastguard Worker xor r2, -1 890*c0909341SAndroid Build Coastguard Worker mov mind, r2 891*c0909341SAndroid Build Coastguard Worker%endif 892*c0909341SAndroid Build Coastguard Worker.y_loop_ar1: 893*c0909341SAndroid Build Coastguard Worker mov xq, -(76>>%2) 894*c0909341SAndroid Build Coastguard Worker movsx val3d, word [bufq+xq*2-2] 895*c0909341SAndroid Build Coastguard Worker.x_loop_ar1: 896*c0909341SAndroid Build Coastguard Worker movu m0, [bufq+xq*2-82*2-2] ; top/left 897*c0909341SAndroid Build Coastguard Worker%if %2 898*c0909341SAndroid Build Coastguard Worker movu m7, [bufyq+xq*4] 899*c0909341SAndroid Build Coastguard Worker%if %3 900*c0909341SAndroid Build Coastguard Worker movu m1, [bufyq+xq*4+82*2] 901*c0909341SAndroid Build Coastguard Worker phaddw m7, m1 902*c0909341SAndroid Build Coastguard Worker%else 903*c0909341SAndroid Build Coastguard Worker phaddw m7, m7 904*c0909341SAndroid Build Coastguard Worker%endif 905*c0909341SAndroid Build Coastguard Worker%else 906*c0909341SAndroid Build Coastguard Worker movq m7, [bufyq+xq*2] 907*c0909341SAndroid Build Coastguard Worker%endif 908*c0909341SAndroid Build Coastguard Worker psrldq m2, m0, 2 ; top 909*c0909341SAndroid Build Coastguard Worker psrldq m1, m0, 4 ; top/right 910*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2 911*c0909341SAndroid Build Coastguard Worker%if %2 912*c0909341SAndroid Build Coastguard Worker%if %3 913*c0909341SAndroid Build Coastguard Worker pshufd m2, m7, q3232 914*c0909341SAndroid Build Coastguard Worker paddw m7, m2 915*c0909341SAndroid Build Coastguard Worker%endif 916*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m6 917*c0909341SAndroid Build Coastguard Worker%endif 918*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m7 919*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m4 920*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m5 921*c0909341SAndroid Build Coastguard Worker paddd m0, m1 922*c0909341SAndroid Build Coastguard Worker paddd m0, m3 923*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_inner: 924*c0909341SAndroid Build Coastguard Worker movd val0d, m0 925*c0909341SAndroid Build Coastguard Worker psrldq m0, 4 926*c0909341SAndroid Build Coastguard Worker imul val3d, cf3d 927*c0909341SAndroid Build Coastguard Worker add val3d, val0d 928*c0909341SAndroid Build Coastguard Worker sar val3d, shiftb 929*c0909341SAndroid Build Coastguard Worker movsx val0d, word [bufq+xq*2] 930*c0909341SAndroid Build Coastguard Worker add val3d, val0d 931*c0909341SAndroid Build Coastguard Worker cmp val3d, maxd 932*c0909341SAndroid Build Coastguard Worker cmovg val3d, maxd 933*c0909341SAndroid Build Coastguard Worker cmp val3d, mind 934*c0909341SAndroid Build Coastguard Worker cmovl val3d, mind 935*c0909341SAndroid Build Coastguard Worker mov word [bufq+xq*2], val3w 936*c0909341SAndroid Build Coastguard Worker ; keep val3d in-place as left for next x iteration 937*c0909341SAndroid Build Coastguard Worker inc xq 938*c0909341SAndroid Build Coastguard Worker jz .x_loop_ar1_end 939*c0909341SAndroid Build Coastguard Worker test xq, 3 940*c0909341SAndroid Build Coastguard Worker jnz .x_loop_ar1_inner 941*c0909341SAndroid Build Coastguard Worker jmp .x_loop_ar1 942*c0909341SAndroid Build Coastguard Worker 943*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_end: 944*c0909341SAndroid Build Coastguard Worker add bufq, 82*2 945*c0909341SAndroid Build Coastguard Worker add bufyq, 82*2<<%3 946*c0909341SAndroid Build Coastguard Worker dec hd 947*c0909341SAndroid Build Coastguard Worker jg .y_loop_ar1 948*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 949*c0909341SAndroid Build Coastguard Worker%undef maxd 950*c0909341SAndroid Build Coastguard Worker%undef mind 951*c0909341SAndroid Build Coastguard Worker%undef hd 952*c0909341SAndroid Build Coastguard Worker%endif 953*c0909341SAndroid Build Coastguard Worker RET 954*c0909341SAndroid Build Coastguard Worker 955*c0909341SAndroid Build Coastguard Worker.ar2: 956*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 957*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift 958*c0909341SAndroid Build Coastguard Worker%else 959*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift 960*c0909341SAndroid Build Coastguard Worker ALLOC_STACK -16*8 961*c0909341SAndroid Build Coastguard Worker mov bufyq, r1m 962*c0909341SAndroid Build Coastguard Worker mov uvd, r3m 963*c0909341SAndroid Build Coastguard Worker%endif 964*c0909341SAndroid Build Coastguard Worker mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 965*c0909341SAndroid Build Coastguard Worker imul uvd, 28 966*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 967*c0909341SAndroid Build Coastguard Worker sar bdmaxd, 1 968*c0909341SAndroid Build Coastguard Worker SPLATW m5, bdmaxd ; max_grain 969*c0909341SAndroid Build Coastguard Worker%else 970*c0909341SAndroid Build Coastguard Worker SPLATW m5, r4m 971*c0909341SAndroid Build Coastguard Worker psraw m5, 1 972*c0909341SAndroid Build Coastguard Worker%endif 973*c0909341SAndroid Build Coastguard Worker pcmpeqw m6, m6 974*c0909341SAndroid Build Coastguard Worker%if !cpuflag(sse4) 975*c0909341SAndroid Build Coastguard Worker pcmpeqw m7, m7 976*c0909341SAndroid Build Coastguard Worker psrldq m7, 14 977*c0909341SAndroid Build Coastguard Worker pslldq m7, 2 978*c0909341SAndroid Build Coastguard Worker pxor m7, m6 979*c0909341SAndroid Build Coastguard Worker%endif 980*c0909341SAndroid Build Coastguard Worker pxor m6, m5 ; min_grain 981*c0909341SAndroid Build Coastguard Worker%if %2 && cpuflag(sse4) 982*c0909341SAndroid Build Coastguard Worker SPLATW m7, [base+hmul_bits+2+%3*2] 983*c0909341SAndroid Build Coastguard Worker%endif 984*c0909341SAndroid Build Coastguard Worker 985*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 986*c0909341SAndroid Build Coastguard Worker SWAP 5, 13 987*c0909341SAndroid Build Coastguard Worker SWAP 6, 14 988*c0909341SAndroid Build Coastguard Worker SWAP 7, 15 989*c0909341SAndroid Build Coastguard Worker%else 990*c0909341SAndroid Build Coastguard Worker%define m13 [rsp+5*16] 991*c0909341SAndroid Build Coastguard Worker%define m14 [rsp+6*16] 992*c0909341SAndroid Build Coastguard Worker%define m15 [rsp+7*16] 993*c0909341SAndroid Build Coastguard Worker mova m13, m5 994*c0909341SAndroid Build Coastguard Worker mova m14, m6 995*c0909341SAndroid Build Coastguard Worker mova m15, m7 996*c0909341SAndroid Build Coastguard Worker%endif 997*c0909341SAndroid Build Coastguard Worker 998*c0909341SAndroid Build Coastguard Worker ; coef values 999*c0909341SAndroid Build Coastguard Worker movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] 1000*c0909341SAndroid Build Coastguard Worker pxor m1, m1 1001*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m0 1002*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m0, m1 1003*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1 1004*c0909341SAndroid Build Coastguard Worker pinsrw m2, [base+round_vals-12+shiftq*2], 5 1005*c0909341SAndroid Build Coastguard Worker 1006*c0909341SAndroid Build Coastguard Worker pshufd m6, m0, q0000 1007*c0909341SAndroid Build Coastguard Worker pshufd m7, m0, q1111 1008*c0909341SAndroid Build Coastguard Worker pshufd m1, m0, q3333 1009*c0909341SAndroid Build Coastguard Worker pshufd m0, m0, q2222 1010*c0909341SAndroid Build Coastguard Worker pshufd m3, m2, q1111 1011*c0909341SAndroid Build Coastguard Worker pshufd m4, m2, q2222 1012*c0909341SAndroid Build Coastguard Worker pshufd m2, m2, q0000 1013*c0909341SAndroid Build Coastguard Worker 1014*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1015*c0909341SAndroid Build Coastguard Worker SWAP 0, 8 1016*c0909341SAndroid Build Coastguard Worker SWAP 1, 9 1017*c0909341SAndroid Build Coastguard Worker SWAP 2, 10 1018*c0909341SAndroid Build Coastguard Worker SWAP 3, 11 1019*c0909341SAndroid Build Coastguard Worker SWAP 4, 12 1020*c0909341SAndroid Build Coastguard Worker%else 1021*c0909341SAndroid Build Coastguard Worker%define m8 [rsp+0*16] 1022*c0909341SAndroid Build Coastguard Worker%define m9 [rsp+1*16] 1023*c0909341SAndroid Build Coastguard Worker%define m10 [rsp+2*16] 1024*c0909341SAndroid Build Coastguard Worker%define m11 [rsp+3*16] 1025*c0909341SAndroid Build Coastguard Worker%define m12 [rsp+4*16] 1026*c0909341SAndroid Build Coastguard Worker mova m8, m0 1027*c0909341SAndroid Build Coastguard Worker mova m9, m1 1028*c0909341SAndroid Build Coastguard Worker mova m10, m2 1029*c0909341SAndroid Build Coastguard Worker mova m11, m3 1030*c0909341SAndroid Build Coastguard Worker mova m12, m4 1031*c0909341SAndroid Build Coastguard Worker%endif 1032*c0909341SAndroid Build Coastguard Worker 1033*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1034*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, fg_data, h, x 1035*c0909341SAndroid Build Coastguard Worker%else 1036*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x 1037*c0909341SAndroid Build Coastguard Worker%endif 1038*c0909341SAndroid Build Coastguard Worker%if %2 1039*c0909341SAndroid Build Coastguard Worker sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) 1040*c0909341SAndroid Build Coastguard Worker%else 1041*c0909341SAndroid Build Coastguard Worker sub bufq, 2*(82*69+3) 1042*c0909341SAndroid Build Coastguard Worker%endif 1043*c0909341SAndroid Build Coastguard Worker add bufyq, 2*(79+82*3) 1044*c0909341SAndroid Build Coastguard Worker mov hd, 70-35*%3 1045*c0909341SAndroid Build Coastguard Worker.y_loop_ar2: 1046*c0909341SAndroid Build Coastguard Worker mov xq, -(76>>%2) 1047*c0909341SAndroid Build Coastguard Worker 1048*c0909341SAndroid Build Coastguard Worker.x_loop_ar2: 1049*c0909341SAndroid Build Coastguard Worker movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] 1050*c0909341SAndroid Build Coastguard Worker movu m5, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] 1051*c0909341SAndroid Build Coastguard Worker psrldq m4, m0, 2 ; y=-2,x=[-1,+5] 1052*c0909341SAndroid Build Coastguard Worker psrldq m1, m0, 4 ; y=-2,x=[-0,+5] 1053*c0909341SAndroid Build Coastguard Worker psrldq m3, m0, 6 ; y=-2,x=[+1,+5] 1054*c0909341SAndroid Build Coastguard Worker psrldq m2, m0, 8 ; y=-2,x=[+2,+5] 1055*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m4 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] 1056*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] 1057*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m5 ; y=-2/-1,x=[+2/-2,+3/-1,+4/+0,+5/+1] 1058*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m6 1059*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m7 1060*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m8 1061*c0909341SAndroid Build Coastguard Worker paddd m0, m1 1062*c0909341SAndroid Build Coastguard Worker paddd m0, m2 1063*c0909341SAndroid Build Coastguard Worker psrldq m3, m5, 2 ; y=-1,x=[-1,+5] 1064*c0909341SAndroid Build Coastguard Worker psrldq m1, m5, 4 ; y=-1,x=[-0,+5] 1065*c0909341SAndroid Build Coastguard Worker psrldq m4, m5, 6 ; y=-1,x=[+1,+5] 1066*c0909341SAndroid Build Coastguard Worker psrldq m2, m5, 8 ; y=-1,x=[+2,+5] 1067*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m1 1068*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m2 1069*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m9 1070*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m10 1071*c0909341SAndroid Build Coastguard Worker paddd m3, m4 1072*c0909341SAndroid Build Coastguard Worker paddd m0, m3 1073*c0909341SAndroid Build Coastguard Worker 1074*c0909341SAndroid Build Coastguard Worker ; luma component & rounding 1075*c0909341SAndroid Build Coastguard Worker%if %2 1076*c0909341SAndroid Build Coastguard Worker movu m1, [bufyq+xq*4] 1077*c0909341SAndroid Build Coastguard Worker%if %3 1078*c0909341SAndroid Build Coastguard Worker movu m2, [bufyq+xq*4+82*2] 1079*c0909341SAndroid Build Coastguard Worker phaddw m1, m2 1080*c0909341SAndroid Build Coastguard Worker pshufd m2, m1, q3232 1081*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1082*c0909341SAndroid Build Coastguard Worker%else 1083*c0909341SAndroid Build Coastguard Worker phaddw m1, m1 1084*c0909341SAndroid Build Coastguard Worker%endif 1085*c0909341SAndroid Build Coastguard Worker%if cpuflag(sse4) 1086*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m15 1087*c0909341SAndroid Build Coastguard Worker%elif %3 1088*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, [base+pw_8192] 1089*c0909341SAndroid Build Coastguard Worker%else 1090*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, [base+pw_16384] 1091*c0909341SAndroid Build Coastguard Worker%endif 1092*c0909341SAndroid Build Coastguard Worker%else 1093*c0909341SAndroid Build Coastguard Worker movq m1, [bufyq+xq*2] 1094*c0909341SAndroid Build Coastguard Worker%endif 1095*c0909341SAndroid Build Coastguard Worker punpcklwd m1, [base+pw_1] 1096*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m12 1097*c0909341SAndroid Build Coastguard Worker paddd m0, m1 1098*c0909341SAndroid Build Coastguard Worker 1099*c0909341SAndroid Build Coastguard Worker movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] 1100*c0909341SAndroid Build Coastguard Worker pshufd m2, m1, q3321 1101*c0909341SAndroid Build Coastguard Worker pxor m3, m3 1102*c0909341SAndroid Build Coastguard Worker pcmpgtw m3, m2 1103*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3 ; y=0,x=[0,3] in dword 1104*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_inner: 1105*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1, m11 1106*c0909341SAndroid Build Coastguard Worker paddd m3, m0 1107*c0909341SAndroid Build Coastguard Worker psrldq m0, 4 ; shift top to next pixel 1108*c0909341SAndroid Build Coastguard Worker psrad m3, [fg_dataq+FGData.ar_coeff_shift] 1109*c0909341SAndroid Build Coastguard Worker ; we do not need to packssdw since we only care about one value 1110*c0909341SAndroid Build Coastguard Worker paddd m3, m2 1111*c0909341SAndroid Build Coastguard Worker packssdw m3, m3 1112*c0909341SAndroid Build Coastguard Worker pminsw m3, m13 1113*c0909341SAndroid Build Coastguard Worker pmaxsw m3, m14 1114*c0909341SAndroid Build Coastguard Worker psrldq m1, 2 1115*c0909341SAndroid Build Coastguard Worker pslldq m3, 2 1116*c0909341SAndroid Build Coastguard Worker psrldq m2, 4 1117*c0909341SAndroid Build Coastguard Worker%if cpuflag(sse4) 1118*c0909341SAndroid Build Coastguard Worker pblendw m1, m3, 00000010b 1119*c0909341SAndroid Build Coastguard Worker%else 1120*c0909341SAndroid Build Coastguard Worker pand m1, m15 1121*c0909341SAndroid Build Coastguard Worker pandn m4, m15, m3 1122*c0909341SAndroid Build Coastguard Worker por m1, m4 1123*c0909341SAndroid Build Coastguard Worker%endif 1124*c0909341SAndroid Build Coastguard Worker ; overwrite previous pixel, should be ok 1125*c0909341SAndroid Build Coastguard Worker movd [bufq+xq*2-2], m1 1126*c0909341SAndroid Build Coastguard Worker inc xq 1127*c0909341SAndroid Build Coastguard Worker jz .x_loop_ar2_end 1128*c0909341SAndroid Build Coastguard Worker test xq, 3 1129*c0909341SAndroid Build Coastguard Worker jnz .x_loop_ar2_inner 1130*c0909341SAndroid Build Coastguard Worker jmp .x_loop_ar2 1131*c0909341SAndroid Build Coastguard Worker 1132*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_end: 1133*c0909341SAndroid Build Coastguard Worker add bufq, 82*2 1134*c0909341SAndroid Build Coastguard Worker add bufyq, 82*2<<%3 1135*c0909341SAndroid Build Coastguard Worker dec hd 1136*c0909341SAndroid Build Coastguard Worker jg .y_loop_ar2 1137*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1138*c0909341SAndroid Build Coastguard Worker%undef m13 1139*c0909341SAndroid Build Coastguard Worker%undef m14 1140*c0909341SAndroid Build Coastguard Worker%undef m15 1141*c0909341SAndroid Build Coastguard Worker%endif 1142*c0909341SAndroid Build Coastguard Worker RET 1143*c0909341SAndroid Build Coastguard Worker 1144*c0909341SAndroid Build Coastguard Worker.ar3: 1145*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1146*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift 1147*c0909341SAndroid Build Coastguard Worker%if WIN64 1148*c0909341SAndroid Build Coastguard Worker mov r6, rsp 1149*c0909341SAndroid Build Coastguard Worker and rsp, ~15 1150*c0909341SAndroid Build Coastguard Worker sub rsp, 96 1151*c0909341SAndroid Build Coastguard Worker %define tmp rsp 1152*c0909341SAndroid Build Coastguard Worker%else 1153*c0909341SAndroid Build Coastguard Worker %define tmp rsp+stack_offset-120 1154*c0909341SAndroid Build Coastguard Worker%endif 1155*c0909341SAndroid Build Coastguard Worker%else 1156*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift 1157*c0909341SAndroid Build Coastguard Worker ALLOC_STACK -16*14 1158*c0909341SAndroid Build Coastguard Worker mov bufyq, r1m 1159*c0909341SAndroid Build Coastguard Worker mov uvd, r3m 1160*c0909341SAndroid Build Coastguard Worker %define tmp rsp 1161*c0909341SAndroid Build Coastguard Worker%endif 1162*c0909341SAndroid Build Coastguard Worker mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 1163*c0909341SAndroid Build Coastguard Worker imul uvd, 28 1164*c0909341SAndroid Build Coastguard Worker SPLATW m4, [base+round_vals-12+shiftq*2] 1165*c0909341SAndroid Build Coastguard Worker pxor m5, m5 1166*c0909341SAndroid Build Coastguard Worker pcmpgtw m5, m4 1167*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5 1168*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1169*c0909341SAndroid Build Coastguard Worker sar bdmaxd, 1 1170*c0909341SAndroid Build Coastguard Worker SPLATW m6, bdmaxd ; max_grain 1171*c0909341SAndroid Build Coastguard Worker%else 1172*c0909341SAndroid Build Coastguard Worker SPLATW m6, r4m 1173*c0909341SAndroid Build Coastguard Worker psraw m6, 1 1174*c0909341SAndroid Build Coastguard Worker%endif 1175*c0909341SAndroid Build Coastguard Worker pcmpeqw m7, m7 1176*c0909341SAndroid Build Coastguard Worker%if !cpuflag(sse4) 1177*c0909341SAndroid Build Coastguard Worker pcmpeqw m3, m3 1178*c0909341SAndroid Build Coastguard Worker psrldq m3, 14 1179*c0909341SAndroid Build Coastguard Worker pslldq m3, 4 1180*c0909341SAndroid Build Coastguard Worker pxor m3, m7 1181*c0909341SAndroid Build Coastguard Worker%endif 1182*c0909341SAndroid Build Coastguard Worker pxor m7, m6 ; min_grain 1183*c0909341SAndroid Build Coastguard Worker%if %2 && cpuflag(sse4) 1184*c0909341SAndroid Build Coastguard Worker SPLATW m3, [base+hmul_bits+2+%3*2] 1185*c0909341SAndroid Build Coastguard Worker%endif 1186*c0909341SAndroid Build Coastguard Worker 1187*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1188*c0909341SAndroid Build Coastguard Worker SWAP 3, 11 1189*c0909341SAndroid Build Coastguard Worker SWAP 4, 12 1190*c0909341SAndroid Build Coastguard Worker SWAP 6, 14 1191*c0909341SAndroid Build Coastguard Worker SWAP 7, 15 1192*c0909341SAndroid Build Coastguard Worker%else 1193*c0909341SAndroid Build Coastguard Worker%define m11 [rsp+ 9*16] 1194*c0909341SAndroid Build Coastguard Worker%define m12 [rsp+10*16] 1195*c0909341SAndroid Build Coastguard Worker%define m14 [rsp+12*16] 1196*c0909341SAndroid Build Coastguard Worker%define m15 [rsp+13*16] 1197*c0909341SAndroid Build Coastguard Worker mova m11, m3 1198*c0909341SAndroid Build Coastguard Worker mova m12, m4 1199*c0909341SAndroid Build Coastguard Worker mova m14, m6 1200*c0909341SAndroid Build Coastguard Worker mova m15, m7 1201*c0909341SAndroid Build Coastguard Worker%endif 1202*c0909341SAndroid Build Coastguard Worker 1203*c0909341SAndroid Build Coastguard Worker ; cf from y=-3,x=-3 until y=-3,x=-2 1204*c0909341SAndroid Build Coastguard Worker movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] 1205*c0909341SAndroid Build Coastguard Worker pxor m1, m1 1206*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m0 1207*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m0, m1 1208*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1 1209*c0909341SAndroid Build Coastguard Worker pshufd m1, m0, q0000 1210*c0909341SAndroid Build Coastguard Worker pshufd m3, m0, q1111 1211*c0909341SAndroid Build Coastguard Worker pshufd m4, m0, q2222 1212*c0909341SAndroid Build Coastguard Worker pshufd m0, m0, q3333 1213*c0909341SAndroid Build Coastguard Worker pshufd m5, m2, q0000 1214*c0909341SAndroid Build Coastguard Worker pshufd m6, m2, q1111 1215*c0909341SAndroid Build Coastguard Worker mova [tmp+16*0], m1 1216*c0909341SAndroid Build Coastguard Worker mova [tmp+16*1], m3 1217*c0909341SAndroid Build Coastguard Worker mova [tmp+16*2], m4 1218*c0909341SAndroid Build Coastguard Worker mova [tmp+16*3], m0 1219*c0909341SAndroid Build Coastguard Worker mova [tmp+16*4], m5 1220*c0909341SAndroid Build Coastguard Worker mova [tmp+16*5], m6 1221*c0909341SAndroid Build Coastguard Worker pshufd m6, m2, q2222 1222*c0909341SAndroid Build Coastguard Worker pshufd m7, m2, q3333 1223*c0909341SAndroid Build Coastguard Worker 1224*c0909341SAndroid Build Coastguard Worker ; cf from y=-1,x=-1 to y=0,x=-1 + luma component 1225*c0909341SAndroid Build Coastguard Worker movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] 1226*c0909341SAndroid Build Coastguard Worker pxor m1, m1 1227*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m0 1228*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m0, m1 ; luma 1229*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1 1230*c0909341SAndroid Build Coastguard Worker pshufd m3, m0, q3232 1231*c0909341SAndroid Build Coastguard Worker psrldq m5, m0, 10 1232*c0909341SAndroid Build Coastguard Worker ; y=0,x=[-3 to -1] + "1.0" for current pixel 1233*c0909341SAndroid Build Coastguard Worker pinsrw m5, [base+round_vals-10+shiftq*2], 3 1234*c0909341SAndroid Build Coastguard Worker ; y=-1,x=[-1 to +2] 1235*c0909341SAndroid Build Coastguard Worker pshufd m1, m0, q0000 1236*c0909341SAndroid Build Coastguard Worker pshufd m0, m0, q1111 1237*c0909341SAndroid Build Coastguard Worker ; y=-1,x=+3 + luma 1238*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m2 1239*c0909341SAndroid Build Coastguard Worker pshufd m3, m3, q0000 1240*c0909341SAndroid Build Coastguard Worker 1241*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1242*c0909341SAndroid Build Coastguard Worker SWAP 1, 8 1243*c0909341SAndroid Build Coastguard Worker SWAP 0, 9 1244*c0909341SAndroid Build Coastguard Worker SWAP 3, 10 1245*c0909341SAndroid Build Coastguard Worker SWAP 5, 13 1246*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, fg_data, h, x 1247*c0909341SAndroid Build Coastguard Worker%else 1248*c0909341SAndroid Build Coastguard Worker%define m8 [rsp+ 6*16] 1249*c0909341SAndroid Build Coastguard Worker%define m9 [rsp+ 7*16] 1250*c0909341SAndroid Build Coastguard Worker%define m10 [rsp+ 8*16] 1251*c0909341SAndroid Build Coastguard Worker%define m13 [rsp+11*16] 1252*c0909341SAndroid Build Coastguard Worker mova m8, m1 1253*c0909341SAndroid Build Coastguard Worker mova m9, m0 1254*c0909341SAndroid Build Coastguard Worker mova m10, m3 1255*c0909341SAndroid Build Coastguard Worker mova m13, m5 1256*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x 1257*c0909341SAndroid Build Coastguard Worker%endif 1258*c0909341SAndroid Build Coastguard Worker%if %2 1259*c0909341SAndroid Build Coastguard Worker sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) 1260*c0909341SAndroid Build Coastguard Worker%else 1261*c0909341SAndroid Build Coastguard Worker sub bufq, 2*(82*69+3) 1262*c0909341SAndroid Build Coastguard Worker%endif 1263*c0909341SAndroid Build Coastguard Worker add bufyq, 2*(79+82*3) 1264*c0909341SAndroid Build Coastguard Worker mov hd, 70-35*%3 1265*c0909341SAndroid Build Coastguard Worker.y_loop_ar3: 1266*c0909341SAndroid Build Coastguard Worker mov xq, -(76>>%2) 1267*c0909341SAndroid Build Coastguard Worker 1268*c0909341SAndroid Build Coastguard Worker.x_loop_ar3: 1269*c0909341SAndroid Build Coastguard Worker ; first line 1270*c0909341SAndroid Build Coastguard Worker movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] 1271*c0909341SAndroid Build Coastguard Worker movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] 1272*c0909341SAndroid Build Coastguard Worker palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] 1273*c0909341SAndroid Build Coastguard Worker palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] 1274*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] 1275*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] 1276*c0909341SAndroid Build Coastguard Worker shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] 1277*c0909341SAndroid Build Coastguard Worker 1278*c0909341SAndroid Build Coastguard Worker pmaddwd m0, [tmp+0*16] 1279*c0909341SAndroid Build Coastguard Worker pmaddwd m2, [tmp+1*16] 1280*c0909341SAndroid Build Coastguard Worker pmaddwd m3, [tmp+2*16] 1281*c0909341SAndroid Build Coastguard Worker paddd m0, m2 1282*c0909341SAndroid Build Coastguard Worker paddd m0, m3 ; first 6 x of top y 1283*c0909341SAndroid Build Coastguard Worker 1284*c0909341SAndroid Build Coastguard Worker ; second line [m0/1 are busy] 1285*c0909341SAndroid Build Coastguard Worker movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] 1286*c0909341SAndroid Build Coastguard Worker movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] 1287*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] 1288*c0909341SAndroid Build Coastguard Worker palignr m4, m3, m2, 2 ; y=-2,x=[-2,+5] 1289*c0909341SAndroid Build Coastguard Worker palignr m3, m3, m2, 4 ; y=-2,x=[-2,+5] 1290*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] 1291*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] 1292*c0909341SAndroid Build Coastguard Worker shufps m3, m4, m5, q1032 ; t=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] 1293*c0909341SAndroid Build Coastguard Worker pmaddwd m1, [tmp+3*16] 1294*c0909341SAndroid Build Coastguard Worker pmaddwd m4, [tmp+4*16] 1295*c0909341SAndroid Build Coastguard Worker pmaddwd m3, [tmp+5*16] 1296*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m6 1297*c0909341SAndroid Build Coastguard Worker paddd m1, m4 1298*c0909341SAndroid Build Coastguard Worker paddd m3, m5 1299*c0909341SAndroid Build Coastguard Worker paddd m0, m1 1300*c0909341SAndroid Build Coastguard Worker paddd m0, m3 ; top 2 lines 1301*c0909341SAndroid Build Coastguard Worker 1302*c0909341SAndroid Build Coastguard Worker ; third line [m0 is busy] & luma + round 1303*c0909341SAndroid Build Coastguard Worker movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] 1304*c0909341SAndroid Build Coastguard Worker movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] 1305*c0909341SAndroid Build Coastguard Worker%if %2 1306*c0909341SAndroid Build Coastguard Worker movu m5, [bufyq+xq*4] 1307*c0909341SAndroid Build Coastguard Worker%if %3 1308*c0909341SAndroid Build Coastguard Worker movu m4, [bufyq+xq*4+82*2] 1309*c0909341SAndroid Build Coastguard Worker phaddw m5, m4 1310*c0909341SAndroid Build Coastguard Worker%else 1311*c0909341SAndroid Build Coastguard Worker phaddw m5, m5 1312*c0909341SAndroid Build Coastguard Worker%endif 1313*c0909341SAndroid Build Coastguard Worker%else 1314*c0909341SAndroid Build Coastguard Worker movq m5, [bufyq+xq*2] 1315*c0909341SAndroid Build Coastguard Worker%endif 1316*c0909341SAndroid Build Coastguard Worker palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] 1317*c0909341SAndroid Build Coastguard Worker palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] 1318*c0909341SAndroid Build Coastguard Worker%if %3 1319*c0909341SAndroid Build Coastguard Worker pshufd m4, m5, q3232 1320*c0909341SAndroid Build Coastguard Worker paddw m5, m4 1321*c0909341SAndroid Build Coastguard Worker%endif 1322*c0909341SAndroid Build Coastguard Worker%if %2 1323*c0909341SAndroid Build Coastguard Worker%if cpuflag(sse4) 1324*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m11 1325*c0909341SAndroid Build Coastguard Worker%elif %3 1326*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, [base+pw_8192] 1327*c0909341SAndroid Build Coastguard Worker%else 1328*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, [base+pw_16384] 1329*c0909341SAndroid Build Coastguard Worker%endif 1330*c0909341SAndroid Build Coastguard Worker%endif 1331*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] 1332*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] 1333*c0909341SAndroid Build Coastguard Worker shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] 1334*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m5 1335*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m7 1336*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m8 1337*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m9 1338*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m10 1339*c0909341SAndroid Build Coastguard Worker paddd m1, m3 1340*c0909341SAndroid Build Coastguard Worker paddd m4, m2 1341*c0909341SAndroid Build Coastguard Worker paddd m0, m12 ; += round 1342*c0909341SAndroid Build Coastguard Worker paddd m1, m4 1343*c0909341SAndroid Build Coastguard Worker paddd m0, m1 1344*c0909341SAndroid Build Coastguard Worker 1345*c0909341SAndroid Build Coastguard Worker movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] 1346*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_inner: 1347*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m1, m13 1348*c0909341SAndroid Build Coastguard Worker pshufd m3, m2, q1111 1349*c0909341SAndroid Build Coastguard Worker paddd m2, m3 ; left+cur 1350*c0909341SAndroid Build Coastguard Worker paddd m2, m0 ; add top 1351*c0909341SAndroid Build Coastguard Worker psrldq m0, 4 1352*c0909341SAndroid Build Coastguard Worker psrad m2, [fg_dataq+FGData.ar_coeff_shift] 1353*c0909341SAndroid Build Coastguard Worker packssdw m2, m2 1354*c0909341SAndroid Build Coastguard Worker pminsw m2, m14 1355*c0909341SAndroid Build Coastguard Worker pmaxsw m2, m15 1356*c0909341SAndroid Build Coastguard Worker pslldq m2, 4 1357*c0909341SAndroid Build Coastguard Worker psrldq m1, 2 1358*c0909341SAndroid Build Coastguard Worker%if cpuflag(sse4) 1359*c0909341SAndroid Build Coastguard Worker pblendw m1, m2, 00000100b 1360*c0909341SAndroid Build Coastguard Worker%else 1361*c0909341SAndroid Build Coastguard Worker pand m1, m11 1362*c0909341SAndroid Build Coastguard Worker pandn m3, m11, m2 1363*c0909341SAndroid Build Coastguard Worker por m1, m3 1364*c0909341SAndroid Build Coastguard Worker%endif 1365*c0909341SAndroid Build Coastguard Worker ; overwrite previous pixels, should be ok 1366*c0909341SAndroid Build Coastguard Worker movq [bufq+xq*2-4], m1 1367*c0909341SAndroid Build Coastguard Worker inc xq 1368*c0909341SAndroid Build Coastguard Worker jz .x_loop_ar3_end 1369*c0909341SAndroid Build Coastguard Worker test xq, 3 1370*c0909341SAndroid Build Coastguard Worker jnz .x_loop_ar3_inner 1371*c0909341SAndroid Build Coastguard Worker jmp .x_loop_ar3 1372*c0909341SAndroid Build Coastguard Worker 1373*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_end: 1374*c0909341SAndroid Build Coastguard Worker add bufq, 82*2 1375*c0909341SAndroid Build Coastguard Worker add bufyq, 82*2<<%3 1376*c0909341SAndroid Build Coastguard Worker dec hd 1377*c0909341SAndroid Build Coastguard Worker jg .y_loop_ar3 1378*c0909341SAndroid Build Coastguard Worker%if WIN64 1379*c0909341SAndroid Build Coastguard Worker mov rsp, r6 1380*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_32 1381*c0909341SAndroid Build Coastguard Worker%undef m8 1382*c0909341SAndroid Build Coastguard Worker%undef m9 1383*c0909341SAndroid Build Coastguard Worker%undef m10 1384*c0909341SAndroid Build Coastguard Worker%undef m11 1385*c0909341SAndroid Build Coastguard Worker%undef m12 1386*c0909341SAndroid Build Coastguard Worker%undef m13 1387*c0909341SAndroid Build Coastguard Worker%undef m14 1388*c0909341SAndroid Build Coastguard Worker%undef m15 1389*c0909341SAndroid Build Coastguard Worker%endif 1390*c0909341SAndroid Build Coastguard Worker RET 1391*c0909341SAndroid Build Coastguard Worker%endmacro 1392*c0909341SAndroid Build Coastguard Worker 1393*c0909341SAndroid Build Coastguard Workergenerate_grain_uv_fn 420, 1, 1 1394*c0909341SAndroid Build Coastguard Workergenerate_grain_uv_fn 422, 1, 0 1395*c0909341SAndroid Build Coastguard Workergenerate_grain_uv_fn 444, 0, 0 1396*c0909341SAndroid Build Coastguard Worker 1397*c0909341SAndroid Build Coastguard Worker%macro SCRATCH 3 1398*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1399*c0909341SAndroid Build Coastguard Worker mova [rsp+%3*mmsize], m%1 1400*c0909341SAndroid Build Coastguard Worker%define m%2 [rsp+%3*mmsize] 1401*c0909341SAndroid Build Coastguard Worker%else 1402*c0909341SAndroid Build Coastguard Worker SWAP %1, %2 1403*c0909341SAndroid Build Coastguard Worker%endif 1404*c0909341SAndroid Build Coastguard Worker%endmacro 1405*c0909341SAndroid Build Coastguard Worker 1406*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3 1407*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1408*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < mmsize 1409*c0909341SAndroid Build Coastguard Workercglobal fgy_32x32xn_16bpc, 0, 7, 8, 0-(8 * mmsize + 12 * gprsize), \ 1410*c0909341SAndroid Build Coastguard Worker dst, src, scaling, unused1, fg_data, picptr, unused2 1411*c0909341SAndroid Build Coastguard Worker ; copy stack arguments to new position post-alignment, so that we 1412*c0909341SAndroid Build Coastguard Worker ; don't have to keep the old stack location in a separate register 1413*c0909341SAndroid Build Coastguard Worker mov r0, r0m 1414*c0909341SAndroid Build Coastguard Worker mov r1, r2m 1415*c0909341SAndroid Build Coastguard Worker mov r2, r4m 1416*c0909341SAndroid Build Coastguard Worker mov r3, r6m 1417*c0909341SAndroid Build Coastguard Worker mov r4, r7m 1418*c0909341SAndroid Build Coastguard Worker mov r5, r8m 1419*c0909341SAndroid Build Coastguard Worker 1420*c0909341SAndroid Build Coastguard Worker%define r0m [rsp+8*mmsize+ 3*gprsize] 1421*c0909341SAndroid Build Coastguard Worker%define r2m [rsp+8*mmsize+ 5*gprsize] 1422*c0909341SAndroid Build Coastguard Worker%define r4m [rsp+8*mmsize+ 7*gprsize] 1423*c0909341SAndroid Build Coastguard Worker%define r6m [rsp+8*mmsize+ 9*gprsize] 1424*c0909341SAndroid Build Coastguard Worker%define r7m [rsp+8*mmsize+10*gprsize] 1425*c0909341SAndroid Build Coastguard Worker%define r8m [rsp+8*mmsize+11*gprsize] 1426*c0909341SAndroid Build Coastguard Worker 1427*c0909341SAndroid Build Coastguard Worker mov r0m, r0 1428*c0909341SAndroid Build Coastguard Worker mov r2m, r1 1429*c0909341SAndroid Build Coastguard Worker mov r4m, r2 1430*c0909341SAndroid Build Coastguard Worker mov r6m, r3 1431*c0909341SAndroid Build Coastguard Worker mov r7m, r4 1432*c0909341SAndroid Build Coastguard Worker mov r8m, r5 1433*c0909341SAndroid Build Coastguard Worker%else 1434*c0909341SAndroid Build Coastguard Workercglobal fgy_32x32xn_16bpc, 0, 7, 8, 8 * mmsize + 4 * gprsize, \ 1435*c0909341SAndroid Build Coastguard Worker dst, src, scaling, unused1, fg_data, picptr, unused2 1436*c0909341SAndroid Build Coastguard Worker%endif 1437*c0909341SAndroid Build Coastguard Worker mov srcq, srcm 1438*c0909341SAndroid Build Coastguard Worker mov scalingq, r5m 1439*c0909341SAndroid Build Coastguard Worker mov fg_dataq, r3m 1440*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < mmsize 1441*c0909341SAndroid Build Coastguard Worker mov r6, r9m 1442*c0909341SAndroid Build Coastguard Worker 1443*c0909341SAndroid Build Coastguard Worker%define r9m [rsp+8*mmsize+ 4*gprsize] 1444*c0909341SAndroid Build Coastguard Worker%define r3m [rsp+8*mmsize+ 6*gprsize] 1445*c0909341SAndroid Build Coastguard Worker%define r5m [rsp+8*mmsize+ 8*gprsize] 1446*c0909341SAndroid Build Coastguard Worker 1447*c0909341SAndroid Build Coastguard Worker mov r9m, r6 1448*c0909341SAndroid Build Coastguard Worker%endif 1449*c0909341SAndroid Build Coastguard Worker LEA r5, $$ 1450*c0909341SAndroid Build Coastguard Worker%define base r5-$$ 1451*c0909341SAndroid Build Coastguard Worker mov r5m, picptrq 1452*c0909341SAndroid Build Coastguard Worker%else 1453*c0909341SAndroid Build Coastguard Workercglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut 1454*c0909341SAndroid Build Coastguard Worker lea r8, [pb_mask] 1455*c0909341SAndroid Build Coastguard Worker%define base r8-pb_mask 1456*c0909341SAndroid Build Coastguard Worker%endif 1457*c0909341SAndroid Build Coastguard Worker mov r6d, [fg_dataq+FGData.scaling_shift] 1458*c0909341SAndroid Build Coastguard Worker SPLATW m3, [base+mul_bits+r6*2-14] 1459*c0909341SAndroid Build Coastguard Worker mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 1460*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1461*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 0, 3 1462*c0909341SAndroid Build Coastguard Worker%else 1463*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 9, 10 1464*c0909341SAndroid Build Coastguard Worker%endif 1465*c0909341SAndroid Build Coastguard Worker mov t0d, r9m ; bdmax 1466*c0909341SAndroid Build Coastguard Worker sar t0d, 11 ; is_12bpc 1467*c0909341SAndroid Build Coastguard Worker inc t0d 1468*c0909341SAndroid Build Coastguard Worker mov t1d, r6d 1469*c0909341SAndroid Build Coastguard Worker imul t1d, t0d 1470*c0909341SAndroid Build Coastguard Worker dec t0d 1471*c0909341SAndroid Build Coastguard Worker SPLATW m5, [base+min+t1*2] 1472*c0909341SAndroid Build Coastguard Worker lea t0d, [t0d*3] 1473*c0909341SAndroid Build Coastguard Worker lea t0d, [r6d*2+t0d] 1474*c0909341SAndroid Build Coastguard Worker SPLATW m4, [base+max+t0*2] 1475*c0909341SAndroid Build Coastguard Worker SPLATW m2, r9m 1476*c0909341SAndroid Build Coastguard Worker 1477*c0909341SAndroid Build Coastguard Worker pcmpeqw m1, m1 1478*c0909341SAndroid Build Coastguard Worker psraw m7, m2, 1 ; max_grain 1479*c0909341SAndroid Build Coastguard Worker pxor m1, m7 ; min_grain 1480*c0909341SAndroid Build Coastguard Worker SPLATD m6, [base+pd_16] 1481*c0909341SAndroid Build Coastguard Worker 1482*c0909341SAndroid Build Coastguard Worker SCRATCH 1, 9, 0 1483*c0909341SAndroid Build Coastguard Worker SCRATCH 2, 10, 1 1484*c0909341SAndroid Build Coastguard Worker SCRATCH 3, 11, 2 1485*c0909341SAndroid Build Coastguard Worker SCRATCH 4, 12, 3 1486*c0909341SAndroid Build Coastguard Worker SCRATCH 5, 13, 4 1487*c0909341SAndroid Build Coastguard Worker SCRATCH 6, 14, 5 1488*c0909341SAndroid Build Coastguard Worker SCRATCH 7, 15, 6 1489*c0909341SAndroid Build Coastguard Worker 1490*c0909341SAndroid Build Coastguard Worker mova m6, [base+pw_27_17_17_27] ; for horizontal filter 1491*c0909341SAndroid Build Coastguard Worker 1492*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1493*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2 1494*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 0 1495*c0909341SAndroid Build Coastguard Worker%else 1496*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ 1497*c0909341SAndroid Build Coastguard Worker sby, see 1498*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 7 1499*c0909341SAndroid Build Coastguard Worker%endif 1500*c0909341SAndroid Build Coastguard Worker 1501*c0909341SAndroid Build Coastguard Worker mov sbyd, r8m 1502*c0909341SAndroid Build Coastguard Worker movzx t0d, byte [fg_dataq+FGData.overlap_flag] 1503*c0909341SAndroid Build Coastguard Worker test t0d, t0d 1504*c0909341SAndroid Build Coastguard Worker jz .no_vertical_overlap 1505*c0909341SAndroid Build Coastguard Worker test sbyd, sbyd 1506*c0909341SAndroid Build Coastguard Worker jnz .vertical_overlap 1507*c0909341SAndroid Build Coastguard Worker.no_vertical_overlap: 1508*c0909341SAndroid Build Coastguard Worker mov dword r8m, t0d 1509*c0909341SAndroid Build Coastguard Worker 1510*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1511*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused 1512*c0909341SAndroid Build Coastguard Worker imul seed, (173 << 24) | 37 1513*c0909341SAndroid Build Coastguard Worker%else 1514*c0909341SAndroid Build Coastguard Worker imul seed, sbyd, (173 << 24) | 37 1515*c0909341SAndroid Build Coastguard Worker%endif 1516*c0909341SAndroid Build Coastguard Worker add seed, (105 << 24) | 178 1517*c0909341SAndroid Build Coastguard Worker rol seed, 8 1518*c0909341SAndroid Build Coastguard Worker movzx seed, seew 1519*c0909341SAndroid Build Coastguard Worker xor seed, [fg_dataq+FGData.seed] 1520*c0909341SAndroid Build Coastguard Worker 1521*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1522*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 1523*c0909341SAndroid Build Coastguard Worker 1524*c0909341SAndroid Build Coastguard Worker mov r3m, seed 1525*c0909341SAndroid Build Coastguard Worker mov wq, r4m 1526*c0909341SAndroid Build Coastguard Worker%else 1527*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1528*c0909341SAndroid Build Coastguard Worker unused1, unused2, see, src_bak 1529*c0909341SAndroid Build Coastguard Worker%endif 1530*c0909341SAndroid Build Coastguard Worker 1531*c0909341SAndroid Build Coastguard Worker lea src_bakq, [srcq+wq*2] 1532*c0909341SAndroid Build Coastguard Worker mov r9mp, src_bakq 1533*c0909341SAndroid Build Coastguard Worker neg wq 1534*c0909341SAndroid Build Coastguard Worker sub dstmp, srcq 1535*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1536*c0909341SAndroid Build Coastguard Worker mov r4m, wq 1537*c0909341SAndroid Build Coastguard Worker%endif 1538*c0909341SAndroid Build Coastguard Worker 1539*c0909341SAndroid Build Coastguard Worker.loop_x: 1540*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1541*c0909341SAndroid Build Coastguard Worker mov seed, r3m 1542*c0909341SAndroid Build Coastguard Worker%endif 1543*c0909341SAndroid Build Coastguard Worker mov r6d, seed 1544*c0909341SAndroid Build Coastguard Worker or seed, 0xEFF4 1545*c0909341SAndroid Build Coastguard Worker shr r6d, 1 1546*c0909341SAndroid Build Coastguard Worker test seeb, seeh 1547*c0909341SAndroid Build Coastguard Worker lea seed, [r6+0x8000] 1548*c0909341SAndroid Build Coastguard Worker cmovp seed, r6d ; updated seed 1549*c0909341SAndroid Build Coastguard Worker 1550*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1551*c0909341SAndroid Build Coastguard Worker mov r3m, seed 1552*c0909341SAndroid Build Coastguard Worker 1553*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1554*c0909341SAndroid Build Coastguard Worker 1555*c0909341SAndroid Build Coastguard Worker mov offxd, offyd 1556*c0909341SAndroid Build Coastguard Worker%else 1557*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1558*c0909341SAndroid Build Coastguard Worker offx, offy, see, src_bak 1559*c0909341SAndroid Build Coastguard Worker 1560*c0909341SAndroid Build Coastguard Worker mov offyd, seed 1561*c0909341SAndroid Build Coastguard Worker mov offxd, seed 1562*c0909341SAndroid Build Coastguard Worker%endif 1563*c0909341SAndroid Build Coastguard Worker ror offyd, 8 1564*c0909341SAndroid Build Coastguard Worker shr offxd, 12 1565*c0909341SAndroid Build Coastguard Worker and offyd, 0xf 1566*c0909341SAndroid Build Coastguard Worker imul offyd, 164 1567*c0909341SAndroid Build Coastguard Worker lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 1568*c0909341SAndroid Build Coastguard Worker 1569*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1570*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1571*c0909341SAndroid Build Coastguard Worker%else 1572*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1573*c0909341SAndroid Build Coastguard Worker h, offxy, see, src_bak 1574*c0909341SAndroid Build Coastguard Worker%endif 1575*c0909341SAndroid Build Coastguard Worker 1576*c0909341SAndroid Build Coastguard Worker.loop_x_odd: 1577*c0909341SAndroid Build Coastguard Worker movzx hd, word r7m 1578*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 1579*c0909341SAndroid Build Coastguard Worker.loop_y: 1580*c0909341SAndroid Build Coastguard Worker ; src 1581*c0909341SAndroid Build Coastguard Worker pand m0, m10, [srcq+ 0] 1582*c0909341SAndroid Build Coastguard Worker pand m1, m10, [srcq+16] ; m0-1: src as word 1583*c0909341SAndroid Build Coastguard Worker 1584*c0909341SAndroid Build Coastguard Worker ; scaling[src] 1585*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1586*c0909341SAndroid Build Coastguard Worker vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m4 1587*c0909341SAndroid Build Coastguard Worker vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m4 1588*c0909341SAndroid Build Coastguard Worker%else 1589*c0909341SAndroid Build Coastguard Worker vpgatherdw m2, m0, scalingq-1, r11, r13, 8, 1, m4 1590*c0909341SAndroid Build Coastguard Worker vpgatherdw m3, m1, scalingq-1, r11, r13, 8, 1, m4 1591*c0909341SAndroid Build Coastguard Worker%endif 1592*c0909341SAndroid Build Coastguard Worker REPX {psrlw x, 8}, m2, m3 1593*c0909341SAndroid Build Coastguard Worker 1594*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 1595*c0909341SAndroid Build Coastguard Worker movu m4, [grain_lutq+offxyq*2] 1596*c0909341SAndroid Build Coastguard Worker movu m5, [grain_lutq+offxyq*2+16] 1597*c0909341SAndroid Build Coastguard Worker 1598*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[src] * grain, scaling_shift) 1599*c0909341SAndroid Build Coastguard Worker REPX {pmullw x, m11}, m2, m3 1600*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m2 1601*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m3 1602*c0909341SAndroid Build Coastguard Worker 1603*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 1604*c0909341SAndroid Build Coastguard Worker paddw m0, m4 1605*c0909341SAndroid Build Coastguard Worker paddw m1, m5 1606*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m13 1607*c0909341SAndroid Build Coastguard Worker pmaxsw m1, m13 1608*c0909341SAndroid Build Coastguard Worker pminsw m0, m12 1609*c0909341SAndroid Build Coastguard Worker pminsw m1, m12 1610*c0909341SAndroid Build Coastguard Worker movifnidn dstq, dstmp 1611*c0909341SAndroid Build Coastguard Worker mova [dstq+srcq+ 0], m0 1612*c0909341SAndroid Build Coastguard Worker mova [dstq+srcq+16], m1 1613*c0909341SAndroid Build Coastguard Worker 1614*c0909341SAndroid Build Coastguard Worker add srcq, r2mp ; src += stride 1615*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82*2 1616*c0909341SAndroid Build Coastguard Worker dec hd 1617*c0909341SAndroid Build Coastguard Worker jg .loop_y 1618*c0909341SAndroid Build Coastguard Worker 1619*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1620*c0909341SAndroid Build Coastguard Worker add r4mp, 16 1621*c0909341SAndroid Build Coastguard Worker%else 1622*c0909341SAndroid Build Coastguard Worker add wq, 16 1623*c0909341SAndroid Build Coastguard Worker%endif 1624*c0909341SAndroid Build Coastguard Worker jge .end 1625*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1626*c0909341SAndroid Build Coastguard Worker mov srcq, r9mp 1627*c0909341SAndroid Build Coastguard Worker add srcq, r4mp 1628*c0909341SAndroid Build Coastguard Worker add srcq, r4mp 1629*c0909341SAndroid Build Coastguard Worker%else 1630*c0909341SAndroid Build Coastguard Worker mov src_bakq, r9mp 1631*c0909341SAndroid Build Coastguard Worker lea srcq, [src_bakq+wq*2] 1632*c0909341SAndroid Build Coastguard Worker%endif 1633*c0909341SAndroid Build Coastguard Worker btc dword r8m, 2 1634*c0909341SAndroid Build Coastguard Worker jc .next_blk 1635*c0909341SAndroid Build Coastguard Worker add offxyd, 16 1636*c0909341SAndroid Build Coastguard Worker test dword r8m, 2 1637*c0909341SAndroid Build Coastguard Worker jz .loop_x_odd 1638*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1639*c0909341SAndroid Build Coastguard Worker add dword [rsp+8*mmsize+1*gprsize], 16 1640*c0909341SAndroid Build Coastguard Worker%else 1641*c0909341SAndroid Build Coastguard Worker add r12d, 16 ; top_offxy += 16 1642*c0909341SAndroid Build Coastguard Worker%endif 1643*c0909341SAndroid Build Coastguard Worker jmp .loop_x_odd_v_overlap 1644*c0909341SAndroid Build Coastguard Worker 1645*c0909341SAndroid Build Coastguard Worker.next_blk: 1646*c0909341SAndroid Build Coastguard Worker test dword r8m, 1 1647*c0909341SAndroid Build Coastguard Worker jz .loop_x 1648*c0909341SAndroid Build Coastguard Worker 1649*c0909341SAndroid Build Coastguard Worker ; r8m = sbym 1650*c0909341SAndroid Build Coastguard Worker test dword r8m, 2 1651*c0909341SAndroid Build Coastguard Worker jnz .loop_x_hv_overlap 1652*c0909341SAndroid Build Coastguard Worker 1653*c0909341SAndroid Build Coastguard Worker ; horizontal overlap (without vertical overlap) 1654*c0909341SAndroid Build Coastguard Worker.loop_x_h_overlap: 1655*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1656*c0909341SAndroid Build Coastguard Worker add offxyd, 16 1657*c0909341SAndroid Build Coastguard Worker mov [rsp+8*mmsize+0*gprsize], offxyd 1658*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 1659*c0909341SAndroid Build Coastguard Worker mov seed, r3m 1660*c0909341SAndroid Build Coastguard Worker%endif 1661*c0909341SAndroid Build Coastguard Worker 1662*c0909341SAndroid Build Coastguard Worker mov r6d, seed 1663*c0909341SAndroid Build Coastguard Worker or seed, 0xEFF4 1664*c0909341SAndroid Build Coastguard Worker shr r6d, 1 1665*c0909341SAndroid Build Coastguard Worker test seeb, seeh 1666*c0909341SAndroid Build Coastguard Worker lea seed, [r6+0x8000] 1667*c0909341SAndroid Build Coastguard Worker cmovp seed, r6d ; updated seed 1668*c0909341SAndroid Build Coastguard Worker 1669*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1670*c0909341SAndroid Build Coastguard Worker mov r3m, seed 1671*c0909341SAndroid Build Coastguard Worker 1672*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx 1673*c0909341SAndroid Build Coastguard Worker 1674*c0909341SAndroid Build Coastguard Worker mov offxd, offyd 1675*c0909341SAndroid Build Coastguard Worker%else 1676*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1677*c0909341SAndroid Build Coastguard Worker offx, offy, see, src_bak, left_offxy 1678*c0909341SAndroid Build Coastguard Worker 1679*c0909341SAndroid Build Coastguard Worker lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx 1680*c0909341SAndroid Build Coastguard Worker 1681*c0909341SAndroid Build Coastguard Worker mov offyd, seed 1682*c0909341SAndroid Build Coastguard Worker mov offxd, seed 1683*c0909341SAndroid Build Coastguard Worker%endif 1684*c0909341SAndroid Build Coastguard Worker ror offyd, 8 1685*c0909341SAndroid Build Coastguard Worker shr offxd, 12 1686*c0909341SAndroid Build Coastguard Worker and offyd, 0xf 1687*c0909341SAndroid Build Coastguard Worker imul offyd, 164 1688*c0909341SAndroid Build Coastguard Worker lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 1689*c0909341SAndroid Build Coastguard Worker 1690*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1691*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1692*c0909341SAndroid Build Coastguard Worker%else 1693*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1694*c0909341SAndroid Build Coastguard Worker h, offxy, see, src_bak, left_offxy 1695*c0909341SAndroid Build Coastguard Worker%endif 1696*c0909341SAndroid Build Coastguard Worker 1697*c0909341SAndroid Build Coastguard Worker mov hd, dword r7m 1698*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 1699*c0909341SAndroid Build Coastguard Worker.loop_y_h_overlap: 1700*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 1701*c0909341SAndroid Build Coastguard Worker movu m5, [grain_lutq+offxyq*2] 1702*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1703*c0909341SAndroid Build Coastguard Worker mov r5, [rsp+8*mmsize+0*gprsize] 1704*c0909341SAndroid Build Coastguard Worker movd m4, [grain_lutq+r5*2] 1705*c0909341SAndroid Build Coastguard Worker%else 1706*c0909341SAndroid Build Coastguard Worker movd m4, [grain_lutq+left_offxyq*2] 1707*c0909341SAndroid Build Coastguard Worker%endif 1708*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5 1709*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m6 1710*c0909341SAndroid Build Coastguard Worker paddd m4, m14 1711*c0909341SAndroid Build Coastguard Worker psrad m4, 5 1712*c0909341SAndroid Build Coastguard Worker packssdw m4, m4 1713*c0909341SAndroid Build Coastguard Worker pminsw m4, m15 1714*c0909341SAndroid Build Coastguard Worker pmaxsw m4, m9 1715*c0909341SAndroid Build Coastguard Worker shufps m4, m5, q3210 1716*c0909341SAndroid Build Coastguard Worker 1717*c0909341SAndroid Build Coastguard Worker ; src 1718*c0909341SAndroid Build Coastguard Worker pand m0, m10, [srcq+ 0] 1719*c0909341SAndroid Build Coastguard Worker pand m1, m10, [srcq+16] ; m0-1: src as word 1720*c0909341SAndroid Build Coastguard Worker 1721*c0909341SAndroid Build Coastguard Worker ; scaling[src] 1722*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1723*c0909341SAndroid Build Coastguard Worker vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m5 1724*c0909341SAndroid Build Coastguard Worker vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m5 1725*c0909341SAndroid Build Coastguard Worker%else 1726*c0909341SAndroid Build Coastguard Worker vpgatherdw m2, m0, scalingq-1, r13, r14, 8, 1, m5 1727*c0909341SAndroid Build Coastguard Worker vpgatherdw m3, m1, scalingq-1, r13, r14, 8, 1, m5 1728*c0909341SAndroid Build Coastguard Worker%endif 1729*c0909341SAndroid Build Coastguard Worker REPX {psrlw x, 8}, m2, m3 1730*c0909341SAndroid Build Coastguard Worker 1731*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[src] * grain, scaling_shift) 1732*c0909341SAndroid Build Coastguard Worker movu m5, [grain_lutq+offxyq*2+16] 1733*c0909341SAndroid Build Coastguard Worker REPX {pmullw x, m11}, m2, m3 1734*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m2 1735*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m3 1736*c0909341SAndroid Build Coastguard Worker 1737*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 1738*c0909341SAndroid Build Coastguard Worker paddw m0, m4 1739*c0909341SAndroid Build Coastguard Worker paddw m1, m5 1740*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m13 1741*c0909341SAndroid Build Coastguard Worker pmaxsw m1, m13 1742*c0909341SAndroid Build Coastguard Worker pminsw m0, m12 1743*c0909341SAndroid Build Coastguard Worker pminsw m1, m12 1744*c0909341SAndroid Build Coastguard Worker movifnidn dstq, dstmp 1745*c0909341SAndroid Build Coastguard Worker mova [dstq+srcq+ 0], m0 1746*c0909341SAndroid Build Coastguard Worker mova [dstq+srcq+16], m1 1747*c0909341SAndroid Build Coastguard Worker 1748*c0909341SAndroid Build Coastguard Worker add srcq, r2mp 1749*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82*2 1750*c0909341SAndroid Build Coastguard Worker dec hd 1751*c0909341SAndroid Build Coastguard Worker jg .loop_y_h_overlap 1752*c0909341SAndroid Build Coastguard Worker 1753*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1754*c0909341SAndroid Build Coastguard Worker add r4mp, 16 1755*c0909341SAndroid Build Coastguard Worker%else 1756*c0909341SAndroid Build Coastguard Worker add wq, 16 1757*c0909341SAndroid Build Coastguard Worker%endif 1758*c0909341SAndroid Build Coastguard Worker jge .end 1759*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1760*c0909341SAndroid Build Coastguard Worker mov srcq, r9mp 1761*c0909341SAndroid Build Coastguard Worker add srcq, r4mp 1762*c0909341SAndroid Build Coastguard Worker add srcq, r4mp 1763*c0909341SAndroid Build Coastguard Worker%else 1764*c0909341SAndroid Build Coastguard Worker mov src_bakq, r9mp 1765*c0909341SAndroid Build Coastguard Worker lea srcq, [src_bakq+wq*2] 1766*c0909341SAndroid Build Coastguard Worker%endif 1767*c0909341SAndroid Build Coastguard Worker or dword r8m, 4 1768*c0909341SAndroid Build Coastguard Worker add offxyd, 16 1769*c0909341SAndroid Build Coastguard Worker 1770*c0909341SAndroid Build Coastguard Worker ; r8m = sbym 1771*c0909341SAndroid Build Coastguard Worker test dword r8m, 2 1772*c0909341SAndroid Build Coastguard Worker jz .loop_x_odd 1773*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1774*c0909341SAndroid Build Coastguard Worker add dword [rsp+8*mmsize+1*gprsize], 16 1775*c0909341SAndroid Build Coastguard Worker%else 1776*c0909341SAndroid Build Coastguard Worker add r12d, 16 ; top_offxy += 16 1777*c0909341SAndroid Build Coastguard Worker%endif 1778*c0909341SAndroid Build Coastguard Worker jmp .loop_x_odd_v_overlap 1779*c0909341SAndroid Build Coastguard Worker 1780*c0909341SAndroid Build Coastguard Worker.end: 1781*c0909341SAndroid Build Coastguard Worker RET 1782*c0909341SAndroid Build Coastguard Worker 1783*c0909341SAndroid Build Coastguard Worker.vertical_overlap: 1784*c0909341SAndroid Build Coastguard Worker or t0d, 2 1785*c0909341SAndroid Build Coastguard Worker mov r8m, t0d 1786*c0909341SAndroid Build Coastguard Worker 1787*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1788*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused 1789*c0909341SAndroid Build Coastguard Worker%else 1790*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ 1791*c0909341SAndroid Build Coastguard Worker sby, see 1792*c0909341SAndroid Build Coastguard Worker%endif 1793*c0909341SAndroid Build Coastguard Worker 1794*c0909341SAndroid Build Coastguard Worker movzx sbyd, sbyb 1795*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1796*c0909341SAndroid Build Coastguard Worker imul r4, [fg_dataq+FGData.seed], 0x00010001 1797*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, sby, see, picptr, unused 1798*c0909341SAndroid Build Coastguard Worker%else 1799*c0909341SAndroid Build Coastguard Worker imul seed, [fg_dataq+FGData.seed], 0x00010001 1800*c0909341SAndroid Build Coastguard Worker%endif 1801*c0909341SAndroid Build Coastguard Worker imul t0d, sbyd, 173 * 0x00010001 1802*c0909341SAndroid Build Coastguard Worker imul sbyd, 37 * 0x01000100 1803*c0909341SAndroid Build Coastguard Worker add t0d, (105 << 16) | 188 1804*c0909341SAndroid Build Coastguard Worker add sbyd, (178 << 24) | (141 << 8) 1805*c0909341SAndroid Build Coastguard Worker and t0d, 0x00ff00ff 1806*c0909341SAndroid Build Coastguard Worker and sbyd, 0xff00ff00 1807*c0909341SAndroid Build Coastguard Worker xor seed, t0d 1808*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1809*c0909341SAndroid Build Coastguard Worker xor sbyd, seed 1810*c0909341SAndroid Build Coastguard Worker 1811*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 1812*c0909341SAndroid Build Coastguard Worker 1813*c0909341SAndroid Build Coastguard Worker mov r3m, seed 1814*c0909341SAndroid Build Coastguard Worker mov wq, r4m 1815*c0909341SAndroid Build Coastguard Worker%else 1816*c0909341SAndroid Build Coastguard Worker xor seed, sbyd ; (cur_seed << 16) | top_seed 1817*c0909341SAndroid Build Coastguard Worker 1818*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1819*c0909341SAndroid Build Coastguard Worker unused1, unused2, see, src_bak 1820*c0909341SAndroid Build Coastguard Worker%endif 1821*c0909341SAndroid Build Coastguard Worker 1822*c0909341SAndroid Build Coastguard Worker lea src_bakq, [srcq+wq*2] 1823*c0909341SAndroid Build Coastguard Worker mov r9mp, src_bakq 1824*c0909341SAndroid Build Coastguard Worker neg wq 1825*c0909341SAndroid Build Coastguard Worker sub dstmp, srcq 1826*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1827*c0909341SAndroid Build Coastguard Worker mov r4m, wq 1828*c0909341SAndroid Build Coastguard Worker%endif 1829*c0909341SAndroid Build Coastguard Worker 1830*c0909341SAndroid Build Coastguard Worker.loop_x_v_overlap: 1831*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1832*c0909341SAndroid Build Coastguard Worker mov r5, r5m 1833*c0909341SAndroid Build Coastguard Worker SPLATD m7, [base+pw_27_17_17_27] 1834*c0909341SAndroid Build Coastguard Worker mov seed, r3m 1835*c0909341SAndroid Build Coastguard Worker%else 1836*c0909341SAndroid Build Coastguard Worker SPLATD m7, [pw_27_17_17_27] 1837*c0909341SAndroid Build Coastguard Worker%endif 1838*c0909341SAndroid Build Coastguard Worker 1839*c0909341SAndroid Build Coastguard Worker ; we assume from the block above that bits 8-15 of r7d are zero'ed 1840*c0909341SAndroid Build Coastguard Worker mov r6d, seed 1841*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4eff4 1842*c0909341SAndroid Build Coastguard Worker test seeb, seeh 1843*c0909341SAndroid Build Coastguard Worker setp t0b ; parity of top_seed 1844*c0909341SAndroid Build Coastguard Worker shr seed, 16 1845*c0909341SAndroid Build Coastguard Worker shl t0d, 16 1846*c0909341SAndroid Build Coastguard Worker test seeb, seeh 1847*c0909341SAndroid Build Coastguard Worker setp t0b ; parity of cur_seed 1848*c0909341SAndroid Build Coastguard Worker or r6d, 0x00010001 1849*c0909341SAndroid Build Coastguard Worker xor t0d, r6d 1850*c0909341SAndroid Build Coastguard Worker mov seed, t0d 1851*c0909341SAndroid Build Coastguard Worker ror seed, 1 ; updated (cur_seed << 16) | top_seed 1852*c0909341SAndroid Build Coastguard Worker 1853*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1854*c0909341SAndroid Build Coastguard Worker mov r3m, seed 1855*c0909341SAndroid Build Coastguard Worker 1856*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1857*c0909341SAndroid Build Coastguard Worker 1858*c0909341SAndroid Build Coastguard Worker mov offxd, offyd 1859*c0909341SAndroid Build Coastguard Worker%else 1860*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1861*c0909341SAndroid Build Coastguard Worker offx, offy, see, src_bak, unused, top_offxy 1862*c0909341SAndroid Build Coastguard Worker 1863*c0909341SAndroid Build Coastguard Worker mov offyd, seed 1864*c0909341SAndroid Build Coastguard Worker mov offxd, seed 1865*c0909341SAndroid Build Coastguard Worker%endif 1866*c0909341SAndroid Build Coastguard Worker ror offyd, 8 1867*c0909341SAndroid Build Coastguard Worker ror offxd, 12 1868*c0909341SAndroid Build Coastguard Worker and offyd, 0xf000f 1869*c0909341SAndroid Build Coastguard Worker and offxd, 0xf000f 1870*c0909341SAndroid Build Coastguard Worker imul offyd, 164 1871*c0909341SAndroid Build Coastguard Worker ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1872*c0909341SAndroid Build Coastguard Worker lea offyq, [offyq+offxq*2+0x10001*747+32*82] 1873*c0909341SAndroid Build Coastguard Worker 1874*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1875*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut 1876*c0909341SAndroid Build Coastguard Worker%else 1877*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1878*c0909341SAndroid Build Coastguard Worker h, offxy, see, src_bak, unused, top_offxy 1879*c0909341SAndroid Build Coastguard Worker%endif 1880*c0909341SAndroid Build Coastguard Worker 1881*c0909341SAndroid Build Coastguard Worker movzx top_offxyd, offxyw 1882*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1883*c0909341SAndroid Build Coastguard Worker mov [rsp+8*mmsize+1*gprsize], top_offxyd 1884*c0909341SAndroid Build Coastguard Worker 1885*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1886*c0909341SAndroid Build Coastguard Worker%endif 1887*c0909341SAndroid Build Coastguard Worker shr offxyd, 16 1888*c0909341SAndroid Build Coastguard Worker 1889*c0909341SAndroid Build Coastguard Worker.loop_x_odd_v_overlap: 1890*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1891*c0909341SAndroid Build Coastguard Worker mov r5, r5m 1892*c0909341SAndroid Build Coastguard Worker%endif 1893*c0909341SAndroid Build Coastguard Worker SPLATD m7, [PIC_ptr(pw_27_17_17_27)] 1894*c0909341SAndroid Build Coastguard Worker mov hd, dword r7m 1895*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 1896*c0909341SAndroid Build Coastguard Worker.loop_y_v_overlap: 1897*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 1898*c0909341SAndroid Build Coastguard Worker movu m3, [grain_lutq+offxyq*2] 1899*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1900*c0909341SAndroid Build Coastguard Worker mov r5, [rsp+8*mmsize+1*gprsize] 1901*c0909341SAndroid Build Coastguard Worker movu m2, [grain_lutq+r5*2] 1902*c0909341SAndroid Build Coastguard Worker%else 1903*c0909341SAndroid Build Coastguard Worker movu m2, [grain_lutq+top_offxyq*2] 1904*c0909341SAndroid Build Coastguard Worker%endif 1905*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m2, m3 1906*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3 1907*c0909341SAndroid Build Coastguard Worker REPX {pmaddwd x, m7}, m4, m2 1908*c0909341SAndroid Build Coastguard Worker REPX {paddd x, m14}, m4, m2 1909*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 5}, m4, m2 1910*c0909341SAndroid Build Coastguard Worker packssdw m2, m4 1911*c0909341SAndroid Build Coastguard Worker pminsw m2, m15 1912*c0909341SAndroid Build Coastguard Worker pmaxsw m2, m9 1913*c0909341SAndroid Build Coastguard Worker movu m4, [grain_lutq+offxyq*2+16] 1914*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1915*c0909341SAndroid Build Coastguard Worker movu m3, [grain_lutq+r5*2+16] 1916*c0909341SAndroid Build Coastguard Worker%else 1917*c0909341SAndroid Build Coastguard Worker movu m3, [grain_lutq+top_offxyq*2+16] 1918*c0909341SAndroid Build Coastguard Worker%endif 1919*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3, m4 1920*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4 1921*c0909341SAndroid Build Coastguard Worker REPX {pmaddwd x, m7}, m5, m3 1922*c0909341SAndroid Build Coastguard Worker REPX {paddd x, m14}, m5, m3 1923*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 5}, m5, m3 1924*c0909341SAndroid Build Coastguard Worker packssdw m3, m5 1925*c0909341SAndroid Build Coastguard Worker pminsw m3, m15 1926*c0909341SAndroid Build Coastguard Worker pmaxsw m3, m9 1927*c0909341SAndroid Build Coastguard Worker 1928*c0909341SAndroid Build Coastguard Worker ; src 1929*c0909341SAndroid Build Coastguard Worker pand m0, m10, [srcq+ 0] ; m0-1: src as word 1930*c0909341SAndroid Build Coastguard Worker pand m1, m10, [srcq+16] ; m0-1: src as word 1931*c0909341SAndroid Build Coastguard Worker 1932*c0909341SAndroid Build Coastguard Worker ; scaling[src] 1933*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[src] * grain, scaling_shift) 1934*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1935*c0909341SAndroid Build Coastguard Worker vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 1936*c0909341SAndroid Build Coastguard Worker%else 1937*c0909341SAndroid Build Coastguard Worker vpgatherdw m4, m0, scalingq-1, r11, r13, 8, 1, m5 1938*c0909341SAndroid Build Coastguard Worker%endif 1939*c0909341SAndroid Build Coastguard Worker psrlw m4, 8 1940*c0909341SAndroid Build Coastguard Worker pmullw m4, m11 1941*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m2 1942*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1943*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m2 1944*c0909341SAndroid Build Coastguard Worker%else 1945*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m1, scalingq-1, r11, r13, 8, 1, m2 1946*c0909341SAndroid Build Coastguard Worker%endif 1947*c0909341SAndroid Build Coastguard Worker psrlw m5, 8 1948*c0909341SAndroid Build Coastguard Worker pmullw m5, m11 1949*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m3 1950*c0909341SAndroid Build Coastguard Worker 1951*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 1952*c0909341SAndroid Build Coastguard Worker paddw m0, m4 1953*c0909341SAndroid Build Coastguard Worker paddw m1, m5 1954*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m13 1955*c0909341SAndroid Build Coastguard Worker pmaxsw m1, m13 1956*c0909341SAndroid Build Coastguard Worker pminsw m0, m12 1957*c0909341SAndroid Build Coastguard Worker pminsw m1, m12 1958*c0909341SAndroid Build Coastguard Worker movifnidn dstq, dstmp 1959*c0909341SAndroid Build Coastguard Worker mova [dstq+srcq+ 0], m0 1960*c0909341SAndroid Build Coastguard Worker mova [dstq+srcq+16], m1 1961*c0909341SAndroid Build Coastguard Worker 1962*c0909341SAndroid Build Coastguard Worker add srcq, r2mp 1963*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82*2 1964*c0909341SAndroid Build Coastguard Worker dec hw 1965*c0909341SAndroid Build Coastguard Worker jz .end_y_v_overlap 1966*c0909341SAndroid Build Coastguard Worker ; 2 lines get vertical overlap, then fall back to non-overlap code for 1967*c0909341SAndroid Build Coastguard Worker ; remaining (up to) 30 lines 1968*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1969*c0909341SAndroid Build Coastguard Worker mov r5, r5m 1970*c0909341SAndroid Build Coastguard Worker%endif 1971*c0909341SAndroid Build Coastguard Worker SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] 1972*c0909341SAndroid Build Coastguard Worker xor hd, 0x10000 1973*c0909341SAndroid Build Coastguard Worker test hd, 0x10000 1974*c0909341SAndroid Build Coastguard Worker jnz .loop_y_v_overlap 1975*c0909341SAndroid Build Coastguard Worker jmp .loop_y 1976*c0909341SAndroid Build Coastguard Worker 1977*c0909341SAndroid Build Coastguard Worker.end_y_v_overlap: 1978*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1979*c0909341SAndroid Build Coastguard Worker add r4mp, 16 1980*c0909341SAndroid Build Coastguard Worker%else 1981*c0909341SAndroid Build Coastguard Worker add wq, 16 1982*c0909341SAndroid Build Coastguard Worker%endif 1983*c0909341SAndroid Build Coastguard Worker jge .end_hv 1984*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1985*c0909341SAndroid Build Coastguard Worker mov srcq, r9mp 1986*c0909341SAndroid Build Coastguard Worker add srcq, r4mp 1987*c0909341SAndroid Build Coastguard Worker add srcq, r4mp 1988*c0909341SAndroid Build Coastguard Worker%else 1989*c0909341SAndroid Build Coastguard Worker mov src_bakq, r9mp 1990*c0909341SAndroid Build Coastguard Worker lea srcq, [src_bakq+wq*2] 1991*c0909341SAndroid Build Coastguard Worker%endif 1992*c0909341SAndroid Build Coastguard Worker btc dword r8m, 2 1993*c0909341SAndroid Build Coastguard Worker jc .next_blk_v 1994*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1995*c0909341SAndroid Build Coastguard Worker add dword [rsp+8*mmsize+1*gprsize], 16 1996*c0909341SAndroid Build Coastguard Worker%else 1997*c0909341SAndroid Build Coastguard Worker add top_offxyd, 16 1998*c0909341SAndroid Build Coastguard Worker%endif 1999*c0909341SAndroid Build Coastguard Worker add offxyd, 16 2000*c0909341SAndroid Build Coastguard Worker jmp .loop_x_odd_v_overlap 2001*c0909341SAndroid Build Coastguard Worker 2002*c0909341SAndroid Build Coastguard Worker.next_blk_v: 2003*c0909341SAndroid Build Coastguard Worker ; since fg_dataq.overlap is guaranteed to be set, we never jump 2004*c0909341SAndroid Build Coastguard Worker ; back to .loop_x_v_overlap, and instead always fall-through to 2005*c0909341SAndroid Build Coastguard Worker ; h+v overlap 2006*c0909341SAndroid Build Coastguard Worker 2007*c0909341SAndroid Build Coastguard Worker.loop_x_hv_overlap: 2008*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2009*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 2010*c0909341SAndroid Build Coastguard Worker 2011*c0909341SAndroid Build Coastguard Worker mov r0, [rsp+8*mmsize+1*gprsize] 2012*c0909341SAndroid Build Coastguard Worker add r3, 16 2013*c0909341SAndroid Build Coastguard Worker add r0, 16 2014*c0909341SAndroid Build Coastguard Worker mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy 2015*c0909341SAndroid Build Coastguard Worker mov [rsp+8*mmsize+2*gprsize], r0 ; topleft_offxy 2016*c0909341SAndroid Build Coastguard Worker 2017*c0909341SAndroid Build Coastguard Worker mov seed, r3m 2018*c0909341SAndroid Build Coastguard Worker xor r0, r0 2019*c0909341SAndroid Build Coastguard Worker%else 2020*c0909341SAndroid Build Coastguard Worker ; we assume from the block above that bits 8-15 of r7d are zero'ed 2021*c0909341SAndroid Build Coastguard Worker%endif 2022*c0909341SAndroid Build Coastguard Worker mov r6d, seed 2023*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4eff4 2024*c0909341SAndroid Build Coastguard Worker test seeb, seeh 2025*c0909341SAndroid Build Coastguard Worker setp t0b ; parity of top_seed 2026*c0909341SAndroid Build Coastguard Worker shr seed, 16 2027*c0909341SAndroid Build Coastguard Worker shl t0d, 16 2028*c0909341SAndroid Build Coastguard Worker test seeb, seeh 2029*c0909341SAndroid Build Coastguard Worker setp t0b ; parity of cur_seed 2030*c0909341SAndroid Build Coastguard Worker or r6d, 0x00010001 2031*c0909341SAndroid Build Coastguard Worker xor t0d, r6d 2032*c0909341SAndroid Build Coastguard Worker mov seed, t0d 2033*c0909341SAndroid Build Coastguard Worker ror seed, 1 ; updated (cur_seed << 16) | top_seed 2034*c0909341SAndroid Build Coastguard Worker 2035*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2036*c0909341SAndroid Build Coastguard Worker mov r3m, seed 2037*c0909341SAndroid Build Coastguard Worker 2038*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx 2039*c0909341SAndroid Build Coastguard Worker 2040*c0909341SAndroid Build Coastguard Worker mov offxd, offyd 2041*c0909341SAndroid Build Coastguard Worker%else 2042*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2043*c0909341SAndroid Build Coastguard Worker offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy 2044*c0909341SAndroid Build Coastguard Worker 2045*c0909341SAndroid Build Coastguard Worker lea topleft_offxyq, [top_offxyq+16] 2046*c0909341SAndroid Build Coastguard Worker lea left_offxyq, [offyq+16] 2047*c0909341SAndroid Build Coastguard Worker mov offyd, seed 2048*c0909341SAndroid Build Coastguard Worker mov offxd, seed 2049*c0909341SAndroid Build Coastguard Worker%endif 2050*c0909341SAndroid Build Coastguard Worker ror offyd, 8 2051*c0909341SAndroid Build Coastguard Worker ror offxd, 12 2052*c0909341SAndroid Build Coastguard Worker and offyd, 0xf000f 2053*c0909341SAndroid Build Coastguard Worker and offxd, 0xf000f 2054*c0909341SAndroid Build Coastguard Worker imul offyd, 164 2055*c0909341SAndroid Build Coastguard Worker ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 2056*c0909341SAndroid Build Coastguard Worker lea offyq, [offyq+offxq*2+0x10001*747+32*82] 2057*c0909341SAndroid Build Coastguard Worker 2058*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2059*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS top_offxy, src, scaling, offxy, w, picptr, grain_lut 2060*c0909341SAndroid Build Coastguard Worker%else 2061*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2062*c0909341SAndroid Build Coastguard Worker h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy 2063*c0909341SAndroid Build Coastguard Worker%endif 2064*c0909341SAndroid Build Coastguard Worker 2065*c0909341SAndroid Build Coastguard Worker movzx top_offxyd, offxyw 2066*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2067*c0909341SAndroid Build Coastguard Worker mov [rsp+8*mmsize+1*gprsize], top_offxyd 2068*c0909341SAndroid Build Coastguard Worker 2069*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2070*c0909341SAndroid Build Coastguard Worker%endif 2071*c0909341SAndroid Build Coastguard Worker shr offxyd, 16 2072*c0909341SAndroid Build Coastguard Worker 2073*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2074*c0909341SAndroid Build Coastguard Worker mov r5, r5m 2075*c0909341SAndroid Build Coastguard Worker%endif 2076*c0909341SAndroid Build Coastguard Worker SPLATD m7, [PIC_ptr(pw_27_17_17_27)] 2077*c0909341SAndroid Build Coastguard Worker 2078*c0909341SAndroid Build Coastguard Worker movzx hd, word r7m 2079*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 2080*c0909341SAndroid Build Coastguard Worker.loop_y_hv_overlap: 2081*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 2082*c0909341SAndroid Build Coastguard Worker movu m2, [grain_lutq+offxyq*2] 2083*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2084*c0909341SAndroid Build Coastguard Worker mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy 2085*c0909341SAndroid Build Coastguard Worker mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy 2086*c0909341SAndroid Build Coastguard Worker movu m4, [grain_lutq+r0*2] 2087*c0909341SAndroid Build Coastguard Worker movd m5, [grain_lutq+r5*2] 2088*c0909341SAndroid Build Coastguard Worker mov r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy 2089*c0909341SAndroid Build Coastguard Worker movd m3, [grain_lutq+r5*2] 2090*c0909341SAndroid Build Coastguard Worker%else 2091*c0909341SAndroid Build Coastguard Worker movu m4, [grain_lutq+top_offxyq*2] 2092*c0909341SAndroid Build Coastguard Worker movd m5, [grain_lutq+left_offxyq*2] 2093*c0909341SAndroid Build Coastguard Worker movd m3, [grain_lutq+topleft_offxyq*2] 2094*c0909341SAndroid Build Coastguard Worker%endif 2095*c0909341SAndroid Build Coastguard Worker ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 2096*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m2 2097*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4 2098*c0909341SAndroid Build Coastguard Worker REPX {pmaddwd x, m6}, m5, m3 2099*c0909341SAndroid Build Coastguard Worker REPX {paddd x, m14}, m5, m3 2100*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 5}, m5, m3 2101*c0909341SAndroid Build Coastguard Worker packssdw m5, m3 2102*c0909341SAndroid Build Coastguard Worker pminsw m5, m15 2103*c0909341SAndroid Build Coastguard Worker pmaxsw m5, m9 2104*c0909341SAndroid Build Coastguard Worker shufps m3, m5, m2, q3210 2105*c0909341SAndroid Build Coastguard Worker shufps m5, m4, q3232 2106*c0909341SAndroid Build Coastguard Worker ; followed by v interpolation (top | cur -> cur) 2107*c0909341SAndroid Build Coastguard Worker movu m0, [grain_lutq+offxyq*2+16] 2108*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2109*c0909341SAndroid Build Coastguard Worker movu m1, [grain_lutq+r0*2+16] 2110*c0909341SAndroid Build Coastguard Worker%else 2111*c0909341SAndroid Build Coastguard Worker movu m1, [grain_lutq+top_offxyq*2+16] 2112*c0909341SAndroid Build Coastguard Worker%endif 2113*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m5, m3 2114*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3 2115*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m1, m0 2116*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0 2117*c0909341SAndroid Build Coastguard Worker REPX {pmaddwd x, m7}, m2, m5, m3, m1 2118*c0909341SAndroid Build Coastguard Worker REPX {paddd x, m14}, m2, m5, m3, m1 2119*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 5}, m2, m5, m3, m1 2120*c0909341SAndroid Build Coastguard Worker packssdw m2, m5 2121*c0909341SAndroid Build Coastguard Worker packssdw m3, m1 2122*c0909341SAndroid Build Coastguard Worker REPX {pminsw x, m15}, m2, m3 2123*c0909341SAndroid Build Coastguard Worker REPX {pmaxsw x, m9}, m2, m3 2124*c0909341SAndroid Build Coastguard Worker 2125*c0909341SAndroid Build Coastguard Worker ; src 2126*c0909341SAndroid Build Coastguard Worker pand m0, m10, [srcq+ 0] 2127*c0909341SAndroid Build Coastguard Worker pand m1, m10, [srcq+16] ; m0-1: src as word 2128*c0909341SAndroid Build Coastguard Worker 2129*c0909341SAndroid Build Coastguard Worker ; scaling[src] 2130*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[src] * grain, scaling_shift) 2131*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2132*c0909341SAndroid Build Coastguard Worker vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 2133*c0909341SAndroid Build Coastguard Worker%else 2134*c0909341SAndroid Build Coastguard Worker vpgatherdw m4, m0, scalingq-1, r14, r10, 8, 1, m5 2135*c0909341SAndroid Build Coastguard Worker%endif 2136*c0909341SAndroid Build Coastguard Worker psrlw m4, 8 2137*c0909341SAndroid Build Coastguard Worker pmullw m4, m11 2138*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m4 2139*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2140*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m4 2141*c0909341SAndroid Build Coastguard Worker%else 2142*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m1, scalingq-1, r14, r10, 8, 1, m4 2143*c0909341SAndroid Build Coastguard Worker%endif 2144*c0909341SAndroid Build Coastguard Worker psrlw m5, 8 2145*c0909341SAndroid Build Coastguard Worker pmullw m5, m11 2146*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m5 2147*c0909341SAndroid Build Coastguard Worker 2148*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 2149*c0909341SAndroid Build Coastguard Worker paddw m0, m2 2150*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2151*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m13 2152*c0909341SAndroid Build Coastguard Worker pmaxsw m1, m13 2153*c0909341SAndroid Build Coastguard Worker pminsw m0, m12 2154*c0909341SAndroid Build Coastguard Worker pminsw m1, m12 2155*c0909341SAndroid Build Coastguard Worker movifnidn dstq, dstmp 2156*c0909341SAndroid Build Coastguard Worker mova [dstq+srcq+ 0], m0 2157*c0909341SAndroid Build Coastguard Worker mova [dstq+srcq+16], m1 2158*c0909341SAndroid Build Coastguard Worker 2159*c0909341SAndroid Build Coastguard Worker add srcq, r2mp 2160*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82*2 2161*c0909341SAndroid Build Coastguard Worker dec hw 2162*c0909341SAndroid Build Coastguard Worker jz .end_y_hv_overlap 2163*c0909341SAndroid Build Coastguard Worker ; 2 lines get vertical overlap, then fall back to non-overlap code for 2164*c0909341SAndroid Build Coastguard Worker ; remaining (up to) 30 lines 2165*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2166*c0909341SAndroid Build Coastguard Worker mov r5, r5m 2167*c0909341SAndroid Build Coastguard Worker%endif 2168*c0909341SAndroid Build Coastguard Worker SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] 2169*c0909341SAndroid Build Coastguard Worker xor hd, 0x10000 2170*c0909341SAndroid Build Coastguard Worker test hd, 0x10000 2171*c0909341SAndroid Build Coastguard Worker jnz .loop_y_hv_overlap 2172*c0909341SAndroid Build Coastguard Worker jmp .loop_y_h_overlap 2173*c0909341SAndroid Build Coastguard Worker 2174*c0909341SAndroid Build Coastguard Worker.end_y_hv_overlap: 2175*c0909341SAndroid Build Coastguard Worker or dword r8m, 4 2176*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2177*c0909341SAndroid Build Coastguard Worker add r4mp, 16 2178*c0909341SAndroid Build Coastguard Worker%else 2179*c0909341SAndroid Build Coastguard Worker add wq, 16 2180*c0909341SAndroid Build Coastguard Worker%endif 2181*c0909341SAndroid Build Coastguard Worker jge .end_hv 2182*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2183*c0909341SAndroid Build Coastguard Worker mov r5, r5m 2184*c0909341SAndroid Build Coastguard Worker add offxyd, 16 2185*c0909341SAndroid Build Coastguard Worker add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16 2186*c0909341SAndroid Build Coastguard Worker mov srcq, r9mp 2187*c0909341SAndroid Build Coastguard Worker add srcq, r4mp 2188*c0909341SAndroid Build Coastguard Worker add srcq, r4mp 2189*c0909341SAndroid Build Coastguard Worker%else 2190*c0909341SAndroid Build Coastguard Worker add offxyd, 16 2191*c0909341SAndroid Build Coastguard Worker add top_offxyd, 16 2192*c0909341SAndroid Build Coastguard Worker mov src_bakq, r9mp 2193*c0909341SAndroid Build Coastguard Worker lea srcq, [src_bakq+wq*2] 2194*c0909341SAndroid Build Coastguard Worker%endif 2195*c0909341SAndroid Build Coastguard Worker jmp .loop_x_odd_v_overlap 2196*c0909341SAndroid Build Coastguard Worker 2197*c0909341SAndroid Build Coastguard Worker.end_hv: 2198*c0909341SAndroid Build Coastguard Worker RET 2199*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2200*c0909341SAndroid Build Coastguard Worker DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 2201*c0909341SAndroid Build Coastguard Worker%endif 2202*c0909341SAndroid Build Coastguard Worker 2203*c0909341SAndroid Build Coastguard Worker%macro FGUV_FN 3 ; name, ss_hor, ss_ver 2204*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3 2205*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2206*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < mmsize 2207*c0909341SAndroid Build Coastguard Workercglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \ 2208*c0909341SAndroid Build Coastguard Worker tmp, src, scaling, h, fg_data, picptr, unused 2209*c0909341SAndroid Build Coastguard Worker mov r0, r0m 2210*c0909341SAndroid Build Coastguard Worker mov r1, r1m 2211*c0909341SAndroid Build Coastguard Worker mov r2, r2m 2212*c0909341SAndroid Build Coastguard Worker mov r4, r3m 2213*c0909341SAndroid Build Coastguard Worker mov r3, r4m 2214*c0909341SAndroid Build Coastguard Worker mov r5, r5m 2215*c0909341SAndroid Build Coastguard Worker%define r0m [rsp+8*mmsize+ 3*gprsize] 2216*c0909341SAndroid Build Coastguard Worker%define r1m [rsp+8*mmsize+ 4*gprsize] 2217*c0909341SAndroid Build Coastguard Worker%define r2m [rsp+8*mmsize+ 5*gprsize] 2218*c0909341SAndroid Build Coastguard Worker%define r3m [rsp+8*mmsize+ 6*gprsize] 2219*c0909341SAndroid Build Coastguard Worker%define r4m [rsp+8*mmsize+ 7*gprsize] 2220*c0909341SAndroid Build Coastguard Worker%define r5m [rsp+8*mmsize+ 8*gprsize] 2221*c0909341SAndroid Build Coastguard Worker mov r0m, r0 2222*c0909341SAndroid Build Coastguard Worker mov r2m, r2 2223*c0909341SAndroid Build Coastguard Worker mov r4m, r3 2224*c0909341SAndroid Build Coastguard Worker mov r5m, r5 2225*c0909341SAndroid Build Coastguard Worker 2226*c0909341SAndroid Build Coastguard Worker mov r0, r6m 2227*c0909341SAndroid Build Coastguard Worker mov r2, r7m 2228*c0909341SAndroid Build Coastguard Worker mov r3, r8m 2229*c0909341SAndroid Build Coastguard Worker mov r5, r9m 2230*c0909341SAndroid Build Coastguard Worker%define r6m [rsp+8*mmsize+ 9*gprsize] 2231*c0909341SAndroid Build Coastguard Worker%define r7m [rsp+8*mmsize+10*gprsize] 2232*c0909341SAndroid Build Coastguard Worker%define r8m [rsp+8*mmsize+11*gprsize] 2233*c0909341SAndroid Build Coastguard Worker%define r9m [rsp+8*mmsize+12*gprsize] 2234*c0909341SAndroid Build Coastguard Worker mov r6m, r0 2235*c0909341SAndroid Build Coastguard Worker mov r7m, r2 2236*c0909341SAndroid Build Coastguard Worker mov r8m, r3 2237*c0909341SAndroid Build Coastguard Worker mov r9m, r5 2238*c0909341SAndroid Build Coastguard Worker 2239*c0909341SAndroid Build Coastguard Worker mov r2, r10m 2240*c0909341SAndroid Build Coastguard Worker mov r3, r11m 2241*c0909341SAndroid Build Coastguard Worker mov r5, r12m 2242*c0909341SAndroid Build Coastguard Worker mov r0, r13m 2243*c0909341SAndroid Build Coastguard Worker%define r10m [rsp+8*mmsize+13*gprsize] 2244*c0909341SAndroid Build Coastguard Worker%define r11m [rsp+8*mmsize+14*gprsize] 2245*c0909341SAndroid Build Coastguard Worker%define r12m [rsp+8*mmsize+15*gprsize] 2246*c0909341SAndroid Build Coastguard Worker mov r10m, r2 2247*c0909341SAndroid Build Coastguard Worker mov r11m, r3 2248*c0909341SAndroid Build Coastguard Worker mov r12m, r5 2249*c0909341SAndroid Build Coastguard Worker 2250*c0909341SAndroid Build Coastguard Worker SPLATW m2, r13m 2251*c0909341SAndroid Build Coastguard Worker%else 2252*c0909341SAndroid Build Coastguard Workercglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ 2253*c0909341SAndroid Build Coastguard Worker tmp, src, scaling, h, fg_data, picptr, unused 2254*c0909341SAndroid Build Coastguard Worker mov srcq, srcm 2255*c0909341SAndroid Build Coastguard Worker mov fg_dataq, r3m 2256*c0909341SAndroid Build Coastguard Worker%endif 2257*c0909341SAndroid Build Coastguard Worker LEA r5, $$ 2258*c0909341SAndroid Build Coastguard Worker%define base r5-$$ 2259*c0909341SAndroid Build Coastguard Worker 2260*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 0, 2, 3 2261*c0909341SAndroid Build Coastguard Worker%else 2262*c0909341SAndroid Build Coastguard Workercglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ 2263*c0909341SAndroid Build Coastguard Worker grain_lut, h, sby, luma, lstride, uv_pl, is_id 2264*c0909341SAndroid Build Coastguard Worker%define base r8-pb_mask 2265*c0909341SAndroid Build Coastguard Worker lea r8, [pb_mask] 2266*c0909341SAndroid Build Coastguard Worker 2267*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 9, 10, 11 2268*c0909341SAndroid Build Coastguard Worker%endif 2269*c0909341SAndroid Build Coastguard Worker mov r6d, [fg_dataq+FGData.scaling_shift] 2270*c0909341SAndroid Build Coastguard Worker SPLATW m3, [base+mul_bits+r6*2-14] 2271*c0909341SAndroid Build Coastguard Worker mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 2272*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT >= mmsize 2273*c0909341SAndroid Build Coastguard Worker mov t0d, r13m ; bdmax 2274*c0909341SAndroid Build Coastguard Worker%endif 2275*c0909341SAndroid Build Coastguard Worker sar t0d, 11 ; is_12bpc 2276*c0909341SAndroid Build Coastguard Worker inc t0d 2277*c0909341SAndroid Build Coastguard Worker mov t1d, r6d 2278*c0909341SAndroid Build Coastguard Worker imul t1d, t0d 2279*c0909341SAndroid Build Coastguard Worker dec t0d 2280*c0909341SAndroid Build Coastguard Worker SPLATW m5, [base+min+t1*2] 2281*c0909341SAndroid Build Coastguard Worker lea t1d, [t0d*3] 2282*c0909341SAndroid Build Coastguard Worker mov t2d, r12m 2283*c0909341SAndroid Build Coastguard Worker inc t2d 2284*c0909341SAndroid Build Coastguard Worker imul r6d, t2d 2285*c0909341SAndroid Build Coastguard Worker add t1d, r6d 2286*c0909341SAndroid Build Coastguard Worker SPLATW m4, [base+max+t1*2] 2287*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT >= mmsize 2288*c0909341SAndroid Build Coastguard Worker SPLATW m2, r13m 2289*c0909341SAndroid Build Coastguard Worker%endif 2290*c0909341SAndroid Build Coastguard Worker 2291*c0909341SAndroid Build Coastguard Worker SCRATCH 2, 10, 2 2292*c0909341SAndroid Build Coastguard Worker SCRATCH 3, 11, 3 2293*c0909341SAndroid Build Coastguard Worker SCRATCH 4, 12, 4 2294*c0909341SAndroid Build Coastguard Worker SCRATCH 5, 13, 5 2295*c0909341SAndroid Build Coastguard Worker 2296*c0909341SAndroid Build Coastguard Worker%define mzero m7 2297*c0909341SAndroid Build Coastguard Worker 2298*c0909341SAndroid Build Coastguard Worker%if %3 2299*c0909341SAndroid Build Coastguard Worker SPLATD m2, [base+pw_23_22] 2300*c0909341SAndroid Build Coastguard Worker%endif 2301*c0909341SAndroid Build Coastguard Worker 2302*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2303*c0909341SAndroid Build Coastguard Worker mov scalingq, r5m 2304*c0909341SAndroid Build Coastguard Worker mov r5m, r5 2305*c0909341SAndroid Build Coastguard Worker%else 2306*c0909341SAndroid Build Coastguard Worker mov r13mp, strideq 2307*c0909341SAndroid Build Coastguard Worker%endif 2308*c0909341SAndroid Build Coastguard Worker 2309*c0909341SAndroid Build Coastguard Worker pcmpeqw m0, m0 2310*c0909341SAndroid Build Coastguard Worker psraw m1, m10, 1 2311*c0909341SAndroid Build Coastguard Worker pxor m0, m1 2312*c0909341SAndroid Build Coastguard Worker 2313*c0909341SAndroid Build Coastguard Worker SCRATCH 0, 8, 0 2314*c0909341SAndroid Build Coastguard Worker SCRATCH 1, 9, 1 2315*c0909341SAndroid Build Coastguard Worker 2316*c0909341SAndroid Build Coastguard Worker cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 2317*c0909341SAndroid Build Coastguard Worker jne .csfl 2318*c0909341SAndroid Build Coastguard Worker 2319*c0909341SAndroid Build Coastguard Worker%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_h, ss_v 2320*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2321*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 2322*c0909341SAndroid Build Coastguard Worker 2323*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 0 2324*c0909341SAndroid Build Coastguard Worker%else 2325*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap 2326*c0909341SAndroid Build Coastguard Worker 2327*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 9 2328*c0909341SAndroid Build Coastguard Worker%endif 2329*c0909341SAndroid Build Coastguard Worker 2330*c0909341SAndroid Build Coastguard Worker%if %1 2331*c0909341SAndroid Build Coastguard Worker mov r6d, r11m 2332*c0909341SAndroid Build Coastguard Worker SPLATW m0, [fg_dataq+FGData.uv_mult+r6*4] 2333*c0909341SAndroid Build Coastguard Worker SPLATW m1, [fg_dataq+FGData.uv_luma_mult+r6*4] 2334*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m1, m0 2335*c0909341SAndroid Build Coastguard Worker SPLATW m5, [fg_dataq+FGData.uv_offset+r6*4] 2336*c0909341SAndroid Build Coastguard Worker SPLATD m7, [base+pw_4+t0*4] 2337*c0909341SAndroid Build Coastguard Worker pmullw m5, m7 2338*c0909341SAndroid Build Coastguard Worker%else 2339*c0909341SAndroid Build Coastguard Worker SPLATD m6, [base+pd_16] 2340*c0909341SAndroid Build Coastguard Worker%if %2 2341*c0909341SAndroid Build Coastguard Worker mova m5, [base+pw_23_22] 2342*c0909341SAndroid Build Coastguard Worker%else 2343*c0909341SAndroid Build Coastguard Worker mova m5, [base+pw_27_17_17_27] 2344*c0909341SAndroid Build Coastguard Worker%endif 2345*c0909341SAndroid Build Coastguard Worker%endif 2346*c0909341SAndroid Build Coastguard Worker 2347*c0909341SAndroid Build Coastguard Worker SCRATCH 6, 14, 6 2348*c0909341SAndroid Build Coastguard Worker SCRATCH 5, 15, 7 2349*c0909341SAndroid Build Coastguard Worker 2350*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2351*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 0 2352*c0909341SAndroid Build Coastguard Worker%else 2353*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 7 2354*c0909341SAndroid Build Coastguard Worker%endif 2355*c0909341SAndroid Build Coastguard Worker 2356*c0909341SAndroid Build Coastguard Worker mov sbyd, r8m 2357*c0909341SAndroid Build Coastguard Worker mov t0d, [fg_dataq+FGData.overlap_flag] 2358*c0909341SAndroid Build Coastguard Worker test t0d, t0d 2359*c0909341SAndroid Build Coastguard Worker jz %%no_vertical_overlap 2360*c0909341SAndroid Build Coastguard Worker test sbyd, sbyd 2361*c0909341SAndroid Build Coastguard Worker jnz %%vertical_overlap 2362*c0909341SAndroid Build Coastguard Worker 2363*c0909341SAndroid Build Coastguard Worker%%no_vertical_overlap: 2364*c0909341SAndroid Build Coastguard Worker mov r8m, t0d 2365*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2366*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap 2367*c0909341SAndroid Build Coastguard Worker imul seed, (173 << 24) | 37 2368*c0909341SAndroid Build Coastguard Worker%else 2369*c0909341SAndroid Build Coastguard Worker imul seed, sbyd, (173 << 24) | 37 2370*c0909341SAndroid Build Coastguard Worker%endif 2371*c0909341SAndroid Build Coastguard Worker add seed, (105 << 24) | 178 2372*c0909341SAndroid Build Coastguard Worker rol seed, 8 2373*c0909341SAndroid Build Coastguard Worker movzx seed, seew 2374*c0909341SAndroid Build Coastguard Worker xor seed, [fg_dataq+FGData.seed] 2375*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2376*c0909341SAndroid Build Coastguard Worker mov r3m, seed 2377*c0909341SAndroid Build Coastguard Worker 2378*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, see, w, picptr, luma 2379*c0909341SAndroid Build Coastguard Worker 2380*c0909341SAndroid Build Coastguard Worker mov dstq, r0mp 2381*c0909341SAndroid Build Coastguard Worker mov lumaq, r9mp 2382*c0909341SAndroid Build Coastguard Worker mov wq, r4m 2383*c0909341SAndroid Build Coastguard Worker lea r3, [srcq+wq*2] 2384*c0909341SAndroid Build Coastguard Worker mov r1mp, r3 2385*c0909341SAndroid Build Coastguard Worker lea r3, [dstq+wq*2] 2386*c0909341SAndroid Build Coastguard Worker mov r11mp, r3 2387*c0909341SAndroid Build Coastguard Worker lea r3, [lumaq+wq*(2<<%2)] 2388*c0909341SAndroid Build Coastguard Worker mov r12mp, r3 2389*c0909341SAndroid Build Coastguard Worker%if %3 2390*c0909341SAndroid Build Coastguard Worker shl r10mp, 1 2391*c0909341SAndroid Build Coastguard Worker%endif 2392*c0909341SAndroid Build Coastguard Worker%else 2393*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2394*c0909341SAndroid Build Coastguard Worker unused2, unused3, see, unused4, unused5, unused6, luma, lstride 2395*c0909341SAndroid Build Coastguard Worker 2396*c0909341SAndroid Build Coastguard Worker mov lstrideq, r10mp 2397*c0909341SAndroid Build Coastguard Worker%if %3 2398*c0909341SAndroid Build Coastguard Worker add lstrideq, lstrideq 2399*c0909341SAndroid Build Coastguard Worker%endif 2400*c0909341SAndroid Build Coastguard Worker mov lumaq, r9mp 2401*c0909341SAndroid Build Coastguard Worker lea r10, [srcq+wq*2] 2402*c0909341SAndroid Build Coastguard Worker lea r11, [dstq+wq*2] 2403*c0909341SAndroid Build Coastguard Worker lea r12, [lumaq+wq*(2<<%2)] 2404*c0909341SAndroid Build Coastguard Worker mov r10mp, r10 2405*c0909341SAndroid Build Coastguard Worker mov r11mp, r11 2406*c0909341SAndroid Build Coastguard Worker mov r12mp, r12 2407*c0909341SAndroid Build Coastguard Worker%endif 2408*c0909341SAndroid Build Coastguard Worker neg wq 2409*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2410*c0909341SAndroid Build Coastguard Worker mov r4mp, wq 2411*c0909341SAndroid Build Coastguard Worker%endif 2412*c0909341SAndroid Build Coastguard Worker 2413*c0909341SAndroid Build Coastguard Worker%%loop_x: 2414*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2415*c0909341SAndroid Build Coastguard Worker mov seed, r3m 2416*c0909341SAndroid Build Coastguard Worker%endif 2417*c0909341SAndroid Build Coastguard Worker 2418*c0909341SAndroid Build Coastguard Worker mov r6d, seed 2419*c0909341SAndroid Build Coastguard Worker or seed, 0xEFF4 2420*c0909341SAndroid Build Coastguard Worker shr r6d, 1 2421*c0909341SAndroid Build Coastguard Worker test seeb, seeh 2422*c0909341SAndroid Build Coastguard Worker lea seed, [r6+0x8000] 2423*c0909341SAndroid Build Coastguard Worker cmovp seed, r6d ; updated seed 2424*c0909341SAndroid Build Coastguard Worker 2425*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2426*c0909341SAndroid Build Coastguard Worker mov r3m, seed 2427*c0909341SAndroid Build Coastguard Worker 2428*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx 2429*c0909341SAndroid Build Coastguard Worker 2430*c0909341SAndroid Build Coastguard Worker mov offxd, offyd 2431*c0909341SAndroid Build Coastguard Worker%else 2432*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2433*c0909341SAndroid Build Coastguard Worker offx, offy, see, unused1, unused2, unused3, luma, lstride 2434*c0909341SAndroid Build Coastguard Worker 2435*c0909341SAndroid Build Coastguard Worker mov offxd, seed 2436*c0909341SAndroid Build Coastguard Worker mov offyd, seed 2437*c0909341SAndroid Build Coastguard Worker%endif 2438*c0909341SAndroid Build Coastguard Worker ror offyd, 8 2439*c0909341SAndroid Build Coastguard Worker shr offxd, 12 2440*c0909341SAndroid Build Coastguard Worker and offyd, 0xf 2441*c0909341SAndroid Build Coastguard Worker imul offyd, 164>>%3 2442*c0909341SAndroid Build Coastguard Worker lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 2443*c0909341SAndroid Build Coastguard Worker 2444*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2445*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2446*c0909341SAndroid Build Coastguard Worker%else 2447*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2448*c0909341SAndroid Build Coastguard Worker h, offxy, see, unused1, unused2, unused3, luma, lstride 2449*c0909341SAndroid Build Coastguard Worker%endif 2450*c0909341SAndroid Build Coastguard Worker 2451*c0909341SAndroid Build Coastguard Worker%if %2 == 0 2452*c0909341SAndroid Build Coastguard Worker%%loop_x_odd: 2453*c0909341SAndroid Build Coastguard Worker%endif 2454*c0909341SAndroid Build Coastguard Worker mov hd, r7m 2455*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 2456*c0909341SAndroid Build Coastguard Worker%%loop_y: 2457*c0909341SAndroid Build Coastguard Worker ; src 2458*c0909341SAndroid Build Coastguard Worker mova m0, [srcq] 2459*c0909341SAndroid Build Coastguard Worker mova m1, [srcq+16] ; m0-1: src as word 2460*c0909341SAndroid Build Coastguard Worker 2461*c0909341SAndroid Build Coastguard Worker ; luma_src 2462*c0909341SAndroid Build Coastguard Worker pxor mzero, mzero 2463*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2464*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut 2465*c0909341SAndroid Build Coastguard Worker 2466*c0909341SAndroid Build Coastguard Worker mov lumaq, r9m 2467*c0909341SAndroid Build Coastguard Worker%endif 2468*c0909341SAndroid Build Coastguard Worker mova m4, [lumaq+ 0] 2469*c0909341SAndroid Build Coastguard Worker mova m6, [lumaq+(16<<%2)] 2470*c0909341SAndroid Build Coastguard Worker%if %2 2471*c0909341SAndroid Build Coastguard Worker phaddw m4, [lumaq+16] 2472*c0909341SAndroid Build Coastguard Worker phaddw m6, [lumaq+48] 2473*c0909341SAndroid Build Coastguard Worker%endif 2474*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2475*c0909341SAndroid Build Coastguard Worker add lumaq, r10mp 2476*c0909341SAndroid Build Coastguard Worker mov r9m, lumaq 2477*c0909341SAndroid Build Coastguard Worker%endif 2478*c0909341SAndroid Build Coastguard Worker%if %2 2479*c0909341SAndroid Build Coastguard Worker pavgw m4, mzero 2480*c0909341SAndroid Build Coastguard Worker pavgw m6, mzero 2481*c0909341SAndroid Build Coastguard Worker%endif 2482*c0909341SAndroid Build Coastguard Worker 2483*c0909341SAndroid Build Coastguard Worker%if %1 2484*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4, m0 2485*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m0 2486*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6, m1 2487*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m1 ; { luma, chroma } 2488*c0909341SAndroid Build Coastguard Worker REPX {pmaddwd x, m14}, m3, m4, m5, m6 2489*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 6}, m3, m4, m5, m6 2490*c0909341SAndroid Build Coastguard Worker packssdw m4, m3 2491*c0909341SAndroid Build Coastguard Worker packssdw m6, m5 2492*c0909341SAndroid Build Coastguard Worker REPX {paddw x, m15}, m4, m6 2493*c0909341SAndroid Build Coastguard Worker REPX {pmaxsw x, mzero}, m4, m6 2494*c0909341SAndroid Build Coastguard Worker REPX {pminsw x, m10}, m4, m6 ; clip_pixel() 2495*c0909341SAndroid Build Coastguard Worker%else 2496*c0909341SAndroid Build Coastguard Worker REPX {pand x, m10}, m4, m6 2497*c0909341SAndroid Build Coastguard Worker%endif 2498*c0909341SAndroid Build Coastguard Worker 2499*c0909341SAndroid Build Coastguard Worker ; scaling[luma_src] 2500*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2501*c0909341SAndroid Build Coastguard Worker vpgatherdw m3, m4, scalingq-1, r0, r5, 8, 1 2502*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 2503*c0909341SAndroid Build Coastguard Worker%else 2504*c0909341SAndroid Build Coastguard Worker vpgatherdw m3, m4, scalingq-1, r10, r12, 8, 1 2505*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 2506*c0909341SAndroid Build Coastguard Worker%endif 2507*c0909341SAndroid Build Coastguard Worker REPX {psrlw x, 8}, m3, m5 2508*c0909341SAndroid Build Coastguard Worker 2509*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 2510*c0909341SAndroid Build Coastguard Worker movu m4, [grain_lutq+offxyq*2] 2511*c0909341SAndroid Build Coastguard Worker movu m6, [grain_lutq+offxyq*2+16] 2512*c0909341SAndroid Build Coastguard Worker 2513*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2514*c0909341SAndroid Build Coastguard Worker REPX {pmullw x, m11}, m3, m5 2515*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m3 2516*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m5 2517*c0909341SAndroid Build Coastguard Worker 2518*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 2519*c0909341SAndroid Build Coastguard Worker paddw m0, m4 2520*c0909341SAndroid Build Coastguard Worker paddw m1, m6 2521*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m13 2522*c0909341SAndroid Build Coastguard Worker pmaxsw m1, m13 2523*c0909341SAndroid Build Coastguard Worker pminsw m0, m12 2524*c0909341SAndroid Build Coastguard Worker pminsw m1, m12 2525*c0909341SAndroid Build Coastguard Worker movifnidn dstq, dstmp 2526*c0909341SAndroid Build Coastguard Worker mova [dstq+ 0], m0 2527*c0909341SAndroid Build Coastguard Worker mova [dstq+16], m1 2528*c0909341SAndroid Build Coastguard Worker 2529*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2530*c0909341SAndroid Build Coastguard Worker add srcq, r2mp 2531*c0909341SAndroid Build Coastguard Worker add dstq, r2mp 2532*c0909341SAndroid Build Coastguard Worker mov dstmp, dstq 2533*c0909341SAndroid Build Coastguard Worker%else 2534*c0909341SAndroid Build Coastguard Worker add srcq, r13mp 2535*c0909341SAndroid Build Coastguard Worker add dstq, r13mp 2536*c0909341SAndroid Build Coastguard Worker add lumaq, lstrideq 2537*c0909341SAndroid Build Coastguard Worker%endif 2538*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82*2 2539*c0909341SAndroid Build Coastguard Worker dec hd 2540*c0909341SAndroid Build Coastguard Worker jg %%loop_y 2541*c0909341SAndroid Build Coastguard Worker 2542*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2543*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, w, picptr, luma 2544*c0909341SAndroid Build Coastguard Worker 2545*c0909341SAndroid Build Coastguard Worker mov wq, r4mp 2546*c0909341SAndroid Build Coastguard Worker%endif 2547*c0909341SAndroid Build Coastguard Worker add wq, 16 2548*c0909341SAndroid Build Coastguard Worker jge %%end 2549*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2550*c0909341SAndroid Build Coastguard Worker mov srcq, r1mp 2551*c0909341SAndroid Build Coastguard Worker%else 2552*c0909341SAndroid Build Coastguard Worker mov srcq, r10mp 2553*c0909341SAndroid Build Coastguard Worker%endif 2554*c0909341SAndroid Build Coastguard Worker mov dstq, r11mp 2555*c0909341SAndroid Build Coastguard Worker mov lumaq, r12mp 2556*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+wq*2] 2557*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+wq*2] 2558*c0909341SAndroid Build Coastguard Worker lea lumaq, [lumaq+wq*(2<<%2)] 2559*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2560*c0909341SAndroid Build Coastguard Worker mov r0m, dstq 2561*c0909341SAndroid Build Coastguard Worker mov r9m, lumaq 2562*c0909341SAndroid Build Coastguard Worker mov r4m, wq 2563*c0909341SAndroid Build Coastguard Worker%endif 2564*c0909341SAndroid Build Coastguard Worker%if %2 == 0 2565*c0909341SAndroid Build Coastguard Worker btc dword r8m, 2 2566*c0909341SAndroid Build Coastguard Worker jc %%next_blk 2567*c0909341SAndroid Build Coastguard Worker add offxyd, 16 2568*c0909341SAndroid Build Coastguard Worker test dword r8m, 2 2569*c0909341SAndroid Build Coastguard Worker jz %%loop_x_odd 2570*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2571*c0909341SAndroid Build Coastguard Worker add dword [rsp+8*mmsize+1*gprsize], 16 2572*c0909341SAndroid Build Coastguard Worker%else 2573*c0909341SAndroid Build Coastguard Worker add r11d, 16 2574*c0909341SAndroid Build Coastguard Worker%endif 2575*c0909341SAndroid Build Coastguard Worker jmp %%loop_x_odd_v_overlap 2576*c0909341SAndroid Build Coastguard Worker%%next_blk: 2577*c0909341SAndroid Build Coastguard Worker%endif 2578*c0909341SAndroid Build Coastguard Worker test dword r8m, 1 2579*c0909341SAndroid Build Coastguard Worker je %%loop_x 2580*c0909341SAndroid Build Coastguard Worker 2581*c0909341SAndroid Build Coastguard Worker ; r8m = sbym 2582*c0909341SAndroid Build Coastguard Worker test dword r8m, 2 2583*c0909341SAndroid Build Coastguard Worker jnz %%loop_x_hv_overlap 2584*c0909341SAndroid Build Coastguard Worker 2585*c0909341SAndroid Build Coastguard Worker ; horizontal overlap (without vertical overlap) 2586*c0909341SAndroid Build Coastguard Worker%%loop_x_h_overlap: 2587*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2588*c0909341SAndroid Build Coastguard Worker add offxyd, 16 2589*c0909341SAndroid Build Coastguard Worker mov [rsp+8*mmsize+0*gprsize], offxyd 2590*c0909341SAndroid Build Coastguard Worker 2591*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut 2592*c0909341SAndroid Build Coastguard Worker 2593*c0909341SAndroid Build Coastguard Worker mov seed, r3m 2594*c0909341SAndroid Build Coastguard Worker%endif 2595*c0909341SAndroid Build Coastguard Worker mov r6d, seed 2596*c0909341SAndroid Build Coastguard Worker or seed, 0xEFF4 2597*c0909341SAndroid Build Coastguard Worker shr r6d, 1 2598*c0909341SAndroid Build Coastguard Worker test seeb, seeh 2599*c0909341SAndroid Build Coastguard Worker lea seed, [r6+0x8000] 2600*c0909341SAndroid Build Coastguard Worker cmovp seed, r6d ; updated seed 2601*c0909341SAndroid Build Coastguard Worker 2602*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2603*c0909341SAndroid Build Coastguard Worker mov r3m, seed 2604*c0909341SAndroid Build Coastguard Worker 2605*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx 2606*c0909341SAndroid Build Coastguard Worker 2607*c0909341SAndroid Build Coastguard Worker mov offxd, offyd 2608*c0909341SAndroid Build Coastguard Worker%else 2609*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2610*c0909341SAndroid Build Coastguard Worker offx, offy, see, left_offxy, unused1, unused2, luma, lstride 2611*c0909341SAndroid Build Coastguard Worker 2612*c0909341SAndroid Build Coastguard Worker lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx 2613*c0909341SAndroid Build Coastguard Worker mov offxd, seed 2614*c0909341SAndroid Build Coastguard Worker mov offyd, seed 2615*c0909341SAndroid Build Coastguard Worker%endif 2616*c0909341SAndroid Build Coastguard Worker ror offyd, 8 2617*c0909341SAndroid Build Coastguard Worker shr offxd, 12 2618*c0909341SAndroid Build Coastguard Worker and offyd, 0xf 2619*c0909341SAndroid Build Coastguard Worker imul offyd, 164>>%3 2620*c0909341SAndroid Build Coastguard Worker lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 2621*c0909341SAndroid Build Coastguard Worker 2622*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2623*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2624*c0909341SAndroid Build Coastguard Worker%else 2625*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2626*c0909341SAndroid Build Coastguard Worker h, offxy, see, left_offxy, unused1, unused2, luma, lstride 2627*c0909341SAndroid Build Coastguard Worker%endif 2628*c0909341SAndroid Build Coastguard Worker 2629*c0909341SAndroid Build Coastguard Worker mov hd, r7m 2630*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 2631*c0909341SAndroid Build Coastguard Worker%%loop_y_h_overlap: 2632*c0909341SAndroid Build Coastguard Worker mova m0, [srcq] 2633*c0909341SAndroid Build Coastguard Worker mova m1, [srcq+16] 2634*c0909341SAndroid Build Coastguard Worker 2635*c0909341SAndroid Build Coastguard Worker ; luma_src 2636*c0909341SAndroid Build Coastguard Worker pxor mzero, mzero 2637*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2638*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut 2639*c0909341SAndroid Build Coastguard Worker mov lumaq, r9m 2640*c0909341SAndroid Build Coastguard Worker%endif 2641*c0909341SAndroid Build Coastguard Worker mova m4, [lumaq+ 0] 2642*c0909341SAndroid Build Coastguard Worker mova m6, [lumaq+(16<<%2)] 2643*c0909341SAndroid Build Coastguard Worker%if %2 2644*c0909341SAndroid Build Coastguard Worker phaddw m4, [lumaq+16] 2645*c0909341SAndroid Build Coastguard Worker phaddw m6, [lumaq+48] 2646*c0909341SAndroid Build Coastguard Worker%endif 2647*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2648*c0909341SAndroid Build Coastguard Worker add lumaq, r10mp 2649*c0909341SAndroid Build Coastguard Worker mov r9m, lumaq 2650*c0909341SAndroid Build Coastguard Worker%endif 2651*c0909341SAndroid Build Coastguard Worker%if %2 2652*c0909341SAndroid Build Coastguard Worker pavgw m4, mzero 2653*c0909341SAndroid Build Coastguard Worker pavgw m6, mzero 2654*c0909341SAndroid Build Coastguard Worker%endif 2655*c0909341SAndroid Build Coastguard Worker 2656*c0909341SAndroid Build Coastguard Worker%if %1 2657*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4, m0 2658*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m0 2659*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6, m1 2660*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m1 ; { luma, chroma } 2661*c0909341SAndroid Build Coastguard Worker REPX {pmaddwd x, m14}, m3, m4, m5, m6 2662*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 6}, m3, m4, m5, m6 2663*c0909341SAndroid Build Coastguard Worker packssdw m4, m3 2664*c0909341SAndroid Build Coastguard Worker packssdw m6, m5 2665*c0909341SAndroid Build Coastguard Worker REPX {paddw x, m15}, m4, m6 2666*c0909341SAndroid Build Coastguard Worker REPX {pmaxsw x, mzero}, m4, m6 2667*c0909341SAndroid Build Coastguard Worker REPX {pminsw x, m10}, m4, m6 ; clip_pixel() 2668*c0909341SAndroid Build Coastguard Worker%else 2669*c0909341SAndroid Build Coastguard Worker REPX {pand x, m10}, m4, m6 2670*c0909341SAndroid Build Coastguard Worker%endif 2671*c0909341SAndroid Build Coastguard Worker 2672*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 2673*c0909341SAndroid Build Coastguard Worker movu m7, [grain_lutq+offxyq*2] 2674*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2675*c0909341SAndroid Build Coastguard Worker mov r5, [rsp+8*mmsize+0*gprsize] 2676*c0909341SAndroid Build Coastguard Worker movd m5, [grain_lutq+r5*2] 2677*c0909341SAndroid Build Coastguard Worker%else 2678*c0909341SAndroid Build Coastguard Worker movd m5, [grain_lutq+left_offxyq*2+ 0] 2679*c0909341SAndroid Build Coastguard Worker%endif 2680*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m7 ; {left0, cur0} 2681*c0909341SAndroid Build Coastguard Worker%if %1 2682*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2683*c0909341SAndroid Build Coastguard Worker mov r5, r5m 2684*c0909341SAndroid Build Coastguard Worker%endif 2685*c0909341SAndroid Build Coastguard Worker%if %2 2686*c0909341SAndroid Build Coastguard Worker pmaddwd m5, [PIC_ptr(pw_23_22)] 2687*c0909341SAndroid Build Coastguard Worker%else 2688*c0909341SAndroid Build Coastguard Worker pmaddwd m5, [PIC_ptr(pw_27_17_17_27)] 2689*c0909341SAndroid Build Coastguard Worker%endif 2690*c0909341SAndroid Build Coastguard Worker paddd m5, [PIC_ptr(pd_16)] 2691*c0909341SAndroid Build Coastguard Worker%else 2692*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m15 2693*c0909341SAndroid Build Coastguard Worker paddd m5, m14 2694*c0909341SAndroid Build Coastguard Worker%endif 2695*c0909341SAndroid Build Coastguard Worker psrad m5, 5 2696*c0909341SAndroid Build Coastguard Worker packssdw m5, m5 2697*c0909341SAndroid Build Coastguard Worker pmaxsw m5, m8 2698*c0909341SAndroid Build Coastguard Worker pminsw m5, m9 2699*c0909341SAndroid Build Coastguard Worker shufps m5, m7, q3210 2700*c0909341SAndroid Build Coastguard Worker movu m3, [grain_lutq+offxyq*2+16] 2701*c0909341SAndroid Build Coastguard Worker 2702*c0909341SAndroid Build Coastguard Worker ; scaling[luma_src] 2703*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2704*c0909341SAndroid Build Coastguard Worker vpgatherdw m7, m4, scalingq-1, r0, r5, 8, 1 2705*c0909341SAndroid Build Coastguard Worker vpgatherdw m4, m6, scalingq-1, r0, r5, 8, 1 2706*c0909341SAndroid Build Coastguard Worker%else 2707*c0909341SAndroid Build Coastguard Worker vpgatherdw m7, m4, scalingq-1, r2, r12, 8, 1 2708*c0909341SAndroid Build Coastguard Worker vpgatherdw m4, m6, scalingq-1, r2, r12, 8, 1 2709*c0909341SAndroid Build Coastguard Worker%endif 2710*c0909341SAndroid Build Coastguard Worker REPX {psrlw x, 8}, m7, m4 2711*c0909341SAndroid Build Coastguard Worker 2712*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2713*c0909341SAndroid Build Coastguard Worker REPX {pmullw x, m11}, m7, m4 2714*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m7 2715*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m4 2716*c0909341SAndroid Build Coastguard Worker 2717*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 2718*c0909341SAndroid Build Coastguard Worker paddw m0, m5 2719*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2720*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m13 2721*c0909341SAndroid Build Coastguard Worker pmaxsw m1, m13 2722*c0909341SAndroid Build Coastguard Worker pminsw m0, m12 2723*c0909341SAndroid Build Coastguard Worker pminsw m1, m12 2724*c0909341SAndroid Build Coastguard Worker movifnidn dstq, dstmp 2725*c0909341SAndroid Build Coastguard Worker mova [dstq+ 0], m0 2726*c0909341SAndroid Build Coastguard Worker mova [dstq+16], m1 2727*c0909341SAndroid Build Coastguard Worker 2728*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2729*c0909341SAndroid Build Coastguard Worker add srcq, r2mp 2730*c0909341SAndroid Build Coastguard Worker add dstq, r2mp 2731*c0909341SAndroid Build Coastguard Worker mov dstmp, dstq 2732*c0909341SAndroid Build Coastguard Worker%else 2733*c0909341SAndroid Build Coastguard Worker add srcq, r13mp 2734*c0909341SAndroid Build Coastguard Worker add dstq, r13mp 2735*c0909341SAndroid Build Coastguard Worker add lumaq, lstrideq 2736*c0909341SAndroid Build Coastguard Worker%endif 2737*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82*2 2738*c0909341SAndroid Build Coastguard Worker dec hd 2739*c0909341SAndroid Build Coastguard Worker jg %%loop_y_h_overlap 2740*c0909341SAndroid Build Coastguard Worker 2741*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2742*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut 2743*c0909341SAndroid Build Coastguard Worker mov wq, r4mp 2744*c0909341SAndroid Build Coastguard Worker%endif 2745*c0909341SAndroid Build Coastguard Worker add wq, 16 2746*c0909341SAndroid Build Coastguard Worker jge %%end 2747*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2748*c0909341SAndroid Build Coastguard Worker mov srcq, r1mp 2749*c0909341SAndroid Build Coastguard Worker%else 2750*c0909341SAndroid Build Coastguard Worker mov srcq, r10mp 2751*c0909341SAndroid Build Coastguard Worker%endif 2752*c0909341SAndroid Build Coastguard Worker mov dstq, r11mp 2753*c0909341SAndroid Build Coastguard Worker mov lumaq, r12mp 2754*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+wq*2] 2755*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+wq*2] 2756*c0909341SAndroid Build Coastguard Worker lea lumaq, [lumaq+wq*(2<<%2)] 2757*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2758*c0909341SAndroid Build Coastguard Worker mov r0mp, dstq 2759*c0909341SAndroid Build Coastguard Worker mov r9mp, lumaq 2760*c0909341SAndroid Build Coastguard Worker mov r4m, wq 2761*c0909341SAndroid Build Coastguard Worker%endif 2762*c0909341SAndroid Build Coastguard Worker 2763*c0909341SAndroid Build Coastguard Worker%if %2 2764*c0909341SAndroid Build Coastguard Worker ; r8m = sbym 2765*c0909341SAndroid Build Coastguard Worker test dword r8m, 2 2766*c0909341SAndroid Build Coastguard Worker jne %%loop_x_hv_overlap 2767*c0909341SAndroid Build Coastguard Worker jmp %%loop_x_h_overlap 2768*c0909341SAndroid Build Coastguard Worker%else 2769*c0909341SAndroid Build Coastguard Worker or dword r8m, 4 2770*c0909341SAndroid Build Coastguard Worker add offxyd, 16 2771*c0909341SAndroid Build Coastguard Worker 2772*c0909341SAndroid Build Coastguard Worker ; r8m = sbym 2773*c0909341SAndroid Build Coastguard Worker test dword r8m, 2 2774*c0909341SAndroid Build Coastguard Worker jz %%loop_x_odd 2775*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2776*c0909341SAndroid Build Coastguard Worker add dword [rsp+8*mmsize+1*gprsize], 16 2777*c0909341SAndroid Build Coastguard Worker%else 2778*c0909341SAndroid Build Coastguard Worker add r11d, 16 ; top_offxy += 16 2779*c0909341SAndroid Build Coastguard Worker%endif 2780*c0909341SAndroid Build Coastguard Worker jmp %%loop_x_odd_v_overlap 2781*c0909341SAndroid Build Coastguard Worker%endif 2782*c0909341SAndroid Build Coastguard Worker 2783*c0909341SAndroid Build Coastguard Worker%%end: 2784*c0909341SAndroid Build Coastguard Worker RET 2785*c0909341SAndroid Build Coastguard Worker 2786*c0909341SAndroid Build Coastguard Worker%%vertical_overlap: 2787*c0909341SAndroid Build Coastguard Worker or t0d, 2 2788*c0909341SAndroid Build Coastguard Worker mov r8m, t0d 2789*c0909341SAndroid Build Coastguard Worker 2790*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2791*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 2792*c0909341SAndroid Build Coastguard Worker%else 2793*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ 2794*c0909341SAndroid Build Coastguard Worker sby, see, unused1, unused2, unused3, lstride 2795*c0909341SAndroid Build Coastguard Worker%endif 2796*c0909341SAndroid Build Coastguard Worker 2797*c0909341SAndroid Build Coastguard Worker movzx sbyd, sbyb 2798*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2799*c0909341SAndroid Build Coastguard Worker imul r4, [fg_dataq+FGData.seed], 0x00010001 2800*c0909341SAndroid Build Coastguard Worker 2801*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused 2802*c0909341SAndroid Build Coastguard Worker%else 2803*c0909341SAndroid Build Coastguard Worker imul seed, [fg_dataq+FGData.seed], 0x00010001 2804*c0909341SAndroid Build Coastguard Worker%endif 2805*c0909341SAndroid Build Coastguard Worker imul t0d, sbyd, 173 * 0x00010001 2806*c0909341SAndroid Build Coastguard Worker imul sbyd, 37 * 0x01000100 2807*c0909341SAndroid Build Coastguard Worker add t0d, (105 << 16) | 188 2808*c0909341SAndroid Build Coastguard Worker add sbyd, (178 << 24) | (141 << 8) 2809*c0909341SAndroid Build Coastguard Worker and t0d, 0x00ff00ff 2810*c0909341SAndroid Build Coastguard Worker and sbyd, 0xff00ff00 2811*c0909341SAndroid Build Coastguard Worker xor seed, t0d 2812*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2813*c0909341SAndroid Build Coastguard Worker xor sbyd, seed 2814*c0909341SAndroid Build Coastguard Worker 2815*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, see, w, picptr, luma 2816*c0909341SAndroid Build Coastguard Worker 2817*c0909341SAndroid Build Coastguard Worker mov r3m, seed 2818*c0909341SAndroid Build Coastguard Worker mov dstq, r0mp 2819*c0909341SAndroid Build Coastguard Worker mov lumaq, r9mp 2820*c0909341SAndroid Build Coastguard Worker mov wq, r4m 2821*c0909341SAndroid Build Coastguard Worker lea r3, [srcq+wq*2] 2822*c0909341SAndroid Build Coastguard Worker mov r1mp, r3 2823*c0909341SAndroid Build Coastguard Worker lea r3, [dstq+wq*2] 2824*c0909341SAndroid Build Coastguard Worker mov r11mp, r3 2825*c0909341SAndroid Build Coastguard Worker lea r3, [lumaq+wq*(2<<%2)] 2826*c0909341SAndroid Build Coastguard Worker mov r12mp, r3 2827*c0909341SAndroid Build Coastguard Worker%if %3 2828*c0909341SAndroid Build Coastguard Worker shl r10mp, 1 2829*c0909341SAndroid Build Coastguard Worker%endif 2830*c0909341SAndroid Build Coastguard Worker%else 2831*c0909341SAndroid Build Coastguard Worker xor seed, sbyd ; (cur_seed << 16) | top_seed 2832*c0909341SAndroid Build Coastguard Worker 2833*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2834*c0909341SAndroid Build Coastguard Worker unused1, unused2, see, unused3, unused4, unused5, luma, lstride 2835*c0909341SAndroid Build Coastguard Worker 2836*c0909341SAndroid Build Coastguard Worker mov lstrideq, r10mp 2837*c0909341SAndroid Build Coastguard Worker%if %3 2838*c0909341SAndroid Build Coastguard Worker add lstrideq, lstrideq 2839*c0909341SAndroid Build Coastguard Worker%endif 2840*c0909341SAndroid Build Coastguard Worker mov lumaq, r9mp 2841*c0909341SAndroid Build Coastguard Worker lea r10, [srcq+wq*2] 2842*c0909341SAndroid Build Coastguard Worker lea r11, [dstq+wq*2] 2843*c0909341SAndroid Build Coastguard Worker lea r12, [lumaq+wq*(2<<%2)] 2844*c0909341SAndroid Build Coastguard Worker mov r10mp, r10 2845*c0909341SAndroid Build Coastguard Worker mov r11mp, r11 2846*c0909341SAndroid Build Coastguard Worker mov r12mp, r12 2847*c0909341SAndroid Build Coastguard Worker%endif 2848*c0909341SAndroid Build Coastguard Worker neg wq 2849*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2850*c0909341SAndroid Build Coastguard Worker mov r4m, wq 2851*c0909341SAndroid Build Coastguard Worker%endif 2852*c0909341SAndroid Build Coastguard Worker 2853*c0909341SAndroid Build Coastguard Worker%%loop_x_v_overlap: 2854*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2855*c0909341SAndroid Build Coastguard Worker mov seed, r3m 2856*c0909341SAndroid Build Coastguard Worker xor t0d, t0d 2857*c0909341SAndroid Build Coastguard Worker%else 2858*c0909341SAndroid Build Coastguard Worker ; we assume from the block above that bits 8-15 of r7d are zero'ed 2859*c0909341SAndroid Build Coastguard Worker%endif 2860*c0909341SAndroid Build Coastguard Worker mov r6d, seed 2861*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4eff4 2862*c0909341SAndroid Build Coastguard Worker test seeb, seeh 2863*c0909341SAndroid Build Coastguard Worker setp t0b ; parity of top_seed 2864*c0909341SAndroid Build Coastguard Worker shr seed, 16 2865*c0909341SAndroid Build Coastguard Worker shl t0d, 16 2866*c0909341SAndroid Build Coastguard Worker test seeb, seeh 2867*c0909341SAndroid Build Coastguard Worker setp t0b ; parity of cur_seed 2868*c0909341SAndroid Build Coastguard Worker or r6d, 0x00010001 2869*c0909341SAndroid Build Coastguard Worker xor t0d, r6d 2870*c0909341SAndroid Build Coastguard Worker mov seed, t0d 2871*c0909341SAndroid Build Coastguard Worker ror seed, 1 ; updated (cur_seed << 16) | top_seed 2872*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2873*c0909341SAndroid Build Coastguard Worker mov r3m, seed 2874*c0909341SAndroid Build Coastguard Worker 2875*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx 2876*c0909341SAndroid Build Coastguard Worker 2877*c0909341SAndroid Build Coastguard Worker mov offxd, offyd 2878*c0909341SAndroid Build Coastguard Worker%else 2879*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2880*c0909341SAndroid Build Coastguard Worker offx, offy, see, unused1, top_offxy, unused2, luma, lstride 2881*c0909341SAndroid Build Coastguard Worker 2882*c0909341SAndroid Build Coastguard Worker mov offyd, seed 2883*c0909341SAndroid Build Coastguard Worker mov offxd, seed 2884*c0909341SAndroid Build Coastguard Worker%endif 2885*c0909341SAndroid Build Coastguard Worker ror offyd, 8 2886*c0909341SAndroid Build Coastguard Worker ror offxd, 12 2887*c0909341SAndroid Build Coastguard Worker and offyd, 0xf000f 2888*c0909341SAndroid Build Coastguard Worker and offxd, 0xf000f 2889*c0909341SAndroid Build Coastguard Worker imul offyd, 164>>%3 2890*c0909341SAndroid Build Coastguard Worker ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 2891*c0909341SAndroid Build Coastguard Worker lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 2892*c0909341SAndroid Build Coastguard Worker 2893*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2894*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut 2895*c0909341SAndroid Build Coastguard Worker%else 2896*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2897*c0909341SAndroid Build Coastguard Worker h, offxy, see, unused1, top_offxy, unused2, luma, lstride 2898*c0909341SAndroid Build Coastguard Worker%endif 2899*c0909341SAndroid Build Coastguard Worker movzx top_offxyd, offxyw 2900*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2901*c0909341SAndroid Build Coastguard Worker mov [rsp+8*mmsize+1*gprsize], top_offxyd 2902*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2903*c0909341SAndroid Build Coastguard Worker%endif 2904*c0909341SAndroid Build Coastguard Worker shr offxyd, 16 2905*c0909341SAndroid Build Coastguard Worker 2906*c0909341SAndroid Build Coastguard Worker%if %2 == 0 2907*c0909341SAndroid Build Coastguard Worker%%loop_x_odd_v_overlap: 2908*c0909341SAndroid Build Coastguard Worker%endif 2909*c0909341SAndroid Build Coastguard Worker%if %3 == 0 2910*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2911*c0909341SAndroid Build Coastguard Worker mov r5, r5m 2912*c0909341SAndroid Build Coastguard Worker%endif 2913*c0909341SAndroid Build Coastguard Worker SPLATD m2, [PIC_ptr(pw_27_17_17_27)] 2914*c0909341SAndroid Build Coastguard Worker%endif 2915*c0909341SAndroid Build Coastguard Worker 2916*c0909341SAndroid Build Coastguard Worker mov hd, r7m 2917*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 2918*c0909341SAndroid Build Coastguard Worker%%loop_y_v_overlap: 2919*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 2920*c0909341SAndroid Build Coastguard Worker movu m3, [grain_lutq+offxyq*2] 2921*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2922*c0909341SAndroid Build Coastguard Worker mov r0, [rsp+mmsize*8+gprsize*1] ; top_offxy 2923*c0909341SAndroid Build Coastguard Worker movu m5, [grain_lutq+r0*2] 2924*c0909341SAndroid Build Coastguard Worker%else 2925*c0909341SAndroid Build Coastguard Worker movu m5, [grain_lutq+top_offxyq*2] 2926*c0909341SAndroid Build Coastguard Worker%endif 2927*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m5, m3 2928*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m3 ; {top/cur interleaved} 2929*c0909341SAndroid Build Coastguard Worker REPX {pmaddwd x, m2}, m7, m5 2930*c0909341SAndroid Build Coastguard Worker%if %1 2931*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2932*c0909341SAndroid Build Coastguard Worker mov r5, r5m 2933*c0909341SAndroid Build Coastguard Worker%endif 2934*c0909341SAndroid Build Coastguard Worker REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 2935*c0909341SAndroid Build Coastguard Worker%else 2936*c0909341SAndroid Build Coastguard Worker REPX {paddd x, m14}, m7, m5 2937*c0909341SAndroid Build Coastguard Worker%endif 2938*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 5}, m7, m5 2939*c0909341SAndroid Build Coastguard Worker packssdw m3, m5, m7 2940*c0909341SAndroid Build Coastguard Worker pmaxsw m3, m8 2941*c0909341SAndroid Build Coastguard Worker pminsw m3, m9 2942*c0909341SAndroid Build Coastguard Worker 2943*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 2944*c0909341SAndroid Build Coastguard Worker movu m4, [grain_lutq+offxyq*2+16] 2945*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2946*c0909341SAndroid Build Coastguard Worker movu m5, [grain_lutq+r0*2+16] 2947*c0909341SAndroid Build Coastguard Worker%else 2948*c0909341SAndroid Build Coastguard Worker movu m5, [grain_lutq+top_offxyq*2+16] 2949*c0909341SAndroid Build Coastguard Worker%endif 2950*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m5, m4 2951*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m4 ; {top/cur interleaved} 2952*c0909341SAndroid Build Coastguard Worker REPX {pmaddwd x, m2}, m7, m5 2953*c0909341SAndroid Build Coastguard Worker%if %1 2954*c0909341SAndroid Build Coastguard Worker REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 2955*c0909341SAndroid Build Coastguard Worker%else 2956*c0909341SAndroid Build Coastguard Worker REPX {paddd x, m14}, m7, m5 2957*c0909341SAndroid Build Coastguard Worker%endif 2958*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 5}, m7, m5 2959*c0909341SAndroid Build Coastguard Worker packssdw m4, m5, m7 2960*c0909341SAndroid Build Coastguard Worker pmaxsw m4, m8 2961*c0909341SAndroid Build Coastguard Worker pminsw m4, m9 2962*c0909341SAndroid Build Coastguard Worker 2963*c0909341SAndroid Build Coastguard Worker ; src 2964*c0909341SAndroid Build Coastguard Worker mova m0, [srcq] 2965*c0909341SAndroid Build Coastguard Worker mova m1, [srcq+16] 2966*c0909341SAndroid Build Coastguard Worker 2967*c0909341SAndroid Build Coastguard Worker ; luma_src 2968*c0909341SAndroid Build Coastguard Worker pxor mzero, mzero 2969*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2970*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut 2971*c0909341SAndroid Build Coastguard Worker 2972*c0909341SAndroid Build Coastguard Worker mov lumaq, r9mp 2973*c0909341SAndroid Build Coastguard Worker%endif 2974*c0909341SAndroid Build Coastguard Worker mova m5, [lumaq+ 0] 2975*c0909341SAndroid Build Coastguard Worker mova m6, [lumaq+(16<<%2)] 2976*c0909341SAndroid Build Coastguard Worker%if %2 2977*c0909341SAndroid Build Coastguard Worker phaddw m5, [lumaq+16] 2978*c0909341SAndroid Build Coastguard Worker phaddw m6, [lumaq+48] 2979*c0909341SAndroid Build Coastguard Worker%endif 2980*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2981*c0909341SAndroid Build Coastguard Worker add lumaq, r10mp 2982*c0909341SAndroid Build Coastguard Worker mov r9mp, lumaq 2983*c0909341SAndroid Build Coastguard Worker%endif 2984*c0909341SAndroid Build Coastguard Worker%if %2 2985*c0909341SAndroid Build Coastguard Worker pavgw m5, mzero 2986*c0909341SAndroid Build Coastguard Worker pavgw m6, mzero 2987*c0909341SAndroid Build Coastguard Worker%endif 2988*c0909341SAndroid Build Coastguard Worker 2989*c0909341SAndroid Build Coastguard Worker%if %1 2990*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m5, m0 2991*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m0 2992*c0909341SAndroid Build Coastguard Worker REPX {pmaddwd x, m14}, m7, m5 2993*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 6}, m7, m5 2994*c0909341SAndroid Build Coastguard Worker packssdw m5, m7 2995*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m6, m1 2996*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m1 ; { luma, chroma } 2997*c0909341SAndroid Build Coastguard Worker REPX {pmaddwd x, m14}, m7, m6 2998*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 6}, m7, m6 2999*c0909341SAndroid Build Coastguard Worker packssdw m6, m7 3000*c0909341SAndroid Build Coastguard Worker pxor mzero, mzero 3001*c0909341SAndroid Build Coastguard Worker REPX {paddw x, m15}, m5, m6 3002*c0909341SAndroid Build Coastguard Worker REPX {pmaxsw x, mzero}, m5, m6 3003*c0909341SAndroid Build Coastguard Worker REPX {pminsw x, m10}, m5, m6 ; clip_pixel() 3004*c0909341SAndroid Build Coastguard Worker%else 3005*c0909341SAndroid Build Coastguard Worker REPX {pand x, m10}, m5, m6 3006*c0909341SAndroid Build Coastguard Worker%endif 3007*c0909341SAndroid Build Coastguard Worker 3008*c0909341SAndroid Build Coastguard Worker ; scaling[luma_src] 3009*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3010*c0909341SAndroid Build Coastguard Worker vpgatherdw m7, m5, scalingq-1, r0, r5, 8, 1 3011*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 3012*c0909341SAndroid Build Coastguard Worker%else 3013*c0909341SAndroid Build Coastguard Worker vpgatherdw m7, m5, scalingq-1, r10, r12, 8, 1 3014*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 3015*c0909341SAndroid Build Coastguard Worker%endif 3016*c0909341SAndroid Build Coastguard Worker REPX {psrlw x, 8}, m7, m5 3017*c0909341SAndroid Build Coastguard Worker 3018*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[luma_src] * grain, scaling_shift) 3019*c0909341SAndroid Build Coastguard Worker REPX {pmullw x, m11}, m7, m5 3020*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m7 3021*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m5 3022*c0909341SAndroid Build Coastguard Worker 3023*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 3024*c0909341SAndroid Build Coastguard Worker paddw m0, m3 3025*c0909341SAndroid Build Coastguard Worker paddw m1, m4 3026*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m13 3027*c0909341SAndroid Build Coastguard Worker pmaxsw m1, m13 3028*c0909341SAndroid Build Coastguard Worker pminsw m0, m12 3029*c0909341SAndroid Build Coastguard Worker pminsw m1, m12 3030*c0909341SAndroid Build Coastguard Worker movifnidn dstq, dstmp 3031*c0909341SAndroid Build Coastguard Worker mova [dstq+ 0], m0 3032*c0909341SAndroid Build Coastguard Worker mova [dstq+16], m1 3033*c0909341SAndroid Build Coastguard Worker 3034*c0909341SAndroid Build Coastguard Worker dec hw 3035*c0909341SAndroid Build Coastguard Worker jle %%end_y_v_overlap 3036*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3037*c0909341SAndroid Build Coastguard Worker add srcq, r2mp 3038*c0909341SAndroid Build Coastguard Worker add dstq, r2mp 3039*c0909341SAndroid Build Coastguard Worker mov dstmp, dstq 3040*c0909341SAndroid Build Coastguard Worker%else 3041*c0909341SAndroid Build Coastguard Worker add srcq, r13mp 3042*c0909341SAndroid Build Coastguard Worker add dstq, r13mp 3043*c0909341SAndroid Build Coastguard Worker add lumaq, lstrideq 3044*c0909341SAndroid Build Coastguard Worker%endif 3045*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82*2 3046*c0909341SAndroid Build Coastguard Worker%if %3 3047*c0909341SAndroid Build Coastguard Worker jmp %%loop_y 3048*c0909341SAndroid Build Coastguard Worker%else 3049*c0909341SAndroid Build Coastguard Worker btc hd, 16 3050*c0909341SAndroid Build Coastguard Worker jc %%loop_y 3051*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3052*c0909341SAndroid Build Coastguard Worker mov r5, r5m 3053*c0909341SAndroid Build Coastguard Worker%endif 3054*c0909341SAndroid Build Coastguard Worker SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] 3055*c0909341SAndroid Build Coastguard Worker jmp %%loop_y_v_overlap 3056*c0909341SAndroid Build Coastguard Worker%endif 3057*c0909341SAndroid Build Coastguard Worker 3058*c0909341SAndroid Build Coastguard Worker%%end_y_v_overlap: 3059*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3060*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut 3061*c0909341SAndroid Build Coastguard Worker 3062*c0909341SAndroid Build Coastguard Worker mov wq, r4m 3063*c0909341SAndroid Build Coastguard Worker%endif 3064*c0909341SAndroid Build Coastguard Worker add wq, 16 3065*c0909341SAndroid Build Coastguard Worker jge %%end_hv 3066*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3067*c0909341SAndroid Build Coastguard Worker mov srcq, r1mp 3068*c0909341SAndroid Build Coastguard Worker%else 3069*c0909341SAndroid Build Coastguard Worker mov srcq, r10mp 3070*c0909341SAndroid Build Coastguard Worker%endif 3071*c0909341SAndroid Build Coastguard Worker mov dstq, r11mp 3072*c0909341SAndroid Build Coastguard Worker mov lumaq, r12mp 3073*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+wq*2] 3074*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+wq*2] 3075*c0909341SAndroid Build Coastguard Worker lea lumaq, [lumaq+wq*(2<<%2)] 3076*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3077*c0909341SAndroid Build Coastguard Worker mov r0mp, dstq 3078*c0909341SAndroid Build Coastguard Worker mov r9mp, lumaq 3079*c0909341SAndroid Build Coastguard Worker mov r4m, wq 3080*c0909341SAndroid Build Coastguard Worker%endif 3081*c0909341SAndroid Build Coastguard Worker 3082*c0909341SAndroid Build Coastguard Worker%if %2 3083*c0909341SAndroid Build Coastguard Worker ; since fg_dataq.overlap is guaranteed to be set, we never jump 3084*c0909341SAndroid Build Coastguard Worker ; back to .loop_x_v_overlap, and instead always fall-through to 3085*c0909341SAndroid Build Coastguard Worker ; h+v overlap 3086*c0909341SAndroid Build Coastguard Worker%else 3087*c0909341SAndroid Build Coastguard Worker btc dword r8m, 2 3088*c0909341SAndroid Build Coastguard Worker jc %%loop_x_hv_overlap 3089*c0909341SAndroid Build Coastguard Worker add offxyd, 16 3090*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3091*c0909341SAndroid Build Coastguard Worker add dword [rsp+8*mmsize+1*gprsize], 16 3092*c0909341SAndroid Build Coastguard Worker%else 3093*c0909341SAndroid Build Coastguard Worker add r11d, 16 3094*c0909341SAndroid Build Coastguard Worker%endif 3095*c0909341SAndroid Build Coastguard Worker jmp %%loop_x_odd_v_overlap 3096*c0909341SAndroid Build Coastguard Worker%endif 3097*c0909341SAndroid Build Coastguard Worker 3098*c0909341SAndroid Build Coastguard Worker%%loop_x_hv_overlap: 3099*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3100*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, w, picptr, grain_lut 3101*c0909341SAndroid Build Coastguard Worker 3102*c0909341SAndroid Build Coastguard Worker mov t0d, [rsp+mmsize*8+gprsize*1] ; top_offxy 3103*c0909341SAndroid Build Coastguard Worker add offxyd, 16 3104*c0909341SAndroid Build Coastguard Worker add t0d, 16 3105*c0909341SAndroid Build Coastguard Worker mov [rsp+mmsize*8+gprsize*0], offxyd ; left_offxyd 3106*c0909341SAndroid Build Coastguard Worker mov [rsp+mmsize*8+gprsize*2], t0d ; topleft_offxyd 3107*c0909341SAndroid Build Coastguard Worker 3108*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut 3109*c0909341SAndroid Build Coastguard Worker 3110*c0909341SAndroid Build Coastguard Worker mov seed, r3m 3111*c0909341SAndroid Build Coastguard Worker xor t0d, t0d 3112*c0909341SAndroid Build Coastguard Worker%else 3113*c0909341SAndroid Build Coastguard Worker ; we assume from the block above that bits 8-15 of r7d are zero'ed 3114*c0909341SAndroid Build Coastguard Worker%endif 3115*c0909341SAndroid Build Coastguard Worker mov r6d, seed 3116*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4eff4 3117*c0909341SAndroid Build Coastguard Worker test seeb, seeh 3118*c0909341SAndroid Build Coastguard Worker setp t0b ; parity of top_seed 3119*c0909341SAndroid Build Coastguard Worker shr seed, 16 3120*c0909341SAndroid Build Coastguard Worker shl t0d, 16 3121*c0909341SAndroid Build Coastguard Worker test seeb, seeh 3122*c0909341SAndroid Build Coastguard Worker setp t0b ; parity of cur_seed 3123*c0909341SAndroid Build Coastguard Worker or r6d, 0x00010001 3124*c0909341SAndroid Build Coastguard Worker xor t0d, r6d 3125*c0909341SAndroid Build Coastguard Worker mov seed, t0d 3126*c0909341SAndroid Build Coastguard Worker ror seed, 1 ; updated (cur_seed << 16) | top_seed 3127*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3128*c0909341SAndroid Build Coastguard Worker mov r3m, seed 3129*c0909341SAndroid Build Coastguard Worker 3130*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx 3131*c0909341SAndroid Build Coastguard Worker 3132*c0909341SAndroid Build Coastguard Worker mov offxd, offyd 3133*c0909341SAndroid Build Coastguard Worker%else 3134*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 3135*c0909341SAndroid Build Coastguard Worker offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride 3136*c0909341SAndroid Build Coastguard Worker 3137*c0909341SAndroid Build Coastguard Worker lea topleft_offxyq, [top_offxyq+16] 3138*c0909341SAndroid Build Coastguard Worker lea left_offxyq, [offyq+16] 3139*c0909341SAndroid Build Coastguard Worker mov offyd, seed 3140*c0909341SAndroid Build Coastguard Worker mov offxd, seed 3141*c0909341SAndroid Build Coastguard Worker%endif 3142*c0909341SAndroid Build Coastguard Worker ror offyd, 8 3143*c0909341SAndroid Build Coastguard Worker ror offxd, 12 3144*c0909341SAndroid Build Coastguard Worker and offyd, 0xf000f 3145*c0909341SAndroid Build Coastguard Worker and offxd, 0xf000f 3146*c0909341SAndroid Build Coastguard Worker imul offyd, 164>>%3 3147*c0909341SAndroid Build Coastguard Worker ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 3148*c0909341SAndroid Build Coastguard Worker lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 3149*c0909341SAndroid Build Coastguard Worker 3150*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3151*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, h, picptr, top_offxy 3152*c0909341SAndroid Build Coastguard Worker%else 3153*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 3154*c0909341SAndroid Build Coastguard Worker h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride 3155*c0909341SAndroid Build Coastguard Worker%endif 3156*c0909341SAndroid Build Coastguard Worker movzx top_offxyd, offxyw 3157*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3158*c0909341SAndroid Build Coastguard Worker mov [rsp+8*mmsize+1*gprsize], top_offxyd 3159*c0909341SAndroid Build Coastguard Worker 3160*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 3161*c0909341SAndroid Build Coastguard Worker%endif 3162*c0909341SAndroid Build Coastguard Worker shr offxyd, 16 3163*c0909341SAndroid Build Coastguard Worker 3164*c0909341SAndroid Build Coastguard Worker%if %3 == 0 3165*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3166*c0909341SAndroid Build Coastguard Worker mov r5, r5m 3167*c0909341SAndroid Build Coastguard Worker%endif 3168*c0909341SAndroid Build Coastguard Worker SPLATD m2, [PIC_ptr(pw_27_17_17_27)] 3169*c0909341SAndroid Build Coastguard Worker%endif 3170*c0909341SAndroid Build Coastguard Worker 3171*c0909341SAndroid Build Coastguard Worker mov hd, r7m 3172*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 3173*c0909341SAndroid Build Coastguard Worker%%loop_y_hv_overlap: 3174*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 3175*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3176*c0909341SAndroid Build Coastguard Worker mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy 3177*c0909341SAndroid Build Coastguard Worker mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy 3178*c0909341SAndroid Build Coastguard Worker movd m5, [grain_lutq+r5*2] 3179*c0909341SAndroid Build Coastguard Worker%else 3180*c0909341SAndroid Build Coastguard Worker movd m5, [grain_lutq+left_offxyq*2] 3181*c0909341SAndroid Build Coastguard Worker%endif 3182*c0909341SAndroid Build Coastguard Worker movu m7, [grain_lutq+offxyq*2] 3183*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3184*c0909341SAndroid Build Coastguard Worker mov r5, [rsp+8*mmsize+2*gprsize] 3185*c0909341SAndroid Build Coastguard Worker movu m4, [grain_lutq+r0*2] 3186*c0909341SAndroid Build Coastguard Worker%if %2 3187*c0909341SAndroid Build Coastguard Worker pinsrw m5, [grain_lutq+r5*2], 2 3188*c0909341SAndroid Build Coastguard Worker%else 3189*c0909341SAndroid Build Coastguard Worker movd m3, [grain_lutq+r5*2] 3190*c0909341SAndroid Build Coastguard Worker%endif 3191*c0909341SAndroid Build Coastguard Worker%else 3192*c0909341SAndroid Build Coastguard Worker movu m4, [grain_lutq+top_offxyq*2] 3193*c0909341SAndroid Build Coastguard Worker%if %2 3194*c0909341SAndroid Build Coastguard Worker pinsrw m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left } 3195*c0909341SAndroid Build Coastguard Worker%else 3196*c0909341SAndroid Build Coastguard Worker movd m3, [grain_lutq+topleft_offxyq*2] 3197*c0909341SAndroid Build Coastguard Worker%endif 3198*c0909341SAndroid Build Coastguard Worker%endif 3199*c0909341SAndroid Build Coastguard Worker%if %2 == 0 3200*c0909341SAndroid Build Coastguard Worker punpckldq m5, m3 3201*c0909341SAndroid Build Coastguard Worker%endif 3202*c0909341SAndroid Build Coastguard Worker punpckldq m3, m7, m4 ; { cur0/1,top0/1,cur2/3,top2/3 } 3203*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m3 ; { left/cur0,_/cur1,topleft/top0,_/top1 } 3204*c0909341SAndroid Build Coastguard Worker%if %1 3205*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3206*c0909341SAndroid Build Coastguard Worker mov r5, r5m 3207*c0909341SAndroid Build Coastguard Worker%endif 3208*c0909341SAndroid Build Coastguard Worker%if %2 3209*c0909341SAndroid Build Coastguard Worker movddup m0, [PIC_ptr(pw_23_22)] 3210*c0909341SAndroid Build Coastguard Worker%else 3211*c0909341SAndroid Build Coastguard Worker movddup m0, [PIC_ptr(pw_27_17_17_27)] 3212*c0909341SAndroid Build Coastguard Worker%endif 3213*c0909341SAndroid Build Coastguard Worker%else 3214*c0909341SAndroid Build Coastguard Worker pshufd m0, m15, q1010 3215*c0909341SAndroid Build Coastguard Worker%endif 3216*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m0 3217*c0909341SAndroid Build Coastguard Worker%if %1 3218*c0909341SAndroid Build Coastguard Worker paddd m5, [PIC_ptr(pd_16)] 3219*c0909341SAndroid Build Coastguard Worker%else 3220*c0909341SAndroid Build Coastguard Worker paddd m5, m14 3221*c0909341SAndroid Build Coastguard Worker%endif 3222*c0909341SAndroid Build Coastguard Worker psrad m5, 5 3223*c0909341SAndroid Build Coastguard Worker packssdw m5, m5 3224*c0909341SAndroid Build Coastguard Worker pmaxsw m5, m8 3225*c0909341SAndroid Build Coastguard Worker pminsw m5, m9 3226*c0909341SAndroid Build Coastguard Worker shufps m5, m3, q3210 ; cur0/1,top0/1,cur2/3,top2/3 3227*c0909341SAndroid Build Coastguard Worker shufps m3, m5, m7, q3220 ; cur0-7 post-h_filter 3228*c0909341SAndroid Build Coastguard Worker shufps m5, m4, q3231 ; top0-7 post-h_filter 3229*c0909341SAndroid Build Coastguard Worker 3230*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m5, m3 3231*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m3 ; {top/cur interleaved} 3232*c0909341SAndroid Build Coastguard Worker REPX {pmaddwd x, m2}, m7, m5 3233*c0909341SAndroid Build Coastguard Worker%if %1 3234*c0909341SAndroid Build Coastguard Worker REPX {paddd x, [PIC_ptr(pd_16)]}, m5, m7 3235*c0909341SAndroid Build Coastguard Worker%else 3236*c0909341SAndroid Build Coastguard Worker REPX {paddd x, m14}, m5, m7 3237*c0909341SAndroid Build Coastguard Worker%endif 3238*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 5}, m5, m7 3239*c0909341SAndroid Build Coastguard Worker packssdw m3, m5, m7 3240*c0909341SAndroid Build Coastguard Worker pmaxsw m3, m8 3241*c0909341SAndroid Build Coastguard Worker pminsw m3, m9 3242*c0909341SAndroid Build Coastguard Worker 3243*c0909341SAndroid Build Coastguard Worker ; right half 3244*c0909341SAndroid Build Coastguard Worker movu m4, [grain_lutq+offxyq*2+16] 3245*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3246*c0909341SAndroid Build Coastguard Worker movu m0, [grain_lutq+r0*2+16] 3247*c0909341SAndroid Build Coastguard Worker%else 3248*c0909341SAndroid Build Coastguard Worker movu m0, [grain_lutq+top_offxyq*2+16] 3249*c0909341SAndroid Build Coastguard Worker%endif 3250*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m4 3251*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m4 ; {top/cur interleaved} 3252*c0909341SAndroid Build Coastguard Worker REPX {pmaddwd x, m2}, m1, m0 3253*c0909341SAndroid Build Coastguard Worker%if %1 3254*c0909341SAndroid Build Coastguard Worker REPX {paddd x, [PIC_ptr(pd_16)]}, m1, m0 3255*c0909341SAndroid Build Coastguard Worker%else 3256*c0909341SAndroid Build Coastguard Worker REPX {paddd x, m14}, m1, m0 3257*c0909341SAndroid Build Coastguard Worker%endif 3258*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 5}, m1, m0 3259*c0909341SAndroid Build Coastguard Worker packssdw m4, m0, m1 3260*c0909341SAndroid Build Coastguard Worker pmaxsw m4, m8 3261*c0909341SAndroid Build Coastguard Worker pminsw m4, m9 3262*c0909341SAndroid Build Coastguard Worker 3263*c0909341SAndroid Build Coastguard Worker ; src 3264*c0909341SAndroid Build Coastguard Worker mova m0, [srcq] 3265*c0909341SAndroid Build Coastguard Worker mova m1, [srcq+16] 3266*c0909341SAndroid Build Coastguard Worker 3267*c0909341SAndroid Build Coastguard Worker ; luma_src 3268*c0909341SAndroid Build Coastguard Worker pxor mzero, mzero 3269*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3270*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut 3271*c0909341SAndroid Build Coastguard Worker 3272*c0909341SAndroid Build Coastguard Worker mov lumaq, r9mp 3273*c0909341SAndroid Build Coastguard Worker%endif 3274*c0909341SAndroid Build Coastguard Worker mova m6, [lumaq+ 0] 3275*c0909341SAndroid Build Coastguard Worker mova m5, [lumaq+(16<<%2)] 3276*c0909341SAndroid Build Coastguard Worker%if %2 3277*c0909341SAndroid Build Coastguard Worker phaddw m6, [lumaq+16] 3278*c0909341SAndroid Build Coastguard Worker phaddw m5, [lumaq+48] 3279*c0909341SAndroid Build Coastguard Worker%endif 3280*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3281*c0909341SAndroid Build Coastguard Worker add lumaq, r10mp 3282*c0909341SAndroid Build Coastguard Worker mov r9mp, lumaq 3283*c0909341SAndroid Build Coastguard Worker%endif 3284*c0909341SAndroid Build Coastguard Worker%if %2 3285*c0909341SAndroid Build Coastguard Worker pavgw m6, mzero 3286*c0909341SAndroid Build Coastguard Worker pavgw m5, mzero 3287*c0909341SAndroid Build Coastguard Worker%endif 3288*c0909341SAndroid Build Coastguard Worker 3289*c0909341SAndroid Build Coastguard Worker%if %1 3290*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m6, m0 3291*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m0 3292*c0909341SAndroid Build Coastguard Worker REPX {pmaddwd x, m14}, m7, m6 3293*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 6}, m7, m6 3294*c0909341SAndroid Build Coastguard Worker packssdw m6, m7 3295*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m5, m1 3296*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m1 ; { luma, chroma } 3297*c0909341SAndroid Build Coastguard Worker REPX {pmaddwd x, m14}, m7, m5 3298*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 6}, m7, m5 3299*c0909341SAndroid Build Coastguard Worker packssdw m5, m7 3300*c0909341SAndroid Build Coastguard Worker pxor mzero, mzero 3301*c0909341SAndroid Build Coastguard Worker REPX {paddw x, m15}, m6, m5 3302*c0909341SAndroid Build Coastguard Worker REPX {pmaxsw x, mzero}, m6, m5 3303*c0909341SAndroid Build Coastguard Worker REPX {pminsw x, m10}, m6, m5 ; clip_pixel() 3304*c0909341SAndroid Build Coastguard Worker%else 3305*c0909341SAndroid Build Coastguard Worker REPX {pand x, m10}, m6, m5 3306*c0909341SAndroid Build Coastguard Worker%endif 3307*c0909341SAndroid Build Coastguard Worker 3308*c0909341SAndroid Build Coastguard Worker ; scaling[luma_src] 3309*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3310*c0909341SAndroid Build Coastguard Worker vpgatherdw m7, m6, scalingq-1, r0, r5, 8, 1 3311*c0909341SAndroid Build Coastguard Worker vpgatherdw m6, m5, scalingq-1, r0, r5, 8, 1 3312*c0909341SAndroid Build Coastguard Worker%else 3313*c0909341SAndroid Build Coastguard Worker%if %3 == 0 3314*c0909341SAndroid Build Coastguard Worker ; register shortage :) 3315*c0909341SAndroid Build Coastguard Worker push r12 3316*c0909341SAndroid Build Coastguard Worker%endif 3317*c0909341SAndroid Build Coastguard Worker vpgatherdw m7, m6, scalingq-1, r2, r12, 8, 1 3318*c0909341SAndroid Build Coastguard Worker vpgatherdw m6, m5, scalingq-1, r2, r12, 8, 1 3319*c0909341SAndroid Build Coastguard Worker%if %3 == 0 3320*c0909341SAndroid Build Coastguard Worker pop r12 3321*c0909341SAndroid Build Coastguard Worker%endif 3322*c0909341SAndroid Build Coastguard Worker%endif 3323*c0909341SAndroid Build Coastguard Worker REPX {psrlw x, 8}, m7, m6 3324*c0909341SAndroid Build Coastguard Worker 3325*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[luma_src] * grain, scaling_shift) 3326*c0909341SAndroid Build Coastguard Worker REPX {pmullw x, m11}, m7, m6 3327*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m7 3328*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m6 3329*c0909341SAndroid Build Coastguard Worker 3330*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 3331*c0909341SAndroid Build Coastguard Worker paddw m0, m3 3332*c0909341SAndroid Build Coastguard Worker paddw m1, m4 3333*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m13 3334*c0909341SAndroid Build Coastguard Worker pmaxsw m1, m13 3335*c0909341SAndroid Build Coastguard Worker pminsw m0, m12 3336*c0909341SAndroid Build Coastguard Worker pminsw m1, m12 3337*c0909341SAndroid Build Coastguard Worker movifnidn dstq, dstmp 3338*c0909341SAndroid Build Coastguard Worker mova [dstq+ 0], m0 3339*c0909341SAndroid Build Coastguard Worker mova [dstq+16], m1 3340*c0909341SAndroid Build Coastguard Worker 3341*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3342*c0909341SAndroid Build Coastguard Worker add srcq, r2mp 3343*c0909341SAndroid Build Coastguard Worker add dstq, r2mp 3344*c0909341SAndroid Build Coastguard Worker mov dstmp, dstq 3345*c0909341SAndroid Build Coastguard Worker%else 3346*c0909341SAndroid Build Coastguard Worker add srcq, r13mp 3347*c0909341SAndroid Build Coastguard Worker add dstq, r13mp 3348*c0909341SAndroid Build Coastguard Worker add lumaq, lstrideq 3349*c0909341SAndroid Build Coastguard Worker%endif 3350*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82*2 3351*c0909341SAndroid Build Coastguard Worker dec hw 3352*c0909341SAndroid Build Coastguard Worker%if %3 3353*c0909341SAndroid Build Coastguard Worker jg %%loop_y_h_overlap 3354*c0909341SAndroid Build Coastguard Worker%else 3355*c0909341SAndroid Build Coastguard Worker jle %%end_y_hv_overlap 3356*c0909341SAndroid Build Coastguard Worker btc hd, 16 3357*c0909341SAndroid Build Coastguard Worker jc %%loop_y_h_overlap 3358*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3359*c0909341SAndroid Build Coastguard Worker mov r5, r5m 3360*c0909341SAndroid Build Coastguard Worker%endif 3361*c0909341SAndroid Build Coastguard Worker SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] 3362*c0909341SAndroid Build Coastguard Worker jmp %%loop_y_hv_overlap 3363*c0909341SAndroid Build Coastguard Worker%%end_y_hv_overlap: 3364*c0909341SAndroid Build Coastguard Worker%endif 3365*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3366*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut 3367*c0909341SAndroid Build Coastguard Worker 3368*c0909341SAndroid Build Coastguard Worker mov wq, r4m 3369*c0909341SAndroid Build Coastguard Worker%endif 3370*c0909341SAndroid Build Coastguard Worker add wq, 16 3371*c0909341SAndroid Build Coastguard Worker jge %%end_hv 3372*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3373*c0909341SAndroid Build Coastguard Worker mov srcq, r1mp 3374*c0909341SAndroid Build Coastguard Worker%else 3375*c0909341SAndroid Build Coastguard Worker mov srcq, r10mp 3376*c0909341SAndroid Build Coastguard Worker%endif 3377*c0909341SAndroid Build Coastguard Worker mov dstq, r11mp 3378*c0909341SAndroid Build Coastguard Worker mov lumaq, r12mp 3379*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+wq*2] 3380*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+wq*2] 3381*c0909341SAndroid Build Coastguard Worker lea lumaq, [lumaq+wq*(2<<%2)] 3382*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3383*c0909341SAndroid Build Coastguard Worker mov dstmp, dstq 3384*c0909341SAndroid Build Coastguard Worker mov r9mp, lumaq 3385*c0909341SAndroid Build Coastguard Worker mov r4m, wq 3386*c0909341SAndroid Build Coastguard Worker%endif 3387*c0909341SAndroid Build Coastguard Worker%if %2 3388*c0909341SAndroid Build Coastguard Worker jmp %%loop_x_hv_overlap 3389*c0909341SAndroid Build Coastguard Worker%else 3390*c0909341SAndroid Build Coastguard Worker or dword r8m, 4 3391*c0909341SAndroid Build Coastguard Worker add offxyd, 16 3392*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3393*c0909341SAndroid Build Coastguard Worker add dword [rsp+8*mmsize+1*gprsize], 16 3394*c0909341SAndroid Build Coastguard Worker%else 3395*c0909341SAndroid Build Coastguard Worker add r11d, 16 ; top_offxy += 16 3396*c0909341SAndroid Build Coastguard Worker%endif 3397*c0909341SAndroid Build Coastguard Worker jmp %%loop_x_odd_v_overlap 3398*c0909341SAndroid Build Coastguard Worker%endif 3399*c0909341SAndroid Build Coastguard Worker 3400*c0909341SAndroid Build Coastguard Worker%%end_hv: 3401*c0909341SAndroid Build Coastguard Worker RET 3402*c0909341SAndroid Build Coastguard Worker%endmacro 3403*c0909341SAndroid Build Coastguard Worker 3404*c0909341SAndroid Build Coastguard Worker %%FGUV_32x32xN_LOOP 1, %2, %3 3405*c0909341SAndroid Build Coastguard Worker.csfl: 3406*c0909341SAndroid Build Coastguard Worker %%FGUV_32x32xN_LOOP 0, %2, %3 3407*c0909341SAndroid Build Coastguard Worker 3408*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < mmsize 3409*c0909341SAndroid Build Coastguard WorkerDECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 3410*c0909341SAndroid Build Coastguard Worker%endif 3411*c0909341SAndroid Build Coastguard Worker%endmacro 3412*c0909341SAndroid Build Coastguard Worker 3413*c0909341SAndroid Build Coastguard WorkerFGUV_FN 420, 1, 1 3414*c0909341SAndroid Build Coastguard WorkerFGUV_FN 422, 1, 0 3415*c0909341SAndroid Build Coastguard WorkerFGUV_FN 444, 0, 0 3416