1*c0909341SAndroid Build Coastguard Worker; Copyright © 2019-2021, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2019, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; All rights reserved. 4*c0909341SAndroid Build Coastguard Worker; 5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 7*c0909341SAndroid Build Coastguard Worker; 8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 9*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 10*c0909341SAndroid Build Coastguard Worker; 11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 12*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 13*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 14*c0909341SAndroid Build Coastguard Worker; 15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*c0909341SAndroid Build Coastguard Worker 26*c0909341SAndroid Build Coastguard Worker%include "config.asm" 27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 28*c0909341SAndroid Build Coastguard Worker%include "x86/filmgrain_common.asm" 29*c0909341SAndroid Build Coastguard Worker 30*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 31*c0909341SAndroid Build Coastguard Worker 32*c0909341SAndroid Build Coastguard Workerpw_1024: times 8 dw 1024 33*c0909341SAndroid Build Coastguard Workerpb_27_17_17_27: db 27, 17, 17, 27 34*c0909341SAndroid Build Coastguard Worker times 6 db 0, 32 35*c0909341SAndroid Build Coastguard Workerpb_23_22_h: db 23, 22 36*c0909341SAndroid Build Coastguard Worker times 7 db 0, 32 37*c0909341SAndroid Build Coastguard Workerpb_27_17: times 8 db 27, 17 38*c0909341SAndroid Build Coastguard Workerpb_17_27: times 8 db 17, 27 39*c0909341SAndroid Build Coastguard Workerpb_23_22: times 8 db 23, 22 40*c0909341SAndroid Build Coastguard Workerpb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 41*c0909341SAndroid Build Coastguard Workerrnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 42*c0909341SAndroid Build Coastguard Workerbyte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 43*c0909341SAndroid Build Coastguard Workerpw_seed_xor: times 2 dw 0xb524 44*c0909341SAndroid Build Coastguard Worker times 2 dw 0x49d8 45*c0909341SAndroid Build Coastguard Workerpb_1: times 4 db 1 46*c0909341SAndroid Build Coastguard Workerhmul_bits: dw 32768, 16384, 8192, 4096 47*c0909341SAndroid Build Coastguard Workerround: dw 2048, 1024, 512 48*c0909341SAndroid Build Coastguard Workermul_bits: dw 256, 128, 64, 32, 16 49*c0909341SAndroid Build Coastguard Workerround_vals: dw 32, 64, 128, 256, 512 50*c0909341SAndroid Build Coastguard Workermax: dw 255, 240, 235 51*c0909341SAndroid Build Coastguard Workermin: dw 0, 16 52*c0909341SAndroid Build Coastguard Workerpw_1: dw 1 53*c0909341SAndroid Build Coastguard Worker 54*c0909341SAndroid Build Coastguard Worker%macro JMP_TABLE 2-* 55*c0909341SAndroid Build Coastguard Worker %xdefine %1_8bpc_%2_table %%table 56*c0909341SAndroid Build Coastguard Worker %xdefine %%base %1_8bpc_%2_table 57*c0909341SAndroid Build Coastguard Worker %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) 58*c0909341SAndroid Build Coastguard Worker %%table: 59*c0909341SAndroid Build Coastguard Worker %rep %0 - 2 60*c0909341SAndroid Build Coastguard Worker dd %%prefix %+ .ar%3 - %%base 61*c0909341SAndroid Build Coastguard Worker %rotate 1 62*c0909341SAndroid Build Coastguard Worker %endrep 63*c0909341SAndroid Build Coastguard Worker%endmacro 64*c0909341SAndroid Build Coastguard Worker 65*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3 66*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3 67*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3 68*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3 69*c0909341SAndroid Build Coastguard Worker 70*c0909341SAndroid Build Coastguard WorkerSECTION .text 71*c0909341SAndroid Build Coastguard Worker 72*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 73*c0909341SAndroid Build Coastguard Worker%define PIC_ptr(a) base+a 74*c0909341SAndroid Build Coastguard Worker%else 75*c0909341SAndroid Build Coastguard Worker%define PIC_ptr(a) a 76*c0909341SAndroid Build Coastguard Worker%endif 77*c0909341SAndroid Build Coastguard Worker 78*c0909341SAndroid Build Coastguard Worker%macro SCRATCH 3 79*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 80*c0909341SAndroid Build Coastguard Worker mova [rsp+%3*mmsize], m%1 81*c0909341SAndroid Build Coastguard Worker%define m%2 [rsp+%3*mmsize] 82*c0909341SAndroid Build Coastguard Worker%else 83*c0909341SAndroid Build Coastguard Worker SWAP %1, %2 84*c0909341SAndroid Build Coastguard Worker%endif 85*c0909341SAndroid Build Coastguard Worker%endmacro 86*c0909341SAndroid Build Coastguard Worker 87*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3 88*c0909341SAndroid Build Coastguard Workercglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data 89*c0909341SAndroid Build Coastguard Worker LEA r4, $$ 90*c0909341SAndroid Build Coastguard Worker%define base r4-$$ 91*c0909341SAndroid Build Coastguard Worker movq m1, [base+rnd_next_upperbit_mask] 92*c0909341SAndroid Build Coastguard Worker movq m4, [base+mul_bits] 93*c0909341SAndroid Build Coastguard Worker movq m7, [base+hmul_bits] 94*c0909341SAndroid Build Coastguard Worker mov r2d, [fg_dataq+FGData.grain_scale_shift] 95*c0909341SAndroid Build Coastguard Worker movd m2, [base+round+r2*2] 96*c0909341SAndroid Build Coastguard Worker movd m0, [fg_dataq+FGData.seed] 97*c0909341SAndroid Build Coastguard Worker mova m5, [base+pb_mask] 98*c0909341SAndroid Build Coastguard Worker pshuflw m2, m2, q0000 99*c0909341SAndroid Build Coastguard Worker pshuflw m0, m0, q0000 100*c0909341SAndroid Build Coastguard Worker mov r2, -73*82 101*c0909341SAndroid Build Coastguard Worker sub bufq, r2 102*c0909341SAndroid Build Coastguard Worker lea r3, [base+gaussian_sequence] 103*c0909341SAndroid Build Coastguard Worker.loop: 104*c0909341SAndroid Build Coastguard Worker pand m6, m0, m1 105*c0909341SAndroid Build Coastguard Worker psrlw m3, m6, 10 106*c0909341SAndroid Build Coastguard Worker por m6, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 107*c0909341SAndroid Build Coastguard Worker pmullw m6, m4 ; bits 0x0f00 are set 108*c0909341SAndroid Build Coastguard Worker pshufb m3, m5, m6 ; set 15th bit for next 4 seeds 109*c0909341SAndroid Build Coastguard Worker psllq m6, m3, 30 110*c0909341SAndroid Build Coastguard Worker por m3, m6 111*c0909341SAndroid Build Coastguard Worker psllq m6, m3, 15 112*c0909341SAndroid Build Coastguard Worker por m3, m6 ; aggregate each bit into next seed's high bit 113*c0909341SAndroid Build Coastguard Worker pmulhuw m6, m0, m7 114*c0909341SAndroid Build Coastguard Worker por m3, m6 ; 4 next output seeds 115*c0909341SAndroid Build Coastguard Worker pshuflw m0, m3, q3333 116*c0909341SAndroid Build Coastguard Worker psrlw m3, 5 117*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 118*c0909341SAndroid Build Coastguard Worker movq r6, m3 119*c0909341SAndroid Build Coastguard Worker mov r8, r6 120*c0909341SAndroid Build Coastguard Worker movzx r5d, r6w 121*c0909341SAndroid Build Coastguard Worker shr r6d, 16 122*c0909341SAndroid Build Coastguard Worker shr r8, 32 123*c0909341SAndroid Build Coastguard Worker movzx r7, r8w 124*c0909341SAndroid Build Coastguard Worker shr r8, 16 125*c0909341SAndroid Build Coastguard Worker 126*c0909341SAndroid Build Coastguard Worker movd m6, [r3+r5*2] 127*c0909341SAndroid Build Coastguard Worker pinsrw m6, [r3+r6*2], 1 128*c0909341SAndroid Build Coastguard Worker pinsrw m6, [r3+r7*2], 2 129*c0909341SAndroid Build Coastguard Worker pinsrw m6, [r3+r8*2], 3 130*c0909341SAndroid Build Coastguard Worker%else 131*c0909341SAndroid Build Coastguard Worker movd r6, m3 132*c0909341SAndroid Build Coastguard Worker pshuflw m3, m3, q3232 133*c0909341SAndroid Build Coastguard Worker movzx r5, r6w 134*c0909341SAndroid Build Coastguard Worker shr r6, 16 135*c0909341SAndroid Build Coastguard Worker 136*c0909341SAndroid Build Coastguard Worker movd m6, [r3+r5*2] 137*c0909341SAndroid Build Coastguard Worker pinsrw m6, [r3+r6*2], 1 138*c0909341SAndroid Build Coastguard Worker 139*c0909341SAndroid Build Coastguard Worker movd r6, m3 140*c0909341SAndroid Build Coastguard Worker movzx r5, r6w 141*c0909341SAndroid Build Coastguard Worker shr r6, 16 142*c0909341SAndroid Build Coastguard Worker 143*c0909341SAndroid Build Coastguard Worker pinsrw m6, [r3+r5*2], 2 144*c0909341SAndroid Build Coastguard Worker pinsrw m6, [r3+r6*2], 3 145*c0909341SAndroid Build Coastguard Worker%endif 146*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m2 147*c0909341SAndroid Build Coastguard Worker packsswb m6, m6 148*c0909341SAndroid Build Coastguard Worker movd [bufq+r2], m6 149*c0909341SAndroid Build Coastguard Worker add r2, 4 150*c0909341SAndroid Build Coastguard Worker jl .loop 151*c0909341SAndroid Build Coastguard Worker 152*c0909341SAndroid Build Coastguard Worker ; auto-regression code 153*c0909341SAndroid Build Coastguard Worker movsxd r2, [fg_dataq+FGData.ar_coeff_lag] 154*c0909341SAndroid Build Coastguard Worker movsxd r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4] 155*c0909341SAndroid Build Coastguard Worker lea r2, [r2+base+generate_grain_y_8bpc_ssse3_table] 156*c0909341SAndroid Build Coastguard Worker jmp r2 157*c0909341SAndroid Build Coastguard Worker 158*c0909341SAndroid Build Coastguard Worker.ar1: 159*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 160*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max 161*c0909341SAndroid Build Coastguard Worker%elif WIN64 162*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0 163*c0909341SAndroid Build Coastguard Worker mov bufq, r0 164*c0909341SAndroid Build Coastguard Worker%else 165*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 166*c0909341SAndroid Build Coastguard Worker%endif 167*c0909341SAndroid Build Coastguard Worker movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] 168*c0909341SAndroid Build Coastguard Worker movd m4, [fg_dataq+FGData.ar_coeffs_y] 169*c0909341SAndroid Build Coastguard Worker mov ecx, [fg_dataq+FGData.ar_coeff_shift] 170*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 171*c0909341SAndroid Build Coastguard Worker mov r1m, cf3d 172*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, shift, val3, min, max, x, val0 173*c0909341SAndroid Build Coastguard Worker%define hd r0mp 174*c0909341SAndroid Build Coastguard Worker%define cf3d r1mp 175*c0909341SAndroid Build Coastguard Worker%elif WIN64 176*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0 177*c0909341SAndroid Build Coastguard Worker%else 178*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 179*c0909341SAndroid Build Coastguard Worker%endif 180*c0909341SAndroid Build Coastguard Worker pxor m6, m6 181*c0909341SAndroid Build Coastguard Worker pcmpgtb m7, m6, m4 182*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m7 183*c0909341SAndroid Build Coastguard Worker pinsrw m4, [base+pw_1], 3 184*c0909341SAndroid Build Coastguard Worker pshufd m5, m4, q1111 185*c0909341SAndroid Build Coastguard Worker pshufd m4, m4, q0000 186*c0909341SAndroid Build Coastguard Worker movd m3, [base+round_vals+shiftq*2-12] ; rnd 187*c0909341SAndroid Build Coastguard Worker pshuflw m3, m3, q0000 188*c0909341SAndroid Build Coastguard Worker sub bufq, 82*73-(82*3+79) 189*c0909341SAndroid Build Coastguard Worker mov hd, 70 190*c0909341SAndroid Build Coastguard Worker mov mind, -128 191*c0909341SAndroid Build Coastguard Worker mov maxd, 127 192*c0909341SAndroid Build Coastguard Worker.y_loop_ar1: 193*c0909341SAndroid Build Coastguard Worker mov xq, -76 194*c0909341SAndroid Build Coastguard Worker movsx val3d, byte [bufq+xq-1] 195*c0909341SAndroid Build Coastguard Worker.x_loop_ar1: 196*c0909341SAndroid Build Coastguard Worker movq m0, [bufq+xq-82-1] ; top/left 197*c0909341SAndroid Build Coastguard Worker pcmpgtb m7, m6, m0 198*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m7 199*c0909341SAndroid Build Coastguard Worker psrldq m2, m0, 2 ; top 200*c0909341SAndroid Build Coastguard Worker psrldq m1, m0, 4 ; top/right 201*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2 202*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3 203*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m4 204*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m5 205*c0909341SAndroid Build Coastguard Worker paddd m0, m1 206*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_inner: 207*c0909341SAndroid Build Coastguard Worker movd val0d, m0 208*c0909341SAndroid Build Coastguard Worker psrldq m0, 4 209*c0909341SAndroid Build Coastguard Worker imul val3d, cf3d 210*c0909341SAndroid Build Coastguard Worker add val3d, val0d 211*c0909341SAndroid Build Coastguard Worker sar val3d, shiftb 212*c0909341SAndroid Build Coastguard Worker movsx val0d, byte [bufq+xq] 213*c0909341SAndroid Build Coastguard Worker add val3d, val0d 214*c0909341SAndroid Build Coastguard Worker cmp val3d, maxd 215*c0909341SAndroid Build Coastguard Worker cmovns val3d, maxd 216*c0909341SAndroid Build Coastguard Worker cmp val3d, mind 217*c0909341SAndroid Build Coastguard Worker cmovs val3d, mind 218*c0909341SAndroid Build Coastguard Worker mov byte [bufq+xq], val3b 219*c0909341SAndroid Build Coastguard Worker ; keep val3d in-place as left for next x iteration 220*c0909341SAndroid Build Coastguard Worker inc xq 221*c0909341SAndroid Build Coastguard Worker jz .x_loop_ar1_end 222*c0909341SAndroid Build Coastguard Worker test xq, 3 223*c0909341SAndroid Build Coastguard Worker jnz .x_loop_ar1_inner 224*c0909341SAndroid Build Coastguard Worker jmp .x_loop_ar1 225*c0909341SAndroid Build Coastguard Worker 226*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_end: 227*c0909341SAndroid Build Coastguard Worker add bufq, 82 228*c0909341SAndroid Build Coastguard Worker dec hd 229*c0909341SAndroid Build Coastguard Worker jg .y_loop_ar1 230*c0909341SAndroid Build Coastguard Worker.ar0: 231*c0909341SAndroid Build Coastguard Worker RET 232*c0909341SAndroid Build Coastguard Worker 233*c0909341SAndroid Build Coastguard Worker.ar2: 234*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 235*c0909341SAndroid Build Coastguard Worker ALLOC_STACK -16*8 236*c0909341SAndroid Build Coastguard Worker%endif 237*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, fg_data, shift 238*c0909341SAndroid Build Coastguard Worker mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 239*c0909341SAndroid Build Coastguard Worker movd m6, [base+round_vals-12+shiftq*2] 240*c0909341SAndroid Build Coastguard Worker movd m7, [base+byte_blend+1] 241*c0909341SAndroid Build Coastguard Worker SCRATCH 7, 15, 7 242*c0909341SAndroid Build Coastguard Worker movq m0, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 243*c0909341SAndroid Build Coastguard Worker movd m1, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 244*c0909341SAndroid Build Coastguard Worker pxor m7, m7 245*c0909341SAndroid Build Coastguard Worker pshuflw m6, m6, q0000 246*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m7 247*c0909341SAndroid Build Coastguard Worker pcmpgtb m4, m7, m0 248*c0909341SAndroid Build Coastguard Worker pcmpgtb m5, m7, m1 249*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m4 250*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m5 251*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, fg_data, h, x 252*c0909341SAndroid Build Coastguard Worker pshufd m4, m1, q0000 253*c0909341SAndroid Build Coastguard Worker pshufd m5, m1, q1111 254*c0909341SAndroid Build Coastguard Worker pshufd m3, m0, q3333 255*c0909341SAndroid Build Coastguard Worker pshufd m2, m0, q2222 256*c0909341SAndroid Build Coastguard Worker pshufd m1, m0, q1111 257*c0909341SAndroid Build Coastguard Worker pshufd m0, m0, q0000 258*c0909341SAndroid Build Coastguard Worker SCRATCH 0, 8, 0 259*c0909341SAndroid Build Coastguard Worker SCRATCH 1, 9, 1 260*c0909341SAndroid Build Coastguard Worker SCRATCH 2, 10, 2 261*c0909341SAndroid Build Coastguard Worker SCRATCH 3, 11, 3 262*c0909341SAndroid Build Coastguard Worker SCRATCH 4, 12, 4 263*c0909341SAndroid Build Coastguard Worker SCRATCH 5, 13, 5 264*c0909341SAndroid Build Coastguard Worker SCRATCH 6, 14, 6 265*c0909341SAndroid Build Coastguard Worker sub bufq, 82*73-(82*3+79) 266*c0909341SAndroid Build Coastguard Worker mov hd, 70 267*c0909341SAndroid Build Coastguard Worker.y_loop_ar2: 268*c0909341SAndroid Build Coastguard Worker mov xq, -76 269*c0909341SAndroid Build Coastguard Worker 270*c0909341SAndroid Build Coastguard Worker.x_loop_ar2: 271*c0909341SAndroid Build Coastguard Worker movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] 272*c0909341SAndroid Build Coastguard Worker movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] 273*c0909341SAndroid Build Coastguard Worker pcmpgtb m2, m7, m0 274*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m0, m2 275*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m2 276*c0909341SAndroid Build Coastguard Worker psrldq m5, m0, 2 ; y=-2,x=[-1,+5] 277*c0909341SAndroid Build Coastguard Worker psrldq m3, m1, 2 ; y=-1,x=[-1,+5] 278*c0909341SAndroid Build Coastguard Worker psrldq m4, m1, 4 ; y=-1,x=[+0,+5] 279*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m0, m5 280*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4 281*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m8 282*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m11 283*c0909341SAndroid Build Coastguard Worker paddd m2, m3 284*c0909341SAndroid Build Coastguard Worker 285*c0909341SAndroid Build Coastguard Worker psrldq m4, m0, 4 ; y=-2,x=[+0,+5] 286*c0909341SAndroid Build Coastguard Worker psrldq m5, m0, 6 ; y=-2,x=[+1,+5] 287*c0909341SAndroid Build Coastguard Worker psrldq m6, m0, 8 ; y=-2,x=[+2,+5] 288*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5 289*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m1 290*c0909341SAndroid Build Coastguard Worker psrldq m5, m1, 6 ; y=-1,x=[+1,+5] 291*c0909341SAndroid Build Coastguard Worker psrldq m1, m1, 8 ; y=-1,x=[+2,+5] 292*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m1 293*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m9 294*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m10 295*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m12 296*c0909341SAndroid Build Coastguard Worker paddd m4, m6 297*c0909341SAndroid Build Coastguard Worker paddd m2, m5 298*c0909341SAndroid Build Coastguard Worker paddd m2, m4 299*c0909341SAndroid Build Coastguard Worker paddd m2, m14 300*c0909341SAndroid Build Coastguard Worker 301*c0909341SAndroid Build Coastguard Worker movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] 302*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_inner: 303*c0909341SAndroid Build Coastguard Worker pcmpgtb m4, m7, m0 304*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m0, m4 305*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1, m13 306*c0909341SAndroid Build Coastguard Worker paddd m3, m2 307*c0909341SAndroid Build Coastguard Worker psrldq m1, 4 ; y=0,x=0 308*c0909341SAndroid Build Coastguard Worker psrldq m2, 4 ; shift top to next pixel 309*c0909341SAndroid Build Coastguard Worker psrad m3, [fg_dataq+FGData.ar_coeff_shift] 310*c0909341SAndroid Build Coastguard Worker ; don't packssdw since we only care about one value 311*c0909341SAndroid Build Coastguard Worker paddw m3, m1 312*c0909341SAndroid Build Coastguard Worker packsswb m3, m3 313*c0909341SAndroid Build Coastguard Worker pslldq m3, 2 314*c0909341SAndroid Build Coastguard Worker pand m3, m15 315*c0909341SAndroid Build Coastguard Worker pandn m1, m15, m0 316*c0909341SAndroid Build Coastguard Worker por m0, m1, m3 317*c0909341SAndroid Build Coastguard Worker psrldq m0, 1 318*c0909341SAndroid Build Coastguard Worker ; overwrite 2 pixels, but that's ok 319*c0909341SAndroid Build Coastguard Worker movd [bufq+xq-1], m0 320*c0909341SAndroid Build Coastguard Worker inc xq 321*c0909341SAndroid Build Coastguard Worker jz .x_loop_ar2_end 322*c0909341SAndroid Build Coastguard Worker test xq, 3 323*c0909341SAndroid Build Coastguard Worker jnz .x_loop_ar2_inner 324*c0909341SAndroid Build Coastguard Worker jmp .x_loop_ar2 325*c0909341SAndroid Build Coastguard Worker 326*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_end: 327*c0909341SAndroid Build Coastguard Worker add bufq, 82 328*c0909341SAndroid Build Coastguard Worker dec hd 329*c0909341SAndroid Build Coastguard Worker jg .y_loop_ar2 330*c0909341SAndroid Build Coastguard Worker RET 331*c0909341SAndroid Build Coastguard Worker 332*c0909341SAndroid Build Coastguard Worker.ar3: 333*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, fg_data, shift 334*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 335*c0909341SAndroid Build Coastguard Worker ALLOC_STACK -16*14 336*c0909341SAndroid Build Coastguard Worker%elif WIN64 337*c0909341SAndroid Build Coastguard Worker SUB rsp, 16*6 338*c0909341SAndroid Build Coastguard Worker%assign stack_size_padded (stack_size_padded+16*6) 339*c0909341SAndroid Build Coastguard Worker%assign stack_size (stack_size+16*6) 340*c0909341SAndroid Build Coastguard Worker%else 341*c0909341SAndroid Build Coastguard Worker ALLOC_STACK -16*6 342*c0909341SAndroid Build Coastguard Worker%endif 343*c0909341SAndroid Build Coastguard Worker mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 344*c0909341SAndroid Build Coastguard Worker movd m6, [base+round_vals-12+shiftq*2] 345*c0909341SAndroid Build Coastguard Worker movd m7, [base+byte_blend] 346*c0909341SAndroid Build Coastguard Worker movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 347*c0909341SAndroid Build Coastguard Worker movq m2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 348*c0909341SAndroid Build Coastguard Worker pxor m3, m3 349*c0909341SAndroid Build Coastguard Worker pcmpgtb m4, m3, m0 350*c0909341SAndroid Build Coastguard Worker pcmpgtb m3, m2 351*c0909341SAndroid Build Coastguard Worker pshuflw m6, m6, q0000 352*c0909341SAndroid Build Coastguard Worker SCRATCH 6, 14, 12 353*c0909341SAndroid Build Coastguard Worker SCRATCH 7, 15, 13 354*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m0, m4 355*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m4 356*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3 357*c0909341SAndroid Build Coastguard Worker pshufd m3, m0, q1111 358*c0909341SAndroid Build Coastguard Worker pshufd m4, m0, q2222 359*c0909341SAndroid Build Coastguard Worker pshufd m5, m0, q3333 360*c0909341SAndroid Build Coastguard Worker pshufd m0, m0, q0000 361*c0909341SAndroid Build Coastguard Worker mova [rsp+ 0*16], m0 362*c0909341SAndroid Build Coastguard Worker mova [rsp+ 1*16], m3 363*c0909341SAndroid Build Coastguard Worker mova [rsp+ 2*16], m4 364*c0909341SAndroid Build Coastguard Worker mova [rsp+ 3*16], m5 365*c0909341SAndroid Build Coastguard Worker pshufd m6, m1, q1111 366*c0909341SAndroid Build Coastguard Worker pshufd m7, m1, q2222 367*c0909341SAndroid Build Coastguard Worker pshufd m5, m1, q3333 368*c0909341SAndroid Build Coastguard Worker pshufd m1, m1, q0000 369*c0909341SAndroid Build Coastguard Worker pshufd m3, m2, q1111 370*c0909341SAndroid Build Coastguard Worker psrldq m0, m2, 10 371*c0909341SAndroid Build Coastguard Worker pinsrw m2, [base+pw_1], 5 372*c0909341SAndroid Build Coastguard Worker pshufd m4, m2, q2222 373*c0909341SAndroid Build Coastguard Worker pshufd m2, m2, q0000 374*c0909341SAndroid Build Coastguard Worker pinsrw m0, [base+round_vals+shiftq*2-10], 3 375*c0909341SAndroid Build Coastguard Worker mova [rsp+ 4*16], m1 376*c0909341SAndroid Build Coastguard Worker mova [rsp+ 5*16], m6 377*c0909341SAndroid Build Coastguard Worker SCRATCH 7, 8, 6 378*c0909341SAndroid Build Coastguard Worker SCRATCH 5, 9, 7 379*c0909341SAndroid Build Coastguard Worker SCRATCH 2, 10, 8 380*c0909341SAndroid Build Coastguard Worker SCRATCH 3, 11, 9 381*c0909341SAndroid Build Coastguard Worker SCRATCH 4, 12, 10 382*c0909341SAndroid Build Coastguard Worker SCRATCH 0, 13, 11 383*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, fg_data, h, x 384*c0909341SAndroid Build Coastguard Worker sub bufq, 82*73-(82*3+79) 385*c0909341SAndroid Build Coastguard Worker mov hd, 70 386*c0909341SAndroid Build Coastguard Worker.y_loop_ar3: 387*c0909341SAndroid Build Coastguard Worker mov xq, -76 388*c0909341SAndroid Build Coastguard Worker 389*c0909341SAndroid Build Coastguard Worker.x_loop_ar3: 390*c0909341SAndroid Build Coastguard Worker movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] 391*c0909341SAndroid Build Coastguard Worker pxor m3, m3 392*c0909341SAndroid Build Coastguard Worker pcmpgtb m3, m0 393*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m0, m3 394*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m3 395*c0909341SAndroid Build Coastguard Worker 396*c0909341SAndroid Build Coastguard Worker psrldq m5, m0, 2 397*c0909341SAndroid Build Coastguard Worker psrldq m6, m0, 4 398*c0909341SAndroid Build Coastguard Worker psrldq m7, m0, 6 399*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m0, m5 400*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m7 401*c0909341SAndroid Build Coastguard Worker pmaddwd m4, [rsp+ 0*16] 402*c0909341SAndroid Build Coastguard Worker pmaddwd m6, [rsp+ 1*16] 403*c0909341SAndroid Build Coastguard Worker paddd m4, m6 404*c0909341SAndroid Build Coastguard Worker 405*c0909341SAndroid Build Coastguard Worker movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] 406*c0909341SAndroid Build Coastguard Worker pxor m5, m5 407*c0909341SAndroid Build Coastguard Worker pcmpgtb m5, m1 408*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m1, m5 409*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m5 410*c0909341SAndroid Build Coastguard Worker palignr m6, m2, m0, 10 411*c0909341SAndroid Build Coastguard Worker palignr m7, m2, m0, 12 412*c0909341SAndroid Build Coastguard Worker psrldq m0, 8 413*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m6 414*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m1 415*c0909341SAndroid Build Coastguard Worker pmaddwd m0, [rsp+ 2*16] 416*c0909341SAndroid Build Coastguard Worker pmaddwd m7, [rsp+ 3*16] 417*c0909341SAndroid Build Coastguard Worker paddd m0, m7 418*c0909341SAndroid Build Coastguard Worker paddd m0, m4 419*c0909341SAndroid Build Coastguard Worker 420*c0909341SAndroid Build Coastguard Worker psrldq m4, m1, 2 421*c0909341SAndroid Build Coastguard Worker psrldq m5, m1, 4 422*c0909341SAndroid Build Coastguard Worker psrldq m6, m1, 6 423*c0909341SAndroid Build Coastguard Worker psrldq m7, m1, 8 424*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5 425*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m7 426*c0909341SAndroid Build Coastguard Worker pmaddwd m4, [rsp+ 4*16] 427*c0909341SAndroid Build Coastguard Worker pmaddwd m6, [rsp+ 5*16] 428*c0909341SAndroid Build Coastguard Worker paddd m4, m6 429*c0909341SAndroid Build Coastguard Worker paddd m0, m4 430*c0909341SAndroid Build Coastguard Worker 431*c0909341SAndroid Build Coastguard Worker movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] 432*c0909341SAndroid Build Coastguard Worker pxor m7, m7 433*c0909341SAndroid Build Coastguard Worker pcmpgtb m7, m2 434*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m2, m7 435*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m7 436*c0909341SAndroid Build Coastguard Worker palignr m7, m3, m1, 10 437*c0909341SAndroid Build Coastguard Worker palignr m3, m1, 12 438*c0909341SAndroid Build Coastguard Worker psrldq m1, m2, 2 439*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m3 440*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m2, m1 441*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m8 442*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m9 443*c0909341SAndroid Build Coastguard Worker paddd m7, m3 444*c0909341SAndroid Build Coastguard Worker paddd m0, m7 445*c0909341SAndroid Build Coastguard Worker 446*c0909341SAndroid Build Coastguard Worker psrldq m6, m2, 4 447*c0909341SAndroid Build Coastguard Worker psrldq m1, m2, 6 448*c0909341SAndroid Build Coastguard Worker psrldq m3, m2, 8 449*c0909341SAndroid Build Coastguard Worker palignr m4, m5, m2, 10 450*c0909341SAndroid Build Coastguard Worker palignr m5, m5, m2, 12 451*c0909341SAndroid Build Coastguard Worker 452*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m1 453*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4 454*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m14 455*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m10 456*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m11 457*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m12 458*c0909341SAndroid Build Coastguard Worker paddd m0, m6 459*c0909341SAndroid Build Coastguard Worker paddd m3, m5 460*c0909341SAndroid Build Coastguard Worker paddd m0, m3 461*c0909341SAndroid Build Coastguard Worker 462*c0909341SAndroid Build Coastguard Worker movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] 463*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_inner: 464*c0909341SAndroid Build Coastguard Worker pxor m5, m5 465*c0909341SAndroid Build Coastguard Worker pcmpgtb m5, m1 466*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m1, m5 467*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m13 468*c0909341SAndroid Build Coastguard Worker pshufd m3, m2, q1111 469*c0909341SAndroid Build Coastguard Worker paddd m2, m3 ; left+cur 470*c0909341SAndroid Build Coastguard Worker paddd m2, m0 ; add top 471*c0909341SAndroid Build Coastguard Worker psrldq m0, 4 472*c0909341SAndroid Build Coastguard Worker psrad m2, [fg_dataq+FGData.ar_coeff_shift] 473*c0909341SAndroid Build Coastguard Worker ; don't packssdw since we only care about one value 474*c0909341SAndroid Build Coastguard Worker packsswb m2, m2 475*c0909341SAndroid Build Coastguard Worker pslldq m2, 3 476*c0909341SAndroid Build Coastguard Worker pand m2, m15 477*c0909341SAndroid Build Coastguard Worker pandn m3, m15, m1 478*c0909341SAndroid Build Coastguard Worker por m1, m2, m3 479*c0909341SAndroid Build Coastguard Worker movd [bufq+xq-3], m1 480*c0909341SAndroid Build Coastguard Worker psrldq m1, 1 481*c0909341SAndroid Build Coastguard Worker inc xq 482*c0909341SAndroid Build Coastguard Worker jz .x_loop_ar3_end 483*c0909341SAndroid Build Coastguard Worker test xq, 3 484*c0909341SAndroid Build Coastguard Worker jnz .x_loop_ar3_inner 485*c0909341SAndroid Build Coastguard Worker jmp .x_loop_ar3 486*c0909341SAndroid Build Coastguard Worker 487*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_end: 488*c0909341SAndroid Build Coastguard Worker add bufq, 82 489*c0909341SAndroid Build Coastguard Worker dec hd 490*c0909341SAndroid Build Coastguard Worker jg .y_loop_ar3 491*c0909341SAndroid Build Coastguard Worker RET 492*c0909341SAndroid Build Coastguard Worker 493*c0909341SAndroid Build Coastguard Worker%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y 494*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3 495*c0909341SAndroid Build Coastguard Workercglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv 496*c0909341SAndroid Build Coastguard Worker movifnidn r2, r2mp 497*c0909341SAndroid Build Coastguard Worker movifnidn r3, r3mp 498*c0909341SAndroid Build Coastguard Worker LEA r4, $$ 499*c0909341SAndroid Build Coastguard Worker%define base r4-$$ 500*c0909341SAndroid Build Coastguard Worker movq m1, [base+rnd_next_upperbit_mask] 501*c0909341SAndroid Build Coastguard Worker movq m4, [base+mul_bits] 502*c0909341SAndroid Build Coastguard Worker movq m7, [base+hmul_bits] 503*c0909341SAndroid Build Coastguard Worker mov r5d, [fg_dataq+FGData.grain_scale_shift] 504*c0909341SAndroid Build Coastguard Worker movd m6, [base+round+r5*2] 505*c0909341SAndroid Build Coastguard Worker mova m5, [base+pb_mask] 506*c0909341SAndroid Build Coastguard Worker movd m0, [fg_dataq+FGData.seed] 507*c0909341SAndroid Build Coastguard Worker movd m2, [base+pw_seed_xor+uvq*4] 508*c0909341SAndroid Build Coastguard Worker pxor m0, m2 509*c0909341SAndroid Build Coastguard Worker pshuflw m6, m6, q0000 510*c0909341SAndroid Build Coastguard Worker pshuflw m0, m0, q0000 511*c0909341SAndroid Build Coastguard Worker lea r6, [base+gaussian_sequence] 512*c0909341SAndroid Build Coastguard Worker%if %2 513*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 514*c0909341SAndroid Build Coastguard Worker mov r7d, 73-35*%3 515*c0909341SAndroid Build Coastguard Worker%else 516*c0909341SAndroid Build Coastguard Worker mov r3mp, 73-35*%3 517*c0909341SAndroid Build Coastguard Worker%endif 518*c0909341SAndroid Build Coastguard Worker add bufq, 44 519*c0909341SAndroid Build Coastguard Worker.loop_y: 520*c0909341SAndroid Build Coastguard Worker mov r5, -44 521*c0909341SAndroid Build Coastguard Worker.loop_x: 522*c0909341SAndroid Build Coastguard Worker%else 523*c0909341SAndroid Build Coastguard Worker mov r5, -82*73 524*c0909341SAndroid Build Coastguard Worker sub bufq, r5 525*c0909341SAndroid Build Coastguard Worker.loop: 526*c0909341SAndroid Build Coastguard Worker%endif 527*c0909341SAndroid Build Coastguard Worker pand m2, m0, m1 528*c0909341SAndroid Build Coastguard Worker psrlw m3, m2, 10 529*c0909341SAndroid Build Coastguard Worker por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 530*c0909341SAndroid Build Coastguard Worker pmullw m2, m4 ; bits 0x0f00 are set 531*c0909341SAndroid Build Coastguard Worker pshufb m3, m5, m2 ; set 15th bit for next 4 seeds 532*c0909341SAndroid Build Coastguard Worker psllq m2, m3, 30 533*c0909341SAndroid Build Coastguard Worker por m3, m2 534*c0909341SAndroid Build Coastguard Worker psllq m2, m3, 15 535*c0909341SAndroid Build Coastguard Worker por m3, m2 ; aggregate each bit into next seed's high bit 536*c0909341SAndroid Build Coastguard Worker pmulhuw m2, m0, m7 537*c0909341SAndroid Build Coastguard Worker por m2, m3 ; 4 next output seeds 538*c0909341SAndroid Build Coastguard Worker pshuflw m0, m2, q3333 539*c0909341SAndroid Build Coastguard Worker psrlw m2, 5 540*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 541*c0909341SAndroid Build Coastguard Worker movd r9d, m2 542*c0909341SAndroid Build Coastguard Worker pshuflw m2, m2, q3232 543*c0909341SAndroid Build Coastguard Worker movzx r8, r9w 544*c0909341SAndroid Build Coastguard Worker shr r9, 16 545*c0909341SAndroid Build Coastguard Worker 546*c0909341SAndroid Build Coastguard Worker movd m3, [r6+r8*2] 547*c0909341SAndroid Build Coastguard Worker pinsrw m3, [r6+r9*2], 1 548*c0909341SAndroid Build Coastguard Worker 549*c0909341SAndroid Build Coastguard Worker movd r9d, m2 550*c0909341SAndroid Build Coastguard Worker movzx r8, r9w 551*c0909341SAndroid Build Coastguard Worker shr r9, 16 552*c0909341SAndroid Build Coastguard Worker 553*c0909341SAndroid Build Coastguard Worker pinsrw m3, [r6+r8*2], 2 554*c0909341SAndroid Build Coastguard Worker pinsrw m3, [r6+r9*2], 3 555*c0909341SAndroid Build Coastguard Worker%else 556*c0909341SAndroid Build Coastguard Worker movd r2, m2 557*c0909341SAndroid Build Coastguard Worker pshuflw m2, m2, q3232 558*c0909341SAndroid Build Coastguard Worker movzx r1, r2w 559*c0909341SAndroid Build Coastguard Worker shr r2, 16 560*c0909341SAndroid Build Coastguard Worker 561*c0909341SAndroid Build Coastguard Worker movd m3, [r6+r1*2] 562*c0909341SAndroid Build Coastguard Worker pinsrw m3, [r6+r2*2], 1 563*c0909341SAndroid Build Coastguard Worker 564*c0909341SAndroid Build Coastguard Worker movd r2, m2 565*c0909341SAndroid Build Coastguard Worker movzx r1, r2w 566*c0909341SAndroid Build Coastguard Worker shr r2, 16 567*c0909341SAndroid Build Coastguard Worker 568*c0909341SAndroid Build Coastguard Worker pinsrw m3, [r6+r1*2], 2 569*c0909341SAndroid Build Coastguard Worker pinsrw m3, [r6+r2*2], 3 570*c0909341SAndroid Build Coastguard Worker%endif 571*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m6 572*c0909341SAndroid Build Coastguard Worker packsswb m3, m3 573*c0909341SAndroid Build Coastguard Worker movd [bufq+r5], m3 574*c0909341SAndroid Build Coastguard Worker add r5, 4 575*c0909341SAndroid Build Coastguard Worker%if %2 576*c0909341SAndroid Build Coastguard Worker jl .loop_x 577*c0909341SAndroid Build Coastguard Worker add bufq, 82 578*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 579*c0909341SAndroid Build Coastguard Worker dec r7d 580*c0909341SAndroid Build Coastguard Worker%else 581*c0909341SAndroid Build Coastguard Worker dec r3mp 582*c0909341SAndroid Build Coastguard Worker%endif 583*c0909341SAndroid Build Coastguard Worker jg .loop_y 584*c0909341SAndroid Build Coastguard Worker%else 585*c0909341SAndroid Build Coastguard Worker jl .loop 586*c0909341SAndroid Build Coastguard Worker%endif 587*c0909341SAndroid Build Coastguard Worker 588*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 589*c0909341SAndroid Build Coastguard Worker mov r2, r2mp 590*c0909341SAndroid Build Coastguard Worker%endif 591*c0909341SAndroid Build Coastguard Worker 592*c0909341SAndroid Build Coastguard Worker ; auto-regression code 593*c0909341SAndroid Build Coastguard Worker movsxd r5, [fg_dataq+FGData.ar_coeff_lag] 594*c0909341SAndroid Build Coastguard Worker movsxd r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4] 595*c0909341SAndroid Build Coastguard Worker lea r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table] 596*c0909341SAndroid Build Coastguard Worker jmp r5 597*c0909341SAndroid Build Coastguard Worker 598*c0909341SAndroid Build Coastguard Worker.ar0: 599*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 600*c0909341SAndroid Build Coastguard Worker movifnidn bufyq, bufymp 601*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 602*c0909341SAndroid Build Coastguard Worker ALLOC_STACK -2*16 603*c0909341SAndroid Build Coastguard Worker%endif 604*c0909341SAndroid Build Coastguard Worker imul uvd, 28 605*c0909341SAndroid Build Coastguard Worker mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 606*c0909341SAndroid Build Coastguard Worker movd m5, [fg_dataq+FGData.ar_coeffs_uv+uvq] 607*c0909341SAndroid Build Coastguard Worker movd m4, [base+hmul_bits+shiftq*2] 608*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, h, x 609*c0909341SAndroid Build Coastguard Worker pxor m0, m0 610*c0909341SAndroid Build Coastguard Worker pcmpgtb m0, m5 611*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m0 612*c0909341SAndroid Build Coastguard Worker movd m7, [base+pb_1] 613*c0909341SAndroid Build Coastguard Worker%if %2 614*c0909341SAndroid Build Coastguard Worker movd m6, [base+hmul_bits+2+%3*2] 615*c0909341SAndroid Build Coastguard Worker%endif 616*c0909341SAndroid Build Coastguard Worker pshuflw m5, m5, q0000 617*c0909341SAndroid Build Coastguard Worker pshuflw m4, m4, q0000 618*c0909341SAndroid Build Coastguard Worker pshufd m7, m7, q0000 619*c0909341SAndroid Build Coastguard Worker%if %2 620*c0909341SAndroid Build Coastguard Worker pshuflw m6, m6, q0000 621*c0909341SAndroid Build Coastguard Worker%endif 622*c0909341SAndroid Build Coastguard Worker punpcklqdq m5, m5 623*c0909341SAndroid Build Coastguard Worker punpcklqdq m4, m4 624*c0909341SAndroid Build Coastguard Worker%if %2 625*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m6 626*c0909341SAndroid Build Coastguard Worker%endif 627*c0909341SAndroid Build Coastguard Worker pcmpeqw m1, m1 628*c0909341SAndroid Build Coastguard Worker pslldq m1, 12>>%2 629*c0909341SAndroid Build Coastguard Worker SCRATCH 1, 8, 0 630*c0909341SAndroid Build Coastguard Worker SCRATCH 4, 9, 1 631*c0909341SAndroid Build Coastguard Worker%if %2 632*c0909341SAndroid Build Coastguard Worker sub bufq, 82*(73-35*%3)+82-(82*3+41) 633*c0909341SAndroid Build Coastguard Worker%else 634*c0909341SAndroid Build Coastguard Worker sub bufq, 82*70-3 635*c0909341SAndroid Build Coastguard Worker%endif 636*c0909341SAndroid Build Coastguard Worker add bufyq, 3+82*3 637*c0909341SAndroid Build Coastguard Worker mov hd, 70-35*%3 638*c0909341SAndroid Build Coastguard Worker.y_loop_ar0: 639*c0909341SAndroid Build Coastguard Worker xor xd, xd 640*c0909341SAndroid Build Coastguard Worker.x_loop_ar0: 641*c0909341SAndroid Build Coastguard Worker ; first 32 pixels 642*c0909341SAndroid Build Coastguard Worker%if %2 643*c0909341SAndroid Build Coastguard Worker movu m1, [bufyq+xq*2] 644*c0909341SAndroid Build Coastguard Worker%if %3 645*c0909341SAndroid Build Coastguard Worker movu m2, [bufyq+xq*2+82] 646*c0909341SAndroid Build Coastguard Worker%endif 647*c0909341SAndroid Build Coastguard Worker movu m3, [bufyq+xq*2+16] 648*c0909341SAndroid Build Coastguard Worker%if %3 649*c0909341SAndroid Build Coastguard Worker movu m4, [bufyq+xq*2+82+16] 650*c0909341SAndroid Build Coastguard Worker%endif 651*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m7, m1 652*c0909341SAndroid Build Coastguard Worker%if %3 653*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m7, m2 654*c0909341SAndroid Build Coastguard Worker%endif 655*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m7, m3 656*c0909341SAndroid Build Coastguard Worker%if %3 657*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m7, m4 658*c0909341SAndroid Build Coastguard Worker paddw m0, m1 659*c0909341SAndroid Build Coastguard Worker paddw m2, m3 660*c0909341SAndroid Build Coastguard Worker%endif 661*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m6 662*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m6 663*c0909341SAndroid Build Coastguard Worker%else 664*c0909341SAndroid Build Coastguard Worker movu m0, [bufyq+xq] 665*c0909341SAndroid Build Coastguard Worker pxor m6, m6 666*c0909341SAndroid Build Coastguard Worker pcmpgtb m6, m0 667*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m0, m6 668*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m6 669*c0909341SAndroid Build Coastguard Worker%endif 670*c0909341SAndroid Build Coastguard Worker pmullw m0, m5 671*c0909341SAndroid Build Coastguard Worker pmullw m2, m5 672*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m9 673*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m9 674*c0909341SAndroid Build Coastguard Worker movu m1, [bufq+xq] 675*c0909341SAndroid Build Coastguard Worker pxor m4, m4 676*c0909341SAndroid Build Coastguard Worker pcmpgtb m4, m1 677*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m1, m4 678*c0909341SAndroid Build Coastguard Worker%if %2 679*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m4 680*c0909341SAndroid Build Coastguard Worker paddw m2, m3 681*c0909341SAndroid Build Coastguard Worker paddw m0, m1 682*c0909341SAndroid Build Coastguard Worker%else 683*c0909341SAndroid Build Coastguard Worker punpcklbw m6, m1, m4 684*c0909341SAndroid Build Coastguard Worker paddw m2, m3 685*c0909341SAndroid Build Coastguard Worker paddw m0, m6 686*c0909341SAndroid Build Coastguard Worker%endif 687*c0909341SAndroid Build Coastguard Worker packsswb m0, m2 688*c0909341SAndroid Build Coastguard Worker%if %2 689*c0909341SAndroid Build Coastguard Worker movu [bufq+xq], m0 690*c0909341SAndroid Build Coastguard Worker add xd, 16 691*c0909341SAndroid Build Coastguard Worker cmp xd, 32 692*c0909341SAndroid Build Coastguard Worker jl .x_loop_ar0 693*c0909341SAndroid Build Coastguard Worker 694*c0909341SAndroid Build Coastguard Worker ; last 6/12 pixels 695*c0909341SAndroid Build Coastguard Worker movu m1, [bufyq+xq*(1+%2)] 696*c0909341SAndroid Build Coastguard Worker%if %3 697*c0909341SAndroid Build Coastguard Worker movu m2, [bufyq+xq*2+82] 698*c0909341SAndroid Build Coastguard Worker%endif 699*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m7, m1 700*c0909341SAndroid Build Coastguard Worker%if %3 701*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m7, m2 702*c0909341SAndroid Build Coastguard Worker paddw m0, m1 703*c0909341SAndroid Build Coastguard Worker%endif 704*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m6 705*c0909341SAndroid Build Coastguard Worker pmullw m0, m5 706*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m9 707*c0909341SAndroid Build Coastguard Worker movq m1, [bufq+xq] 708*c0909341SAndroid Build Coastguard Worker pxor m4, m4 709*c0909341SAndroid Build Coastguard Worker pcmpgtb m4, m1 710*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m1, m4 711*c0909341SAndroid Build Coastguard Worker paddw m0, m2 712*c0909341SAndroid Build Coastguard Worker packsswb m0, m0 713*c0909341SAndroid Build Coastguard Worker pandn m2, m8, m0 714*c0909341SAndroid Build Coastguard Worker pand m1, m8 715*c0909341SAndroid Build Coastguard Worker por m2, m1 716*c0909341SAndroid Build Coastguard Worker movq [bufq+xq], m2 717*c0909341SAndroid Build Coastguard Worker%else 718*c0909341SAndroid Build Coastguard Worker add xd, 16 719*c0909341SAndroid Build Coastguard Worker cmp xd, 80 720*c0909341SAndroid Build Coastguard Worker je .y_loop_final_ar0 721*c0909341SAndroid Build Coastguard Worker movu [bufq+xq-16], m0 722*c0909341SAndroid Build Coastguard Worker jmp .x_loop_ar0 723*c0909341SAndroid Build Coastguard Worker.y_loop_final_ar0: 724*c0909341SAndroid Build Coastguard Worker pandn m2, m8, m0 725*c0909341SAndroid Build Coastguard Worker pand m1, m8 726*c0909341SAndroid Build Coastguard Worker por m2, m1 727*c0909341SAndroid Build Coastguard Worker movu [bufq+xq-16], m2 728*c0909341SAndroid Build Coastguard Worker%endif 729*c0909341SAndroid Build Coastguard Worker 730*c0909341SAndroid Build Coastguard Worker add bufq, 82 731*c0909341SAndroid Build Coastguard Worker add bufyq, 82<<%3 732*c0909341SAndroid Build Coastguard Worker dec hd 733*c0909341SAndroid Build Coastguard Worker jg .y_loop_ar0 734*c0909341SAndroid Build Coastguard Worker RET 735*c0909341SAndroid Build Coastguard Worker 736*c0909341SAndroid Build Coastguard Worker.ar1: 737*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 738*c0909341SAndroid Build Coastguard Worker RESET_STACK_STATE 739*c0909341SAndroid Build Coastguard Worker%endif 740*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x 741*c0909341SAndroid Build Coastguard Worker imul uvd, 28 742*c0909341SAndroid Build Coastguard Worker movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] 743*c0909341SAndroid Build Coastguard Worker movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1] 744*c0909341SAndroid Build Coastguard Worker pinsrw m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2 745*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 746*c0909341SAndroid Build Coastguard Worker mov r3mp, cf3d 747*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, shift, fg_data, val3, min, max, x 748*c0909341SAndroid Build Coastguard Worker%elif WIN64 749*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x 750*c0909341SAndroid Build Coastguard Worker mov bufq, r0 751*c0909341SAndroid Build Coastguard Worker%else 752*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x 753*c0909341SAndroid Build Coastguard Worker%endif 754*c0909341SAndroid Build Coastguard Worker mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 755*c0909341SAndroid Build Coastguard Worker movd m3, [base+round_vals+shiftq*2-12] ; rnd 756*c0909341SAndroid Build Coastguard Worker%if %2 757*c0909341SAndroid Build Coastguard Worker movd m7, [base+pb_1] 758*c0909341SAndroid Build Coastguard Worker movd m6, [base+hmul_bits+2+%3*2] 759*c0909341SAndroid Build Coastguard Worker%endif 760*c0909341SAndroid Build Coastguard Worker psrldq m4, 1 761*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 762*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, shift, val0, val3, min, max, x 763*c0909341SAndroid Build Coastguard Worker%elif WIN64 764*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0 765*c0909341SAndroid Build Coastguard Worker%else 766*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0 767*c0909341SAndroid Build Coastguard Worker%endif 768*c0909341SAndroid Build Coastguard Worker pxor m5, m5 769*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m5 770*c0909341SAndroid Build Coastguard Worker%if %2 771*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m6 772*c0909341SAndroid Build Coastguard Worker%endif 773*c0909341SAndroid Build Coastguard Worker pcmpgtb m5, m4 774*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5 775*c0909341SAndroid Build Coastguard Worker pshufd m5, m4, q1111 776*c0909341SAndroid Build Coastguard Worker pshufd m4, m4, q0000 777*c0909341SAndroid Build Coastguard Worker pshufd m3, m3, q0000 778*c0909341SAndroid Build Coastguard Worker%if %2 779*c0909341SAndroid Build Coastguard Worker pshufd m7, m7, q0000 780*c0909341SAndroid Build Coastguard Worker pshufd m6, m6, q0000 781*c0909341SAndroid Build Coastguard Worker sub bufq, 82*(73-35*%3)+44-(82*3+41) 782*c0909341SAndroid Build Coastguard Worker%else 783*c0909341SAndroid Build Coastguard Worker sub bufq, 82*69+3 784*c0909341SAndroid Build Coastguard Worker%endif 785*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 786*c0909341SAndroid Build Coastguard Worker add r1mp, 79+82*3 787*c0909341SAndroid Build Coastguard Worker mov r0mp, 70-35*%3 788*c0909341SAndroid Build Coastguard Worker%else 789*c0909341SAndroid Build Coastguard Worker add bufyq, 79+82*3 790*c0909341SAndroid Build Coastguard Worker mov hd, 70-35*%3 791*c0909341SAndroid Build Coastguard Worker%endif 792*c0909341SAndroid Build Coastguard Worker mov mind, -128 793*c0909341SAndroid Build Coastguard Worker mov maxd, 127 794*c0909341SAndroid Build Coastguard Worker.y_loop_ar1: 795*c0909341SAndroid Build Coastguard Worker mov xq, -(76>>%2) 796*c0909341SAndroid Build Coastguard Worker movsx val3d, byte [bufq+xq-1] 797*c0909341SAndroid Build Coastguard Worker.x_loop_ar1: 798*c0909341SAndroid Build Coastguard Worker%if %2 799*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 800*c0909341SAndroid Build Coastguard Worker mov r2, r1mp 801*c0909341SAndroid Build Coastguard Worker movq m0, [r2+xq*2] 802*c0909341SAndroid Build Coastguard Worker%if %3 803*c0909341SAndroid Build Coastguard Worker movq m1, [r2+xq*2+82] 804*c0909341SAndroid Build Coastguard Worker%endif 805*c0909341SAndroid Build Coastguard Worker%else 806*c0909341SAndroid Build Coastguard Worker movq m0, [bufyq+xq*2] 807*c0909341SAndroid Build Coastguard Worker%if %3 808*c0909341SAndroid Build Coastguard Worker movq m1, [bufyq+xq*2+82] 809*c0909341SAndroid Build Coastguard Worker%endif 810*c0909341SAndroid Build Coastguard Worker%endif 811*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m7, m0 812*c0909341SAndroid Build Coastguard Worker%if %3 813*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m7, m1 814*c0909341SAndroid Build Coastguard Worker paddw m2, m0 815*c0909341SAndroid Build Coastguard Worker%endif 816*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m6 817*c0909341SAndroid Build Coastguard Worker%else 818*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 819*c0909341SAndroid Build Coastguard Worker mov r2, r1mp 820*c0909341SAndroid Build Coastguard Worker movd m2, [r2+xq] 821*c0909341SAndroid Build Coastguard Worker%else 822*c0909341SAndroid Build Coastguard Worker movd m2, [bufyq+xq] 823*c0909341SAndroid Build Coastguard Worker%endif 824*c0909341SAndroid Build Coastguard Worker pxor m0, m0 825*c0909341SAndroid Build Coastguard Worker pcmpgtb m0, m2 826*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m0 827*c0909341SAndroid Build Coastguard Worker%endif 828*c0909341SAndroid Build Coastguard Worker 829*c0909341SAndroid Build Coastguard Worker movq m0, [bufq+xq-82-1] ; top/left 830*c0909341SAndroid Build Coastguard Worker pxor m1, m1 831*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m0 832*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1 833*c0909341SAndroid Build Coastguard Worker psrldq m1, m0, 4 ; top/right 834*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2 835*c0909341SAndroid Build Coastguard Worker psrldq m2, m0, 2 ; top 836*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2 837*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m4 838*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m5 839*c0909341SAndroid Build Coastguard Worker paddd m0, m1 840*c0909341SAndroid Build Coastguard Worker paddd m0, m3 841*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_inner: 842*c0909341SAndroid Build Coastguard Worker movd val0d, m0 843*c0909341SAndroid Build Coastguard Worker psrldq m0, 4 844*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 845*c0909341SAndroid Build Coastguard Worker imul val3d, r3mp 846*c0909341SAndroid Build Coastguard Worker%else 847*c0909341SAndroid Build Coastguard Worker imul val3d, cf3d 848*c0909341SAndroid Build Coastguard Worker%endif 849*c0909341SAndroid Build Coastguard Worker add val3d, val0d 850*c0909341SAndroid Build Coastguard Worker sar val3d, shiftb 851*c0909341SAndroid Build Coastguard Worker movsx val0d, byte [bufq+xq] 852*c0909341SAndroid Build Coastguard Worker add val3d, val0d 853*c0909341SAndroid Build Coastguard Worker cmp val3d, maxd 854*c0909341SAndroid Build Coastguard Worker cmovns val3d, maxd 855*c0909341SAndroid Build Coastguard Worker cmp val3d, mind 856*c0909341SAndroid Build Coastguard Worker cmovs val3d, mind 857*c0909341SAndroid Build Coastguard Worker mov byte [bufq+xq], val3b 858*c0909341SAndroid Build Coastguard Worker ; keep val3d in-place as left for next x iteration 859*c0909341SAndroid Build Coastguard Worker inc xq 860*c0909341SAndroid Build Coastguard Worker jz .x_loop_ar1_end 861*c0909341SAndroid Build Coastguard Worker test xq, 3 862*c0909341SAndroid Build Coastguard Worker jnz .x_loop_ar1_inner 863*c0909341SAndroid Build Coastguard Worker jmp .x_loop_ar1 864*c0909341SAndroid Build Coastguard Worker 865*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_end: 866*c0909341SAndroid Build Coastguard Worker add bufq, 82 867*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 868*c0909341SAndroid Build Coastguard Worker add r1mp, 82<<%3 869*c0909341SAndroid Build Coastguard Worker dec r0mp 870*c0909341SAndroid Build Coastguard Worker%else 871*c0909341SAndroid Build Coastguard Worker add bufyq, 82<<%3 872*c0909341SAndroid Build Coastguard Worker dec hd 873*c0909341SAndroid Build Coastguard Worker%endif 874*c0909341SAndroid Build Coastguard Worker jg .y_loop_ar1 875*c0909341SAndroid Build Coastguard Worker RET 876*c0909341SAndroid Build Coastguard Worker 877*c0909341SAndroid Build Coastguard Worker.ar2: 878*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 879*c0909341SAndroid Build Coastguard Worker ALLOC_STACK -8*16 880*c0909341SAndroid Build Coastguard Worker%endif 881*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 882*c0909341SAndroid Build Coastguard Worker movifnidn bufyq, bufymp 883*c0909341SAndroid Build Coastguard Worker mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 884*c0909341SAndroid Build Coastguard Worker imul uvd, 28 885*c0909341SAndroid Build Coastguard Worker movd m7, [base+round_vals-12+shiftq*2] 886*c0909341SAndroid Build Coastguard Worker movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-12 887*c0909341SAndroid Build Coastguard Worker pxor m2, m2 888*c0909341SAndroid Build Coastguard Worker pcmpgtb m2, m0 889*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m0, m2 890*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m2 891*c0909341SAndroid Build Coastguard Worker pinsrw m1, [base+pw_1], 5 892*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m7 893*c0909341SAndroid Build Coastguard Worker pshufd m7, m7, q0000 894*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, fg_data, h, unused, x 895*c0909341SAndroid Build Coastguard Worker pshufd m4, m1, q0000 896*c0909341SAndroid Build Coastguard Worker pshufd m5, m1, q1111 897*c0909341SAndroid Build Coastguard Worker pshufd m6, m1, q2222 898*c0909341SAndroid Build Coastguard Worker pshufd m3, m0, q3333 899*c0909341SAndroid Build Coastguard Worker pshufd m2, m0, q2222 900*c0909341SAndroid Build Coastguard Worker pshufd m1, m0, q1111 901*c0909341SAndroid Build Coastguard Worker pshufd m0, m0, q0000 902*c0909341SAndroid Build Coastguard Worker SCRATCH 0, 8, 0 903*c0909341SAndroid Build Coastguard Worker SCRATCH 1, 9, 1 904*c0909341SAndroid Build Coastguard Worker SCRATCH 2, 10, 2 905*c0909341SAndroid Build Coastguard Worker SCRATCH 3, 11, 3 906*c0909341SAndroid Build Coastguard Worker SCRATCH 4, 12, 4 907*c0909341SAndroid Build Coastguard Worker SCRATCH 5, 13, 5 908*c0909341SAndroid Build Coastguard Worker SCRATCH 6, 14, 6 909*c0909341SAndroid Build Coastguard Worker SCRATCH 7, 15, 7 910*c0909341SAndroid Build Coastguard Worker%if %2 911*c0909341SAndroid Build Coastguard Worker movd m7, [base+hmul_bits+2+%3*2] 912*c0909341SAndroid Build Coastguard Worker movd m6, [base+pb_1] 913*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m7 914*c0909341SAndroid Build Coastguard Worker pshufd m6, m6, q0000 915*c0909341SAndroid Build Coastguard Worker pshufd m7, m7, q0000 916*c0909341SAndroid Build Coastguard Worker sub bufq, 82*(73-35*%3)+44-(82*3+41) 917*c0909341SAndroid Build Coastguard Worker%else 918*c0909341SAndroid Build Coastguard Worker sub bufq, 82*69+3 919*c0909341SAndroid Build Coastguard Worker%endif 920*c0909341SAndroid Build Coastguard Worker add bufyq, 79+82*3 921*c0909341SAndroid Build Coastguard Worker mov hd, 70-35*%3 922*c0909341SAndroid Build Coastguard Worker.y_loop_ar2: 923*c0909341SAndroid Build Coastguard Worker mov xq, -(76>>%2) 924*c0909341SAndroid Build Coastguard Worker 925*c0909341SAndroid Build Coastguard Worker.x_loop_ar2: 926*c0909341SAndroid Build Coastguard Worker pxor m2, m2 927*c0909341SAndroid Build Coastguard Worker movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] 928*c0909341SAndroid Build Coastguard Worker movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] 929*c0909341SAndroid Build Coastguard Worker pcmpgtb m2, m0 930*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m0, m2 931*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m2 932*c0909341SAndroid Build Coastguard Worker psrldq m5, m0, 2 ; y=-2,x=[-1,+5] 933*c0909341SAndroid Build Coastguard Worker psrldq m3, m1, 2 ; y=-1,x=[-1,+5] 934*c0909341SAndroid Build Coastguard Worker psrldq m4, m1, 4 ; y=-1,x=[+0,+5] 935*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m0, m5 936*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4 937*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m8 938*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m11 939*c0909341SAndroid Build Coastguard Worker paddd m2, m3 940*c0909341SAndroid Build Coastguard Worker 941*c0909341SAndroid Build Coastguard Worker psrldq m4, m0, 4 ; y=-2,x=[+0,+5] 942*c0909341SAndroid Build Coastguard Worker psrldq m5, m0, 6 ; y=-2,x=[+1,+5] 943*c0909341SAndroid Build Coastguard Worker psrldq m0, 8 ; y=-2,x=[+2,+5] 944*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5 945*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1 946*c0909341SAndroid Build Coastguard Worker psrldq m3, m1, 6 ; y=-1,x=[+1,+5] 947*c0909341SAndroid Build Coastguard Worker psrldq m1, m1, 8 ; y=-1,x=[+2,+5] 948*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m1 949*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m9 950*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m10 951*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m12 952*c0909341SAndroid Build Coastguard Worker paddd m4, m0 953*c0909341SAndroid Build Coastguard Worker paddd m2, m3 954*c0909341SAndroid Build Coastguard Worker paddd m2, m4 955*c0909341SAndroid Build Coastguard Worker 956*c0909341SAndroid Build Coastguard Worker%if %2 957*c0909341SAndroid Build Coastguard Worker movq m1, [bufyq+xq*2] 958*c0909341SAndroid Build Coastguard Worker%if %3 959*c0909341SAndroid Build Coastguard Worker movq m3, [bufyq+xq*2+82] 960*c0909341SAndroid Build Coastguard Worker%endif 961*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m6, m1 962*c0909341SAndroid Build Coastguard Worker%if %3 963*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m6, m3 964*c0909341SAndroid Build Coastguard Worker paddw m0, m1 965*c0909341SAndroid Build Coastguard Worker%endif 966*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m7 967*c0909341SAndroid Build Coastguard Worker%else 968*c0909341SAndroid Build Coastguard Worker movd m0, [bufyq+xq] 969*c0909341SAndroid Build Coastguard Worker pxor m1, m1 970*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m0 971*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1 972*c0909341SAndroid Build Coastguard Worker%endif 973*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m15 974*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m14 975*c0909341SAndroid Build Coastguard Worker paddd m2, m0 976*c0909341SAndroid Build Coastguard Worker 977*c0909341SAndroid Build Coastguard Worker movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] 978*c0909341SAndroid Build Coastguard Worker pxor m4, m4 979*c0909341SAndroid Build Coastguard Worker movd m5, [base+byte_blend+1] 980*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m5 981*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_inner: 982*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m4, m0 983*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1 984*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m0, m13 985*c0909341SAndroid Build Coastguard Worker paddd m3, m2 986*c0909341SAndroid Build Coastguard Worker psrldq m2, 4 ; shift top to next pixel 987*c0909341SAndroid Build Coastguard Worker psrad m3, [fg_dataq+FGData.ar_coeff_shift] 988*c0909341SAndroid Build Coastguard Worker pslldq m3, 4 989*c0909341SAndroid Build Coastguard Worker pand m3, m5 990*c0909341SAndroid Build Coastguard Worker paddw m0, m3 991*c0909341SAndroid Build Coastguard Worker packsswb m0, m0 992*c0909341SAndroid Build Coastguard Worker movd [bufq+xq-2], m0 993*c0909341SAndroid Build Coastguard Worker psrldq m0, 1 994*c0909341SAndroid Build Coastguard Worker inc xq 995*c0909341SAndroid Build Coastguard Worker jz .x_loop_ar2_end 996*c0909341SAndroid Build Coastguard Worker test xq, 3 997*c0909341SAndroid Build Coastguard Worker jnz .x_loop_ar2_inner 998*c0909341SAndroid Build Coastguard Worker jmp .x_loop_ar2 999*c0909341SAndroid Build Coastguard Worker 1000*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_end: 1001*c0909341SAndroid Build Coastguard Worker add bufq, 82 1002*c0909341SAndroid Build Coastguard Worker add bufyq, 82<<%3 1003*c0909341SAndroid Build Coastguard Worker dec hd 1004*c0909341SAndroid Build Coastguard Worker jg .y_loop_ar2 1005*c0909341SAndroid Build Coastguard Worker RET 1006*c0909341SAndroid Build Coastguard Worker 1007*c0909341SAndroid Build Coastguard Worker.ar3: 1008*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1009*c0909341SAndroid Build Coastguard Worker RESET_STACK_STATE 1010*c0909341SAndroid Build Coastguard Worker%endif 1011*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 1012*c0909341SAndroid Build Coastguard Worker movifnidn bufyq, bufymp 1013*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1014*c0909341SAndroid Build Coastguard Worker ALLOC_STACK -15*16 1015*c0909341SAndroid Build Coastguard Worker%else 1016*c0909341SAndroid Build Coastguard Worker SUB rsp, 16*7 1017*c0909341SAndroid Build Coastguard Worker%assign stack_size_padded (stack_size_padded+16*7) 1018*c0909341SAndroid Build Coastguard Worker%assign stack_size (stack_size+16*7) 1019*c0909341SAndroid Build Coastguard Worker%endif 1020*c0909341SAndroid Build Coastguard Worker mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 1021*c0909341SAndroid Build Coastguard Worker imul uvd, 28 1022*c0909341SAndroid Build Coastguard Worker 1023*c0909341SAndroid Build Coastguard Worker movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 1024*c0909341SAndroid Build Coastguard Worker pxor m3, m3 1025*c0909341SAndroid Build Coastguard Worker pcmpgtb m3, m0 1026*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m0, m3 1027*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m3 1028*c0909341SAndroid Build Coastguard Worker pshufd m2, m0, q1111 1029*c0909341SAndroid Build Coastguard Worker pshufd m3, m0, q2222 1030*c0909341SAndroid Build Coastguard Worker pshufd m4, m0, q3333 1031*c0909341SAndroid Build Coastguard Worker pshufd m0, m0, q0000 1032*c0909341SAndroid Build Coastguard Worker pshufd m5, m1, q1111 1033*c0909341SAndroid Build Coastguard Worker pshufd m6, m1, q2222 1034*c0909341SAndroid Build Coastguard Worker pshufd m7, m1, q3333 1035*c0909341SAndroid Build Coastguard Worker pshufd m1, m1, q0000 1036*c0909341SAndroid Build Coastguard Worker mova [rsp+ 0*16], m0 1037*c0909341SAndroid Build Coastguard Worker mova [rsp+ 1*16], m2 1038*c0909341SAndroid Build Coastguard Worker mova [rsp+ 2*16], m3 1039*c0909341SAndroid Build Coastguard Worker mova [rsp+ 3*16], m4 1040*c0909341SAndroid Build Coastguard Worker mova [rsp+ 4*16], m1 1041*c0909341SAndroid Build Coastguard Worker mova [rsp+ 5*16], m5 1042*c0909341SAndroid Build Coastguard Worker mova [rsp+ 6*16], m6 1043*c0909341SAndroid Build Coastguard Worker SCRATCH 7, 8, 7 1044*c0909341SAndroid Build Coastguard Worker 1045*c0909341SAndroid Build Coastguard Worker movu m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-24 [24=luma] 1046*c0909341SAndroid Build Coastguard Worker pxor m4, m4 1047*c0909341SAndroid Build Coastguard Worker pcmpgtb m4, m2 1048*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m2, m4 1049*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m4 1050*c0909341SAndroid Build Coastguard Worker pshufd m4, m2, q3232 1051*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m5 1052*c0909341SAndroid Build Coastguard Worker pshuflw m5, m4, q3321 1053*c0909341SAndroid Build Coastguard Worker pshufd m4, m3, q0000 1054*c0909341SAndroid Build Coastguard Worker pshufd m3, m2, q1111 1055*c0909341SAndroid Build Coastguard Worker pshufd m2, m2, q0000 1056*c0909341SAndroid Build Coastguard Worker pinsrw m5, [base+round_vals+shiftq*2-10], 3 1057*c0909341SAndroid Build Coastguard Worker SCRATCH 2, 9, 8 1058*c0909341SAndroid Build Coastguard Worker SCRATCH 3, 10, 9 1059*c0909341SAndroid Build Coastguard Worker SCRATCH 4, 11, 10 1060*c0909341SAndroid Build Coastguard Worker SCRATCH 5, 12, 11 1061*c0909341SAndroid Build Coastguard Worker 1062*c0909341SAndroid Build Coastguard Worker movd m2, [base+round_vals-12+shiftq*2] 1063*c0909341SAndroid Build Coastguard Worker%if %2 1064*c0909341SAndroid Build Coastguard Worker movd m1, [base+pb_1] 1065*c0909341SAndroid Build Coastguard Worker movd m3, [base+hmul_bits+2+%3*2] 1066*c0909341SAndroid Build Coastguard Worker%endif 1067*c0909341SAndroid Build Coastguard Worker pxor m0, m0 1068*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m0 1069*c0909341SAndroid Build Coastguard Worker%if %2 1070*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m3 1071*c0909341SAndroid Build Coastguard Worker%endif 1072*c0909341SAndroid Build Coastguard Worker pshufd m2, m2, q0000 1073*c0909341SAndroid Build Coastguard Worker%if %2 1074*c0909341SAndroid Build Coastguard Worker pshufd m1, m1, q0000 1075*c0909341SAndroid Build Coastguard Worker pshufd m3, m3, q0000 1076*c0909341SAndroid Build Coastguard Worker SCRATCH 1, 13, 12 1077*c0909341SAndroid Build Coastguard Worker%endif 1078*c0909341SAndroid Build Coastguard Worker SCRATCH 2, 14, 13 1079*c0909341SAndroid Build Coastguard Worker%if %2 1080*c0909341SAndroid Build Coastguard Worker SCRATCH 3, 15, 14 1081*c0909341SAndroid Build Coastguard Worker%endif 1082*c0909341SAndroid Build Coastguard Worker 1083*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, fg_data, h, unused, x 1084*c0909341SAndroid Build Coastguard Worker%if %2 1085*c0909341SAndroid Build Coastguard Worker sub bufq, 82*(73-35*%3)+44-(82*3+41) 1086*c0909341SAndroid Build Coastguard Worker%else 1087*c0909341SAndroid Build Coastguard Worker sub bufq, 82*69+3 1088*c0909341SAndroid Build Coastguard Worker%endif 1089*c0909341SAndroid Build Coastguard Worker add bufyq, 79+82*3 1090*c0909341SAndroid Build Coastguard Worker mov hd, 70-35*%3 1091*c0909341SAndroid Build Coastguard Worker.y_loop_ar3: 1092*c0909341SAndroid Build Coastguard Worker mov xq, -(76>>%2) 1093*c0909341SAndroid Build Coastguard Worker 1094*c0909341SAndroid Build Coastguard Worker.x_loop_ar3: 1095*c0909341SAndroid Build Coastguard Worker movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] 1096*c0909341SAndroid Build Coastguard Worker pxor m4, m4 1097*c0909341SAndroid Build Coastguard Worker pcmpgtb m4, m0 1098*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m0, m4 1099*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m4 1100*c0909341SAndroid Build Coastguard Worker 1101*c0909341SAndroid Build Coastguard Worker psrldq m5, m0, 2 1102*c0909341SAndroid Build Coastguard Worker psrldq m6, m0, 4 1103*c0909341SAndroid Build Coastguard Worker psrldq m7, m0, 6 1104*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m0, m5 1105*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m7 1106*c0909341SAndroid Build Coastguard Worker pmaddwd m4, [rsp+ 0*16] 1107*c0909341SAndroid Build Coastguard Worker pmaddwd m6, [rsp+ 1*16] 1108*c0909341SAndroid Build Coastguard Worker paddd m4, m6 1109*c0909341SAndroid Build Coastguard Worker 1110*c0909341SAndroid Build Coastguard Worker palignr m2, m3, m0, 10 1111*c0909341SAndroid Build Coastguard Worker palignr m3, m0, 12 1112*c0909341SAndroid Build Coastguard Worker psrldq m0, 8 1113*c0909341SAndroid Build Coastguard Worker 1114*c0909341SAndroid Build Coastguard Worker movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] 1115*c0909341SAndroid Build Coastguard Worker pxor m6, m6 1116*c0909341SAndroid Build Coastguard Worker pcmpgtb m6, m1 1117*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m1, m6 1118*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m6 1119*c0909341SAndroid Build Coastguard Worker 1120*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2 1121*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m1 1122*c0909341SAndroid Build Coastguard Worker pmaddwd m0, [rsp+ 2*16] 1123*c0909341SAndroid Build Coastguard Worker pmaddwd m3, [rsp+ 3*16] 1124*c0909341SAndroid Build Coastguard Worker paddd m0, m3 1125*c0909341SAndroid Build Coastguard Worker paddd m0, m4 1126*c0909341SAndroid Build Coastguard Worker 1127*c0909341SAndroid Build Coastguard Worker movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] 1128*c0909341SAndroid Build Coastguard Worker pxor m7, m7 1129*c0909341SAndroid Build Coastguard Worker pcmpgtb m7, m2 1130*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m2, m7 1131*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m7 1132*c0909341SAndroid Build Coastguard Worker 1133*c0909341SAndroid Build Coastguard Worker palignr m3, m5, m1, 10 1134*c0909341SAndroid Build Coastguard Worker palignr m5, m1, 12 1135*c0909341SAndroid Build Coastguard Worker psrldq m4, m2, 2 1136*c0909341SAndroid Build Coastguard Worker 1137*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m5 1138*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m2, m4 1139*c0909341SAndroid Build Coastguard Worker pmaddwd m3, [rsp+ 6*16] 1140*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m8 1141*c0909341SAndroid Build Coastguard Worker paddd m3, m5 1142*c0909341SAndroid Build Coastguard Worker paddd m0, m3 1143*c0909341SAndroid Build Coastguard Worker 1144*c0909341SAndroid Build Coastguard Worker psrldq m3, m1, 2 1145*c0909341SAndroid Build Coastguard Worker psrldq m4, m1, 4 1146*c0909341SAndroid Build Coastguard Worker psrldq m5, m1, 6 1147*c0909341SAndroid Build Coastguard Worker psrldq m1, 8 1148*c0909341SAndroid Build Coastguard Worker 1149*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4 1150*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m1 1151*c0909341SAndroid Build Coastguard Worker pmaddwd m3, [rsp+ 4*16] 1152*c0909341SAndroid Build Coastguard Worker pmaddwd m5, [rsp+ 5*16] 1153*c0909341SAndroid Build Coastguard Worker paddd m3, m5 1154*c0909341SAndroid Build Coastguard Worker paddd m0, m3 1155*c0909341SAndroid Build Coastguard Worker 1156*c0909341SAndroid Build Coastguard Worker%if %2 1157*c0909341SAndroid Build Coastguard Worker movq m1, [bufyq+xq*2] 1158*c0909341SAndroid Build Coastguard Worker%if %3 1159*c0909341SAndroid Build Coastguard Worker movq m3, [bufyq+xq*2+82] 1160*c0909341SAndroid Build Coastguard Worker%endif 1161*c0909341SAndroid Build Coastguard Worker pmaddubsw m7, m13, m1 1162*c0909341SAndroid Build Coastguard Worker%if %3 1163*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m13, m3 1164*c0909341SAndroid Build Coastguard Worker paddw m7, m5 1165*c0909341SAndroid Build Coastguard Worker%endif 1166*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m15 1167*c0909341SAndroid Build Coastguard Worker%else 1168*c0909341SAndroid Build Coastguard Worker movd m7, [bufyq+xq] 1169*c0909341SAndroid Build Coastguard Worker pxor m1, m1 1170*c0909341SAndroid Build Coastguard Worker pcmpgtb m1, m7 1171*c0909341SAndroid Build Coastguard Worker punpcklbw m7, m1 1172*c0909341SAndroid Build Coastguard Worker%endif 1173*c0909341SAndroid Build Coastguard Worker 1174*c0909341SAndroid Build Coastguard Worker psrldq m1, m2, 4 1175*c0909341SAndroid Build Coastguard Worker psrldq m3, m2, 6 1176*c0909341SAndroid Build Coastguard Worker palignr m4, m6, m2, 10 1177*c0909341SAndroid Build Coastguard Worker palignr m6, m2, 12 1178*c0909341SAndroid Build Coastguard Worker psrldq m2, 8 1179*c0909341SAndroid Build Coastguard Worker 1180*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3 1181*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4 1182*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m7 1183*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m9 1184*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m10 1185*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m11 1186*c0909341SAndroid Build Coastguard Worker paddd m1, m2 1187*c0909341SAndroid Build Coastguard Worker paddd m0, m6 1188*c0909341SAndroid Build Coastguard Worker paddd m0, m1 1189*c0909341SAndroid Build Coastguard Worker paddd m0, m14 1190*c0909341SAndroid Build Coastguard Worker 1191*c0909341SAndroid Build Coastguard Worker movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] 1192*c0909341SAndroid Build Coastguard Worker pxor m4, m4 1193*c0909341SAndroid Build Coastguard Worker movd m5, [base+byte_blend] 1194*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_inner: 1195*c0909341SAndroid Build Coastguard Worker pcmpgtb m2, m4, m1 1196*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m1, m2 1197*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m3, m12 1198*c0909341SAndroid Build Coastguard Worker pshufd m3, m2, q1111 1199*c0909341SAndroid Build Coastguard Worker paddd m2, m3 ; left+cur 1200*c0909341SAndroid Build Coastguard Worker paddd m2, m0 ; add top 1201*c0909341SAndroid Build Coastguard Worker psrldq m0, 4 1202*c0909341SAndroid Build Coastguard Worker psrad m2, [fg_dataq+FGData.ar_coeff_shift] 1203*c0909341SAndroid Build Coastguard Worker ; don't packssdw, we only care about one value 1204*c0909341SAndroid Build Coastguard Worker packsswb m2, m2 1205*c0909341SAndroid Build Coastguard Worker pandn m3, m5, m1 1206*c0909341SAndroid Build Coastguard Worker pslld m2, 24 1207*c0909341SAndroid Build Coastguard Worker pand m2, m5 1208*c0909341SAndroid Build Coastguard Worker por m1, m2, m3 1209*c0909341SAndroid Build Coastguard Worker movd [bufq+xq-3], m1 1210*c0909341SAndroid Build Coastguard Worker psrldq m1, 1 1211*c0909341SAndroid Build Coastguard Worker inc xq 1212*c0909341SAndroid Build Coastguard Worker jz .x_loop_ar3_end 1213*c0909341SAndroid Build Coastguard Worker test xq, 3 1214*c0909341SAndroid Build Coastguard Worker jnz .x_loop_ar3_inner 1215*c0909341SAndroid Build Coastguard Worker jmp .x_loop_ar3 1216*c0909341SAndroid Build Coastguard Worker 1217*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_end: 1218*c0909341SAndroid Build Coastguard Worker add bufq, 82 1219*c0909341SAndroid Build Coastguard Worker add bufyq, 82<<%3 1220*c0909341SAndroid Build Coastguard Worker dec hd 1221*c0909341SAndroid Build Coastguard Worker jg .y_loop_ar3 1222*c0909341SAndroid Build Coastguard Worker RET 1223*c0909341SAndroid Build Coastguard Worker%endmacro 1224*c0909341SAndroid Build Coastguard Worker 1225*c0909341SAndroid Build Coastguard Workergenerate_grain_uv_fn 420, 1, 1 1226*c0909341SAndroid Build Coastguard Workergenerate_grain_uv_fn 422, 1, 0 1227*c0909341SAndroid Build Coastguard Workergenerate_grain_uv_fn 444, 0, 0 1228*c0909341SAndroid Build Coastguard Worker 1229*c0909341SAndroid Build Coastguard Worker%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg 1230*c0909341SAndroid Build Coastguard Worker%assign %%idx 0 1231*c0909341SAndroid Build Coastguard Worker%define %%tmp %2 1232*c0909341SAndroid Build Coastguard Worker%if %0 == 6 1233*c0909341SAndroid Build Coastguard Worker%define %%tmp %6 1234*c0909341SAndroid Build Coastguard Worker%endif 1235*c0909341SAndroid Build Coastguard Worker%rep 4 1236*c0909341SAndroid Build Coastguard Worker%if %%idx == 0 1237*c0909341SAndroid Build Coastguard Worker movd %5 %+ d, %2 1238*c0909341SAndroid Build Coastguard Worker pshuflw %%tmp, %2, q3232 1239*c0909341SAndroid Build Coastguard Worker%else 1240*c0909341SAndroid Build Coastguard Worker movd %5 %+ d, %%tmp 1241*c0909341SAndroid Build Coastguard Worker%if %%idx == 2 1242*c0909341SAndroid Build Coastguard Worker punpckhqdq %%tmp, %%tmp 1243*c0909341SAndroid Build Coastguard Worker%elif %%idx == 4 1244*c0909341SAndroid Build Coastguard Worker psrlq %%tmp, 32 1245*c0909341SAndroid Build Coastguard Worker%endif 1246*c0909341SAndroid Build Coastguard Worker%endif 1247*c0909341SAndroid Build Coastguard Worker movzx %4 %+ d, %5 %+ w 1248*c0909341SAndroid Build Coastguard Worker shr %5 %+ d, 16 1249*c0909341SAndroid Build Coastguard Worker 1250*c0909341SAndroid Build Coastguard Worker%if %%idx == 0 1251*c0909341SAndroid Build Coastguard Worker movd %1, [%3+%4] 1252*c0909341SAndroid Build Coastguard Worker%else 1253*c0909341SAndroid Build Coastguard Worker pinsrw %1, [%3+%4], %%idx + 0 1254*c0909341SAndroid Build Coastguard Worker%endif 1255*c0909341SAndroid Build Coastguard Worker pinsrw %1, [%3+%5], %%idx + 1 1256*c0909341SAndroid Build Coastguard Worker%assign %%idx %%idx+2 1257*c0909341SAndroid Build Coastguard Worker%endrep 1258*c0909341SAndroid Build Coastguard Worker%endmacro 1259*c0909341SAndroid Build Coastguard Worker 1260*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3 1261*c0909341SAndroid Build Coastguard Worker; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby) 1262*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1263*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < mmsize 1264*c0909341SAndroid Build Coastguard Workercglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \ 1265*c0909341SAndroid Build Coastguard Worker dst, src, scaling, unused1, fg_data, picptr, unused2 1266*c0909341SAndroid Build Coastguard Worker ; copy stack arguments to new position post-alignment, so that we 1267*c0909341SAndroid Build Coastguard Worker ; don't have to keep the old stack location in a separate register 1268*c0909341SAndroid Build Coastguard Worker mov r0, r0m 1269*c0909341SAndroid Build Coastguard Worker mov r1, r2m 1270*c0909341SAndroid Build Coastguard Worker mov r2, r4m 1271*c0909341SAndroid Build Coastguard Worker mov r3, r6m 1272*c0909341SAndroid Build Coastguard Worker mov r4, r7m 1273*c0909341SAndroid Build Coastguard Worker mov r5, r8m 1274*c0909341SAndroid Build Coastguard Worker 1275*c0909341SAndroid Build Coastguard Worker mov [rsp+5*mmsize+ 4*gprsize], r0 1276*c0909341SAndroid Build Coastguard Worker mov [rsp+5*mmsize+ 6*gprsize], r1 1277*c0909341SAndroid Build Coastguard Worker mov [rsp+5*mmsize+ 8*gprsize], r2 1278*c0909341SAndroid Build Coastguard Worker mov [rsp+5*mmsize+10*gprsize], r3 1279*c0909341SAndroid Build Coastguard Worker mov [rsp+5*mmsize+11*gprsize], r4 1280*c0909341SAndroid Build Coastguard Worker mov [rsp+5*mmsize+12*gprsize], r5 1281*c0909341SAndroid Build Coastguard Worker%else 1282*c0909341SAndroid Build Coastguard Workercglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \ 1283*c0909341SAndroid Build Coastguard Worker dst, src, scaling, unused1, fg_data, picptr, unused2 1284*c0909341SAndroid Build Coastguard Worker%endif 1285*c0909341SAndroid Build Coastguard Worker mov srcq, srcm 1286*c0909341SAndroid Build Coastguard Worker mov fg_dataq, r3m 1287*c0909341SAndroid Build Coastguard Worker mov scalingq, r5m 1288*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < mmsize 1289*c0909341SAndroid Build Coastguard Worker%define r0m [rsp+5*mmsize+ 4*gprsize] 1290*c0909341SAndroid Build Coastguard Worker%define r1m [rsp+5*mmsize+ 5*gprsize] 1291*c0909341SAndroid Build Coastguard Worker%define r2m [rsp+5*mmsize+ 6*gprsize] 1292*c0909341SAndroid Build Coastguard Worker%define r3m [rsp+5*mmsize+ 7*gprsize] 1293*c0909341SAndroid Build Coastguard Worker%define r4m [rsp+5*mmsize+ 8*gprsize] 1294*c0909341SAndroid Build Coastguard Worker%define r5m [rsp+5*mmsize+ 9*gprsize] 1295*c0909341SAndroid Build Coastguard Worker%define r6m [rsp+5*mmsize+10*gprsize] 1296*c0909341SAndroid Build Coastguard Worker%define r7m [rsp+5*mmsize+11*gprsize] 1297*c0909341SAndroid Build Coastguard Worker%define r8m [rsp+5*mmsize+12*gprsize] 1298*c0909341SAndroid Build Coastguard Worker%endif 1299*c0909341SAndroid Build Coastguard Worker LEA r5, pb_mask 1300*c0909341SAndroid Build Coastguard Worker%define base r5-pb_mask 1301*c0909341SAndroid Build Coastguard Worker mov r5m, picptrq 1302*c0909341SAndroid Build Coastguard Worker%else 1303*c0909341SAndroid Build Coastguard Workercglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut 1304*c0909341SAndroid Build Coastguard Worker lea r7, [pb_mask] 1305*c0909341SAndroid Build Coastguard Worker%define base r7-pb_mask 1306*c0909341SAndroid Build Coastguard Worker%endif 1307*c0909341SAndroid Build Coastguard Worker mov r6d, [fg_dataq+FGData.scaling_shift] 1308*c0909341SAndroid Build Coastguard Worker movd m3, [base+mul_bits+r6*2-14] 1309*c0909341SAndroid Build Coastguard Worker mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 1310*c0909341SAndroid Build Coastguard Worker movd m4, [base+max+r6*4] 1311*c0909341SAndroid Build Coastguard Worker movd m5, [base+min+r6*2] 1312*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m3 1313*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m4 1314*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m5 1315*c0909341SAndroid Build Coastguard Worker pshufd m3, m3, q0000 1316*c0909341SAndroid Build Coastguard Worker pshufd m4, m4, q0000 1317*c0909341SAndroid Build Coastguard Worker pshufd m5, m5, q0000 1318*c0909341SAndroid Build Coastguard Worker SCRATCH 3, 11, 0 1319*c0909341SAndroid Build Coastguard Worker SCRATCH 4, 12, 1 1320*c0909341SAndroid Build Coastguard Worker SCRATCH 5, 13, 2 1321*c0909341SAndroid Build Coastguard Worker 1322*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1323*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 1324*c0909341SAndroid Build Coastguard Worker%else 1325*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap 1326*c0909341SAndroid Build Coastguard Worker%endif 1327*c0909341SAndroid Build Coastguard Worker 1328*c0909341SAndroid Build Coastguard Worker mov sbyd, r8m 1329*c0909341SAndroid Build Coastguard Worker mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 1330*c0909341SAndroid Build Coastguard Worker test overlapd, overlapd 1331*c0909341SAndroid Build Coastguard Worker jz .no_vertical_overlap 1332*c0909341SAndroid Build Coastguard Worker mova m6, [base+pw_1024] 1333*c0909341SAndroid Build Coastguard Worker mova m7, [base+pb_27_17_17_27] 1334*c0909341SAndroid Build Coastguard Worker SCRATCH 6, 14, 3 1335*c0909341SAndroid Build Coastguard Worker SCRATCH 7, 15, 4 1336*c0909341SAndroid Build Coastguard Worker test sbyd, sbyd 1337*c0909341SAndroid Build Coastguard Worker jnz .vertical_overlap 1338*c0909341SAndroid Build Coastguard Worker ; fall-through 1339*c0909341SAndroid Build Coastguard Worker 1340*c0909341SAndroid Build Coastguard Worker.no_vertical_overlap: 1341*c0909341SAndroid Build Coastguard Worker mov r8m, overlapd 1342*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1343*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused 1344*c0909341SAndroid Build Coastguard Worker imul seed, (173 << 24) | 37 1345*c0909341SAndroid Build Coastguard Worker%else 1346*c0909341SAndroid Build Coastguard Worker imul seed, sbyd, (173 << 24) | 37 1347*c0909341SAndroid Build Coastguard Worker%endif 1348*c0909341SAndroid Build Coastguard Worker add seed, (105 << 24) | 178 1349*c0909341SAndroid Build Coastguard Worker rol seed, 8 1350*c0909341SAndroid Build Coastguard Worker movzx seed, seew 1351*c0909341SAndroid Build Coastguard Worker xor seed, [fg_dataq+FGData.seed] 1352*c0909341SAndroid Build Coastguard Worker 1353*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1354*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 1355*c0909341SAndroid Build Coastguard Worker 1356*c0909341SAndroid Build Coastguard Worker mov r3m, seed 1357*c0909341SAndroid Build Coastguard Worker mov wq, r4m 1358*c0909341SAndroid Build Coastguard Worker%else 1359*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1360*c0909341SAndroid Build Coastguard Worker unused1, unused2, see, unused3 1361*c0909341SAndroid Build Coastguard Worker%endif 1362*c0909341SAndroid Build Coastguard Worker 1363*c0909341SAndroid Build Coastguard Worker lea src_bakq, [srcq+wq] 1364*c0909341SAndroid Build Coastguard Worker neg wq 1365*c0909341SAndroid Build Coastguard Worker sub dstmp, srcq 1366*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1367*c0909341SAndroid Build Coastguard Worker mov r1m, src_bakq 1368*c0909341SAndroid Build Coastguard Worker mov r4m, wq 1369*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 1370*c0909341SAndroid Build Coastguard Worker%endif 1371*c0909341SAndroid Build Coastguard Worker 1372*c0909341SAndroid Build Coastguard Worker.loop_x: 1373*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1374*c0909341SAndroid Build Coastguard Worker mov seed, r3m 1375*c0909341SAndroid Build Coastguard Worker%endif 1376*c0909341SAndroid Build Coastguard Worker mov r6d, seed 1377*c0909341SAndroid Build Coastguard Worker or seed, 0xEFF4 1378*c0909341SAndroid Build Coastguard Worker shr r6d, 1 1379*c0909341SAndroid Build Coastguard Worker test seeb, seeh 1380*c0909341SAndroid Build Coastguard Worker lea seed, [r6+0x8000] 1381*c0909341SAndroid Build Coastguard Worker cmovp seed, r6d ; updated seed 1382*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1383*c0909341SAndroid Build Coastguard Worker mov r3m, seed 1384*c0909341SAndroid Build Coastguard Worker 1385*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1386*c0909341SAndroid Build Coastguard Worker 1387*c0909341SAndroid Build Coastguard Worker mov offxd, offyd 1388*c0909341SAndroid Build Coastguard Worker%else 1389*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1390*c0909341SAndroid Build Coastguard Worker offx, offy, see, unused 1391*c0909341SAndroid Build Coastguard Worker 1392*c0909341SAndroid Build Coastguard Worker mov offyd, seed 1393*c0909341SAndroid Build Coastguard Worker mov offxd, seed 1394*c0909341SAndroid Build Coastguard Worker%endif 1395*c0909341SAndroid Build Coastguard Worker ror offyd, 8 1396*c0909341SAndroid Build Coastguard Worker shr offxd, 12 1397*c0909341SAndroid Build Coastguard Worker and offyd, 0xf 1398*c0909341SAndroid Build Coastguard Worker imul offyd, 164 1399*c0909341SAndroid Build Coastguard Worker lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 1400*c0909341SAndroid Build Coastguard Worker 1401*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1402*c0909341SAndroid Build Coastguard Worker ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, 1403*c0909341SAndroid Build Coastguard Worker ; r6m=grain_lut, r7m=h, r8m=overlap_v|h 1404*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1405*c0909341SAndroid Build Coastguard Worker%else 1406*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1407*c0909341SAndroid Build Coastguard Worker h, offxy, see, unused 1408*c0909341SAndroid Build Coastguard Worker%endif 1409*c0909341SAndroid Build Coastguard Worker 1410*c0909341SAndroid Build Coastguard Worker.loop_x_odd: 1411*c0909341SAndroid Build Coastguard Worker mov hd, r7m 1412*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 1413*c0909341SAndroid Build Coastguard Worker.loop_y: 1414*c0909341SAndroid Build Coastguard Worker ; src 1415*c0909341SAndroid Build Coastguard Worker mova m0, [srcq] 1416*c0909341SAndroid Build Coastguard Worker pxor m2, m2 1417*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m0, m2 1418*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m2 ; m0-1: src as word 1419*c0909341SAndroid Build Coastguard Worker 1420*c0909341SAndroid Build Coastguard Worker ; scaling[src] 1421*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1422*c0909341SAndroid Build Coastguard Worker vpgatherdw m4, m0, scalingq-1, r0, r5, m3 1423*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m1, scalingq-1, r0, r5, m3 1424*c0909341SAndroid Build Coastguard Worker%else 1425*c0909341SAndroid Build Coastguard Worker vpgatherdw m4, m0, scalingq-1, r12, r13, m3 1426*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m1, scalingq-1, r12, r13, m3 1427*c0909341SAndroid Build Coastguard Worker%endif 1428*c0909341SAndroid Build Coastguard Worker REPX {psrlw x, 8}, m4, m5 1429*c0909341SAndroid Build Coastguard Worker 1430*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 1431*c0909341SAndroid Build Coastguard Worker movu m3, [grain_lutq+offxyq] 1432*c0909341SAndroid Build Coastguard Worker pcmpgtb m7, m2, m3 1433*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m7 1434*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m7 1435*c0909341SAndroid Build Coastguard Worker 1436*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[src] * grain, scaling_shift) 1437*c0909341SAndroid Build Coastguard Worker pmullw m2, m4 1438*c0909341SAndroid Build Coastguard Worker pmullw m3, m5 1439*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m11 1440*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m11 1441*c0909341SAndroid Build Coastguard Worker 1442*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 1443*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1444*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1445*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m13 1446*c0909341SAndroid Build Coastguard Worker pmaxsw m1, m13 1447*c0909341SAndroid Build Coastguard Worker pminsw m0, m12 1448*c0909341SAndroid Build Coastguard Worker pminsw m1, m12 1449*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1450*c0909341SAndroid Build Coastguard Worker movifnidn dstq, dstmp 1451*c0909341SAndroid Build Coastguard Worker mova [dstq+srcq], m0 1452*c0909341SAndroid Build Coastguard Worker 1453*c0909341SAndroid Build Coastguard Worker add srcq, r2mp 1454*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82 1455*c0909341SAndroid Build Coastguard Worker dec hd 1456*c0909341SAndroid Build Coastguard Worker jg .loop_y 1457*c0909341SAndroid Build Coastguard Worker 1458*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1459*c0909341SAndroid Build Coastguard Worker add r4mp, 16 1460*c0909341SAndroid Build Coastguard Worker%else 1461*c0909341SAndroid Build Coastguard Worker add wq, 16 1462*c0909341SAndroid Build Coastguard Worker%endif 1463*c0909341SAndroid Build Coastguard Worker jge .end 1464*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1465*c0909341SAndroid Build Coastguard Worker mov srcq, r1mp 1466*c0909341SAndroid Build Coastguard Worker add srcq, r4mp 1467*c0909341SAndroid Build Coastguard Worker%else 1468*c0909341SAndroid Build Coastguard Worker lea srcq, [src_bakq+wq] 1469*c0909341SAndroid Build Coastguard Worker%endif 1470*c0909341SAndroid Build Coastguard Worker btc dword r8m, 2 1471*c0909341SAndroid Build Coastguard Worker jc .next_blk 1472*c0909341SAndroid Build Coastguard Worker 1473*c0909341SAndroid Build Coastguard Worker add offxyd, 16 1474*c0909341SAndroid Build Coastguard Worker test dword r8m, 2 ; r8m & 2 = have_top_overlap 1475*c0909341SAndroid Build Coastguard Worker jz .loop_x_odd 1476*c0909341SAndroid Build Coastguard Worker 1477*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1478*c0909341SAndroid Build Coastguard Worker add dword [rsp+5*mmsize+1*gprsize], 16 1479*c0909341SAndroid Build Coastguard Worker%else 1480*c0909341SAndroid Build Coastguard Worker add r11d, 16 ; top_offxyd 1481*c0909341SAndroid Build Coastguard Worker%endif 1482*c0909341SAndroid Build Coastguard Worker jnz .loop_x_odd_v_overlap 1483*c0909341SAndroid Build Coastguard Worker 1484*c0909341SAndroid Build Coastguard Worker.next_blk: 1485*c0909341SAndroid Build Coastguard Worker test dword r8m, 1 1486*c0909341SAndroid Build Coastguard Worker jz .loop_x 1487*c0909341SAndroid Build Coastguard Worker 1488*c0909341SAndroid Build Coastguard Worker test dword r8m, 2 1489*c0909341SAndroid Build Coastguard Worker jnz .loop_x_hv_overlap 1490*c0909341SAndroid Build Coastguard Worker 1491*c0909341SAndroid Build Coastguard Worker ; horizontal overlap (without vertical overlap) 1492*c0909341SAndroid Build Coastguard Worker.loop_x_h_overlap: 1493*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1494*c0909341SAndroid Build Coastguard Worker ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, 1495*c0909341SAndroid Build Coastguard Worker ; r6m=grain_lut, r7m=h, r8m=overlap_v|h 1496*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3 1497*c0909341SAndroid Build Coastguard Worker 1498*c0909341SAndroid Build Coastguard Worker add offxyd, 16 ; left_offxyd 1499*c0909341SAndroid Build Coastguard Worker mov [rsp+5*mmsize+0*gprsize], offxyd 1500*c0909341SAndroid Build Coastguard Worker 1501*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 1502*c0909341SAndroid Build Coastguard Worker 1503*c0909341SAndroid Build Coastguard Worker mov seed, r3m 1504*c0909341SAndroid Build Coastguard Worker%else 1505*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1506*c0909341SAndroid Build Coastguard Worker offx, offy, see, left_offxy 1507*c0909341SAndroid Build Coastguard Worker 1508*c0909341SAndroid Build Coastguard Worker lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx 1509*c0909341SAndroid Build Coastguard Worker%endif 1510*c0909341SAndroid Build Coastguard Worker 1511*c0909341SAndroid Build Coastguard Worker mov r6d, seed 1512*c0909341SAndroid Build Coastguard Worker or seed, 0xEFF4 1513*c0909341SAndroid Build Coastguard Worker shr r6d, 1 1514*c0909341SAndroid Build Coastguard Worker test seeb, seeh 1515*c0909341SAndroid Build Coastguard Worker lea seed, [r6+0x8000] 1516*c0909341SAndroid Build Coastguard Worker cmovp seed, r6d ; updated seed 1517*c0909341SAndroid Build Coastguard Worker 1518*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1519*c0909341SAndroid Build Coastguard Worker mov r3m, seed 1520*c0909341SAndroid Build Coastguard Worker 1521*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1522*c0909341SAndroid Build Coastguard Worker 1523*c0909341SAndroid Build Coastguard Worker mov offxd, offyd 1524*c0909341SAndroid Build Coastguard Worker%else 1525*c0909341SAndroid Build Coastguard Worker mov offyd, seed 1526*c0909341SAndroid Build Coastguard Worker mov offxd, seed 1527*c0909341SAndroid Build Coastguard Worker%endif 1528*c0909341SAndroid Build Coastguard Worker ror offyd, 8 1529*c0909341SAndroid Build Coastguard Worker shr offxd, 12 1530*c0909341SAndroid Build Coastguard Worker and offyd, 0xf 1531*c0909341SAndroid Build Coastguard Worker imul offyd, 164 1532*c0909341SAndroid Build Coastguard Worker lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 1533*c0909341SAndroid Build Coastguard Worker 1534*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1535*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1536*c0909341SAndroid Build Coastguard Worker%else 1537*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1538*c0909341SAndroid Build Coastguard Worker h, offxy, see, left_offxy 1539*c0909341SAndroid Build Coastguard Worker%endif 1540*c0909341SAndroid Build Coastguard Worker 1541*c0909341SAndroid Build Coastguard Worker mov hd, r7m 1542*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 1543*c0909341SAndroid Build Coastguard Worker.loop_y_h_overlap: 1544*c0909341SAndroid Build Coastguard Worker ; src 1545*c0909341SAndroid Build Coastguard Worker mova m0, [srcq] 1546*c0909341SAndroid Build Coastguard Worker pxor m2, m2 1547*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m0, m2 1548*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m2 ; m0-1: src as word 1549*c0909341SAndroid Build Coastguard Worker 1550*c0909341SAndroid Build Coastguard Worker ; scaling[src] 1551*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1552*c0909341SAndroid Build Coastguard Worker vpgatherdw m4, m0, scalingq-1, r0, r5, m3 1553*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m1, scalingq-1, r0, r5, m3 1554*c0909341SAndroid Build Coastguard Worker%else 1555*c0909341SAndroid Build Coastguard Worker vpgatherdw m4, m0, scalingq-1, r12, r13, m3 1556*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m1, scalingq-1, r12, r13, m3 1557*c0909341SAndroid Build Coastguard Worker%endif 1558*c0909341SAndroid Build Coastguard Worker REPX {psrlw x, 8}, m4, m5 1559*c0909341SAndroid Build Coastguard Worker 1560*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 1561*c0909341SAndroid Build Coastguard Worker movu m3, [grain_lutq+offxyq] 1562*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1563*c0909341SAndroid Build Coastguard Worker mov r5, [rsp+5*mmsize+0*gprsize] 1564*c0909341SAndroid Build Coastguard Worker movd m7, [grain_lutq+r5] 1565*c0909341SAndroid Build Coastguard Worker%else 1566*c0909341SAndroid Build Coastguard Worker movd m7, [grain_lutq+left_offxyq] 1567*c0909341SAndroid Build Coastguard Worker%endif 1568*c0909341SAndroid Build Coastguard Worker punpcklbw m7, m3 1569*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m15, m7 1570*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m14 1571*c0909341SAndroid Build Coastguard Worker packsswb m6, m6 1572*c0909341SAndroid Build Coastguard Worker shufps m6, m3, q3210 1573*c0909341SAndroid Build Coastguard Worker pcmpgtb m2, m6 1574*c0909341SAndroid Build Coastguard Worker punpcklbw m7, m6, m2 1575*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m2 1576*c0909341SAndroid Build Coastguard Worker 1577*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[src] * grain, scaling_shift) 1578*c0909341SAndroid Build Coastguard Worker pmullw m7, m4 1579*c0909341SAndroid Build Coastguard Worker pmullw m6, m5 1580*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m11 1581*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m11 1582*c0909341SAndroid Build Coastguard Worker 1583*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 1584*c0909341SAndroid Build Coastguard Worker paddw m0, m7 1585*c0909341SAndroid Build Coastguard Worker paddw m1, m6 1586*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m13 1587*c0909341SAndroid Build Coastguard Worker pmaxsw m1, m13 1588*c0909341SAndroid Build Coastguard Worker pminsw m0, m12 1589*c0909341SAndroid Build Coastguard Worker pminsw m1, m12 1590*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1591*c0909341SAndroid Build Coastguard Worker movifnidn dstq, dstmp 1592*c0909341SAndroid Build Coastguard Worker mova [dstq+srcq], m0 1593*c0909341SAndroid Build Coastguard Worker 1594*c0909341SAndroid Build Coastguard Worker add srcq, r2mp 1595*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82 1596*c0909341SAndroid Build Coastguard Worker dec hd 1597*c0909341SAndroid Build Coastguard Worker jg .loop_y_h_overlap 1598*c0909341SAndroid Build Coastguard Worker 1599*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1600*c0909341SAndroid Build Coastguard Worker add r4mp, 16 1601*c0909341SAndroid Build Coastguard Worker%else 1602*c0909341SAndroid Build Coastguard Worker add wq, 16 1603*c0909341SAndroid Build Coastguard Worker%endif 1604*c0909341SAndroid Build Coastguard Worker jge .end 1605*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1606*c0909341SAndroid Build Coastguard Worker mov srcq, r1m 1607*c0909341SAndroid Build Coastguard Worker add srcq, r4m 1608*c0909341SAndroid Build Coastguard Worker%else 1609*c0909341SAndroid Build Coastguard Worker lea srcq, [src_bakq+wq] 1610*c0909341SAndroid Build Coastguard Worker%endif 1611*c0909341SAndroid Build Coastguard Worker xor dword r8m, 4 1612*c0909341SAndroid Build Coastguard Worker add offxyd, 16 1613*c0909341SAndroid Build Coastguard Worker 1614*c0909341SAndroid Build Coastguard Worker ; since this half-block had left-overlap, the next does not 1615*c0909341SAndroid Build Coastguard Worker test dword r8m, 2 ; have_top_overlap 1616*c0909341SAndroid Build Coastguard Worker jz .loop_x_odd 1617*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1618*c0909341SAndroid Build Coastguard Worker add dword [rsp+5*mmsize+1*gprsize], 16 1619*c0909341SAndroid Build Coastguard Worker%else 1620*c0909341SAndroid Build Coastguard Worker add r11d, 16 ; top_offxyd 1621*c0909341SAndroid Build Coastguard Worker%endif 1622*c0909341SAndroid Build Coastguard Worker jmp .loop_x_odd_v_overlap 1623*c0909341SAndroid Build Coastguard Worker 1624*c0909341SAndroid Build Coastguard Worker.end: 1625*c0909341SAndroid Build Coastguard Worker RET 1626*c0909341SAndroid Build Coastguard Worker 1627*c0909341SAndroid Build Coastguard Worker.vertical_overlap: 1628*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1629*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 1630*c0909341SAndroid Build Coastguard Worker%else 1631*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap 1632*c0909341SAndroid Build Coastguard Worker%endif 1633*c0909341SAndroid Build Coastguard Worker 1634*c0909341SAndroid Build Coastguard Worker or overlapd, 2 ; top_overlap: overlap & 2 1635*c0909341SAndroid Build Coastguard Worker mov r8m, overlapd 1636*c0909341SAndroid Build Coastguard Worker movzx sbyd, sbyb 1637*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1638*c0909341SAndroid Build Coastguard Worker imul r4, [fg_dataq+FGData.seed], 0x00010001 1639*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused 1640*c0909341SAndroid Build Coastguard Worker%else 1641*c0909341SAndroid Build Coastguard Worker imul seed, [fg_dataq+FGData.seed], 0x00010001 1642*c0909341SAndroid Build Coastguard Worker%endif 1643*c0909341SAndroid Build Coastguard Worker imul tmpd, sbyd, 173 * 0x00010001 1644*c0909341SAndroid Build Coastguard Worker imul sbyd, 37 * 0x01000100 1645*c0909341SAndroid Build Coastguard Worker add tmpd, (105 << 16) | 188 1646*c0909341SAndroid Build Coastguard Worker add sbyd, (178 << 24) | (141 << 8) 1647*c0909341SAndroid Build Coastguard Worker and tmpd, 0x00ff00ff 1648*c0909341SAndroid Build Coastguard Worker and sbyd, 0xff00ff00 1649*c0909341SAndroid Build Coastguard Worker xor seed, tmpd 1650*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1651*c0909341SAndroid Build Coastguard Worker xor sbyd, seed ; (cur_seed << 16) | top_seed 1652*c0909341SAndroid Build Coastguard Worker 1653*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 1654*c0909341SAndroid Build Coastguard Worker 1655*c0909341SAndroid Build Coastguard Worker mov r3m, seed 1656*c0909341SAndroid Build Coastguard Worker mov wq, r4m 1657*c0909341SAndroid Build Coastguard Worker%else 1658*c0909341SAndroid Build Coastguard Worker xor seed, sbyd ; (cur_seed << 16) | top_seed 1659*c0909341SAndroid Build Coastguard Worker 1660*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1661*c0909341SAndroid Build Coastguard Worker tmp, unused2, see, unused3 1662*c0909341SAndroid Build Coastguard Worker%endif 1663*c0909341SAndroid Build Coastguard Worker 1664*c0909341SAndroid Build Coastguard Worker lea src_bakq, [srcq+wq] 1665*c0909341SAndroid Build Coastguard Worker neg wq 1666*c0909341SAndroid Build Coastguard Worker sub dstmp, srcq 1667*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1668*c0909341SAndroid Build Coastguard Worker mov r1m, src_bakq 1669*c0909341SAndroid Build Coastguard Worker mov r4m, wq 1670*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 1671*c0909341SAndroid Build Coastguard Worker%endif 1672*c0909341SAndroid Build Coastguard Worker 1673*c0909341SAndroid Build Coastguard Worker.loop_x_v_overlap: 1674*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1675*c0909341SAndroid Build Coastguard Worker mov seed, r3m 1676*c0909341SAndroid Build Coastguard Worker%endif 1677*c0909341SAndroid Build Coastguard Worker ; we assume from the block above that bits 8-15 of tmpd are zero'ed, 1678*c0909341SAndroid Build Coastguard Worker ; because of the 'and tmpd, 0x00ff00ff' above 1679*c0909341SAndroid Build Coastguard Worker mov r6d, seed 1680*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4eff4 1681*c0909341SAndroid Build Coastguard Worker test seeb, seeh 1682*c0909341SAndroid Build Coastguard Worker setp tmpb ; parity of top_seed 1683*c0909341SAndroid Build Coastguard Worker shr seed, 16 1684*c0909341SAndroid Build Coastguard Worker shl tmpd, 16 1685*c0909341SAndroid Build Coastguard Worker test seeb, seeh 1686*c0909341SAndroid Build Coastguard Worker setp tmpb ; parity of cur_seed 1687*c0909341SAndroid Build Coastguard Worker or r6d, 0x00010001 1688*c0909341SAndroid Build Coastguard Worker xor tmpd, r6d 1689*c0909341SAndroid Build Coastguard Worker mov seed, tmpd 1690*c0909341SAndroid Build Coastguard Worker ror seed, 1 ; updated (cur_seed << 16) | top_seed 1691*c0909341SAndroid Build Coastguard Worker 1692*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1693*c0909341SAndroid Build Coastguard Worker mov r3m, seed 1694*c0909341SAndroid Build Coastguard Worker 1695*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1696*c0909341SAndroid Build Coastguard Worker 1697*c0909341SAndroid Build Coastguard Worker mov offxd, offyd 1698*c0909341SAndroid Build Coastguard Worker%else 1699*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1700*c0909341SAndroid Build Coastguard Worker offx, offy, see, unused, top_offxy 1701*c0909341SAndroid Build Coastguard Worker 1702*c0909341SAndroid Build Coastguard Worker mov offyd, seed 1703*c0909341SAndroid Build Coastguard Worker mov offxd, seed 1704*c0909341SAndroid Build Coastguard Worker%endif 1705*c0909341SAndroid Build Coastguard Worker 1706*c0909341SAndroid Build Coastguard Worker ror offyd, 8 1707*c0909341SAndroid Build Coastguard Worker ror offxd, 12 1708*c0909341SAndroid Build Coastguard Worker and offyd, 0xf000f 1709*c0909341SAndroid Build Coastguard Worker and offxd, 0xf000f 1710*c0909341SAndroid Build Coastguard Worker imul offyd, 164 1711*c0909341SAndroid Build Coastguard Worker ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1712*c0909341SAndroid Build Coastguard Worker lea offyq, [offyq+offxq*2+0x10001*747+32*82] 1713*c0909341SAndroid Build Coastguard Worker 1714*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1715*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut 1716*c0909341SAndroid Build Coastguard Worker%else 1717*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1718*c0909341SAndroid Build Coastguard Worker h, offxy, see, unused, top_offxy 1719*c0909341SAndroid Build Coastguard Worker%endif 1720*c0909341SAndroid Build Coastguard Worker 1721*c0909341SAndroid Build Coastguard Worker movzx top_offxyd, offxyw 1722*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1723*c0909341SAndroid Build Coastguard Worker mov [rsp+5*mmsize+1*gprsize], top_offxyd 1724*c0909341SAndroid Build Coastguard Worker 1725*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1726*c0909341SAndroid Build Coastguard Worker%endif 1727*c0909341SAndroid Build Coastguard Worker shr offxyd, 16 1728*c0909341SAndroid Build Coastguard Worker 1729*c0909341SAndroid Build Coastguard Worker.loop_x_odd_v_overlap: 1730*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1731*c0909341SAndroid Build Coastguard Worker mov r5, r5m 1732*c0909341SAndroid Build Coastguard Worker lea r5, [base+pb_27_17] 1733*c0909341SAndroid Build Coastguard Worker mov [rsp+5*mmsize+12], r5 1734*c0909341SAndroid Build Coastguard Worker%else 1735*c0909341SAndroid Build Coastguard Worker mova m8, [pb_27_17] 1736*c0909341SAndroid Build Coastguard Worker%endif 1737*c0909341SAndroid Build Coastguard Worker mov hd, r7m 1738*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 1739*c0909341SAndroid Build Coastguard Worker.loop_y_v_overlap: 1740*c0909341SAndroid Build Coastguard Worker ; src 1741*c0909341SAndroid Build Coastguard Worker mova m0, [srcq] 1742*c0909341SAndroid Build Coastguard Worker pxor m2, m2 1743*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m0, m2 1744*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m2 ; m0-1: src as word 1745*c0909341SAndroid Build Coastguard Worker 1746*c0909341SAndroid Build Coastguard Worker ; scaling[src] 1747*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1748*c0909341SAndroid Build Coastguard Worker vpgatherdw m4, m0, scalingq-1, r0, r5, m3 1749*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m1, scalingq-1, r0, r5, m3 1750*c0909341SAndroid Build Coastguard Worker%else 1751*c0909341SAndroid Build Coastguard Worker vpgatherdw m4, m0, scalingq-1, r12, r13, m3 1752*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m1, scalingq-1, r12, r13, m3 1753*c0909341SAndroid Build Coastguard Worker%endif 1754*c0909341SAndroid Build Coastguard Worker REPX {psrlw x, 8}, m4, m5 1755*c0909341SAndroid Build Coastguard Worker 1756*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 1757*c0909341SAndroid Build Coastguard Worker movu m3, [grain_lutq+offxyq] 1758*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1759*c0909341SAndroid Build Coastguard Worker mov r5, [rsp+5*mmsize+1*gprsize] 1760*c0909341SAndroid Build Coastguard Worker movu m7, [grain_lutq+r5] 1761*c0909341SAndroid Build Coastguard Worker%else 1762*c0909341SAndroid Build Coastguard Worker movu m7, [grain_lutq+top_offxyq] 1763*c0909341SAndroid Build Coastguard Worker%endif 1764*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m7, m3 1765*c0909341SAndroid Build Coastguard Worker punpcklbw m7, m3 1766*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1767*c0909341SAndroid Build Coastguard Worker mov r5, [rsp+5*mmsize+12] 1768*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, [r5], m6 1769*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, [r5], m7 1770*c0909341SAndroid Build Coastguard Worker%else 1771*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m8, m6 1772*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m8, m7 1773*c0909341SAndroid Build Coastguard Worker%endif 1774*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m14 1775*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m14 1776*c0909341SAndroid Build Coastguard Worker packsswb m6, m3 1777*c0909341SAndroid Build Coastguard Worker pcmpgtb m7, m2, m6 1778*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m6, m7 1779*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m7 1780*c0909341SAndroid Build Coastguard Worker 1781*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[src] * grain, scaling_shift) 1782*c0909341SAndroid Build Coastguard Worker pmullw m2, m4 1783*c0909341SAndroid Build Coastguard Worker pmullw m6, m5 1784*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m11 1785*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m11 1786*c0909341SAndroid Build Coastguard Worker 1787*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 1788*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1789*c0909341SAndroid Build Coastguard Worker paddw m1, m6 1790*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m13 1791*c0909341SAndroid Build Coastguard Worker pmaxsw m1, m13 1792*c0909341SAndroid Build Coastguard Worker pminsw m0, m12 1793*c0909341SAndroid Build Coastguard Worker pminsw m1, m12 1794*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1795*c0909341SAndroid Build Coastguard Worker movifnidn dstq, dstmp 1796*c0909341SAndroid Build Coastguard Worker mova [dstq+srcq], m0 1797*c0909341SAndroid Build Coastguard Worker 1798*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1799*c0909341SAndroid Build Coastguard Worker add dword [rsp+5*mmsize+12], mmsize 1800*c0909341SAndroid Build Coastguard Worker%else 1801*c0909341SAndroid Build Coastguard Worker mova m8, [pb_17_27] 1802*c0909341SAndroid Build Coastguard Worker%endif 1803*c0909341SAndroid Build Coastguard Worker add srcq, r2mp 1804*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82 1805*c0909341SAndroid Build Coastguard Worker dec hw 1806*c0909341SAndroid Build Coastguard Worker jz .end_y_v_overlap 1807*c0909341SAndroid Build Coastguard Worker ; 2 lines get vertical overlap, then fall back to non-overlap code for 1808*c0909341SAndroid Build Coastguard Worker ; remaining (up to) 30 lines 1809*c0909341SAndroid Build Coastguard Worker btc hd, 16 1810*c0909341SAndroid Build Coastguard Worker jnc .loop_y_v_overlap 1811*c0909341SAndroid Build Coastguard Worker jmp .loop_y 1812*c0909341SAndroid Build Coastguard Worker 1813*c0909341SAndroid Build Coastguard Worker.end_y_v_overlap: 1814*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1815*c0909341SAndroid Build Coastguard Worker add r4mp, 16 1816*c0909341SAndroid Build Coastguard Worker%else 1817*c0909341SAndroid Build Coastguard Worker add wq, 16 1818*c0909341SAndroid Build Coastguard Worker%endif 1819*c0909341SAndroid Build Coastguard Worker jge .end_hv 1820*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1821*c0909341SAndroid Build Coastguard Worker mov srcq, r1mp 1822*c0909341SAndroid Build Coastguard Worker add srcq, r4mp 1823*c0909341SAndroid Build Coastguard Worker%else 1824*c0909341SAndroid Build Coastguard Worker lea srcq, [src_bakq+wq] 1825*c0909341SAndroid Build Coastguard Worker%endif 1826*c0909341SAndroid Build Coastguard Worker btc dword r8m, 2 1827*c0909341SAndroid Build Coastguard Worker jc .loop_x_hv_overlap 1828*c0909341SAndroid Build Coastguard Worker add offxyd, 16 1829*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1830*c0909341SAndroid Build Coastguard Worker add dword [rsp+5*mmsize+1*gprsize], 16 1831*c0909341SAndroid Build Coastguard Worker%else 1832*c0909341SAndroid Build Coastguard Worker add top_offxyd, 16 1833*c0909341SAndroid Build Coastguard Worker%endif 1834*c0909341SAndroid Build Coastguard Worker jmp .loop_x_odd_v_overlap 1835*c0909341SAndroid Build Coastguard Worker 1836*c0909341SAndroid Build Coastguard Worker.loop_x_hv_overlap: 1837*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1838*c0909341SAndroid Build Coastguard Worker mov r5, r5m 1839*c0909341SAndroid Build Coastguard Worker lea r5, [base+pb_27_17] 1840*c0909341SAndroid Build Coastguard Worker mov [rsp+5*mmsize+12], r5 1841*c0909341SAndroid Build Coastguard Worker 1842*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak 1843*c0909341SAndroid Build Coastguard Worker 1844*c0909341SAndroid Build Coastguard Worker mov r5, [rsp+5*mmsize+1*gprsize] 1845*c0909341SAndroid Build Coastguard Worker mov r4, offxyd 1846*c0909341SAndroid Build Coastguard Worker add r5, 16 1847*c0909341SAndroid Build Coastguard Worker add r4, 16 1848*c0909341SAndroid Build Coastguard Worker mov [rsp+5*mmsize+2*gprsize], r5 ; topleft_offxy 1849*c0909341SAndroid Build Coastguard Worker mov [rsp+5*mmsize+0*gprsize], r4 ; left_offxy 1850*c0909341SAndroid Build Coastguard Worker 1851*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak 1852*c0909341SAndroid Build Coastguard Worker 1853*c0909341SAndroid Build Coastguard Worker xor tmpd, tmpd 1854*c0909341SAndroid Build Coastguard Worker mov seed, r3m 1855*c0909341SAndroid Build Coastguard Worker%else 1856*c0909341SAndroid Build Coastguard Worker mova m8, [pb_27_17] 1857*c0909341SAndroid Build Coastguard Worker 1858*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1859*c0909341SAndroid Build Coastguard Worker tmp, unused2, see, unused3 1860*c0909341SAndroid Build Coastguard Worker 1861*c0909341SAndroid Build Coastguard Worker ; we assume from the block above that bits 8-15 of tmpd are zero'ed 1862*c0909341SAndroid Build Coastguard Worker%endif 1863*c0909341SAndroid Build Coastguard Worker mov r6d, seed 1864*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4eff4 1865*c0909341SAndroid Build Coastguard Worker test seeb, seeh 1866*c0909341SAndroid Build Coastguard Worker setp tmpb ; parity of top_seed 1867*c0909341SAndroid Build Coastguard Worker shr seed, 16 1868*c0909341SAndroid Build Coastguard Worker shl tmpd, 16 1869*c0909341SAndroid Build Coastguard Worker test seeb, seeh 1870*c0909341SAndroid Build Coastguard Worker setp tmpb ; parity of cur_seed 1871*c0909341SAndroid Build Coastguard Worker or r6d, 0x00010001 1872*c0909341SAndroid Build Coastguard Worker xor tmpd, r6d 1873*c0909341SAndroid Build Coastguard Worker mov seed, tmpd 1874*c0909341SAndroid Build Coastguard Worker ror seed, 1 ; updated (cur_seed << 16) | top_seed 1875*c0909341SAndroid Build Coastguard Worker 1876*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1877*c0909341SAndroid Build Coastguard Worker mov r3m, seed 1878*c0909341SAndroid Build Coastguard Worker 1879*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1880*c0909341SAndroid Build Coastguard Worker 1881*c0909341SAndroid Build Coastguard Worker mov offxd, offyd 1882*c0909341SAndroid Build Coastguard Worker%else 1883*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1884*c0909341SAndroid Build Coastguard Worker offx, offy, see, left_offxy, top_offxy, topleft_offxy 1885*c0909341SAndroid Build Coastguard Worker 1886*c0909341SAndroid Build Coastguard Worker lea topleft_offxyq, [top_offxyq+16] 1887*c0909341SAndroid Build Coastguard Worker lea left_offxyq, [offyq+16] 1888*c0909341SAndroid Build Coastguard Worker mov offyd, seed 1889*c0909341SAndroid Build Coastguard Worker mov offxd, seed 1890*c0909341SAndroid Build Coastguard Worker%endif 1891*c0909341SAndroid Build Coastguard Worker ror offyd, 8 1892*c0909341SAndroid Build Coastguard Worker ror offxd, 12 1893*c0909341SAndroid Build Coastguard Worker and offyd, 0xf000f 1894*c0909341SAndroid Build Coastguard Worker and offxd, 0xf000f 1895*c0909341SAndroid Build Coastguard Worker imul offyd, 164 1896*c0909341SAndroid Build Coastguard Worker ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1897*c0909341SAndroid Build Coastguard Worker lea offyq, [offyq+offxq*2+0x10001*747+32*82] 1898*c0909341SAndroid Build Coastguard Worker 1899*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1900*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1901*c0909341SAndroid Build Coastguard Worker 1902*c0909341SAndroid Build Coastguard Worker movzx r5, offxyw ; top_offxy 1903*c0909341SAndroid Build Coastguard Worker mov [rsp+5*mmsize+1*gprsize], r5 1904*c0909341SAndroid Build Coastguard Worker%else 1905*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1906*c0909341SAndroid Build Coastguard Worker h, offxy, see, left_offxy, top_offxy, topleft_offxy 1907*c0909341SAndroid Build Coastguard Worker 1908*c0909341SAndroid Build Coastguard Worker movzx top_offxyd, offxyw 1909*c0909341SAndroid Build Coastguard Worker%endif 1910*c0909341SAndroid Build Coastguard Worker shr offxyd, 16 1911*c0909341SAndroid Build Coastguard Worker 1912*c0909341SAndroid Build Coastguard Worker mov hd, r7m 1913*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 1914*c0909341SAndroid Build Coastguard Worker.loop_y_hv_overlap: 1915*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 1916*c0909341SAndroid Build Coastguard Worker movu m3, [grain_lutq+offxyq] 1917*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1918*c0909341SAndroid Build Coastguard Worker mov r5, [rsp+5*mmsize+1*gprsize] ; top_offxy 1919*c0909341SAndroid Build Coastguard Worker mov r0, [rsp+5*mmsize+0*gprsize] ; left_offxy 1920*c0909341SAndroid Build Coastguard Worker movu m6, [grain_lutq+r5] 1921*c0909341SAndroid Build Coastguard Worker mov r5, [rsp+5*mmsize+2*gprsize] ; topleft_offxy 1922*c0909341SAndroid Build Coastguard Worker movd m4, [grain_lutq+r0] 1923*c0909341SAndroid Build Coastguard Worker movd m7, [grain_lutq+r5] 1924*c0909341SAndroid Build Coastguard Worker%else 1925*c0909341SAndroid Build Coastguard Worker movu m6, [grain_lutq+top_offxyq] 1926*c0909341SAndroid Build Coastguard Worker movd m4, [grain_lutq+left_offxyq] 1927*c0909341SAndroid Build Coastguard Worker movd m7, [grain_lutq+topleft_offxyq] 1928*c0909341SAndroid Build Coastguard Worker%endif 1929*c0909341SAndroid Build Coastguard Worker ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 1930*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m3 1931*c0909341SAndroid Build Coastguard Worker punpcklbw m7, m6 1932*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m15, m4 1933*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m15, m7 1934*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m14 1935*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m14 1936*c0909341SAndroid Build Coastguard Worker packsswb m2, m2 1937*c0909341SAndroid Build Coastguard Worker packsswb m4, m4 1938*c0909341SAndroid Build Coastguard Worker shufps m2, m3, q3210 1939*c0909341SAndroid Build Coastguard Worker shufps m4, m6, q3210 1940*c0909341SAndroid Build Coastguard Worker ; followed by v interpolation (top | cur -> cur) 1941*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m4, m2 1942*c0909341SAndroid Build Coastguard Worker punpckhbw m4, m2 1943*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1944*c0909341SAndroid Build Coastguard Worker mov r5, [rsp+5*mmsize+12] 1945*c0909341SAndroid Build Coastguard Worker pmaddubsw m7, [r5], m4 1946*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, [r5], m3 1947*c0909341SAndroid Build Coastguard Worker%else 1948*c0909341SAndroid Build Coastguard Worker pmaddubsw m7, m8, m4 1949*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m8, m3 1950*c0909341SAndroid Build Coastguard Worker%endif 1951*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m14 1952*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m14 1953*c0909341SAndroid Build Coastguard Worker packsswb m4, m7 1954*c0909341SAndroid Build Coastguard Worker pxor m2, m2 1955*c0909341SAndroid Build Coastguard Worker pcmpgtb m7, m2, m4 1956*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m4, m7 1957*c0909341SAndroid Build Coastguard Worker punpckhbw m4, m7 1958*c0909341SAndroid Build Coastguard Worker 1959*c0909341SAndroid Build Coastguard Worker ; src 1960*c0909341SAndroid Build Coastguard Worker mova m0, [srcq] 1961*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m0, m2 1962*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m2 ; m0-1: src as word 1963*c0909341SAndroid Build Coastguard Worker 1964*c0909341SAndroid Build Coastguard Worker ; scaling[src] 1965*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1966*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m0, scalingq-1, r0, r5, m7 1967*c0909341SAndroid Build Coastguard Worker vpgatherdw m6, m1, scalingq-1, r0, r5, m7 1968*c0909341SAndroid Build Coastguard Worker%else 1969*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m0, scalingq-1, r13, r14, m7 1970*c0909341SAndroid Build Coastguard Worker vpgatherdw m6, m1, scalingq-1, r13, r14, m7 1971*c0909341SAndroid Build Coastguard Worker%endif 1972*c0909341SAndroid Build Coastguard Worker REPX {psrlw x, 8}, m5, m6 1973*c0909341SAndroid Build Coastguard Worker 1974*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[src] * grain, scaling_shift) 1975*c0909341SAndroid Build Coastguard Worker pmullw m3, m5 1976*c0909341SAndroid Build Coastguard Worker pmullw m4, m6 1977*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m11 1978*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m11 1979*c0909341SAndroid Build Coastguard Worker 1980*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 1981*c0909341SAndroid Build Coastguard Worker paddw m0, m3 1982*c0909341SAndroid Build Coastguard Worker paddw m1, m4 1983*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m13 1984*c0909341SAndroid Build Coastguard Worker pmaxsw m1, m13 1985*c0909341SAndroid Build Coastguard Worker pminsw m0, m12 1986*c0909341SAndroid Build Coastguard Worker pminsw m1, m12 1987*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1988*c0909341SAndroid Build Coastguard Worker movifnidn dstq, dstmp 1989*c0909341SAndroid Build Coastguard Worker mova [dstq+srcq], m0 1990*c0909341SAndroid Build Coastguard Worker 1991*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1992*c0909341SAndroid Build Coastguard Worker add dword [rsp+5*mmsize+12], mmsize 1993*c0909341SAndroid Build Coastguard Worker%else 1994*c0909341SAndroid Build Coastguard Worker mova m8, [pb_17_27] 1995*c0909341SAndroid Build Coastguard Worker%endif 1996*c0909341SAndroid Build Coastguard Worker add srcq, r2mp 1997*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82 1998*c0909341SAndroid Build Coastguard Worker dec hw 1999*c0909341SAndroid Build Coastguard Worker jz .end_y_hv_overlap 2000*c0909341SAndroid Build Coastguard Worker ; 2 lines get vertical overlap, then fall back to non-overlap code for 2001*c0909341SAndroid Build Coastguard Worker ; remaining (up to) 30 lines 2002*c0909341SAndroid Build Coastguard Worker btc hd, 16 2003*c0909341SAndroid Build Coastguard Worker jnc .loop_y_hv_overlap 2004*c0909341SAndroid Build Coastguard Worker jmp .loop_y_h_overlap 2005*c0909341SAndroid Build Coastguard Worker 2006*c0909341SAndroid Build Coastguard Worker.end_y_hv_overlap: 2007*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2008*c0909341SAndroid Build Coastguard Worker add r4mp, 16 2009*c0909341SAndroid Build Coastguard Worker%else 2010*c0909341SAndroid Build Coastguard Worker add wq, 16 2011*c0909341SAndroid Build Coastguard Worker%endif 2012*c0909341SAndroid Build Coastguard Worker jge .end_hv 2013*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2014*c0909341SAndroid Build Coastguard Worker mov srcq, r1m 2015*c0909341SAndroid Build Coastguard Worker add srcq, r4m 2016*c0909341SAndroid Build Coastguard Worker%else 2017*c0909341SAndroid Build Coastguard Worker lea srcq, [src_bakq+wq] 2018*c0909341SAndroid Build Coastguard Worker%endif 2019*c0909341SAndroid Build Coastguard Worker xor dword r8m, 4 2020*c0909341SAndroid Build Coastguard Worker add offxyd, 16 2021*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2022*c0909341SAndroid Build Coastguard Worker add dword [rsp+5*mmsize+1*gprsize], 16 2023*c0909341SAndroid Build Coastguard Worker%else 2024*c0909341SAndroid Build Coastguard Worker add top_offxyd, 16 2025*c0909341SAndroid Build Coastguard Worker%endif 2026*c0909341SAndroid Build Coastguard Worker jmp .loop_x_odd_v_overlap 2027*c0909341SAndroid Build Coastguard Worker 2028*c0909341SAndroid Build Coastguard Worker.end_hv: 2029*c0909341SAndroid Build Coastguard Worker RET 2030*c0909341SAndroid Build Coastguard Worker 2031*c0909341SAndroid Build Coastguard Worker%macro FGUV_FN 3 ; name, ss_hor, ss_ver 2032*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3 2033*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2034*c0909341SAndroid Build Coastguard Worker; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h, 2035*c0909341SAndroid Build Coastguard Worker; sby, luma, lstride, uv_pl, is_id) 2036*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < mmsize 2037*c0909341SAndroid Build Coastguard WorkerDECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8 2038*c0909341SAndroid Build Coastguard Workercglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \ 2039*c0909341SAndroid Build Coastguard Worker tmp, src, scaling, h, fg_data, picptr, unused 2040*c0909341SAndroid Build Coastguard Worker mov r0, r0m 2041*c0909341SAndroid Build Coastguard Worker mov r1, r2m 2042*c0909341SAndroid Build Coastguard Worker mov r2, r4m 2043*c0909341SAndroid Build Coastguard Worker mov r3, r6m 2044*c0909341SAndroid Build Coastguard Worker mov r4, r7m 2045*c0909341SAndroid Build Coastguard Worker mov [rsp+7*mmsize+3*gprsize], r0 2046*c0909341SAndroid Build Coastguard Worker mov [rsp+7*mmsize+5*gprsize], r1 2047*c0909341SAndroid Build Coastguard Worker mov [rsp+7*mmsize+7*gprsize], r2 2048*c0909341SAndroid Build Coastguard Worker mov [rsp+7*mmsize+9*gprsize], r3 2049*c0909341SAndroid Build Coastguard Worker mov [rsp+7*mmsize+10*gprsize], r4 2050*c0909341SAndroid Build Coastguard Worker 2051*c0909341SAndroid Build Coastguard Worker mov r0, r8m 2052*c0909341SAndroid Build Coastguard Worker mov r1, r9m 2053*c0909341SAndroid Build Coastguard Worker mov r2, r10m 2054*c0909341SAndroid Build Coastguard Worker mov r4, r11m 2055*c0909341SAndroid Build Coastguard Worker mov r3, r12m 2056*c0909341SAndroid Build Coastguard Worker mov [rsp+7*mmsize+11*gprsize], r0 2057*c0909341SAndroid Build Coastguard Worker mov [rsp+7*mmsize+12*gprsize], r1 2058*c0909341SAndroid Build Coastguard Worker mov [rsp+7*mmsize+13*gprsize], r2 2059*c0909341SAndroid Build Coastguard Worker mov [rsp+7*mmsize+14*gprsize], r4 2060*c0909341SAndroid Build Coastguard Worker%else 2061*c0909341SAndroid Build Coastguard Workercglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \ 2062*c0909341SAndroid Build Coastguard Worker tmp, src, scaling, h, fg_data, picptr, unused 2063*c0909341SAndroid Build Coastguard Worker%endif 2064*c0909341SAndroid Build Coastguard Worker mov srcq, srcm 2065*c0909341SAndroid Build Coastguard Worker mov fg_dataq, r3m 2066*c0909341SAndroid Build Coastguard Worker mov scalingq, r5m 2067*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < mmsize 2068*c0909341SAndroid Build Coastguard Worker%define r0m [rsp+7*mmsize+ 3*gprsize] 2069*c0909341SAndroid Build Coastguard Worker%define r1m [rsp+7*mmsize+ 4*gprsize] 2070*c0909341SAndroid Build Coastguard Worker%define r2m [rsp+7*mmsize+ 5*gprsize] 2071*c0909341SAndroid Build Coastguard Worker%define r3m [rsp+7*mmsize+ 6*gprsize] 2072*c0909341SAndroid Build Coastguard Worker%define r4m [rsp+7*mmsize+ 7*gprsize] 2073*c0909341SAndroid Build Coastguard Worker%define r5m [rsp+7*mmsize+ 8*gprsize] 2074*c0909341SAndroid Build Coastguard Worker%define r6m [rsp+7*mmsize+ 9*gprsize] 2075*c0909341SAndroid Build Coastguard Worker%define r7m [rsp+7*mmsize+10*gprsize] 2076*c0909341SAndroid Build Coastguard Worker%define r8m [rsp+7*mmsize+11*gprsize] 2077*c0909341SAndroid Build Coastguard Worker%define r9m [rsp+7*mmsize+12*gprsize] 2078*c0909341SAndroid Build Coastguard Worker%define r10m [rsp+7*mmsize+13*gprsize] 2079*c0909341SAndroid Build Coastguard Worker%define r11m [rsp+7*mmsize+14*gprsize] 2080*c0909341SAndroid Build Coastguard Worker%define r12m [rsp+7*mmsize+15*gprsize] 2081*c0909341SAndroid Build Coastguard Worker%endif 2082*c0909341SAndroid Build Coastguard Worker LEA r5, pb_mask 2083*c0909341SAndroid Build Coastguard Worker%define base r5-pb_mask 2084*c0909341SAndroid Build Coastguard Worker mov r5m, r5 2085*c0909341SAndroid Build Coastguard Worker%else 2086*c0909341SAndroid Build Coastguard Workercglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ 2087*c0909341SAndroid Build Coastguard Worker grain_lut, tmp, sby, luma, lstride, uv_pl, is_id 2088*c0909341SAndroid Build Coastguard Worker lea r8, [pb_mask] 2089*c0909341SAndroid Build Coastguard Worker%define base r8-pb_mask 2090*c0909341SAndroid Build Coastguard Worker%endif 2091*c0909341SAndroid Build Coastguard Worker mov r6d, [fg_dataq+FGData.scaling_shift] 2092*c0909341SAndroid Build Coastguard Worker movd m3, [base+mul_bits+r6*2-14] 2093*c0909341SAndroid Build Coastguard Worker mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 2094*c0909341SAndroid Build Coastguard Worker lea tmpd, [r6d*2] 2095*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize 2096*c0909341SAndroid Build Coastguard Worker test r3, r3 2097*c0909341SAndroid Build Coastguard Worker%else 2098*c0909341SAndroid Build Coastguard Worker cmp dword r12m, 0 ; is_idm 2099*c0909341SAndroid Build Coastguard Worker%endif 2100*c0909341SAndroid Build Coastguard Worker movd m5, [base+min+r6*2] 2101*c0909341SAndroid Build Coastguard Worker cmovne r6d, tmpd 2102*c0909341SAndroid Build Coastguard Worker movd m4, [base+max+r6*2] 2103*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m3 2104*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m5 2105*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m4 2106*c0909341SAndroid Build Coastguard Worker pshufd m3, m3, q0000 2107*c0909341SAndroid Build Coastguard Worker pshufd m5, m5, q0000 2108*c0909341SAndroid Build Coastguard Worker pshufd m4, m4, q0000 2109*c0909341SAndroid Build Coastguard Worker SCRATCH 3, 11, 0 2110*c0909341SAndroid Build Coastguard Worker SCRATCH 4, 12, 1 2111*c0909341SAndroid Build Coastguard Worker SCRATCH 5, 13, 2 2112*c0909341SAndroid Build Coastguard Worker 2113*c0909341SAndroid Build Coastguard Worker cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 2114*c0909341SAndroid Build Coastguard Worker jne .csfl 2115*c0909341SAndroid Build Coastguard Worker 2116*c0909341SAndroid Build Coastguard Worker%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver 2117*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2118*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 2119*c0909341SAndroid Build Coastguard Worker%else 2120*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap 2121*c0909341SAndroid Build Coastguard Worker%endif 2122*c0909341SAndroid Build Coastguard Worker 2123*c0909341SAndroid Build Coastguard Worker%if %1 2124*c0909341SAndroid Build Coastguard Worker mov r6d, dword r11m 2125*c0909341SAndroid Build Coastguard Worker movd m0, [fg_dataq+FGData.uv_mult+r6*4] 2126*c0909341SAndroid Build Coastguard Worker movd m1, [fg_dataq+FGData.uv_luma_mult+r6*4] 2127*c0909341SAndroid Build Coastguard Worker punpcklbw m6, m1, m0 2128*c0909341SAndroid Build Coastguard Worker movd m7, [fg_dataq+FGData.uv_offset+r6*4] 2129*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m6 2130*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m7 2131*c0909341SAndroid Build Coastguard Worker pshufd m6, m6, q0000 2132*c0909341SAndroid Build Coastguard Worker pshufd m7, m7, q0000 2133*c0909341SAndroid Build Coastguard Worker SCRATCH 6, 14, 3 2134*c0909341SAndroid Build Coastguard Worker SCRATCH 7, 15, 4 2135*c0909341SAndroid Build Coastguard Worker%endif 2136*c0909341SAndroid Build Coastguard Worker 2137*c0909341SAndroid Build Coastguard Worker mov sbyd, r8m 2138*c0909341SAndroid Build Coastguard Worker mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 2139*c0909341SAndroid Build Coastguard Worker test overlapd, overlapd 2140*c0909341SAndroid Build Coastguard Worker jz %%no_vertical_overlap 2141*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2142*c0909341SAndroid Build Coastguard Worker%if %2 2143*c0909341SAndroid Build Coastguard Worker mova m1, [base+pb_23_22_h] 2144*c0909341SAndroid Build Coastguard Worker%else 2145*c0909341SAndroid Build Coastguard Worker mova m1, [base+pb_27_17_17_27] 2146*c0909341SAndroid Build Coastguard Worker%endif 2147*c0909341SAndroid Build Coastguard Worker mova m0, [base+pw_1024] 2148*c0909341SAndroid Build Coastguard Worker%else 2149*c0909341SAndroid Build Coastguard Worker%if %2 2150*c0909341SAndroid Build Coastguard Worker mova m1, [pb_23_22_h] 2151*c0909341SAndroid Build Coastguard Worker%else 2152*c0909341SAndroid Build Coastguard Worker mova m1, [pb_27_17_17_27] 2153*c0909341SAndroid Build Coastguard Worker%endif 2154*c0909341SAndroid Build Coastguard Worker mova m0, [pw_1024] 2155*c0909341SAndroid Build Coastguard Worker%endif 2156*c0909341SAndroid Build Coastguard Worker SCRATCH 0, 8, 5 2157*c0909341SAndroid Build Coastguard Worker SCRATCH 1, 9, 6 2158*c0909341SAndroid Build Coastguard Worker test sbyd, sbyd 2159*c0909341SAndroid Build Coastguard Worker jnz %%vertical_overlap 2160*c0909341SAndroid Build Coastguard Worker ; fall-through 2161*c0909341SAndroid Build Coastguard Worker 2162*c0909341SAndroid Build Coastguard Worker%%no_vertical_overlap: 2163*c0909341SAndroid Build Coastguard Worker mov r8m, overlapd 2164*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2165*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap 2166*c0909341SAndroid Build Coastguard Worker imul seed, (173 << 24) | 37 2167*c0909341SAndroid Build Coastguard Worker%else 2168*c0909341SAndroid Build Coastguard Worker imul seed, sbyd, (173 << 24) | 37 2169*c0909341SAndroid Build Coastguard Worker%endif 2170*c0909341SAndroid Build Coastguard Worker add seed, (105 << 24) | 178 2171*c0909341SAndroid Build Coastguard Worker rol seed, 8 2172*c0909341SAndroid Build Coastguard Worker movzx seed, seew 2173*c0909341SAndroid Build Coastguard Worker xor seed, [fg_dataq+FGData.seed] 2174*c0909341SAndroid Build Coastguard Worker 2175*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2176*c0909341SAndroid Build Coastguard Worker mov r3m, seed 2177*c0909341SAndroid Build Coastguard Worker 2178*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak 2179*c0909341SAndroid Build Coastguard Worker%define luma_bakq lumaq 2180*c0909341SAndroid Build Coastguard Worker 2181*c0909341SAndroid Build Coastguard Worker mov wq, r4m 2182*c0909341SAndroid Build Coastguard Worker%if %3 2183*c0909341SAndroid Build Coastguard Worker shl r10mp, 1 2184*c0909341SAndroid Build Coastguard Worker%endif 2185*c0909341SAndroid Build Coastguard Worker%else 2186*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2187*c0909341SAndroid Build Coastguard Worker unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak 2188*c0909341SAndroid Build Coastguard Worker 2189*c0909341SAndroid Build Coastguard Worker mov lstrideq, r10mp 2190*c0909341SAndroid Build Coastguard Worker%endif 2191*c0909341SAndroid Build Coastguard Worker 2192*c0909341SAndroid Build Coastguard Worker mov lumaq, r9mp 2193*c0909341SAndroid Build Coastguard Worker lea src_bakq, [srcq+wq] 2194*c0909341SAndroid Build Coastguard Worker lea luma_bakq, [lumaq+wq*(1+%2)] 2195*c0909341SAndroid Build Coastguard Worker neg wq 2196*c0909341SAndroid Build Coastguard Worker sub r0mp, srcq 2197*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2198*c0909341SAndroid Build Coastguard Worker mov r1m, src_bakq 2199*c0909341SAndroid Build Coastguard Worker mov r11m, luma_bakq 2200*c0909341SAndroid Build Coastguard Worker mov r4m, wq 2201*c0909341SAndroid Build Coastguard Worker 2202*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 2203*c0909341SAndroid Build Coastguard Worker%else 2204*c0909341SAndroid Build Coastguard Worker mov r11mp, src_bakq 2205*c0909341SAndroid Build Coastguard Worker mov r12mp, strideq 2206*c0909341SAndroid Build Coastguard Worker%endif 2207*c0909341SAndroid Build Coastguard Worker 2208*c0909341SAndroid Build Coastguard Worker%%loop_x: 2209*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2210*c0909341SAndroid Build Coastguard Worker mov seed, r3m 2211*c0909341SAndroid Build Coastguard Worker%endif 2212*c0909341SAndroid Build Coastguard Worker mov r6d, seed 2213*c0909341SAndroid Build Coastguard Worker or seed, 0xEFF4 2214*c0909341SAndroid Build Coastguard Worker shr r6d, 1 2215*c0909341SAndroid Build Coastguard Worker test seeb, seeh 2216*c0909341SAndroid Build Coastguard Worker lea seed, [r6+0x8000] 2217*c0909341SAndroid Build Coastguard Worker cmovp seed, r6d ; updated seed 2218*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2219*c0909341SAndroid Build Coastguard Worker mov r3m, seed 2220*c0909341SAndroid Build Coastguard Worker 2221*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx 2222*c0909341SAndroid Build Coastguard Worker 2223*c0909341SAndroid Build Coastguard Worker mov offxd, offyd 2224*c0909341SAndroid Build Coastguard Worker%else 2225*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2226*c0909341SAndroid Build Coastguard Worker offx, offy, see, overlap, unused1, unused2, lstride 2227*c0909341SAndroid Build Coastguard Worker 2228*c0909341SAndroid Build Coastguard Worker mov offyd, seed 2229*c0909341SAndroid Build Coastguard Worker mov offxd, seed 2230*c0909341SAndroid Build Coastguard Worker%endif 2231*c0909341SAndroid Build Coastguard Worker ror offyd, 8 2232*c0909341SAndroid Build Coastguard Worker shr offxd, 12 2233*c0909341SAndroid Build Coastguard Worker and offyd, 0xf 2234*c0909341SAndroid Build Coastguard Worker imul offyd, 164>>%3 2235*c0909341SAndroid Build Coastguard Worker lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx 2236*c0909341SAndroid Build Coastguard Worker 2237*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2238*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut 2239*c0909341SAndroid Build Coastguard Worker%else 2240*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2241*c0909341SAndroid Build Coastguard Worker h, offxy, see, overlap, unused1, unused2, lstride, luma_bak 2242*c0909341SAndroid Build Coastguard Worker%endif 2243*c0909341SAndroid Build Coastguard Worker 2244*c0909341SAndroid Build Coastguard Worker%%loop_x_odd: 2245*c0909341SAndroid Build Coastguard Worker mov hd, r7m 2246*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 2247*c0909341SAndroid Build Coastguard Worker%%loop_y: 2248*c0909341SAndroid Build Coastguard Worker ; src 2249*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2250*c0909341SAndroid Build Coastguard Worker mov lumaq, r9mp 2251*c0909341SAndroid Build Coastguard Worker%endif 2252*c0909341SAndroid Build Coastguard Worker%if %2 2253*c0909341SAndroid Build Coastguard Worker mova m4, [lumaq+ 0] 2254*c0909341SAndroid Build Coastguard Worker mova m6, [lumaq+16] 2255*c0909341SAndroid Build Coastguard Worker mova m0, [srcq] 2256*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2257*c0909341SAndroid Build Coastguard Worker add lumaq, r10mp 2258*c0909341SAndroid Build Coastguard Worker mov r9mp, lumaq 2259*c0909341SAndroid Build Coastguard Worker mov r5, r5m 2260*c0909341SAndroid Build Coastguard Worker movd m7, [base+pb_1] 2261*c0909341SAndroid Build Coastguard Worker%else 2262*c0909341SAndroid Build Coastguard Worker movd m7, [pb_1] 2263*c0909341SAndroid Build Coastguard Worker%endif 2264*c0909341SAndroid Build Coastguard Worker pshufd m7, m7, q0000 2265*c0909341SAndroid Build Coastguard Worker pxor m2, m2 2266*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m7 2267*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m7 2268*c0909341SAndroid Build Coastguard Worker pavgw m4, m2 2269*c0909341SAndroid Build Coastguard Worker pavgw m6, m2 2270*c0909341SAndroid Build Coastguard Worker%else 2271*c0909341SAndroid Build Coastguard Worker mova m4, [lumaq] 2272*c0909341SAndroid Build Coastguard Worker mova m0, [srcq] 2273*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2274*c0909341SAndroid Build Coastguard Worker add lumaq, r10mp 2275*c0909341SAndroid Build Coastguard Worker mov r9mp, lumaq 2276*c0909341SAndroid Build Coastguard Worker%endif 2277*c0909341SAndroid Build Coastguard Worker pxor m2, m2 2278*c0909341SAndroid Build Coastguard Worker%endif 2279*c0909341SAndroid Build Coastguard Worker 2280*c0909341SAndroid Build Coastguard Worker%if %1 2281*c0909341SAndroid Build Coastguard Worker%if %2 2282*c0909341SAndroid Build Coastguard Worker packuswb m4, m6 ; luma 2283*c0909341SAndroid Build Coastguard Worker%endif 2284*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m4, m0 2285*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m0 ; { luma, chroma } 2286*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m14 2287*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m14 2288*c0909341SAndroid Build Coastguard Worker psraw m6, 6 2289*c0909341SAndroid Build Coastguard Worker psraw m4, 6 2290*c0909341SAndroid Build Coastguard Worker paddw m6, m15 2291*c0909341SAndroid Build Coastguard Worker paddw m4, m15 2292*c0909341SAndroid Build Coastguard Worker packuswb m4, m6 ; pack+unpack = clip 2293*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m4, m2 2294*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m2 2295*c0909341SAndroid Build Coastguard Worker%elif %2 == 0 2296*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m4, m2 2297*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m2 2298*c0909341SAndroid Build Coastguard Worker%endif 2299*c0909341SAndroid Build Coastguard Worker 2300*c0909341SAndroid Build Coastguard Worker ; scaling[luma_src] 2301*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2302*c0909341SAndroid Build Coastguard Worker vpgatherdw m7, m4, scalingq-1, r0, r5 2303*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m6, scalingq-1, r0, r5 2304*c0909341SAndroid Build Coastguard Worker%else 2305*c0909341SAndroid Build Coastguard Worker vpgatherdw m7, m4, scalingq-1, r12, r2 2306*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m6, scalingq-1, r12, r2 2307*c0909341SAndroid Build Coastguard Worker%endif 2308*c0909341SAndroid Build Coastguard Worker REPX {psrlw x, 8}, m7, m5 2309*c0909341SAndroid Build Coastguard Worker 2310*c0909341SAndroid Build Coastguard Worker ; unpack chroma_source 2311*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m0, m2 2312*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m2 ; m0-1: src as word 2313*c0909341SAndroid Build Coastguard Worker 2314*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 2315*c0909341SAndroid Build Coastguard Worker movu m3, [grain_lutq+offxyq+ 0] 2316*c0909341SAndroid Build Coastguard Worker pcmpgtb m6, m2, m3 2317*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m6 2318*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m6 2319*c0909341SAndroid Build Coastguard Worker 2320*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2321*c0909341SAndroid Build Coastguard Worker pmullw m2, m7 2322*c0909341SAndroid Build Coastguard Worker pmullw m3, m5 2323*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m11 2324*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m11 2325*c0909341SAndroid Build Coastguard Worker 2326*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2327*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2328*c0909341SAndroid Build Coastguard Worker%endif 2329*c0909341SAndroid Build Coastguard Worker 2330*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 2331*c0909341SAndroid Build Coastguard Worker paddw m0, m2 2332*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2333*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m13 2334*c0909341SAndroid Build Coastguard Worker pmaxsw m1, m13 2335*c0909341SAndroid Build Coastguard Worker pminsw m0, m12 2336*c0909341SAndroid Build Coastguard Worker pminsw m1, m12 2337*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 2338*c0909341SAndroid Build Coastguard Worker movifnidn dstq, dstmp 2339*c0909341SAndroid Build Coastguard Worker mova [dstq+srcq], m0 2340*c0909341SAndroid Build Coastguard Worker 2341*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2342*c0909341SAndroid Build Coastguard Worker add srcq, r2mp 2343*c0909341SAndroid Build Coastguard Worker ; we already incremented lumaq above 2344*c0909341SAndroid Build Coastguard Worker%else 2345*c0909341SAndroid Build Coastguard Worker add srcq, r12mp 2346*c0909341SAndroid Build Coastguard Worker%if %3 2347*c0909341SAndroid Build Coastguard Worker lea lumaq, [lumaq+lstrideq*2] 2348*c0909341SAndroid Build Coastguard Worker%else 2349*c0909341SAndroid Build Coastguard Worker add lumaq, lstrideq 2350*c0909341SAndroid Build Coastguard Worker%endif 2351*c0909341SAndroid Build Coastguard Worker%endif 2352*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82 2353*c0909341SAndroid Build Coastguard Worker dec hw 2354*c0909341SAndroid Build Coastguard Worker jg %%loop_y 2355*c0909341SAndroid Build Coastguard Worker 2356*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2357*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 2358*c0909341SAndroid Build Coastguard Worker 2359*c0909341SAndroid Build Coastguard Worker mov wq, r4m 2360*c0909341SAndroid Build Coastguard Worker%endif 2361*c0909341SAndroid Build Coastguard Worker add wq, 16 2362*c0909341SAndroid Build Coastguard Worker jge %%end 2363*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2364*c0909341SAndroid Build Coastguard Worker mov srcq, r1mp 2365*c0909341SAndroid Build Coastguard Worker mov lumaq, r11mp 2366*c0909341SAndroid Build Coastguard Worker%else 2367*c0909341SAndroid Build Coastguard Worker mov srcq, r11mp 2368*c0909341SAndroid Build Coastguard Worker%endif 2369*c0909341SAndroid Build Coastguard Worker lea lumaq, [luma_bakq+wq*(1+%2)] 2370*c0909341SAndroid Build Coastguard Worker add srcq, wq 2371*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2372*c0909341SAndroid Build Coastguard Worker mov r4m, wq 2373*c0909341SAndroid Build Coastguard Worker mov r9m, lumaq 2374*c0909341SAndroid Build Coastguard Worker%endif 2375*c0909341SAndroid Build Coastguard Worker%if %2 == 0 2376*c0909341SAndroid Build Coastguard Worker ; adjust top_offxy 2377*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2378*c0909341SAndroid Build Coastguard Worker add dword [rsp+7*mmsize+1*gprsize], 16 2379*c0909341SAndroid Build Coastguard Worker%else 2380*c0909341SAndroid Build Coastguard Worker add r11d, 16 2381*c0909341SAndroid Build Coastguard Worker%endif 2382*c0909341SAndroid Build Coastguard Worker add offxyd, 16 2383*c0909341SAndroid Build Coastguard Worker btc dword r8m, 2 2384*c0909341SAndroid Build Coastguard Worker jc %%loop_x_even 2385*c0909341SAndroid Build Coastguard Worker test dword r8m, 2 2386*c0909341SAndroid Build Coastguard Worker jz %%loop_x_odd 2387*c0909341SAndroid Build Coastguard Worker jmp %%loop_x_odd_v_overlap 2388*c0909341SAndroid Build Coastguard Worker%%loop_x_even: 2389*c0909341SAndroid Build Coastguard Worker%endif 2390*c0909341SAndroid Build Coastguard Worker test dword r8m, 1 2391*c0909341SAndroid Build Coastguard Worker jz %%loop_x 2392*c0909341SAndroid Build Coastguard Worker 2393*c0909341SAndroid Build Coastguard Worker ; r8m = sbym 2394*c0909341SAndroid Build Coastguard Worker test dword r8m, 2 2395*c0909341SAndroid Build Coastguard Worker jne %%loop_x_hv_overlap 2396*c0909341SAndroid Build Coastguard Worker 2397*c0909341SAndroid Build Coastguard Worker ; horizontal overlap (without vertical overlap) 2398*c0909341SAndroid Build Coastguard Worker%%loop_x_h_overlap: 2399*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2400*c0909341SAndroid Build Coastguard Worker%if %2 2401*c0909341SAndroid Build Coastguard Worker lea r6, [offxyd+16] 2402*c0909341SAndroid Build Coastguard Worker mov [rsp+7*mmsize+0*gprsize], r6 2403*c0909341SAndroid Build Coastguard Worker%else 2404*c0909341SAndroid Build Coastguard Worker mov [rsp+7*mmsize+0*gprsize], offxyd 2405*c0909341SAndroid Build Coastguard Worker%endif 2406*c0909341SAndroid Build Coastguard Worker 2407*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut 2408*c0909341SAndroid Build Coastguard Worker 2409*c0909341SAndroid Build Coastguard Worker mov seed, r3m 2410*c0909341SAndroid Build Coastguard Worker%else 2411*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2412*c0909341SAndroid Build Coastguard Worker offx, offy, see, left_offxy, unused1, unused2, lstride 2413*c0909341SAndroid Build Coastguard Worker 2414*c0909341SAndroid Build Coastguard Worker%if %2 2415*c0909341SAndroid Build Coastguard Worker lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx 2416*c0909341SAndroid Build Coastguard Worker%else 2417*c0909341SAndroid Build Coastguard Worker mov left_offxyd, offyd 2418*c0909341SAndroid Build Coastguard Worker%endif 2419*c0909341SAndroid Build Coastguard Worker%endif 2420*c0909341SAndroid Build Coastguard Worker mov r6d, seed 2421*c0909341SAndroid Build Coastguard Worker or seed, 0xEFF4 2422*c0909341SAndroid Build Coastguard Worker shr r6d, 1 2423*c0909341SAndroid Build Coastguard Worker test seeb, seeh 2424*c0909341SAndroid Build Coastguard Worker lea seed, [r6+0x8000] 2425*c0909341SAndroid Build Coastguard Worker cmovp seed, r6d ; updated seed 2426*c0909341SAndroid Build Coastguard Worker 2427*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2428*c0909341SAndroid Build Coastguard Worker mov r3m, seed 2429*c0909341SAndroid Build Coastguard Worker 2430*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx 2431*c0909341SAndroid Build Coastguard Worker 2432*c0909341SAndroid Build Coastguard Worker mov offxd, offyd 2433*c0909341SAndroid Build Coastguard Worker%else 2434*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2435*c0909341SAndroid Build Coastguard Worker offx, offy, see, left_offxy, unused1, unused2, lstride 2436*c0909341SAndroid Build Coastguard Worker 2437*c0909341SAndroid Build Coastguard Worker mov offyd, seed 2438*c0909341SAndroid Build Coastguard Worker mov offxd, seed 2439*c0909341SAndroid Build Coastguard Worker%endif 2440*c0909341SAndroid Build Coastguard Worker ror offyd, 8 2441*c0909341SAndroid Build Coastguard Worker shr offxd, 12 2442*c0909341SAndroid Build Coastguard Worker and offyd, 0xf 2443*c0909341SAndroid Build Coastguard Worker imul offyd, 164>>%3 2444*c0909341SAndroid Build Coastguard Worker lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 2445*c0909341SAndroid Build Coastguard Worker 2446*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2447*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut 2448*c0909341SAndroid Build Coastguard Worker%else 2449*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2450*c0909341SAndroid Build Coastguard Worker h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak 2451*c0909341SAndroid Build Coastguard Worker%endif 2452*c0909341SAndroid Build Coastguard Worker 2453*c0909341SAndroid Build Coastguard Worker mov hd, r7m 2454*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 2455*c0909341SAndroid Build Coastguard Worker%%loop_y_h_overlap: 2456*c0909341SAndroid Build Coastguard Worker ; src 2457*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2458*c0909341SAndroid Build Coastguard Worker mov lumaq, r9mp 2459*c0909341SAndroid Build Coastguard Worker%endif 2460*c0909341SAndroid Build Coastguard Worker%if %2 2461*c0909341SAndroid Build Coastguard Worker mova m4, [lumaq+ 0] 2462*c0909341SAndroid Build Coastguard Worker mova m6, [lumaq+16] 2463*c0909341SAndroid Build Coastguard Worker mova m0, [srcq] 2464*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2465*c0909341SAndroid Build Coastguard Worker add lumaq, r10mp 2466*c0909341SAndroid Build Coastguard Worker mov r9mp, lumaq 2467*c0909341SAndroid Build Coastguard Worker mov r5, r5m 2468*c0909341SAndroid Build Coastguard Worker movd m7, [base+pb_1] 2469*c0909341SAndroid Build Coastguard Worker%else 2470*c0909341SAndroid Build Coastguard Worker movd m7, [pb_1] 2471*c0909341SAndroid Build Coastguard Worker%endif 2472*c0909341SAndroid Build Coastguard Worker pshufd m7, m7, q0000 2473*c0909341SAndroid Build Coastguard Worker pxor m2, m2 2474*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m7 2475*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m7 2476*c0909341SAndroid Build Coastguard Worker pavgw m4, m2 2477*c0909341SAndroid Build Coastguard Worker pavgw m6, m2 2478*c0909341SAndroid Build Coastguard Worker%else 2479*c0909341SAndroid Build Coastguard Worker mova m4, [lumaq] 2480*c0909341SAndroid Build Coastguard Worker mova m0, [srcq] 2481*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2482*c0909341SAndroid Build Coastguard Worker add lumaq, r10mp 2483*c0909341SAndroid Build Coastguard Worker mov r9mp, lumaq 2484*c0909341SAndroid Build Coastguard Worker%endif 2485*c0909341SAndroid Build Coastguard Worker pxor m2, m2 2486*c0909341SAndroid Build Coastguard Worker%endif 2487*c0909341SAndroid Build Coastguard Worker 2488*c0909341SAndroid Build Coastguard Worker%if %1 2489*c0909341SAndroid Build Coastguard Worker%if %2 2490*c0909341SAndroid Build Coastguard Worker packuswb m4, m6 ; luma 2491*c0909341SAndroid Build Coastguard Worker%endif 2492*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m4, m0 2493*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m0 ; { luma, chroma } 2494*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m14 2495*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m14 2496*c0909341SAndroid Build Coastguard Worker psraw m6, 6 2497*c0909341SAndroid Build Coastguard Worker psraw m4, 6 2498*c0909341SAndroid Build Coastguard Worker paddw m6, m15 2499*c0909341SAndroid Build Coastguard Worker paddw m4, m15 2500*c0909341SAndroid Build Coastguard Worker packuswb m4, m6 ; pack+unpack = clip 2501*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m4, m2 2502*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m2 2503*c0909341SAndroid Build Coastguard Worker%elif %2 == 0 2504*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m4, m2 2505*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m2 2506*c0909341SAndroid Build Coastguard Worker%endif 2507*c0909341SAndroid Build Coastguard Worker 2508*c0909341SAndroid Build Coastguard Worker ; scaling[luma_src] 2509*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2510*c0909341SAndroid Build Coastguard Worker vpgatherdw m7, m4, scalingq-1, r0, r5 2511*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m6, scalingq-1, r0, r5 2512*c0909341SAndroid Build Coastguard Worker%else 2513*c0909341SAndroid Build Coastguard Worker vpgatherdw m7, m4, scalingq-1, r12, r2 2514*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m6, scalingq-1, r12, r2 2515*c0909341SAndroid Build Coastguard Worker%endif 2516*c0909341SAndroid Build Coastguard Worker REPX {psrlw x, 8}, m7, m5 2517*c0909341SAndroid Build Coastguard Worker 2518*c0909341SAndroid Build Coastguard Worker ; unpack chroma_source 2519*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m0, m2 2520*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m2 ; m0-1: src as word 2521*c0909341SAndroid Build Coastguard Worker 2522*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 2523*c0909341SAndroid Build Coastguard Worker movu m4, [grain_lutq+offxyq+ 0] 2524*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2525*c0909341SAndroid Build Coastguard Worker mov r0, [rsp+7*mmsize+0*gprsize] 2526*c0909341SAndroid Build Coastguard Worker movd m2, [grain_lutq+r0+ 0] 2527*c0909341SAndroid Build Coastguard Worker%else 2528*c0909341SAndroid Build Coastguard Worker movd m2, [grain_lutq+left_offxyq+ 0] 2529*c0909341SAndroid Build Coastguard Worker%endif 2530*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m4 2531*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m9, m2 2532*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m8 2533*c0909341SAndroid Build Coastguard Worker packsswb m3, m3 2534*c0909341SAndroid Build Coastguard Worker shufps m3, m4, q3210 2535*c0909341SAndroid Build Coastguard Worker pxor m4, m4 2536*c0909341SAndroid Build Coastguard Worker pcmpgtb m4, m3 2537*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m4 2538*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m4 2539*c0909341SAndroid Build Coastguard Worker 2540*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2541*c0909341SAndroid Build Coastguard Worker pmullw m2, m7 2542*c0909341SAndroid Build Coastguard Worker pmullw m3, m5 2543*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m11 2544*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m11 2545*c0909341SAndroid Build Coastguard Worker 2546*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2547*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2548*c0909341SAndroid Build Coastguard Worker%endif 2549*c0909341SAndroid Build Coastguard Worker 2550*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 2551*c0909341SAndroid Build Coastguard Worker paddw m0, m2 2552*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2553*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m13 2554*c0909341SAndroid Build Coastguard Worker pmaxsw m1, m13 2555*c0909341SAndroid Build Coastguard Worker pminsw m0, m12 2556*c0909341SAndroid Build Coastguard Worker pminsw m1, m12 2557*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 2558*c0909341SAndroid Build Coastguard Worker movifnidn dstq, dstmp 2559*c0909341SAndroid Build Coastguard Worker mova [dstq+srcq], m0 2560*c0909341SAndroid Build Coastguard Worker 2561*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2562*c0909341SAndroid Build Coastguard Worker add srcq, r2mp 2563*c0909341SAndroid Build Coastguard Worker ; lumaq has already been incremented above 2564*c0909341SAndroid Build Coastguard Worker%else 2565*c0909341SAndroid Build Coastguard Worker add srcq, r12mp 2566*c0909341SAndroid Build Coastguard Worker%if %3 2567*c0909341SAndroid Build Coastguard Worker lea lumaq, [lumaq+lstrideq*2] 2568*c0909341SAndroid Build Coastguard Worker%else 2569*c0909341SAndroid Build Coastguard Worker add lumaq, lstrideq 2570*c0909341SAndroid Build Coastguard Worker%endif 2571*c0909341SAndroid Build Coastguard Worker%endif 2572*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82 2573*c0909341SAndroid Build Coastguard Worker dec hw 2574*c0909341SAndroid Build Coastguard Worker jg %%loop_y_h_overlap 2575*c0909341SAndroid Build Coastguard Worker 2576*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2577*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 2578*c0909341SAndroid Build Coastguard Worker 2579*c0909341SAndroid Build Coastguard Worker mov wq, r4m 2580*c0909341SAndroid Build Coastguard Worker%endif 2581*c0909341SAndroid Build Coastguard Worker add wq, 16 2582*c0909341SAndroid Build Coastguard Worker jge %%end 2583*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2584*c0909341SAndroid Build Coastguard Worker mov srcq, r1mp 2585*c0909341SAndroid Build Coastguard Worker mov lumaq, r11mp 2586*c0909341SAndroid Build Coastguard Worker%else 2587*c0909341SAndroid Build Coastguard Worker mov srcq, r11mp 2588*c0909341SAndroid Build Coastguard Worker%endif 2589*c0909341SAndroid Build Coastguard Worker lea lumaq, [luma_bakq+wq*(1+%2)] 2590*c0909341SAndroid Build Coastguard Worker add srcq, wq 2591*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2592*c0909341SAndroid Build Coastguard Worker mov r4m, wq 2593*c0909341SAndroid Build Coastguard Worker mov r9m, lumaq 2594*c0909341SAndroid Build Coastguard Worker%endif 2595*c0909341SAndroid Build Coastguard Worker%if %2 == 0 2596*c0909341SAndroid Build Coastguard Worker xor dword r8m, 4 2597*c0909341SAndroid Build Coastguard Worker ; adjust top_offxyd 2598*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2599*c0909341SAndroid Build Coastguard Worker add dword [rsp+7*mmsize+1*gprsize], 16 2600*c0909341SAndroid Build Coastguard Worker%else 2601*c0909341SAndroid Build Coastguard Worker add r11d, 16 2602*c0909341SAndroid Build Coastguard Worker%endif 2603*c0909341SAndroid Build Coastguard Worker add offxyd, 16 2604*c0909341SAndroid Build Coastguard Worker%endif 2605*c0909341SAndroid Build Coastguard Worker 2606*c0909341SAndroid Build Coastguard Worker ; r8m = sbym 2607*c0909341SAndroid Build Coastguard Worker test dword r8m, 2 2608*c0909341SAndroid Build Coastguard Worker%if %2 2609*c0909341SAndroid Build Coastguard Worker jne %%loop_x_hv_overlap 2610*c0909341SAndroid Build Coastguard Worker jmp %%loop_x_h_overlap 2611*c0909341SAndroid Build Coastguard Worker%else 2612*c0909341SAndroid Build Coastguard Worker jne %%loop_x_odd_v_overlap 2613*c0909341SAndroid Build Coastguard Worker jmp %%loop_x_odd 2614*c0909341SAndroid Build Coastguard Worker%endif 2615*c0909341SAndroid Build Coastguard Worker 2616*c0909341SAndroid Build Coastguard Worker%%end: 2617*c0909341SAndroid Build Coastguard Worker RET 2618*c0909341SAndroid Build Coastguard Worker 2619*c0909341SAndroid Build Coastguard Worker%%vertical_overlap: 2620*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2621*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 2622*c0909341SAndroid Build Coastguard Worker%else 2623*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap 2624*c0909341SAndroid Build Coastguard Worker%endif 2625*c0909341SAndroid Build Coastguard Worker 2626*c0909341SAndroid Build Coastguard Worker or overlapd, 2 ; top_overlap: overlap & 2 2627*c0909341SAndroid Build Coastguard Worker mov r8m, overlapd 2628*c0909341SAndroid Build Coastguard Worker movzx sbyd, sbyb 2629*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2630*c0909341SAndroid Build Coastguard Worker imul r4, [fg_dataq+FGData.seed], 0x00010001 2631*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused 2632*c0909341SAndroid Build Coastguard Worker%else 2633*c0909341SAndroid Build Coastguard Worker imul seed, [fg_dataq+FGData.seed], 0x00010001 2634*c0909341SAndroid Build Coastguard Worker%endif 2635*c0909341SAndroid Build Coastguard Worker imul tmpd, sbyd, 173 * 0x00010001 2636*c0909341SAndroid Build Coastguard Worker imul sbyd, 37 * 0x01000100 2637*c0909341SAndroid Build Coastguard Worker add tmpd, (105 << 16) | 188 2638*c0909341SAndroid Build Coastguard Worker add sbyd, (178 << 24) | (141 << 8) 2639*c0909341SAndroid Build Coastguard Worker and tmpd, 0x00ff00ff 2640*c0909341SAndroid Build Coastguard Worker and sbyd, 0xff00ff00 2641*c0909341SAndroid Build Coastguard Worker xor seed, tmpd 2642*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2643*c0909341SAndroid Build Coastguard Worker xor sbyd, seed ; (cur_seed << 16) | top_seed 2644*c0909341SAndroid Build Coastguard Worker 2645*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak 2646*c0909341SAndroid Build Coastguard Worker 2647*c0909341SAndroid Build Coastguard Worker mov r3m, seed 2648*c0909341SAndroid Build Coastguard Worker mov wq, r4m 2649*c0909341SAndroid Build Coastguard Worker%if %3 2650*c0909341SAndroid Build Coastguard Worker shl r10mp, 1 2651*c0909341SAndroid Build Coastguard Worker%endif 2652*c0909341SAndroid Build Coastguard Worker%else 2653*c0909341SAndroid Build Coastguard Worker xor seed, sbyd ; (cur_seed << 16) | top_seed 2654*c0909341SAndroid Build Coastguard Worker 2655*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2656*c0909341SAndroid Build Coastguard Worker tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak 2657*c0909341SAndroid Build Coastguard Worker 2658*c0909341SAndroid Build Coastguard Worker mov lstrideq, r10mp 2659*c0909341SAndroid Build Coastguard Worker%endif 2660*c0909341SAndroid Build Coastguard Worker 2661*c0909341SAndroid Build Coastguard Worker mov lumaq, r9mp 2662*c0909341SAndroid Build Coastguard Worker lea src_bakq, [srcq+wq] 2663*c0909341SAndroid Build Coastguard Worker lea luma_bakq, [lumaq+wq*(1+%2)] 2664*c0909341SAndroid Build Coastguard Worker neg wq 2665*c0909341SAndroid Build Coastguard Worker sub r0mp, srcq 2666*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2667*c0909341SAndroid Build Coastguard Worker mov r1m, src_bakq 2668*c0909341SAndroid Build Coastguard Worker mov r11m, luma_bakq 2669*c0909341SAndroid Build Coastguard Worker mov r4m, wq 2670*c0909341SAndroid Build Coastguard Worker 2671*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 2672*c0909341SAndroid Build Coastguard Worker%else 2673*c0909341SAndroid Build Coastguard Worker mov r11mp, src_bakq 2674*c0909341SAndroid Build Coastguard Worker mov r12mp, strideq 2675*c0909341SAndroid Build Coastguard Worker%endif 2676*c0909341SAndroid Build Coastguard Worker 2677*c0909341SAndroid Build Coastguard Worker%%loop_x_v_overlap: 2678*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2679*c0909341SAndroid Build Coastguard Worker mov seed, r3m 2680*c0909341SAndroid Build Coastguard Worker xor tmpd, tmpd 2681*c0909341SAndroid Build Coastguard Worker%endif 2682*c0909341SAndroid Build Coastguard Worker ; we assume from the block above that bits 8-15 of tmpd are zero'ed 2683*c0909341SAndroid Build Coastguard Worker mov r6d, seed 2684*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4eff4 2685*c0909341SAndroid Build Coastguard Worker test seeb, seeh 2686*c0909341SAndroid Build Coastguard Worker setp tmpb ; parity of top_seed 2687*c0909341SAndroid Build Coastguard Worker shr seed, 16 2688*c0909341SAndroid Build Coastguard Worker shl tmpd, 16 2689*c0909341SAndroid Build Coastguard Worker test seeb, seeh 2690*c0909341SAndroid Build Coastguard Worker setp tmpb ; parity of cur_seed 2691*c0909341SAndroid Build Coastguard Worker or r6d, 0x00010001 2692*c0909341SAndroid Build Coastguard Worker xor tmpd, r6d 2693*c0909341SAndroid Build Coastguard Worker mov seed, tmpd 2694*c0909341SAndroid Build Coastguard Worker ror seed, 1 ; updated (cur_seed << 16) | top_seed 2695*c0909341SAndroid Build Coastguard Worker 2696*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2697*c0909341SAndroid Build Coastguard Worker mov r3m, seed 2698*c0909341SAndroid Build Coastguard Worker 2699*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx 2700*c0909341SAndroid Build Coastguard Worker 2701*c0909341SAndroid Build Coastguard Worker mov offxd, offyd 2702*c0909341SAndroid Build Coastguard Worker%else 2703*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2704*c0909341SAndroid Build Coastguard Worker offx, offy, see, overlap, top_offxy, unused, lstride 2705*c0909341SAndroid Build Coastguard Worker 2706*c0909341SAndroid Build Coastguard Worker mov offxd, seed 2707*c0909341SAndroid Build Coastguard Worker mov offyd, seed 2708*c0909341SAndroid Build Coastguard Worker%endif 2709*c0909341SAndroid Build Coastguard Worker ror offyd, 8 2710*c0909341SAndroid Build Coastguard Worker ror offxd, 12 2711*c0909341SAndroid Build Coastguard Worker and offyd, 0xf000f 2712*c0909341SAndroid Build Coastguard Worker and offxd, 0xf000f 2713*c0909341SAndroid Build Coastguard Worker imul offyd, 164>>%3 2714*c0909341SAndroid Build Coastguard Worker ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 2715*c0909341SAndroid Build Coastguard Worker lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 2716*c0909341SAndroid Build Coastguard Worker 2717*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2718*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy 2719*c0909341SAndroid Build Coastguard Worker%else 2720*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2721*c0909341SAndroid Build Coastguard Worker h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak 2722*c0909341SAndroid Build Coastguard Worker%endif 2723*c0909341SAndroid Build Coastguard Worker 2724*c0909341SAndroid Build Coastguard Worker movzx top_offxyd, offxyw 2725*c0909341SAndroid Build Coastguard Worker shr offxyd, 16 2726*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2727*c0909341SAndroid Build Coastguard Worker mov [rsp+7*mmsize+1*gprsize], top_offxyd 2728*c0909341SAndroid Build Coastguard Worker 2729*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut 2730*c0909341SAndroid Build Coastguard Worker%endif 2731*c0909341SAndroid Build Coastguard Worker 2732*c0909341SAndroid Build Coastguard Worker%%loop_x_odd_v_overlap: 2733*c0909341SAndroid Build Coastguard Worker mov hd, r7m 2734*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 2735*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2736*c0909341SAndroid Build Coastguard Worker mov r5, r5m 2737*c0909341SAndroid Build Coastguard Worker%endif 2738*c0909341SAndroid Build Coastguard Worker%if %3 2739*c0909341SAndroid Build Coastguard Worker mova m1, [PIC_ptr(pb_23_22)] 2740*c0909341SAndroid Build Coastguard Worker%else 2741*c0909341SAndroid Build Coastguard Worker mova m1, [PIC_ptr(pb_27_17)] 2742*c0909341SAndroid Build Coastguard Worker%endif 2743*c0909341SAndroid Build Coastguard Worker%%loop_y_v_overlap: 2744*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2745*c0909341SAndroid Build Coastguard Worker mov lumaq, r9mp 2746*c0909341SAndroid Build Coastguard Worker%endif 2747*c0909341SAndroid Build Coastguard Worker%if %2 2748*c0909341SAndroid Build Coastguard Worker mova m4, [lumaq+ 0] 2749*c0909341SAndroid Build Coastguard Worker mova m6, [lumaq+16] 2750*c0909341SAndroid Build Coastguard Worker mova m0, [srcq] 2751*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2752*c0909341SAndroid Build Coastguard Worker add lumaq, r10mp 2753*c0909341SAndroid Build Coastguard Worker mov r9mp, lumaq 2754*c0909341SAndroid Build Coastguard Worker mov r5, r5m 2755*c0909341SAndroid Build Coastguard Worker movd m7, [base+pb_1] 2756*c0909341SAndroid Build Coastguard Worker%else 2757*c0909341SAndroid Build Coastguard Worker movd m7, [pb_1] 2758*c0909341SAndroid Build Coastguard Worker%endif 2759*c0909341SAndroid Build Coastguard Worker pshufd m7, m7, q0000 2760*c0909341SAndroid Build Coastguard Worker pxor m2, m2 2761*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m7 2762*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m7 2763*c0909341SAndroid Build Coastguard Worker pavgw m4, m2 2764*c0909341SAndroid Build Coastguard Worker pavgw m6, m2 2765*c0909341SAndroid Build Coastguard Worker%else 2766*c0909341SAndroid Build Coastguard Worker mova m4, [lumaq] 2767*c0909341SAndroid Build Coastguard Worker mova m0, [srcq] 2768*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2769*c0909341SAndroid Build Coastguard Worker add lumaq, r10mp 2770*c0909341SAndroid Build Coastguard Worker mov r9mp, lumaq 2771*c0909341SAndroid Build Coastguard Worker%endif 2772*c0909341SAndroid Build Coastguard Worker pxor m2, m2 2773*c0909341SAndroid Build Coastguard Worker%endif 2774*c0909341SAndroid Build Coastguard Worker 2775*c0909341SAndroid Build Coastguard Worker%if %1 2776*c0909341SAndroid Build Coastguard Worker%if %2 2777*c0909341SAndroid Build Coastguard Worker packuswb m4, m6 ; luma 2778*c0909341SAndroid Build Coastguard Worker%endif 2779*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m4, m0 2780*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m0 ; { luma, chroma } 2781*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m14 2782*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m14 2783*c0909341SAndroid Build Coastguard Worker psraw m6, 6 2784*c0909341SAndroid Build Coastguard Worker psraw m4, 6 2785*c0909341SAndroid Build Coastguard Worker paddw m6, m15 2786*c0909341SAndroid Build Coastguard Worker paddw m4, m15 2787*c0909341SAndroid Build Coastguard Worker packuswb m4, m6 ; pack+unpack = clip 2788*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m4, m2 2789*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m2 2790*c0909341SAndroid Build Coastguard Worker%elif %2 == 0 2791*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m4, m2 2792*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m2 2793*c0909341SAndroid Build Coastguard Worker%endif 2794*c0909341SAndroid Build Coastguard Worker 2795*c0909341SAndroid Build Coastguard Worker ; scaling[luma_src] 2796*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2797*c0909341SAndroid Build Coastguard Worker vpgatherdw m7, m4, scalingq-1, r0, r5 2798*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m6, scalingq-1, r0, r5 2799*c0909341SAndroid Build Coastguard Worker%else 2800*c0909341SAndroid Build Coastguard Worker vpgatherdw m7, m4, scalingq-1, r12, r2 2801*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m6, scalingq-1, r12, r2 2802*c0909341SAndroid Build Coastguard Worker%endif 2803*c0909341SAndroid Build Coastguard Worker REPX {psrlw x, 8}, m7, m5 2804*c0909341SAndroid Build Coastguard Worker 2805*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 2806*c0909341SAndroid Build Coastguard Worker movu m3, [grain_lutq+offxyq] 2807*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2808*c0909341SAndroid Build Coastguard Worker mov r0, [rsp+7*mmsize+1*gprsize] 2809*c0909341SAndroid Build Coastguard Worker movu m4, [grain_lutq+r0] 2810*c0909341SAndroid Build Coastguard Worker%else 2811*c0909341SAndroid Build Coastguard Worker movu m4, [grain_lutq+top_offxyq] 2812*c0909341SAndroid Build Coastguard Worker%endif 2813*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m4, m3 2814*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m3 2815*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m1, m6 2816*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m1, m4 2817*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m8 2818*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m8 2819*c0909341SAndroid Build Coastguard Worker packsswb m3, m2 2820*c0909341SAndroid Build Coastguard Worker pxor m6, m6 2821*c0909341SAndroid Build Coastguard Worker pcmpgtb m6, m3 2822*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m6 2823*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m6 2824*c0909341SAndroid Build Coastguard Worker 2825*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2826*c0909341SAndroid Build Coastguard Worker pmullw m2, m7 2827*c0909341SAndroid Build Coastguard Worker pmullw m3, m5 2828*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m11 2829*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m11 2830*c0909341SAndroid Build Coastguard Worker 2831*c0909341SAndroid Build Coastguard Worker ; unpack chroma_source 2832*c0909341SAndroid Build Coastguard Worker pxor m4, m4 2833*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m0, m4 2834*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m4 ; m0-1: src as word 2835*c0909341SAndroid Build Coastguard Worker 2836*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2837*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2838*c0909341SAndroid Build Coastguard Worker%endif 2839*c0909341SAndroid Build Coastguard Worker 2840*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 2841*c0909341SAndroid Build Coastguard Worker paddw m0, m2 2842*c0909341SAndroid Build Coastguard Worker paddw m6, m3 2843*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m13 2844*c0909341SAndroid Build Coastguard Worker pmaxsw m6, m13 2845*c0909341SAndroid Build Coastguard Worker pminsw m0, m12 2846*c0909341SAndroid Build Coastguard Worker pminsw m6, m12 2847*c0909341SAndroid Build Coastguard Worker packuswb m0, m6 2848*c0909341SAndroid Build Coastguard Worker movifnidn dstq, dstmp 2849*c0909341SAndroid Build Coastguard Worker mova [dstq+srcq], m0 2850*c0909341SAndroid Build Coastguard Worker 2851*c0909341SAndroid Build Coastguard Worker dec hw 2852*c0909341SAndroid Build Coastguard Worker je %%end_y_v_overlap 2853*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2854*c0909341SAndroid Build Coastguard Worker add srcq, r2mp 2855*c0909341SAndroid Build Coastguard Worker ; lumaq has already been incremented above 2856*c0909341SAndroid Build Coastguard Worker%else 2857*c0909341SAndroid Build Coastguard Worker add srcq, r12mp 2858*c0909341SAndroid Build Coastguard Worker%if %3 2859*c0909341SAndroid Build Coastguard Worker lea lumaq, [lumaq+lstrideq*2] 2860*c0909341SAndroid Build Coastguard Worker%else 2861*c0909341SAndroid Build Coastguard Worker add lumaq, lstrideq 2862*c0909341SAndroid Build Coastguard Worker%endif 2863*c0909341SAndroid Build Coastguard Worker%endif 2864*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82 2865*c0909341SAndroid Build Coastguard Worker%if %3 == 0 2866*c0909341SAndroid Build Coastguard Worker btc hd, 16 2867*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2868*c0909341SAndroid Build Coastguard Worker mov r5, r5m 2869*c0909341SAndroid Build Coastguard Worker%endif 2870*c0909341SAndroid Build Coastguard Worker mova m1, [PIC_ptr(pb_17_27)] 2871*c0909341SAndroid Build Coastguard Worker jnc %%loop_y_v_overlap 2872*c0909341SAndroid Build Coastguard Worker%endif 2873*c0909341SAndroid Build Coastguard Worker jmp %%loop_y 2874*c0909341SAndroid Build Coastguard Worker 2875*c0909341SAndroid Build Coastguard Worker%%end_y_v_overlap: 2876*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2877*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 2878*c0909341SAndroid Build Coastguard Worker 2879*c0909341SAndroid Build Coastguard Worker mov wq, r4m 2880*c0909341SAndroid Build Coastguard Worker%endif 2881*c0909341SAndroid Build Coastguard Worker add wq, 16 2882*c0909341SAndroid Build Coastguard Worker jge %%end_hv 2883*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2884*c0909341SAndroid Build Coastguard Worker mov srcq, r1mp 2885*c0909341SAndroid Build Coastguard Worker mov lumaq, r11mp 2886*c0909341SAndroid Build Coastguard Worker%else 2887*c0909341SAndroid Build Coastguard Worker mov srcq, r11mp 2888*c0909341SAndroid Build Coastguard Worker%endif 2889*c0909341SAndroid Build Coastguard Worker lea lumaq, [luma_bakq+wq*(1+%2)] 2890*c0909341SAndroid Build Coastguard Worker add srcq, wq 2891*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2892*c0909341SAndroid Build Coastguard Worker mov r4m, wq 2893*c0909341SAndroid Build Coastguard Worker mov r9m, lumaq 2894*c0909341SAndroid Build Coastguard Worker%endif 2895*c0909341SAndroid Build Coastguard Worker 2896*c0909341SAndroid Build Coastguard Worker%if %2 2897*c0909341SAndroid Build Coastguard Worker ; since fg_dataq.overlap is guaranteed to be set, we never jump 2898*c0909341SAndroid Build Coastguard Worker ; back to .loop_x_v_overlap, and instead always fall-through to 2899*c0909341SAndroid Build Coastguard Worker ; h+v overlap 2900*c0909341SAndroid Build Coastguard Worker%else 2901*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2902*c0909341SAndroid Build Coastguard Worker add dword [rsp+7*mmsize+1*gprsize], 16 2903*c0909341SAndroid Build Coastguard Worker%else 2904*c0909341SAndroid Build Coastguard Worker add top_offxyd, 16 2905*c0909341SAndroid Build Coastguard Worker%endif 2906*c0909341SAndroid Build Coastguard Worker add offxyd, 16 2907*c0909341SAndroid Build Coastguard Worker btc dword r8m, 2 2908*c0909341SAndroid Build Coastguard Worker jnc %%loop_x_odd_v_overlap 2909*c0909341SAndroid Build Coastguard Worker%endif 2910*c0909341SAndroid Build Coastguard Worker 2911*c0909341SAndroid Build Coastguard Worker%%loop_x_hv_overlap: 2912*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2913*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused 2914*c0909341SAndroid Build Coastguard Worker 2915*c0909341SAndroid Build Coastguard Worker mov r6, [rsp+7*mmsize+1*gprsize] 2916*c0909341SAndroid Build Coastguard Worker%if %2 2917*c0909341SAndroid Build Coastguard Worker lea r0, [r3d+16] 2918*c0909341SAndroid Build Coastguard Worker add r6, 16 2919*c0909341SAndroid Build Coastguard Worker mov [rsp+7*mmsize+0*gprsize], r0 ; left_offxy 2920*c0909341SAndroid Build Coastguard Worker%else 2921*c0909341SAndroid Build Coastguard Worker mov [rsp+7*mmsize+0*gprsize], r3 ; left_offxy 2922*c0909341SAndroid Build Coastguard Worker%endif 2923*c0909341SAndroid Build Coastguard Worker mov [rsp+7*mmsize+2*gprsize], r6 ; topleft_offxy 2924*c0909341SAndroid Build Coastguard Worker 2925*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused 2926*c0909341SAndroid Build Coastguard Worker 2927*c0909341SAndroid Build Coastguard Worker mov seed, r3m 2928*c0909341SAndroid Build Coastguard Worker xor tmpd, tmpd 2929*c0909341SAndroid Build Coastguard Worker%else 2930*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2931*c0909341SAndroid Build Coastguard Worker tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride 2932*c0909341SAndroid Build Coastguard Worker 2933*c0909341SAndroid Build Coastguard Worker%if %2 2934*c0909341SAndroid Build Coastguard Worker lea topleft_offxyq, [top_offxyq+16] 2935*c0909341SAndroid Build Coastguard Worker lea left_offxyq, [offxyq+16] 2936*c0909341SAndroid Build Coastguard Worker%else 2937*c0909341SAndroid Build Coastguard Worker mov topleft_offxyq, top_offxyq 2938*c0909341SAndroid Build Coastguard Worker mov left_offxyq, offxyq 2939*c0909341SAndroid Build Coastguard Worker%endif 2940*c0909341SAndroid Build Coastguard Worker 2941*c0909341SAndroid Build Coastguard Worker ; we assume from the block above that bits 8-15 of tmpd are zero'ed 2942*c0909341SAndroid Build Coastguard Worker%endif 2943*c0909341SAndroid Build Coastguard Worker mov r6d, seed 2944*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4eff4 2945*c0909341SAndroid Build Coastguard Worker test seeb, seeh 2946*c0909341SAndroid Build Coastguard Worker setp tmpb ; parity of top_seed 2947*c0909341SAndroid Build Coastguard Worker shr seed, 16 2948*c0909341SAndroid Build Coastguard Worker shl tmpd, 16 2949*c0909341SAndroid Build Coastguard Worker test seeb, seeh 2950*c0909341SAndroid Build Coastguard Worker setp tmpb ; parity of cur_seed 2951*c0909341SAndroid Build Coastguard Worker or r6d, 0x00010001 2952*c0909341SAndroid Build Coastguard Worker xor tmpd, r6d 2953*c0909341SAndroid Build Coastguard Worker mov seed, tmpd 2954*c0909341SAndroid Build Coastguard Worker ror seed, 1 ; updated (cur_seed << 16) | top_seed 2955*c0909341SAndroid Build Coastguard Worker 2956*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2957*c0909341SAndroid Build Coastguard Worker mov r3m, seed 2958*c0909341SAndroid Build Coastguard Worker 2959*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx 2960*c0909341SAndroid Build Coastguard Worker 2961*c0909341SAndroid Build Coastguard Worker mov offxd, offyd 2962*c0909341SAndroid Build Coastguard Worker%else 2963*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2964*c0909341SAndroid Build Coastguard Worker offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride 2965*c0909341SAndroid Build Coastguard Worker 2966*c0909341SAndroid Build Coastguard Worker mov offxd, seed 2967*c0909341SAndroid Build Coastguard Worker mov offyd, seed 2968*c0909341SAndroid Build Coastguard Worker%endif 2969*c0909341SAndroid Build Coastguard Worker ror offyd, 8 2970*c0909341SAndroid Build Coastguard Worker ror offxd, 12 2971*c0909341SAndroid Build Coastguard Worker and offyd, 0xf000f 2972*c0909341SAndroid Build Coastguard Worker and offxd, 0xf000f 2973*c0909341SAndroid Build Coastguard Worker imul offyd, 164>>%3 2974*c0909341SAndroid Build Coastguard Worker ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 2975*c0909341SAndroid Build Coastguard Worker lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 2976*c0909341SAndroid Build Coastguard Worker 2977*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2978*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut 2979*c0909341SAndroid Build Coastguard Worker%else 2980*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2981*c0909341SAndroid Build Coastguard Worker h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak 2982*c0909341SAndroid Build Coastguard Worker%endif 2983*c0909341SAndroid Build Coastguard Worker 2984*c0909341SAndroid Build Coastguard Worker movzx top_offxyd, offxyw 2985*c0909341SAndroid Build Coastguard Worker shr offxyd, 16 2986*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2987*c0909341SAndroid Build Coastguard Worker mov [rsp+7*mmsize+1*gprsize], top_offxyd 2988*c0909341SAndroid Build Coastguard Worker%endif 2989*c0909341SAndroid Build Coastguard Worker 2990*c0909341SAndroid Build Coastguard Worker mov hd, r7m 2991*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 2992*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2993*c0909341SAndroid Build Coastguard Worker mov r5, r5m 2994*c0909341SAndroid Build Coastguard Worker%endif 2995*c0909341SAndroid Build Coastguard Worker%if %3 2996*c0909341SAndroid Build Coastguard Worker mova m3, [PIC_ptr(pb_23_22)] 2997*c0909341SAndroid Build Coastguard Worker%else 2998*c0909341SAndroid Build Coastguard Worker mova m3, [PIC_ptr(pb_27_17)] 2999*c0909341SAndroid Build Coastguard Worker%endif 3000*c0909341SAndroid Build Coastguard Worker%%loop_y_hv_overlap: 3001*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 3002*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3003*c0909341SAndroid Build Coastguard Worker mov r0, [rsp+7*mmsize+2*gprsize] ; topleft_offxy 3004*c0909341SAndroid Build Coastguard Worker mov r5, [rsp+7*mmsize+1*gprsize] ; top_offxy 3005*c0909341SAndroid Build Coastguard Worker movd m1, [grain_lutq+r0] 3006*c0909341SAndroid Build Coastguard Worker mov r0, [rsp+7*mmsize+0*gprsize] ; left_offxy 3007*c0909341SAndroid Build Coastguard Worker%else 3008*c0909341SAndroid Build Coastguard Worker movd m1, [grain_lutq+topleft_offxyq] 3009*c0909341SAndroid Build Coastguard Worker%endif 3010*c0909341SAndroid Build Coastguard Worker movu m2, [grain_lutq+offxyq] 3011*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3012*c0909341SAndroid Build Coastguard Worker movu m6, [grain_lutq+r5] 3013*c0909341SAndroid Build Coastguard Worker movd m4, [grain_lutq+r0] 3014*c0909341SAndroid Build Coastguard Worker%else 3015*c0909341SAndroid Build Coastguard Worker movu m6, [grain_lutq+top_offxyq] 3016*c0909341SAndroid Build Coastguard Worker movd m4, [grain_lutq+left_offxyq] 3017*c0909341SAndroid Build Coastguard Worker%endif 3018*c0909341SAndroid Build Coastguard Worker ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 3019*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m6 3020*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m2 3021*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m9, m1 3022*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m9, m4 3023*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m8}, m0, m1 3024*c0909341SAndroid Build Coastguard Worker packsswb m0, m1 3025*c0909341SAndroid Build Coastguard Worker shufps m4, m0, m2, q3232 3026*c0909341SAndroid Build Coastguard Worker shufps m0, m6, q3210 3027*c0909341SAndroid Build Coastguard Worker ; followed by v interpolation (top | cur -> cur) 3028*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m0, m4 3029*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m4 3030*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m3, m0 3031*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m3, m2 3032*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m8 3033*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m8 3034*c0909341SAndroid Build Coastguard Worker packsswb m1, m4 3035*c0909341SAndroid Build Coastguard Worker 3036*c0909341SAndroid Build Coastguard Worker ; src 3037*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3038*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 3039*c0909341SAndroid Build Coastguard Worker 3040*c0909341SAndroid Build Coastguard Worker mov lumaq, r9mp 3041*c0909341SAndroid Build Coastguard Worker%endif 3042*c0909341SAndroid Build Coastguard Worker%if %2 3043*c0909341SAndroid Build Coastguard Worker mova m4, [lumaq+ 0] 3044*c0909341SAndroid Build Coastguard Worker mova m6, [lumaq+16] 3045*c0909341SAndroid Build Coastguard Worker mova m0, [srcq] 3046*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3047*c0909341SAndroid Build Coastguard Worker add lumaq, r10mp 3048*c0909341SAndroid Build Coastguard Worker mov r9mp, lumaq 3049*c0909341SAndroid Build Coastguard Worker mov r5, r5m 3050*c0909341SAndroid Build Coastguard Worker movd m7, [base+pb_1] 3051*c0909341SAndroid Build Coastguard Worker%else 3052*c0909341SAndroid Build Coastguard Worker movd m7, [pb_1] 3053*c0909341SAndroid Build Coastguard Worker%endif 3054*c0909341SAndroid Build Coastguard Worker pshufd m7, m7, q0000 3055*c0909341SAndroid Build Coastguard Worker pxor m2, m2 3056*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m7 3057*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m7 3058*c0909341SAndroid Build Coastguard Worker pavgw m4, m2 3059*c0909341SAndroid Build Coastguard Worker pavgw m6, m2 3060*c0909341SAndroid Build Coastguard Worker%else 3061*c0909341SAndroid Build Coastguard Worker mova m4, [lumaq] 3062*c0909341SAndroid Build Coastguard Worker mova m0, [srcq] 3063*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3064*c0909341SAndroid Build Coastguard Worker add lumaq, r10mp 3065*c0909341SAndroid Build Coastguard Worker mov r9mp, lumaq 3066*c0909341SAndroid Build Coastguard Worker%endif 3067*c0909341SAndroid Build Coastguard Worker pxor m2, m2 3068*c0909341SAndroid Build Coastguard Worker%endif 3069*c0909341SAndroid Build Coastguard Worker 3070*c0909341SAndroid Build Coastguard Worker%if %1 3071*c0909341SAndroid Build Coastguard Worker%if %2 3072*c0909341SAndroid Build Coastguard Worker packuswb m4, m6 ; luma 3073*c0909341SAndroid Build Coastguard Worker%endif 3074*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m4, m0 3075*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m0 ; { luma, chroma } 3076*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m14 3077*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m14 3078*c0909341SAndroid Build Coastguard Worker psraw m6, 6 3079*c0909341SAndroid Build Coastguard Worker psraw m4, 6 3080*c0909341SAndroid Build Coastguard Worker paddw m6, m15 3081*c0909341SAndroid Build Coastguard Worker paddw m4, m15 3082*c0909341SAndroid Build Coastguard Worker packuswb m4, m6 ; pack+unpack = clip 3083*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m4, m2 3084*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m2 3085*c0909341SAndroid Build Coastguard Worker%elif %2 == 0 3086*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m4, m2 3087*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m2 3088*c0909341SAndroid Build Coastguard Worker%endif 3089*c0909341SAndroid Build Coastguard Worker 3090*c0909341SAndroid Build Coastguard Worker ; scaling[src] 3091*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3092*c0909341SAndroid Build Coastguard Worker vpgatherdw m7, m4, scalingq-1, r0, r5 3093*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m6, scalingq-1, r0, r5 3094*c0909341SAndroid Build Coastguard Worker%else 3095*c0909341SAndroid Build Coastguard Worker%if %3 3096*c0909341SAndroid Build Coastguard Worker vpgatherdw m7, m4, scalingq-1, r2, r12 3097*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m6, scalingq-1, r2, r12 3098*c0909341SAndroid Build Coastguard Worker%else 3099*c0909341SAndroid Build Coastguard Worker vpgatherdw m7, m4, scalingq-1, r2, r13 3100*c0909341SAndroid Build Coastguard Worker vpgatherdw m5, m6, scalingq-1, r2, r13 3101*c0909341SAndroid Build Coastguard Worker%endif 3102*c0909341SAndroid Build Coastguard Worker%endif 3103*c0909341SAndroid Build Coastguard Worker REPX {psrlw x, 8}, m7, m5 3104*c0909341SAndroid Build Coastguard Worker 3105*c0909341SAndroid Build Coastguard Worker ; unpack grain 3106*c0909341SAndroid Build Coastguard Worker pxor m4, m4 3107*c0909341SAndroid Build Coastguard Worker pcmpgtb m4, m1 3108*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m1, m4 3109*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m4 3110*c0909341SAndroid Build Coastguard Worker 3111*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[src] * grain, scaling_shift) 3112*c0909341SAndroid Build Coastguard Worker pmullw m2, m7 3113*c0909341SAndroid Build Coastguard Worker pmullw m1, m5 3114*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m11 3115*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m11 3116*c0909341SAndroid Build Coastguard Worker 3117*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3118*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 3119*c0909341SAndroid Build Coastguard Worker%endif 3120*c0909341SAndroid Build Coastguard Worker 3121*c0909341SAndroid Build Coastguard Worker ; unpack chroma source 3122*c0909341SAndroid Build Coastguard Worker pxor m4, m4 3123*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m0, m4 3124*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m4 ; m0-1: src as word 3125*c0909341SAndroid Build Coastguard Worker 3126*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 3127*c0909341SAndroid Build Coastguard Worker paddw m0, m2 3128*c0909341SAndroid Build Coastguard Worker paddw m5, m1 3129*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m13 3130*c0909341SAndroid Build Coastguard Worker pmaxsw m5, m13 3131*c0909341SAndroid Build Coastguard Worker pminsw m0, m12 3132*c0909341SAndroid Build Coastguard Worker pminsw m5, m12 3133*c0909341SAndroid Build Coastguard Worker packuswb m0, m5 3134*c0909341SAndroid Build Coastguard Worker movifnidn dstq, dstmp 3135*c0909341SAndroid Build Coastguard Worker mova [dstq+srcq], m0 3136*c0909341SAndroid Build Coastguard Worker 3137*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3138*c0909341SAndroid Build Coastguard Worker add srcq, r2mp 3139*c0909341SAndroid Build Coastguard Worker ; lumaq has been adjusted above already 3140*c0909341SAndroid Build Coastguard Worker%else 3141*c0909341SAndroid Build Coastguard Worker add srcq, r12mp 3142*c0909341SAndroid Build Coastguard Worker%if %3 3143*c0909341SAndroid Build Coastguard Worker lea lumaq, [lumaq+lstrideq*(1+%2)] 3144*c0909341SAndroid Build Coastguard Worker%else 3145*c0909341SAndroid Build Coastguard Worker add lumaq, r10mp 3146*c0909341SAndroid Build Coastguard Worker%endif 3147*c0909341SAndroid Build Coastguard Worker%endif 3148*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82 3149*c0909341SAndroid Build Coastguard Worker dec hw 3150*c0909341SAndroid Build Coastguard Worker%if %3 3151*c0909341SAndroid Build Coastguard Worker jg %%loop_y_h_overlap 3152*c0909341SAndroid Build Coastguard Worker%else 3153*c0909341SAndroid Build Coastguard Worker jle %%end_y_hv_overlap 3154*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3155*c0909341SAndroid Build Coastguard Worker mov r5, r5m 3156*c0909341SAndroid Build Coastguard Worker%endif 3157*c0909341SAndroid Build Coastguard Worker mova m3, [PIC_ptr(pb_17_27)] 3158*c0909341SAndroid Build Coastguard Worker btc hd, 16 3159*c0909341SAndroid Build Coastguard Worker jnc %%loop_y_hv_overlap 3160*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3161*c0909341SAndroid Build Coastguard Worker mov lstrideq, r10mp 3162*c0909341SAndroid Build Coastguard Worker%endif 3163*c0909341SAndroid Build Coastguard Worker jmp %%loop_y_h_overlap 3164*c0909341SAndroid Build Coastguard Worker%%end_y_hv_overlap: 3165*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3166*c0909341SAndroid Build Coastguard Worker mov lstrideq, r10mp 3167*c0909341SAndroid Build Coastguard Worker%endif 3168*c0909341SAndroid Build Coastguard Worker%endif 3169*c0909341SAndroid Build Coastguard Worker 3170*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3171*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 3172*c0909341SAndroid Build Coastguard Worker 3173*c0909341SAndroid Build Coastguard Worker mov wq, r4m 3174*c0909341SAndroid Build Coastguard Worker%endif 3175*c0909341SAndroid Build Coastguard Worker add wq, 16 3176*c0909341SAndroid Build Coastguard Worker jge %%end_hv 3177*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3178*c0909341SAndroid Build Coastguard Worker mov srcq, r1mp 3179*c0909341SAndroid Build Coastguard Worker mov lumaq, r11mp 3180*c0909341SAndroid Build Coastguard Worker%else 3181*c0909341SAndroid Build Coastguard Worker mov srcq, r11mp 3182*c0909341SAndroid Build Coastguard Worker%endif 3183*c0909341SAndroid Build Coastguard Worker lea lumaq, [luma_bakq+wq*(1+%2)] 3184*c0909341SAndroid Build Coastguard Worker add srcq, wq 3185*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3186*c0909341SAndroid Build Coastguard Worker mov r4m, wq 3187*c0909341SAndroid Build Coastguard Worker mov r9m, lumaq 3188*c0909341SAndroid Build Coastguard Worker%endif 3189*c0909341SAndroid Build Coastguard Worker%if %2 3190*c0909341SAndroid Build Coastguard Worker jmp %%loop_x_hv_overlap 3191*c0909341SAndroid Build Coastguard Worker%else 3192*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3193*c0909341SAndroid Build Coastguard Worker add dword [rsp+7*mmsize+1*gprsize], 16 3194*c0909341SAndroid Build Coastguard Worker%else 3195*c0909341SAndroid Build Coastguard Worker add top_offxyd, 16 3196*c0909341SAndroid Build Coastguard Worker%endif 3197*c0909341SAndroid Build Coastguard Worker add offxyd, 16 3198*c0909341SAndroid Build Coastguard Worker xor dword r8m, 4 3199*c0909341SAndroid Build Coastguard Worker jmp %%loop_x_odd_v_overlap 3200*c0909341SAndroid Build Coastguard Worker%endif 3201*c0909341SAndroid Build Coastguard Worker 3202*c0909341SAndroid Build Coastguard Worker%%end_hv: 3203*c0909341SAndroid Build Coastguard Worker RET 3204*c0909341SAndroid Build Coastguard Worker%endmacro 3205*c0909341SAndroid Build Coastguard Worker 3206*c0909341SAndroid Build Coastguard Worker %%FGUV_32x32xN_LOOP 1, %2, %3 3207*c0909341SAndroid Build Coastguard Worker.csfl: 3208*c0909341SAndroid Build Coastguard Worker %%FGUV_32x32xN_LOOP 0, %2, %3 3209*c0909341SAndroid Build Coastguard Worker%endmacro 3210*c0909341SAndroid Build Coastguard Worker 3211*c0909341SAndroid Build Coastguard WorkerFGUV_FN 420, 1, 1 3212*c0909341SAndroid Build Coastguard Worker 3213*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < mmsize 3214*c0909341SAndroid Build Coastguard WorkerDECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 3215*c0909341SAndroid Build Coastguard Worker%endif 3216*c0909341SAndroid Build Coastguard Worker 3217*c0909341SAndroid Build Coastguard WorkerFGUV_FN 422, 1, 0 3218*c0909341SAndroid Build Coastguard Worker 3219*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < mmsize 3220*c0909341SAndroid Build Coastguard WorkerDECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 3221*c0909341SAndroid Build Coastguard Worker%endif 3222*c0909341SAndroid Build Coastguard Worker 3223*c0909341SAndroid Build Coastguard WorkerFGUV_FN 444, 0, 0 3224