1*c0909341SAndroid Build Coastguard Worker; Copyright © 2019-2022, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2019-2022, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; All rights reserved. 4*c0909341SAndroid Build Coastguard Worker; 5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 7*c0909341SAndroid Build Coastguard Worker; 8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 9*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 10*c0909341SAndroid Build Coastguard Worker; 11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 12*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 13*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 14*c0909341SAndroid Build Coastguard Worker; 15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*c0909341SAndroid Build Coastguard Worker 26*c0909341SAndroid Build Coastguard Worker%include "config.asm" 27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 28*c0909341SAndroid Build Coastguard Worker%include "x86/filmgrain_common.asm" 29*c0909341SAndroid Build Coastguard Worker 30*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 31*c0909341SAndroid Build Coastguard Worker 32*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 32 33*c0909341SAndroid Build Coastguard Workerpb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0 34*c0909341SAndroid Build Coastguard Workergen_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 35*c0909341SAndroid Build Coastguard Workergen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 36*c0909341SAndroid Build Coastguard Workergen_shufB: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 37*c0909341SAndroid Build Coastguard Workergen_shufC: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 38*c0909341SAndroid Build Coastguard Workergen_shufD: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 39*c0909341SAndroid Build Coastguard Worker; note: the order of (some of) the following constants matter 40*c0909341SAndroid Build Coastguard Workerpb_27_17: times 2 db 27, 17 41*c0909341SAndroid Build Coastguard Workerbyte_blend: db 0, 0, 0, -1 42*c0909341SAndroid Build Coastguard Workerpb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32 43*c0909341SAndroid Build Coastguard Workerpb_17_27: times 2 db 17, 27 44*c0909341SAndroid Build Coastguard Workerpb_1: times 4 db 1 45*c0909341SAndroid Build Coastguard Workerpb_23_22: db 23, 22, 0, 32, 0, 32, 0, 32 46*c0909341SAndroid Build Coastguard Workernext_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 47*c0909341SAndroid Build Coastguard Workerpw_seed_xor: times 2 dw 0xb524 48*c0909341SAndroid Build Coastguard Worker times 2 dw 0x49d8 49*c0909341SAndroid Build Coastguard Workerfg_min: times 4 db 0 50*c0909341SAndroid Build Coastguard Worker times 4 db 16 51*c0909341SAndroid Build Coastguard Workerfg_max: times 4 db 255 52*c0909341SAndroid Build Coastguard Worker times 4 db 240 53*c0909341SAndroid Build Coastguard Worker times 4 db 235 54*c0909341SAndroid Build Coastguard Workerpd_m65536: dd -65536 55*c0909341SAndroid Build Coastguard Workerpw_8: times 2 dw 8 56*c0909341SAndroid Build Coastguard Workerpw_1024: times 2 dw 1024 57*c0909341SAndroid Build Coastguard Workerhmul_bits: dw 32768, 16384, 8192, 4096 58*c0909341SAndroid Build Coastguard Workerround: dw 2048, 1024, 512 59*c0909341SAndroid Build Coastguard Workermul_bits: dw 256, 128, 64, 32, 16 60*c0909341SAndroid Build Coastguard Workerround_vals: dw 32, 64, 128, 256, 512 61*c0909341SAndroid Build Coastguard Workerpw_1: dw 1 62*c0909341SAndroid Build Coastguard Worker 63*c0909341SAndroid Build Coastguard Worker%macro JMP_TABLE 2-* 64*c0909341SAndroid Build Coastguard Worker %1_8bpc_%2_table: 65*c0909341SAndroid Build Coastguard Worker %xdefine %%base %1_8bpc_%2_table 66*c0909341SAndroid Build Coastguard Worker %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) 67*c0909341SAndroid Build Coastguard Worker %rep %0 - 2 68*c0909341SAndroid Build Coastguard Worker dd %%prefix %+ .ar%3 - %%base 69*c0909341SAndroid Build Coastguard Worker %rotate 1 70*c0909341SAndroid Build Coastguard Worker %endrep 71*c0909341SAndroid Build Coastguard Worker%endmacro 72*c0909341SAndroid Build Coastguard Worker 73*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_y, avx2, 0, 1, 2, 3 74*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3 75*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3 76*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3 77*c0909341SAndroid Build Coastguard Worker 78*c0909341SAndroid Build Coastguard WorkerSECTION .text 79*c0909341SAndroid Build Coastguard Worker 80*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2 81*c0909341SAndroid Build Coastguard Workercglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data 82*c0909341SAndroid Build Coastguard Worker%define base r4-generate_grain_y_8bpc_avx2_table 83*c0909341SAndroid Build Coastguard Worker lea r4, [generate_grain_y_8bpc_avx2_table] 84*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm0, [fg_dataq+FGData.seed] 85*c0909341SAndroid Build Coastguard Worker mov r6d, [fg_dataq+FGData.grain_scale_shift] 86*c0909341SAndroid Build Coastguard Worker movq xm1, [base+next_upperbit_mask] 87*c0909341SAndroid Build Coastguard Worker movsxd r5, [fg_dataq+FGData.ar_coeff_lag] 88*c0909341SAndroid Build Coastguard Worker movq xm4, [base+mul_bits] 89*c0909341SAndroid Build Coastguard Worker movq xm5, [base+hmul_bits] 90*c0909341SAndroid Build Coastguard Worker mov r7, -73*82 91*c0909341SAndroid Build Coastguard Worker mova xm6, [base+pb_mask] 92*c0909341SAndroid Build Coastguard Worker sub bufq, r7 93*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm7, [base+round+r6*2] 94*c0909341SAndroid Build Coastguard Worker lea r6, [gaussian_sequence] 95*c0909341SAndroid Build Coastguard Worker movsxd r5, [r4+r5*4] 96*c0909341SAndroid Build Coastguard Worker.loop: 97*c0909341SAndroid Build Coastguard Worker pand xm2, xm0, xm1 98*c0909341SAndroid Build Coastguard Worker psrlw xm3, xm2, 10 99*c0909341SAndroid Build Coastguard Worker por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 100*c0909341SAndroid Build Coastguard Worker pmullw xm2, xm4 ; bits 0x0f00 are set 101*c0909341SAndroid Build Coastguard Worker pmulhuw xm0, xm5 102*c0909341SAndroid Build Coastguard Worker pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds 103*c0909341SAndroid Build Coastguard Worker psllq xm2, xm3, 30 104*c0909341SAndroid Build Coastguard Worker por xm2, xm3 105*c0909341SAndroid Build Coastguard Worker psllq xm3, xm2, 15 106*c0909341SAndroid Build Coastguard Worker por xm2, xm0 ; aggregate each bit into next seed's high bit 107*c0909341SAndroid Build Coastguard Worker por xm3, xm2 ; 4 next output seeds 108*c0909341SAndroid Build Coastguard Worker pshuflw xm0, xm3, q3333 109*c0909341SAndroid Build Coastguard Worker psrlw xm3, 5 110*c0909341SAndroid Build Coastguard Worker pand xm2, xm0, xm1 111*c0909341SAndroid Build Coastguard Worker movq r2, xm3 112*c0909341SAndroid Build Coastguard Worker psrlw xm3, xm2, 10 113*c0909341SAndroid Build Coastguard Worker por xm2, xm3 114*c0909341SAndroid Build Coastguard Worker pmullw xm2, xm4 115*c0909341SAndroid Build Coastguard Worker pmulhuw xm0, xm5 116*c0909341SAndroid Build Coastguard Worker movzx r3d, r2w 117*c0909341SAndroid Build Coastguard Worker pshufb xm3, xm6, xm2 118*c0909341SAndroid Build Coastguard Worker psllq xm2, xm3, 30 119*c0909341SAndroid Build Coastguard Worker por xm2, xm3 120*c0909341SAndroid Build Coastguard Worker psllq xm3, xm2, 15 121*c0909341SAndroid Build Coastguard Worker por xm0, xm2 122*c0909341SAndroid Build Coastguard Worker movd xm2, [r6+r3*2] 123*c0909341SAndroid Build Coastguard Worker rorx r3, r2, 32 124*c0909341SAndroid Build Coastguard Worker por xm3, xm0 125*c0909341SAndroid Build Coastguard Worker shr r2d, 16 126*c0909341SAndroid Build Coastguard Worker pinsrw xm2, [r6+r2*2], 1 127*c0909341SAndroid Build Coastguard Worker pshuflw xm0, xm3, q3333 128*c0909341SAndroid Build Coastguard Worker movzx r2d, r3w 129*c0909341SAndroid Build Coastguard Worker psrlw xm3, 5 130*c0909341SAndroid Build Coastguard Worker pinsrw xm2, [r6+r2*2], 2 131*c0909341SAndroid Build Coastguard Worker shr r3d, 16 132*c0909341SAndroid Build Coastguard Worker movq r2, xm3 133*c0909341SAndroid Build Coastguard Worker pinsrw xm2, [r6+r3*2], 3 134*c0909341SAndroid Build Coastguard Worker movzx r3d, r2w 135*c0909341SAndroid Build Coastguard Worker pinsrw xm2, [r6+r3*2], 4 136*c0909341SAndroid Build Coastguard Worker rorx r3, r2, 32 137*c0909341SAndroid Build Coastguard Worker shr r2d, 16 138*c0909341SAndroid Build Coastguard Worker pinsrw xm2, [r6+r2*2], 5 139*c0909341SAndroid Build Coastguard Worker movzx r2d, r3w 140*c0909341SAndroid Build Coastguard Worker pinsrw xm2, [r6+r2*2], 6 141*c0909341SAndroid Build Coastguard Worker shr r3d, 16 142*c0909341SAndroid Build Coastguard Worker pinsrw xm2, [r6+r3*2], 7 143*c0909341SAndroid Build Coastguard Worker pmulhrsw xm2, xm7 144*c0909341SAndroid Build Coastguard Worker packsswb xm2, xm2 145*c0909341SAndroid Build Coastguard Worker movq [bufq+r7], xm2 146*c0909341SAndroid Build Coastguard Worker add r7, 8 147*c0909341SAndroid Build Coastguard Worker jl .loop 148*c0909341SAndroid Build Coastguard Worker 149*c0909341SAndroid Build Coastguard Worker ; auto-regression code 150*c0909341SAndroid Build Coastguard Worker add r5, r4 151*c0909341SAndroid Build Coastguard Worker jmp r5 152*c0909341SAndroid Build Coastguard Worker 153*c0909341SAndroid Build Coastguard Worker.ar1: 154*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 155*c0909341SAndroid Build Coastguard Worker mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 156*c0909341SAndroid Build Coastguard Worker movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] 157*c0909341SAndroid Build Coastguard Worker movd xm5, [fg_dataq+FGData.ar_coeffs_y] 158*c0909341SAndroid Build Coastguard Worker mova xm2, [base+gen_shufC] 159*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 160*c0909341SAndroid Build Coastguard Worker pinsrb xm5, [base+pb_1], 3 161*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd 162*c0909341SAndroid Build Coastguard Worker pmovsxbw xm5, xm5 163*c0909341SAndroid Build Coastguard Worker pshufd xm4, xm5, q0000 164*c0909341SAndroid Build Coastguard Worker pshufd xm5, xm5, q1111 165*c0909341SAndroid Build Coastguard Worker sub bufq, 82*73-(82*3+79) 166*c0909341SAndroid Build Coastguard Worker mov hd, 70 167*c0909341SAndroid Build Coastguard Worker mov mind, -128 168*c0909341SAndroid Build Coastguard Worker mov maxd, 127 169*c0909341SAndroid Build Coastguard Worker.y_loop_ar1: 170*c0909341SAndroid Build Coastguard Worker mov xq, -76 171*c0909341SAndroid Build Coastguard Worker movsx val3d, byte [bufq+xq-1] 172*c0909341SAndroid Build Coastguard Worker.x_loop_ar1: 173*c0909341SAndroid Build Coastguard Worker pmovsxbw xm1, [bufq+xq-82-3] 174*c0909341SAndroid Build Coastguard Worker pshufb xm0, xm1, xm2 175*c0909341SAndroid Build Coastguard Worker punpckhwd xm1, xm3 176*c0909341SAndroid Build Coastguard Worker pmaddwd xm0, xm4 177*c0909341SAndroid Build Coastguard Worker pmaddwd xm1, xm5 178*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 179*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_inner: 180*c0909341SAndroid Build Coastguard Worker movd val0d, xm0 181*c0909341SAndroid Build Coastguard Worker psrldq xm0, 4 182*c0909341SAndroid Build Coastguard Worker imul val3d, cf3d 183*c0909341SAndroid Build Coastguard Worker add val3d, val0d 184*c0909341SAndroid Build Coastguard Worker movsx val0d, byte [bufq+xq] 185*c0909341SAndroid Build Coastguard Worker sarx val3d, val3d, shiftd 186*c0909341SAndroid Build Coastguard Worker add val3d, val0d 187*c0909341SAndroid Build Coastguard Worker cmp val3d, maxd 188*c0909341SAndroid Build Coastguard Worker cmovns val3d, maxd 189*c0909341SAndroid Build Coastguard Worker cmp val3d, mind 190*c0909341SAndroid Build Coastguard Worker cmovs val3d, mind 191*c0909341SAndroid Build Coastguard Worker mov [bufq+xq], val3b 192*c0909341SAndroid Build Coastguard Worker ; keep val3d in-place as left for next x iteration 193*c0909341SAndroid Build Coastguard Worker inc xq 194*c0909341SAndroid Build Coastguard Worker jz .x_loop_ar1_end 195*c0909341SAndroid Build Coastguard Worker test xb, 3 196*c0909341SAndroid Build Coastguard Worker jnz .x_loop_ar1_inner 197*c0909341SAndroid Build Coastguard Worker jmp .x_loop_ar1 198*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_end: 199*c0909341SAndroid Build Coastguard Worker add bufq, 82 200*c0909341SAndroid Build Coastguard Worker dec hd 201*c0909341SAndroid Build Coastguard Worker jg .y_loop_ar1 202*c0909341SAndroid Build Coastguard Worker.ar0: 203*c0909341SAndroid Build Coastguard Worker RET 204*c0909341SAndroid Build Coastguard Worker 205*c0909341SAndroid Build Coastguard Worker.ar2: 206*c0909341SAndroid Build Coastguard Worker%if WIN64 207*c0909341SAndroid Build Coastguard Worker %assign stack_size_padded 168 208*c0909341SAndroid Build Coastguard Worker SUB rsp, stack_size_padded 209*c0909341SAndroid Build Coastguard Worker WIN64_PUSH_XMM 16, 8 210*c0909341SAndroid Build Coastguard Worker%endif 211*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, fg_data, h, x 212*c0909341SAndroid Build Coastguard Worker mov r6d, [fg_dataq+FGData.ar_coeff_shift] 213*c0909341SAndroid Build Coastguard Worker pmovsxbw xm7, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 214*c0909341SAndroid Build Coastguard Worker movd xm9, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 215*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm10, [base+round_vals-14+r6*2] 216*c0909341SAndroid Build Coastguard Worker movd xm11, [base+byte_blend+1] 217*c0909341SAndroid Build Coastguard Worker pmovsxbw xm9, xm9 218*c0909341SAndroid Build Coastguard Worker pshufd xm4, xm7, q0000 219*c0909341SAndroid Build Coastguard Worker mova xm12, [base+gen_shufA] 220*c0909341SAndroid Build Coastguard Worker pshufd xm5, xm7, q3333 221*c0909341SAndroid Build Coastguard Worker mova xm13, [base+gen_shufB] 222*c0909341SAndroid Build Coastguard Worker pshufd xm6, xm7, q1111 223*c0909341SAndroid Build Coastguard Worker mova xm14, [base+gen_shufC] 224*c0909341SAndroid Build Coastguard Worker pshufd xm7, xm7, q2222 225*c0909341SAndroid Build Coastguard Worker mova xm15, [base+gen_shufD] 226*c0909341SAndroid Build Coastguard Worker pshufd xm8, xm9, q0000 227*c0909341SAndroid Build Coastguard Worker psrld xm10, 16 228*c0909341SAndroid Build Coastguard Worker pshufd xm9, xm9, q1111 229*c0909341SAndroid Build Coastguard Worker sub bufq, 82*73-(82*3+79) 230*c0909341SAndroid Build Coastguard Worker mov hd, 70 231*c0909341SAndroid Build Coastguard Worker.y_loop_ar2: 232*c0909341SAndroid Build Coastguard Worker mov xq, -76 233*c0909341SAndroid Build Coastguard Worker.x_loop_ar2: 234*c0909341SAndroid Build Coastguard Worker pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] 235*c0909341SAndroid Build Coastguard Worker pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] 236*c0909341SAndroid Build Coastguard Worker pshufb xm2, xm0, xm12 237*c0909341SAndroid Build Coastguard Worker pmaddwd xm2, xm4 238*c0909341SAndroid Build Coastguard Worker pshufb xm3, xm1, xm13 239*c0909341SAndroid Build Coastguard Worker pmaddwd xm3, xm5 240*c0909341SAndroid Build Coastguard Worker paddd xm2, xm3 241*c0909341SAndroid Build Coastguard Worker pshufb xm3, xm0, xm14 242*c0909341SAndroid Build Coastguard Worker pmaddwd xm3, xm6 243*c0909341SAndroid Build Coastguard Worker punpckhqdq xm0, xm0 244*c0909341SAndroid Build Coastguard Worker punpcklwd xm0, xm1 245*c0909341SAndroid Build Coastguard Worker pmaddwd xm0, xm7 246*c0909341SAndroid Build Coastguard Worker pshufb xm1, xm15 247*c0909341SAndroid Build Coastguard Worker pmaddwd xm1, xm8 248*c0909341SAndroid Build Coastguard Worker paddd xm2, xm10 249*c0909341SAndroid Build Coastguard Worker paddd xm2, xm3 250*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 251*c0909341SAndroid Build Coastguard Worker paddd xm2, xm0 252*c0909341SAndroid Build Coastguard Worker movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] 253*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_inner: 254*c0909341SAndroid Build Coastguard Worker pmovsxbw xm1, xm0 255*c0909341SAndroid Build Coastguard Worker pmaddwd xm3, xm9, xm1 256*c0909341SAndroid Build Coastguard Worker psrldq xm1, 4 ; y=0,x=0 257*c0909341SAndroid Build Coastguard Worker paddd xm3, xm2 258*c0909341SAndroid Build Coastguard Worker psrldq xm2, 4 ; shift top to next pixel 259*c0909341SAndroid Build Coastguard Worker psrad xm3, [fg_dataq+FGData.ar_coeff_shift] 260*c0909341SAndroid Build Coastguard Worker ; don't packssdw since we only care about one value 261*c0909341SAndroid Build Coastguard Worker paddw xm3, xm1 262*c0909341SAndroid Build Coastguard Worker packsswb xm3, xm3 263*c0909341SAndroid Build Coastguard Worker pextrb [bufq+xq], xm3, 0 264*c0909341SAndroid Build Coastguard Worker pslldq xm3, 2 265*c0909341SAndroid Build Coastguard Worker vpblendvb xm0, xm3, xm11 266*c0909341SAndroid Build Coastguard Worker psrldq xm0, 1 267*c0909341SAndroid Build Coastguard Worker inc xq 268*c0909341SAndroid Build Coastguard Worker jz .x_loop_ar2_end 269*c0909341SAndroid Build Coastguard Worker test xb, 3 270*c0909341SAndroid Build Coastguard Worker jnz .x_loop_ar2_inner 271*c0909341SAndroid Build Coastguard Worker jmp .x_loop_ar2 272*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_end: 273*c0909341SAndroid Build Coastguard Worker add bufq, 82 274*c0909341SAndroid Build Coastguard Worker dec hd 275*c0909341SAndroid Build Coastguard Worker jg .y_loop_ar2 276*c0909341SAndroid Build Coastguard Worker RET 277*c0909341SAndroid Build Coastguard Worker 278*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2 279*c0909341SAndroid Build Coastguard Worker.ar3: 280*c0909341SAndroid Build Coastguard Worker%if WIN64 281*c0909341SAndroid Build Coastguard Worker ALLOC_STACK 16*14 282*c0909341SAndroid Build Coastguard Worker %assign stack_size stack_size - 16*4 283*c0909341SAndroid Build Coastguard Worker WIN64_PUSH_XMM 12, 8 284*c0909341SAndroid Build Coastguard Worker%else 285*c0909341SAndroid Build Coastguard Worker ALLOC_STACK 16*12 286*c0909341SAndroid Build Coastguard Worker%endif 287*c0909341SAndroid Build Coastguard Worker mov r6d, [fg_dataq+FGData.ar_coeff_shift] 288*c0909341SAndroid Build Coastguard Worker movd xm11, [base+byte_blend] 289*c0909341SAndroid Build Coastguard Worker pmovsxbw m1, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 290*c0909341SAndroid Build Coastguard Worker pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 291*c0909341SAndroid Build Coastguard Worker pshufd m0, m1, q0000 292*c0909341SAndroid Build Coastguard Worker mova [rsp+16* 0], m0 293*c0909341SAndroid Build Coastguard Worker pshufd m0, m1, q1111 294*c0909341SAndroid Build Coastguard Worker mova [rsp+16* 2], m0 295*c0909341SAndroid Build Coastguard Worker pshufd m0, m1, q2222 296*c0909341SAndroid Build Coastguard Worker mova [rsp+16* 4], m0 297*c0909341SAndroid Build Coastguard Worker pshufd m1, m1, q3333 298*c0909341SAndroid Build Coastguard Worker mova [rsp+16* 6], m1 299*c0909341SAndroid Build Coastguard Worker pshufd xm0, xm2, q0000 300*c0909341SAndroid Build Coastguard Worker mova [rsp+16* 8], xm0 301*c0909341SAndroid Build Coastguard Worker pshufd xm0, xm2, q1111 302*c0909341SAndroid Build Coastguard Worker mova [rsp+16* 9], xm0 303*c0909341SAndroid Build Coastguard Worker psrldq xm7, xm2, 10 304*c0909341SAndroid Build Coastguard Worker mova m8, [base+gen_shufA] 305*c0909341SAndroid Build Coastguard Worker pinsrw xm2, [base+pw_1], 5 306*c0909341SAndroid Build Coastguard Worker mova m9, [base+gen_shufC] 307*c0909341SAndroid Build Coastguard Worker pshufd xm2, xm2, q2222 308*c0909341SAndroid Build Coastguard Worker movu m10, [base+gen_shufE] 309*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm6, [base+round_vals-12+r6*2] 310*c0909341SAndroid Build Coastguard Worker pinsrw xm7, [base+round_vals+r6*2-10], 3 311*c0909341SAndroid Build Coastguard Worker mova [rsp+16*10], xm2 312*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, fg_data, h, x 313*c0909341SAndroid Build Coastguard Worker sub bufq, 82*73-(82*3+79) 314*c0909341SAndroid Build Coastguard Worker mov hd, 70 315*c0909341SAndroid Build Coastguard Worker.y_loop_ar3: 316*c0909341SAndroid Build Coastguard Worker mov xq, -76 317*c0909341SAndroid Build Coastguard Worker.x_loop_ar3: 318*c0909341SAndroid Build Coastguard Worker movu xm5, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] 319*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [bufq+xq-82*2-3], 1 ; y=-2,x=[-3,+12] 320*c0909341SAndroid Build Coastguard Worker movu xm4, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] 321*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m5, m5 322*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m4 323*c0909341SAndroid Build Coastguard Worker psraw m3, 8 324*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m5 325*c0909341SAndroid Build Coastguard Worker psraw m5, 8 326*c0909341SAndroid Build Coastguard Worker punpcklbw xm4, xm4 327*c0909341SAndroid Build Coastguard Worker psraw xm4, 8 328*c0909341SAndroid Build Coastguard Worker pshufb m0, m3, m8 329*c0909341SAndroid Build Coastguard Worker pmaddwd m0, [rsp+16*0] 330*c0909341SAndroid Build Coastguard Worker pshufb m1, m3, m9 331*c0909341SAndroid Build Coastguard Worker pmaddwd m1, [rsp+16*2] 332*c0909341SAndroid Build Coastguard Worker shufps m2, m3, m5, q1032 333*c0909341SAndroid Build Coastguard Worker paddd m0, m1 334*c0909341SAndroid Build Coastguard Worker pshufb m1, m2, m8 335*c0909341SAndroid Build Coastguard Worker vperm2i128 m3, m4, 0x21 336*c0909341SAndroid Build Coastguard Worker pmaddwd m1, [rsp+16*4] 337*c0909341SAndroid Build Coastguard Worker shufps xm2, xm3, q1021 338*c0909341SAndroid Build Coastguard Worker vpblendd m2, m3, 0xf0 339*c0909341SAndroid Build Coastguard Worker pshufb m2, m10 340*c0909341SAndroid Build Coastguard Worker paddd m0, m1 341*c0909341SAndroid Build Coastguard Worker pmaddwd m2, [rsp+16*6] 342*c0909341SAndroid Build Coastguard Worker pshufb xm1, xm4, xm9 343*c0909341SAndroid Build Coastguard Worker pmaddwd xm1, [rsp+16*8] 344*c0909341SAndroid Build Coastguard Worker shufps xm4, xm5, q1132 345*c0909341SAndroid Build Coastguard Worker paddd m0, m2 346*c0909341SAndroid Build Coastguard Worker pshufb xm2, xm4, xm8 347*c0909341SAndroid Build Coastguard Worker pshufd xm4, xm4, q2121 348*c0909341SAndroid Build Coastguard Worker pmaddwd xm2, [rsp+16*9] 349*c0909341SAndroid Build Coastguard Worker punpcklwd xm4, xm6 350*c0909341SAndroid Build Coastguard Worker pmaddwd xm4, [rsp+16*10] 351*c0909341SAndroid Build Coastguard Worker vextracti128 xm3, m0, 1 352*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 353*c0909341SAndroid Build Coastguard Worker movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] 354*c0909341SAndroid Build Coastguard Worker paddd xm2, xm4 355*c0909341SAndroid Build Coastguard Worker paddd xm0, xm2 356*c0909341SAndroid Build Coastguard Worker paddd xm0, xm3 357*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_inner: 358*c0909341SAndroid Build Coastguard Worker pmovsxbw xm2, xm1 359*c0909341SAndroid Build Coastguard Worker pmaddwd xm2, xm7 360*c0909341SAndroid Build Coastguard Worker pshufd xm3, xm2, q1111 361*c0909341SAndroid Build Coastguard Worker paddd xm2, xm0 ; add top 362*c0909341SAndroid Build Coastguard Worker paddd xm2, xm3 ; left+cur 363*c0909341SAndroid Build Coastguard Worker psrldq xm0, 4 364*c0909341SAndroid Build Coastguard Worker psrad xm2, [fg_dataq+FGData.ar_coeff_shift] 365*c0909341SAndroid Build Coastguard Worker ; don't packssdw since we only care about one value 366*c0909341SAndroid Build Coastguard Worker packsswb xm2, xm2 367*c0909341SAndroid Build Coastguard Worker pextrb [bufq+xq], xm2, 0 368*c0909341SAndroid Build Coastguard Worker pslldq xm2, 3 369*c0909341SAndroid Build Coastguard Worker vpblendvb xm1, xm2, xm11 370*c0909341SAndroid Build Coastguard Worker psrldq xm1, 1 371*c0909341SAndroid Build Coastguard Worker inc xq 372*c0909341SAndroid Build Coastguard Worker jz .x_loop_ar3_end 373*c0909341SAndroid Build Coastguard Worker test xb, 3 374*c0909341SAndroid Build Coastguard Worker jnz .x_loop_ar3_inner 375*c0909341SAndroid Build Coastguard Worker jmp .x_loop_ar3 376*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_end: 377*c0909341SAndroid Build Coastguard Worker add bufq, 82 378*c0909341SAndroid Build Coastguard Worker dec hd 379*c0909341SAndroid Build Coastguard Worker jg .y_loop_ar3 380*c0909341SAndroid Build Coastguard Worker RET 381*c0909341SAndroid Build Coastguard Worker 382*c0909341SAndroid Build Coastguard Worker%macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y 383*c0909341SAndroid Build Coastguard WorkerINIT_XMM avx2 384*c0909341SAndroid Build Coastguard Workercglobal generate_grain_uv_%1_8bpc, 4, 10, 16, buf, bufy, fg_data, uv 385*c0909341SAndroid Build Coastguard Worker%define base r4-generate_grain_uv_%1_8bpc_avx2_table 386*c0909341SAndroid Build Coastguard Worker lea r4, [generate_grain_uv_%1_8bpc_avx2_table] 387*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm0, [fg_dataq+FGData.seed] 388*c0909341SAndroid Build Coastguard Worker mov r6d, [fg_dataq+FGData.grain_scale_shift] 389*c0909341SAndroid Build Coastguard Worker movq xm1, [base+next_upperbit_mask] 390*c0909341SAndroid Build Coastguard Worker movq xm4, [base+mul_bits] 391*c0909341SAndroid Build Coastguard Worker movq xm5, [base+hmul_bits] 392*c0909341SAndroid Build Coastguard Worker mova xm6, [base+pb_mask] 393*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm7, [base+round+r6*2] 394*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm2, [base+pw_seed_xor+uvq*4] 395*c0909341SAndroid Build Coastguard Worker pxor xm0, xm2 396*c0909341SAndroid Build Coastguard Worker lea r6, [gaussian_sequence] 397*c0909341SAndroid Build Coastguard Worker%if %2 398*c0909341SAndroid Build Coastguard Worker mov r7d, 73-35*%3 399*c0909341SAndroid Build Coastguard Worker add bufq, 44 400*c0909341SAndroid Build Coastguard Worker.loop_y: 401*c0909341SAndroid Build Coastguard Worker mov r5, -44 402*c0909341SAndroid Build Coastguard Worker%else 403*c0909341SAndroid Build Coastguard Worker mov r5, -73*82 404*c0909341SAndroid Build Coastguard Worker sub bufq, r5 405*c0909341SAndroid Build Coastguard Worker%endif 406*c0909341SAndroid Build Coastguard Worker.loop: 407*c0909341SAndroid Build Coastguard Worker pand xm2, xm0, xm1 408*c0909341SAndroid Build Coastguard Worker psrlw xm3, xm2, 10 409*c0909341SAndroid Build Coastguard Worker por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 410*c0909341SAndroid Build Coastguard Worker pmullw xm2, xm4 ; bits 0x0f00 are set 411*c0909341SAndroid Build Coastguard Worker pmulhuw xm0, xm5 412*c0909341SAndroid Build Coastguard Worker pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds 413*c0909341SAndroid Build Coastguard Worker psllq xm2, xm3, 30 414*c0909341SAndroid Build Coastguard Worker por xm2, xm3 415*c0909341SAndroid Build Coastguard Worker psllq xm3, xm2, 15 416*c0909341SAndroid Build Coastguard Worker por xm2, xm0 ; aggregate each bit into next seed's high bit 417*c0909341SAndroid Build Coastguard Worker por xm2, xm3 ; 4 next output seeds 418*c0909341SAndroid Build Coastguard Worker pshuflw xm0, xm2, q3333 419*c0909341SAndroid Build Coastguard Worker psrlw xm2, 5 420*c0909341SAndroid Build Coastguard Worker movq r8, xm2 421*c0909341SAndroid Build Coastguard Worker movzx r9d, r8w 422*c0909341SAndroid Build Coastguard Worker movd xm2, [r6+r9*2] 423*c0909341SAndroid Build Coastguard Worker rorx r9, r8, 32 424*c0909341SAndroid Build Coastguard Worker shr r8d, 16 425*c0909341SAndroid Build Coastguard Worker pinsrw xm2, [r6+r8*2], 1 426*c0909341SAndroid Build Coastguard Worker movzx r8d, r9w 427*c0909341SAndroid Build Coastguard Worker pinsrw xm2, [r6+r8*2], 2 428*c0909341SAndroid Build Coastguard Worker shr r9d, 16 429*c0909341SAndroid Build Coastguard Worker pinsrw xm2, [r6+r9*2], 3 430*c0909341SAndroid Build Coastguard Worker pmulhrsw xm2, xm7 431*c0909341SAndroid Build Coastguard Worker packsswb xm2, xm2 432*c0909341SAndroid Build Coastguard Worker movd [bufq+r5], xm2 433*c0909341SAndroid Build Coastguard Worker add r5, 4 434*c0909341SAndroid Build Coastguard Worker jl .loop 435*c0909341SAndroid Build Coastguard Worker%if %2 436*c0909341SAndroid Build Coastguard Worker add bufq, 82 437*c0909341SAndroid Build Coastguard Worker dec r7d 438*c0909341SAndroid Build Coastguard Worker jg .loop_y 439*c0909341SAndroid Build Coastguard Worker%endif 440*c0909341SAndroid Build Coastguard Worker 441*c0909341SAndroid Build Coastguard Worker ; auto-regression code 442*c0909341SAndroid Build Coastguard Worker movsxd r6, [fg_dataq+FGData.ar_coeff_lag] 443*c0909341SAndroid Build Coastguard Worker movsxd r6, [base+generate_grain_uv_%1_8bpc_avx2_table+r6*4] 444*c0909341SAndroid Build Coastguard Worker add r6, r4 445*c0909341SAndroid Build Coastguard Worker jmp r6 446*c0909341SAndroid Build Coastguard Worker 447*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2 448*c0909341SAndroid Build Coastguard Worker.ar0: 449*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 450*c0909341SAndroid Build Coastguard Worker imul uvd, 28 451*c0909341SAndroid Build Coastguard Worker mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 452*c0909341SAndroid Build Coastguard Worker movd xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq] 453*c0909341SAndroid Build Coastguard Worker movd xm3, [base+hmul_bits+shiftq*2] 454*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, h 455*c0909341SAndroid Build Coastguard Worker pmovsxbw xm2, xm2 456*c0909341SAndroid Build Coastguard Worker%if %2 457*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+pb_1] 458*c0909341SAndroid Build Coastguard Worker vpbroadcastw m6, [base+hmul_bits+2+%3*2] 459*c0909341SAndroid Build Coastguard Worker%endif 460*c0909341SAndroid Build Coastguard Worker vpbroadcastw m2, xm2 461*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, xm3 462*c0909341SAndroid Build Coastguard Worker pxor m12, m12 463*c0909341SAndroid Build Coastguard Worker%if %2 464*c0909341SAndroid Build Coastguard Worker sub bufq, 82*(73-35*%3)+82-(82*3+41) 465*c0909341SAndroid Build Coastguard Worker%else 466*c0909341SAndroid Build Coastguard Worker sub bufq, 82*70-3 467*c0909341SAndroid Build Coastguard Worker%endif 468*c0909341SAndroid Build Coastguard Worker add bufyq, 3+82*3 469*c0909341SAndroid Build Coastguard Worker mov hd, 70-35*%3 470*c0909341SAndroid Build Coastguard Worker.y_loop_ar0: 471*c0909341SAndroid Build Coastguard Worker%if %2 472*c0909341SAndroid Build Coastguard Worker ; first 32 pixels 473*c0909341SAndroid Build Coastguard Worker movu xm4, [bufyq] 474*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [bufyq+32], 1 475*c0909341SAndroid Build Coastguard Worker%if %3 476*c0909341SAndroid Build Coastguard Worker movu xm0, [bufyq+82] 477*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [bufyq+82+32], 1 478*c0909341SAndroid Build Coastguard Worker%endif 479*c0909341SAndroid Build Coastguard Worker movu xm5, [bufyq+16] 480*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [bufyq+48], 1 481*c0909341SAndroid Build Coastguard Worker%if %3 482*c0909341SAndroid Build Coastguard Worker movu xm1, [bufyq+82+16] 483*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [bufyq+82+48], 1 484*c0909341SAndroid Build Coastguard Worker%endif 485*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m7, m4 486*c0909341SAndroid Build Coastguard Worker%if %3 487*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m7, m0 488*c0909341SAndroid Build Coastguard Worker%endif 489*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m7, m5 490*c0909341SAndroid Build Coastguard Worker%if %3 491*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m7, m1 492*c0909341SAndroid Build Coastguard Worker paddw m4, m0 493*c0909341SAndroid Build Coastguard Worker paddw m5, m1 494*c0909341SAndroid Build Coastguard Worker%endif 495*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m6 496*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m6 497*c0909341SAndroid Build Coastguard Worker%else 498*c0909341SAndroid Build Coastguard Worker xor r3d, r3d 499*c0909341SAndroid Build Coastguard Worker ; first 32x2 pixels 500*c0909341SAndroid Build Coastguard Worker.x_loop_ar0: 501*c0909341SAndroid Build Coastguard Worker movu m4, [bufyq+r3] 502*c0909341SAndroid Build Coastguard Worker pcmpgtb m0, m12, m4 503*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m4, m0 504*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m0 505*c0909341SAndroid Build Coastguard Worker%endif 506*c0909341SAndroid Build Coastguard Worker pmullw m4, m2 507*c0909341SAndroid Build Coastguard Worker pmullw m5, m2 508*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m3 509*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m3 510*c0909341SAndroid Build Coastguard Worker%if %2 511*c0909341SAndroid Build Coastguard Worker movu m1, [bufq] 512*c0909341SAndroid Build Coastguard Worker%else 513*c0909341SAndroid Build Coastguard Worker movu m1, [bufq+r3] 514*c0909341SAndroid Build Coastguard Worker%endif 515*c0909341SAndroid Build Coastguard Worker pcmpgtb m8, m12, m1 516*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m8 517*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m8 518*c0909341SAndroid Build Coastguard Worker paddw m0, m4 519*c0909341SAndroid Build Coastguard Worker paddw m1, m5 520*c0909341SAndroid Build Coastguard Worker packsswb m0, m1 521*c0909341SAndroid Build Coastguard Worker%if %2 522*c0909341SAndroid Build Coastguard Worker movu [bufq], m0 523*c0909341SAndroid Build Coastguard Worker%else 524*c0909341SAndroid Build Coastguard Worker movu [bufq+r3], m0 525*c0909341SAndroid Build Coastguard Worker add r3d, 32 526*c0909341SAndroid Build Coastguard Worker cmp r3d, 64 527*c0909341SAndroid Build Coastguard Worker jl .x_loop_ar0 528*c0909341SAndroid Build Coastguard Worker%endif 529*c0909341SAndroid Build Coastguard Worker 530*c0909341SAndroid Build Coastguard Worker ; last 6/12 pixels 531*c0909341SAndroid Build Coastguard Worker movu xm4, [bufyq+32*2] 532*c0909341SAndroid Build Coastguard Worker%if %2 533*c0909341SAndroid Build Coastguard Worker%if %3 534*c0909341SAndroid Build Coastguard Worker movu xm5, [bufyq+32*2+82] 535*c0909341SAndroid Build Coastguard Worker%endif 536*c0909341SAndroid Build Coastguard Worker pmaddubsw xm4, xm7, xm4 537*c0909341SAndroid Build Coastguard Worker%if %3 538*c0909341SAndroid Build Coastguard Worker pmaddubsw xm5, xm7, xm5 539*c0909341SAndroid Build Coastguard Worker paddw xm4, xm5 540*c0909341SAndroid Build Coastguard Worker%endif 541*c0909341SAndroid Build Coastguard Worker movq xm0, [bufq+32] 542*c0909341SAndroid Build Coastguard Worker pmulhrsw xm4, xm6 543*c0909341SAndroid Build Coastguard Worker pmullw xm4, xm2 544*c0909341SAndroid Build Coastguard Worker pmulhrsw xm4, xm3 545*c0909341SAndroid Build Coastguard Worker pcmpgtb xm5, xm12, xm0 546*c0909341SAndroid Build Coastguard Worker punpcklbw xm5, xm0, xm5 547*c0909341SAndroid Build Coastguard Worker paddw xm4, xm5 548*c0909341SAndroid Build Coastguard Worker packsswb xm4, xm4 549*c0909341SAndroid Build Coastguard Worker pblendw xm0, xm4, xm0, 1000b 550*c0909341SAndroid Build Coastguard Worker movq [bufq+32], xm0 551*c0909341SAndroid Build Coastguard Worker%else 552*c0909341SAndroid Build Coastguard Worker movu xm0, [bufq+64] 553*c0909341SAndroid Build Coastguard Worker pcmpgtb xm1, xm12, xm4 554*c0909341SAndroid Build Coastguard Worker punpckhbw xm5, xm4, xm1 555*c0909341SAndroid Build Coastguard Worker punpcklbw xm4, xm1 556*c0909341SAndroid Build Coastguard Worker pmullw xm5, xm2 557*c0909341SAndroid Build Coastguard Worker pmullw xm4, xm2 558*c0909341SAndroid Build Coastguard Worker vpblendd xm1, xm3, xm12, 0x0c 559*c0909341SAndroid Build Coastguard Worker pmulhrsw xm5, xm1 560*c0909341SAndroid Build Coastguard Worker pmulhrsw xm4, xm3 561*c0909341SAndroid Build Coastguard Worker pcmpgtb xm1, xm12, xm0 562*c0909341SAndroid Build Coastguard Worker punpckhbw xm8, xm0, xm1 563*c0909341SAndroid Build Coastguard Worker punpcklbw xm0, xm1 564*c0909341SAndroid Build Coastguard Worker paddw xm5, xm8 565*c0909341SAndroid Build Coastguard Worker paddw xm0, xm4 566*c0909341SAndroid Build Coastguard Worker packsswb xm0, xm5 567*c0909341SAndroid Build Coastguard Worker movu [bufq+64], xm0 568*c0909341SAndroid Build Coastguard Worker%endif 569*c0909341SAndroid Build Coastguard Worker add bufq, 82 570*c0909341SAndroid Build Coastguard Worker add bufyq, 82<<%3 571*c0909341SAndroid Build Coastguard Worker dec hd 572*c0909341SAndroid Build Coastguard Worker jg .y_loop_ar0 573*c0909341SAndroid Build Coastguard Worker RET 574*c0909341SAndroid Build Coastguard Worker 575*c0909341SAndroid Build Coastguard WorkerINIT_XMM avx2 576*c0909341SAndroid Build Coastguard Worker.ar1: 577*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift 578*c0909341SAndroid Build Coastguard Worker imul uvd, 28 579*c0909341SAndroid Build Coastguard Worker mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 580*c0909341SAndroid Build Coastguard Worker movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] 581*c0909341SAndroid Build Coastguard Worker movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] 582*c0909341SAndroid Build Coastguard Worker pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 583*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift 584*c0909341SAndroid Build Coastguard Worker pmovsxbw xm4, xm4 585*c0909341SAndroid Build Coastguard Worker pshufd xm5, xm4, q1111 586*c0909341SAndroid Build Coastguard Worker pshufd xm4, xm4, q0000 587*c0909341SAndroid Build Coastguard Worker pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd 588*c0909341SAndroid Build Coastguard Worker%if %2 589*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm7, [base+pb_1] 590*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm6, [base+hmul_bits+2+%3*2] 591*c0909341SAndroid Build Coastguard Worker%endif 592*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm3, xm3 593*c0909341SAndroid Build Coastguard Worker%if %2 594*c0909341SAndroid Build Coastguard Worker sub bufq, 82*(73-35*%3)+44-(82*3+41) 595*c0909341SAndroid Build Coastguard Worker%else 596*c0909341SAndroid Build Coastguard Worker sub bufq, 82*70-(82-3) 597*c0909341SAndroid Build Coastguard Worker%endif 598*c0909341SAndroid Build Coastguard Worker add bufyq, 79+82*3 599*c0909341SAndroid Build Coastguard Worker mov hd, 70-35*%3 600*c0909341SAndroid Build Coastguard Worker mov mind, -128 601*c0909341SAndroid Build Coastguard Worker mov maxd, 127 602*c0909341SAndroid Build Coastguard Worker.y_loop_ar1: 603*c0909341SAndroid Build Coastguard Worker mov xq, -(76>>%2) 604*c0909341SAndroid Build Coastguard Worker movsx val3d, byte [bufq+xq-1] 605*c0909341SAndroid Build Coastguard Worker.x_loop_ar1: 606*c0909341SAndroid Build Coastguard Worker pmovsxbw xm0, [bufq+xq-82-1] ; top/left 607*c0909341SAndroid Build Coastguard Worker%if %2 608*c0909341SAndroid Build Coastguard Worker movq xm8, [bufyq+xq*2] 609*c0909341SAndroid Build Coastguard Worker%if %3 610*c0909341SAndroid Build Coastguard Worker movq xm9, [bufyq+xq*2+82] 611*c0909341SAndroid Build Coastguard Worker%endif 612*c0909341SAndroid Build Coastguard Worker%endif 613*c0909341SAndroid Build Coastguard Worker psrldq xm2, xm0, 2 ; top 614*c0909341SAndroid Build Coastguard Worker psrldq xm1, xm0, 4 ; top/right 615*c0909341SAndroid Build Coastguard Worker%if %2 616*c0909341SAndroid Build Coastguard Worker pmaddubsw xm8, xm7, xm8 617*c0909341SAndroid Build Coastguard Worker%if %3 618*c0909341SAndroid Build Coastguard Worker pmaddubsw xm9, xm7, xm9 619*c0909341SAndroid Build Coastguard Worker paddw xm8, xm9 620*c0909341SAndroid Build Coastguard Worker%endif 621*c0909341SAndroid Build Coastguard Worker pmulhrsw xm8, xm6 622*c0909341SAndroid Build Coastguard Worker%else 623*c0909341SAndroid Build Coastguard Worker pmovsxbw xm8, [bufyq+xq] 624*c0909341SAndroid Build Coastguard Worker%endif 625*c0909341SAndroid Build Coastguard Worker punpcklwd xm0, xm2 626*c0909341SAndroid Build Coastguard Worker punpcklwd xm1, xm8 627*c0909341SAndroid Build Coastguard Worker pmaddwd xm0, xm4 628*c0909341SAndroid Build Coastguard Worker pmaddwd xm1, xm5 629*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 630*c0909341SAndroid Build Coastguard Worker paddd xm0, xm3 631*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_inner: 632*c0909341SAndroid Build Coastguard Worker movd val0d, xm0 633*c0909341SAndroid Build Coastguard Worker psrldq xm0, 4 634*c0909341SAndroid Build Coastguard Worker imul val3d, cf3d 635*c0909341SAndroid Build Coastguard Worker add val3d, val0d 636*c0909341SAndroid Build Coastguard Worker sarx val3d, val3d, shiftd 637*c0909341SAndroid Build Coastguard Worker movsx val0d, byte [bufq+xq] 638*c0909341SAndroid Build Coastguard Worker add val3d, val0d 639*c0909341SAndroid Build Coastguard Worker cmp val3d, maxd 640*c0909341SAndroid Build Coastguard Worker cmovns val3d, maxd 641*c0909341SAndroid Build Coastguard Worker cmp val3d, mind 642*c0909341SAndroid Build Coastguard Worker cmovs val3d, mind 643*c0909341SAndroid Build Coastguard Worker mov byte [bufq+xq], val3b 644*c0909341SAndroid Build Coastguard Worker ; keep val3d in-place as left for next x iteration 645*c0909341SAndroid Build Coastguard Worker inc xq 646*c0909341SAndroid Build Coastguard Worker jz .x_loop_ar1_end 647*c0909341SAndroid Build Coastguard Worker test xq, 3 648*c0909341SAndroid Build Coastguard Worker jnz .x_loop_ar1_inner 649*c0909341SAndroid Build Coastguard Worker jmp .x_loop_ar1 650*c0909341SAndroid Build Coastguard Worker 651*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_end: 652*c0909341SAndroid Build Coastguard Worker add bufq, 82 653*c0909341SAndroid Build Coastguard Worker add bufyq, 82<<%3 654*c0909341SAndroid Build Coastguard Worker dec hd 655*c0909341SAndroid Build Coastguard Worker jg .y_loop_ar1 656*c0909341SAndroid Build Coastguard Worker RET 657*c0909341SAndroid Build Coastguard Worker 658*c0909341SAndroid Build Coastguard Worker.ar2: 659*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 660*c0909341SAndroid Build Coastguard Worker mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 661*c0909341SAndroid Build Coastguard Worker imul uvd, 28 662*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm13, [base+round_vals-12+shiftq*2] 663*c0909341SAndroid Build Coastguard Worker pmovsxbw xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7 664*c0909341SAndroid Build Coastguard Worker pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12 665*c0909341SAndroid Build Coastguard Worker pinsrw xm0, [base+pw_1], 5 666*c0909341SAndroid Build Coastguard Worker%if %2 667*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm12, [base+hmul_bits+2+%3*2] 668*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm11, [base+pb_1] 669*c0909341SAndroid Build Coastguard Worker%endif 670*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, fg_data, h, unused, x 671*c0909341SAndroid Build Coastguard Worker pshufd xm4, xm7, q0000 672*c0909341SAndroid Build Coastguard Worker pshufd xm5, xm7, q3333 673*c0909341SAndroid Build Coastguard Worker pshufd xm6, xm7, q1111 674*c0909341SAndroid Build Coastguard Worker pshufd xm7, xm7, q2222 675*c0909341SAndroid Build Coastguard Worker pshufd xm8, xm0, q0000 676*c0909341SAndroid Build Coastguard Worker pshufd xm9, xm0, q1111 677*c0909341SAndroid Build Coastguard Worker pshufd xm10, xm0, q2222 678*c0909341SAndroid Build Coastguard Worker%if %2 679*c0909341SAndroid Build Coastguard Worker sub bufq, 82*(73-35*%3)+44-(82*3+41) 680*c0909341SAndroid Build Coastguard Worker%else 681*c0909341SAndroid Build Coastguard Worker sub bufq, 82*70-(82-3) 682*c0909341SAndroid Build Coastguard Worker%endif 683*c0909341SAndroid Build Coastguard Worker add bufyq, 79+82*3 684*c0909341SAndroid Build Coastguard Worker mov hd, 70-35*%3 685*c0909341SAndroid Build Coastguard Worker.y_loop_ar2: 686*c0909341SAndroid Build Coastguard Worker mov xq, -(76>>%2) 687*c0909341SAndroid Build Coastguard Worker 688*c0909341SAndroid Build Coastguard Worker.x_loop_ar2: 689*c0909341SAndroid Build Coastguard Worker pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] 690*c0909341SAndroid Build Coastguard Worker pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] 691*c0909341SAndroid Build Coastguard Worker pshufb xm2, xm0, [base+gen_shufA] 692*c0909341SAndroid Build Coastguard Worker pmaddwd xm2, xm4 693*c0909341SAndroid Build Coastguard Worker pshufb xm3, xm1, [base+gen_shufB] 694*c0909341SAndroid Build Coastguard Worker pmaddwd xm3, xm5 695*c0909341SAndroid Build Coastguard Worker paddd xm2, xm3 696*c0909341SAndroid Build Coastguard Worker pshufb xm3, xm0, [base+gen_shufC] 697*c0909341SAndroid Build Coastguard Worker pmaddwd xm3, xm6 698*c0909341SAndroid Build Coastguard Worker punpckhqdq xm0, xm0 ; y=-2,x=[+2,+5] 699*c0909341SAndroid Build Coastguard Worker punpcklwd xm0, xm1 700*c0909341SAndroid Build Coastguard Worker pmaddwd xm0, xm7 701*c0909341SAndroid Build Coastguard Worker pshufb xm1, [gen_shufD] 702*c0909341SAndroid Build Coastguard Worker pmaddwd xm1, xm8 703*c0909341SAndroid Build Coastguard Worker paddd xm2, xm3 704*c0909341SAndroid Build Coastguard Worker paddd xm0, xm1 705*c0909341SAndroid Build Coastguard Worker paddd xm2, xm0 706*c0909341SAndroid Build Coastguard Worker 707*c0909341SAndroid Build Coastguard Worker%if %2 708*c0909341SAndroid Build Coastguard Worker movq xm0, [bufyq+xq*2] 709*c0909341SAndroid Build Coastguard Worker%if %3 710*c0909341SAndroid Build Coastguard Worker movq xm3, [bufyq+xq*2+82] 711*c0909341SAndroid Build Coastguard Worker%endif 712*c0909341SAndroid Build Coastguard Worker pmaddubsw xm0, xm11, xm0 713*c0909341SAndroid Build Coastguard Worker%if %3 714*c0909341SAndroid Build Coastguard Worker pmaddubsw xm3, xm11, xm3 715*c0909341SAndroid Build Coastguard Worker paddw xm0, xm3 716*c0909341SAndroid Build Coastguard Worker%endif 717*c0909341SAndroid Build Coastguard Worker pmulhrsw xm0, xm12 718*c0909341SAndroid Build Coastguard Worker%else 719*c0909341SAndroid Build Coastguard Worker pmovsxbw xm0, [bufyq+xq] 720*c0909341SAndroid Build Coastguard Worker%endif 721*c0909341SAndroid Build Coastguard Worker punpcklwd xm0, xm13 722*c0909341SAndroid Build Coastguard Worker pmaddwd xm0, xm10 723*c0909341SAndroid Build Coastguard Worker paddd xm2, xm0 724*c0909341SAndroid Build Coastguard Worker 725*c0909341SAndroid Build Coastguard Worker movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] 726*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_inner: 727*c0909341SAndroid Build Coastguard Worker pmovsxbw xm0, xm0 728*c0909341SAndroid Build Coastguard Worker pmaddwd xm3, xm0, xm9 729*c0909341SAndroid Build Coastguard Worker psrldq xm0, 2 730*c0909341SAndroid Build Coastguard Worker paddd xm3, xm2 731*c0909341SAndroid Build Coastguard Worker psrldq xm2, 4 ; shift top to next pixel 732*c0909341SAndroid Build Coastguard Worker psrad xm3, [fg_dataq+FGData.ar_coeff_shift] 733*c0909341SAndroid Build Coastguard Worker pslldq xm3, 2 734*c0909341SAndroid Build Coastguard Worker paddw xm3, xm0 735*c0909341SAndroid Build Coastguard Worker pblendw xm0, xm3, 00000010b 736*c0909341SAndroid Build Coastguard Worker packsswb xm0, xm0 737*c0909341SAndroid Build Coastguard Worker pextrb [bufq+xq], xm0, 1 738*c0909341SAndroid Build Coastguard Worker inc xq 739*c0909341SAndroid Build Coastguard Worker jz .x_loop_ar2_end 740*c0909341SAndroid Build Coastguard Worker test xb, 3 741*c0909341SAndroid Build Coastguard Worker jnz .x_loop_ar2_inner 742*c0909341SAndroid Build Coastguard Worker jmp .x_loop_ar2 743*c0909341SAndroid Build Coastguard Worker 744*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_end: 745*c0909341SAndroid Build Coastguard Worker add bufq, 82 746*c0909341SAndroid Build Coastguard Worker add bufyq, 82<<%3 747*c0909341SAndroid Build Coastguard Worker dec hd 748*c0909341SAndroid Build Coastguard Worker jg .y_loop_ar2 749*c0909341SAndroid Build Coastguard Worker RET 750*c0909341SAndroid Build Coastguard Worker 751*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2 752*c0909341SAndroid Build Coastguard Worker.ar3: 753*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 754*c0909341SAndroid Build Coastguard Worker mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 755*c0909341SAndroid Build Coastguard Worker imul uvd, 28 756*c0909341SAndroid Build Coastguard Worker pmovsxbw m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 757*c0909341SAndroid Build Coastguard Worker pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23 758*c0909341SAndroid Build Coastguard Worker vpbroadcastb xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma] 759*c0909341SAndroid Build Coastguard Worker movd xm13, [base+round_vals-10+shiftq*2] 760*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm14, [base+round_vals-14+shiftq*2] 761*c0909341SAndroid Build Coastguard Worker pshufd m6, m0, q0000 762*c0909341SAndroid Build Coastguard Worker pshufd m7, m0, q1111 763*c0909341SAndroid Build Coastguard Worker pshufd m8, m0, q2222 764*c0909341SAndroid Build Coastguard Worker pshufd m9, m0, q3333 765*c0909341SAndroid Build Coastguard Worker pshufd xm10, xm1, q0000 766*c0909341SAndroid Build Coastguard Worker pshufd xm11, xm1, q1111 767*c0909341SAndroid Build Coastguard Worker pshufhw xm12, xm1, q0000 768*c0909341SAndroid Build Coastguard Worker psraw xm2, 8 769*c0909341SAndroid Build Coastguard Worker palignr xm13, xm1, 10 770*c0909341SAndroid Build Coastguard Worker punpckhwd xm12, xm2 ; interleave luma cf 771*c0909341SAndroid Build Coastguard Worker psrld xm14, 16 772*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS buf, bufy, fg_data, h, unused, x 773*c0909341SAndroid Build Coastguard Worker%if %2 774*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm15, [base+hmul_bits+2+%3*2] 775*c0909341SAndroid Build Coastguard Worker sub bufq, 82*(73-35*%3)+44-(82*3+41) 776*c0909341SAndroid Build Coastguard Worker%else 777*c0909341SAndroid Build Coastguard Worker sub bufq, 82*70-(82-3) 778*c0909341SAndroid Build Coastguard Worker%endif 779*c0909341SAndroid Build Coastguard Worker add bufyq, 79+82*3 780*c0909341SAndroid Build Coastguard Worker mov hd, 70-35*%3 781*c0909341SAndroid Build Coastguard Worker.y_loop_ar3: 782*c0909341SAndroid Build Coastguard Worker mov xq, -(76>>%2) 783*c0909341SAndroid Build Coastguard Worker.x_loop_ar3: 784*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m3, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12 785*c0909341SAndroid Build Coastguard Worker palignr xm1, xm3, [bufq+xq-82*3-9], 6 ; y=-3,x=[-3,+12] 786*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] 787*c0909341SAndroid Build Coastguard Worker vpblendd m3, m1, 0x0f 788*c0909341SAndroid Build Coastguard Worker pxor m0, m0 789*c0909341SAndroid Build Coastguard Worker pcmpgtb m2, m0, m3 790*c0909341SAndroid Build Coastguard Worker pcmpgtb m0, m4 791*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m3, m2 792*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m2 793*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m4, m0 794*c0909341SAndroid Build Coastguard Worker punpckhbw xm4, xm0 795*c0909341SAndroid Build Coastguard Worker pshufb m0, m1, [base+gen_shufA] 796*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m6 797*c0909341SAndroid Build Coastguard Worker pshufb m5, m1, [base+gen_shufC] 798*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m7 799*c0909341SAndroid Build Coastguard Worker shufps m1, m3, q1032 800*c0909341SAndroid Build Coastguard Worker paddd m0, m5 801*c0909341SAndroid Build Coastguard Worker pshufb m5, m1, [base+gen_shufA] 802*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m8 803*c0909341SAndroid Build Coastguard Worker shufps xm1, xm3, q2121 804*c0909341SAndroid Build Coastguard Worker vpblendd m1, m2, 0xf0 805*c0909341SAndroid Build Coastguard Worker pshufb m1, [base+gen_shufE] 806*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m9 807*c0909341SAndroid Build Coastguard Worker paddd m0, m5 808*c0909341SAndroid Build Coastguard Worker pshufb xm3, xm2, [base+gen_shufC] 809*c0909341SAndroid Build Coastguard Worker paddd m0, m1 810*c0909341SAndroid Build Coastguard Worker pmaddwd xm3, xm10 811*c0909341SAndroid Build Coastguard Worker palignr xm1, xm4, xm2, 2 812*c0909341SAndroid Build Coastguard Worker punpckhwd xm1, xm2, xm1 813*c0909341SAndroid Build Coastguard Worker pmaddwd xm1, xm11 814*c0909341SAndroid Build Coastguard Worker palignr xm4, xm2, 12 815*c0909341SAndroid Build Coastguard Worker paddd xm3, xm1 816*c0909341SAndroid Build Coastguard Worker%if %2 817*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm5, [base+pb_1] 818*c0909341SAndroid Build Coastguard Worker movq xm1, [bufyq+xq*2] 819*c0909341SAndroid Build Coastguard Worker pmaddubsw xm1, xm5, xm1 820*c0909341SAndroid Build Coastguard Worker%if %3 821*c0909341SAndroid Build Coastguard Worker movq xm2, [bufyq+xq*2+82] 822*c0909341SAndroid Build Coastguard Worker pmaddubsw xm5, xm2 823*c0909341SAndroid Build Coastguard Worker paddw xm1, xm5 824*c0909341SAndroid Build Coastguard Worker%endif 825*c0909341SAndroid Build Coastguard Worker pmulhrsw xm1, xm15 826*c0909341SAndroid Build Coastguard Worker%else 827*c0909341SAndroid Build Coastguard Worker pmovsxbw xm1, [bufyq+xq] 828*c0909341SAndroid Build Coastguard Worker%endif 829*c0909341SAndroid Build Coastguard Worker punpcklwd xm4, xm1 830*c0909341SAndroid Build Coastguard Worker pmaddwd xm4, xm12 831*c0909341SAndroid Build Coastguard Worker movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] 832*c0909341SAndroid Build Coastguard Worker vextracti128 xm2, m0, 1 833*c0909341SAndroid Build Coastguard Worker paddd xm0, xm14 834*c0909341SAndroid Build Coastguard Worker paddd xm3, xm4 835*c0909341SAndroid Build Coastguard Worker paddd xm0, xm3 836*c0909341SAndroid Build Coastguard Worker paddd xm0, xm2 837*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_inner: 838*c0909341SAndroid Build Coastguard Worker pmovsxbw xm1, xm1 839*c0909341SAndroid Build Coastguard Worker pmaddwd xm2, xm13, xm1 840*c0909341SAndroid Build Coastguard Worker pshuflw xm3, xm2, q1032 841*c0909341SAndroid Build Coastguard Worker paddd xm2, xm0 ; add top 842*c0909341SAndroid Build Coastguard Worker paddd xm2, xm3 ; left+cur 843*c0909341SAndroid Build Coastguard Worker psrldq xm0, 4 844*c0909341SAndroid Build Coastguard Worker psrad xm2, [fg_dataq+FGData.ar_coeff_shift] 845*c0909341SAndroid Build Coastguard Worker psrldq xm1, 2 846*c0909341SAndroid Build Coastguard Worker ; don't packssdw, we only care about one value 847*c0909341SAndroid Build Coastguard Worker punpckldq xm2, xm2 848*c0909341SAndroid Build Coastguard Worker pblendw xm1, xm2, 0100b 849*c0909341SAndroid Build Coastguard Worker packsswb xm1, xm1 850*c0909341SAndroid Build Coastguard Worker pextrb [bufq+xq], xm1, 2 851*c0909341SAndroid Build Coastguard Worker inc xq 852*c0909341SAndroid Build Coastguard Worker jz .x_loop_ar3_end 853*c0909341SAndroid Build Coastguard Worker test xb, 3 854*c0909341SAndroid Build Coastguard Worker jnz .x_loop_ar3_inner 855*c0909341SAndroid Build Coastguard Worker jmp .x_loop_ar3 856*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_end: 857*c0909341SAndroid Build Coastguard Worker add bufq, 82 858*c0909341SAndroid Build Coastguard Worker add bufyq, 82<<%3 859*c0909341SAndroid Build Coastguard Worker dec hd 860*c0909341SAndroid Build Coastguard Worker jg .y_loop_ar3 861*c0909341SAndroid Build Coastguard Worker RET 862*c0909341SAndroid Build Coastguard Worker%endmacro 863*c0909341SAndroid Build Coastguard Worker 864*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2 865*c0909341SAndroid Build Coastguard Workercglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \ 866*c0909341SAndroid Build Coastguard Worker grain_lut, h, sby, see, overlap 867*c0909341SAndroid Build Coastguard Worker%define base r9-pd_m65536 868*c0909341SAndroid Build Coastguard Worker lea r9, [pd_m65536] 869*c0909341SAndroid Build Coastguard Worker mov r6d, [fg_dataq+FGData.scaling_shift] 870*c0909341SAndroid Build Coastguard Worker mov r7d, [fg_dataq+FGData.clip_to_restricted_range] 871*c0909341SAndroid Build Coastguard Worker mov sbyd, sbym 872*c0909341SAndroid Build Coastguard Worker mov overlapd, [fg_dataq+FGData.overlap_flag] 873*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+pd_m65536] 874*c0909341SAndroid Build Coastguard Worker vpbroadcastw m9, [base+mul_bits+r6*2-14] 875*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+fg_min+r7*4] 876*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+fg_max+r7*8] 877*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [base+pw_1024] 878*c0909341SAndroid Build Coastguard Worker movq xm13, [base+pb_27_17_17_27] 879*c0909341SAndroid Build Coastguard Worker test sbyd, sbyd 880*c0909341SAndroid Build Coastguard Worker setnz r7b 881*c0909341SAndroid Build Coastguard Worker pxor m7, m7 882*c0909341SAndroid Build Coastguard Worker test r7b, overlapb 883*c0909341SAndroid Build Coastguard Worker jnz .vertical_overlap 884*c0909341SAndroid Build Coastguard Worker 885*c0909341SAndroid Build Coastguard Worker imul seed, sbyd, (173 << 24) | 37 886*c0909341SAndroid Build Coastguard Worker add seed, (105 << 24) | 178 887*c0909341SAndroid Build Coastguard Worker rorx seed, seed, 24 888*c0909341SAndroid Build Coastguard Worker movzx seed, seew 889*c0909341SAndroid Build Coastguard Worker xor seed, [fg_dataq+FGData.seed] 890*c0909341SAndroid Build Coastguard Worker 891*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 892*c0909341SAndroid Build Coastguard Worker offx, offy, see, overlap 893*c0909341SAndroid Build Coastguard Worker 894*c0909341SAndroid Build Coastguard Worker lea src_bakq, [srcq+wq] 895*c0909341SAndroid Build Coastguard Worker neg wq 896*c0909341SAndroid Build Coastguard Worker sub dstq, srcq 897*c0909341SAndroid Build Coastguard Worker 898*c0909341SAndroid Build Coastguard Worker.loop_x: 899*c0909341SAndroid Build Coastguard Worker rorx r6, seeq, 1 900*c0909341SAndroid Build Coastguard Worker or seed, 0xEFF4 901*c0909341SAndroid Build Coastguard Worker test seeb, seeh 902*c0909341SAndroid Build Coastguard Worker lea seed, [r6+0x8000] 903*c0909341SAndroid Build Coastguard Worker cmovp seed, r6d ; updated seed 904*c0909341SAndroid Build Coastguard Worker 905*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 906*c0909341SAndroid Build Coastguard Worker rorx offxq, seeq, 12 907*c0909341SAndroid Build Coastguard Worker and offyd, 0xf 908*c0909341SAndroid Build Coastguard Worker imul offyd, 164 909*c0909341SAndroid Build Coastguard Worker lea offyd, [offyq+offxq*2+747] ; offy*stride+offx 910*c0909341SAndroid Build Coastguard Worker 911*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 912*c0909341SAndroid Build Coastguard Worker h, offxy, see, overlap 913*c0909341SAndroid Build Coastguard Worker 914*c0909341SAndroid Build Coastguard Worker mov hd, hm 915*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 916*c0909341SAndroid Build Coastguard Worker.loop_y: 917*c0909341SAndroid Build Coastguard Worker ; src 918*c0909341SAndroid Build Coastguard Worker mova m2, [srcq] 919*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m2, m7 920*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2, m7 921*c0909341SAndroid Build Coastguard Worker 922*c0909341SAndroid Build Coastguard Worker ; scaling[src] 923*c0909341SAndroid Build Coastguard Worker pandn m4, m8, m0 924*c0909341SAndroid Build Coastguard Worker mova m6, m8 925*c0909341SAndroid Build Coastguard Worker vpgatherdd m2, [scalingq+m4-0], m8 926*c0909341SAndroid Build Coastguard Worker psrld m3, m0, 16 927*c0909341SAndroid Build Coastguard Worker mova m8, m6 928*c0909341SAndroid Build Coastguard Worker vpgatherdd m4, [scalingq+m3-2], m6 929*c0909341SAndroid Build Coastguard Worker pandn m5, m8, m1 930*c0909341SAndroid Build Coastguard Worker mova m6, m8 931*c0909341SAndroid Build Coastguard Worker vpgatherdd m3, [scalingq+m5-0], m8 932*c0909341SAndroid Build Coastguard Worker pblendw m2, m4, 0xaa 933*c0909341SAndroid Build Coastguard Worker psrld m4, m1, 16 934*c0909341SAndroid Build Coastguard Worker mova m8, m6 935*c0909341SAndroid Build Coastguard Worker vpgatherdd m5, [scalingq+m4-2], m6 936*c0909341SAndroid Build Coastguard Worker pblendw m3, m5, 0xaa 937*c0909341SAndroid Build Coastguard Worker 938*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 939*c0909341SAndroid Build Coastguard Worker movu m5, [grain_lutq+offxyq] 940*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5, m7 941*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m7 942*c0909341SAndroid Build Coastguard Worker 943*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[src] * grain, scaling_shift) 944*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m4 945*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m5 946*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m9 947*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m9 948*c0909341SAndroid Build Coastguard Worker 949*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 950*c0909341SAndroid Build Coastguard Worker paddw m0, m2 951*c0909341SAndroid Build Coastguard Worker paddw m1, m3 952*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 953*c0909341SAndroid Build Coastguard Worker pmaxub m0, m10 954*c0909341SAndroid Build Coastguard Worker pminub m0, m11 955*c0909341SAndroid Build Coastguard Worker mova [dstq+srcq], m0 956*c0909341SAndroid Build Coastguard Worker 957*c0909341SAndroid Build Coastguard Worker add srcq, strideq 958*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82 959*c0909341SAndroid Build Coastguard Worker dec hd 960*c0909341SAndroid Build Coastguard Worker jg .loop_y 961*c0909341SAndroid Build Coastguard Worker 962*c0909341SAndroid Build Coastguard Worker add wq, 32 963*c0909341SAndroid Build Coastguard Worker jge .end 964*c0909341SAndroid Build Coastguard Worker lea srcq, [src_bakq+wq] 965*c0909341SAndroid Build Coastguard Worker test overlapd, overlapd 966*c0909341SAndroid Build Coastguard Worker jz .loop_x 967*c0909341SAndroid Build Coastguard Worker 968*c0909341SAndroid Build Coastguard Worker ; r8m = sbym 969*c0909341SAndroid Build Coastguard Worker cmp dword r8m, 0 970*c0909341SAndroid Build Coastguard Worker jne .loop_x_hv_overlap 971*c0909341SAndroid Build Coastguard Worker 972*c0909341SAndroid Build Coastguard Worker ; horizontal overlap (without vertical overlap) 973*c0909341SAndroid Build Coastguard Worker.loop_x_h_overlap: 974*c0909341SAndroid Build Coastguard Worker rorx r6, seeq, 1 975*c0909341SAndroid Build Coastguard Worker or seed, 0xEFF4 976*c0909341SAndroid Build Coastguard Worker test seeb, seeh 977*c0909341SAndroid Build Coastguard Worker lea seed, [r6+0x8000] 978*c0909341SAndroid Build Coastguard Worker cmovp seed, r6d ; updated seed 979*c0909341SAndroid Build Coastguard Worker 980*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 981*c0909341SAndroid Build Coastguard Worker offx, offy, see, left_offxy 982*c0909341SAndroid Build Coastguard Worker 983*c0909341SAndroid Build Coastguard Worker lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx 984*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 985*c0909341SAndroid Build Coastguard Worker rorx offxq, seeq, 12 986*c0909341SAndroid Build Coastguard Worker and offyd, 0xf 987*c0909341SAndroid Build Coastguard Worker imul offyd, 164 988*c0909341SAndroid Build Coastguard Worker lea offyd, [offyq+offxq*2+747] ; offy*stride+offx 989*c0909341SAndroid Build Coastguard Worker 990*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 991*c0909341SAndroid Build Coastguard Worker h, offxy, see, left_offxy 992*c0909341SAndroid Build Coastguard Worker 993*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 994*c0909341SAndroid Build Coastguard Worker mov hd, hm 995*c0909341SAndroid Build Coastguard Worker.loop_y_h_overlap: 996*c0909341SAndroid Build Coastguard Worker ; src 997*c0909341SAndroid Build Coastguard Worker mova m2, [srcq] 998*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m2, m7 999*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2, m7 1000*c0909341SAndroid Build Coastguard Worker 1001*c0909341SAndroid Build Coastguard Worker ; scaling[src] 1002*c0909341SAndroid Build Coastguard Worker pandn m4, m8, m0 1003*c0909341SAndroid Build Coastguard Worker mova m6, m8 1004*c0909341SAndroid Build Coastguard Worker vpgatherdd m2, [scalingq+m4-0], m8 1005*c0909341SAndroid Build Coastguard Worker psrld m3, m0, 16 1006*c0909341SAndroid Build Coastguard Worker mova m8, m6 1007*c0909341SAndroid Build Coastguard Worker vpgatherdd m4, [scalingq+m3-2], m6 1008*c0909341SAndroid Build Coastguard Worker pandn m5, m8, m1 1009*c0909341SAndroid Build Coastguard Worker mova m6, m8 1010*c0909341SAndroid Build Coastguard Worker vpgatherdd m3, [scalingq+m5-0], m8 1011*c0909341SAndroid Build Coastguard Worker pblendw m2, m4, 0xaa 1012*c0909341SAndroid Build Coastguard Worker psrld m4, m1, 16 1013*c0909341SAndroid Build Coastguard Worker mova m8, m6 1014*c0909341SAndroid Build Coastguard Worker vpgatherdd m5, [scalingq+m4-2], m6 1015*c0909341SAndroid Build Coastguard Worker pblendw m3, m5, 0xaa 1016*c0909341SAndroid Build Coastguard Worker 1017*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 1018*c0909341SAndroid Build Coastguard Worker movu m5, [grain_lutq+offxyq] 1019*c0909341SAndroid Build Coastguard Worker movd xm4, [grain_lutq+left_offxyq] 1020*c0909341SAndroid Build Coastguard Worker punpcklbw xm4, xm5 1021*c0909341SAndroid Build Coastguard Worker pmaddubsw xm4, xm13, xm4 1022*c0909341SAndroid Build Coastguard Worker pmulhrsw xm4, xm12 1023*c0909341SAndroid Build Coastguard Worker packsswb xm4, xm4 1024*c0909341SAndroid Build Coastguard Worker vpblendd m4, m5, 0xfe 1025*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m7 1026*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m7 1027*c0909341SAndroid Build Coastguard Worker 1028*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[src] * grain, scaling_shift) 1029*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m4 1030*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m5 1031*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m9 1032*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m9 1033*c0909341SAndroid Build Coastguard Worker 1034*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 1035*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1036*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1037*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1038*c0909341SAndroid Build Coastguard Worker pmaxub m0, m10 1039*c0909341SAndroid Build Coastguard Worker pminub m0, m11 1040*c0909341SAndroid Build Coastguard Worker mova [dstq+srcq], m0 1041*c0909341SAndroid Build Coastguard Worker 1042*c0909341SAndroid Build Coastguard Worker add srcq, strideq 1043*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82 1044*c0909341SAndroid Build Coastguard Worker dec hd 1045*c0909341SAndroid Build Coastguard Worker jg .loop_y_h_overlap 1046*c0909341SAndroid Build Coastguard Worker 1047*c0909341SAndroid Build Coastguard Worker add wq, 32 1048*c0909341SAndroid Build Coastguard Worker jge .end 1049*c0909341SAndroid Build Coastguard Worker lea srcq, [src_bakq+wq] 1050*c0909341SAndroid Build Coastguard Worker 1051*c0909341SAndroid Build Coastguard Worker ; r8m = sbym 1052*c0909341SAndroid Build Coastguard Worker cmp dword r8m, 0 1053*c0909341SAndroid Build Coastguard Worker jne .loop_x_hv_overlap 1054*c0909341SAndroid Build Coastguard Worker jmp .loop_x_h_overlap 1055*c0909341SAndroid Build Coastguard Worker 1056*c0909341SAndroid Build Coastguard Worker.vertical_overlap: 1057*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1058*c0909341SAndroid Build Coastguard Worker unused, sby, see, overlap 1059*c0909341SAndroid Build Coastguard Worker 1060*c0909341SAndroid Build Coastguard Worker movzx sbyd, sbyb 1061*c0909341SAndroid Build Coastguard Worker imul seed, [fg_dataq+FGData.seed], 0x00010001 1062*c0909341SAndroid Build Coastguard Worker imul r7d, sbyd, 173 * 0x00010001 1063*c0909341SAndroid Build Coastguard Worker imul sbyd, 37 * 0x01000100 1064*c0909341SAndroid Build Coastguard Worker add r7d, (105 << 16) | 188 1065*c0909341SAndroid Build Coastguard Worker add sbyd, (178 << 24) | (141 << 8) 1066*c0909341SAndroid Build Coastguard Worker and r7d, 0x00ff00ff 1067*c0909341SAndroid Build Coastguard Worker and sbyd, 0xff00ff00 1068*c0909341SAndroid Build Coastguard Worker xor seed, r7d 1069*c0909341SAndroid Build Coastguard Worker xor seed, sbyd ; (cur_seed << 16) | top_seed 1070*c0909341SAndroid Build Coastguard Worker 1071*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1072*c0909341SAndroid Build Coastguard Worker offx, offy, see, overlap 1073*c0909341SAndroid Build Coastguard Worker 1074*c0909341SAndroid Build Coastguard Worker lea src_bakq, [srcq+wq] 1075*c0909341SAndroid Build Coastguard Worker neg wq 1076*c0909341SAndroid Build Coastguard Worker sub dstq, srcq 1077*c0909341SAndroid Build Coastguard Worker 1078*c0909341SAndroid Build Coastguard Worker.loop_x_v_overlap: 1079*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [pb_27_17] 1080*c0909341SAndroid Build Coastguard Worker 1081*c0909341SAndroid Build Coastguard Worker ; we assume from the block above that bits 8-15 of r7d are zero'ed 1082*c0909341SAndroid Build Coastguard Worker mov r6d, seed 1083*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4eff4 1084*c0909341SAndroid Build Coastguard Worker test seeb, seeh 1085*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of top_seed 1086*c0909341SAndroid Build Coastguard Worker shr seed, 16 1087*c0909341SAndroid Build Coastguard Worker shl r7d, 16 1088*c0909341SAndroid Build Coastguard Worker test seeb, seeh 1089*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of cur_seed 1090*c0909341SAndroid Build Coastguard Worker or r6d, 0x00010001 1091*c0909341SAndroid Build Coastguard Worker xor r7d, r6d 1092*c0909341SAndroid Build Coastguard Worker rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1093*c0909341SAndroid Build Coastguard Worker 1094*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 1095*c0909341SAndroid Build Coastguard Worker rorx offxd, seed, 12 1096*c0909341SAndroid Build Coastguard Worker and offyd, 0xf000f 1097*c0909341SAndroid Build Coastguard Worker and offxd, 0xf000f 1098*c0909341SAndroid Build Coastguard Worker imul offyd, 164 1099*c0909341SAndroid Build Coastguard Worker ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1100*c0909341SAndroid Build Coastguard Worker lea offyd, [offyq+offxq*2+0x10001*747+32*82] 1101*c0909341SAndroid Build Coastguard Worker 1102*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1103*c0909341SAndroid Build Coastguard Worker h, offxy, see, overlap, top_offxy 1104*c0909341SAndroid Build Coastguard Worker 1105*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 1106*c0909341SAndroid Build Coastguard Worker mov hd, hm 1107*c0909341SAndroid Build Coastguard Worker movzx top_offxyd, offxyw 1108*c0909341SAndroid Build Coastguard Worker shr offxyd, 16 1109*c0909341SAndroid Build Coastguard Worker.loop_y_v_overlap: 1110*c0909341SAndroid Build Coastguard Worker ; src 1111*c0909341SAndroid Build Coastguard Worker mova m2, [srcq] 1112*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m2, m7 1113*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2, m7 1114*c0909341SAndroid Build Coastguard Worker 1115*c0909341SAndroid Build Coastguard Worker ; scaling[src] 1116*c0909341SAndroid Build Coastguard Worker pandn m4, m8, m0 1117*c0909341SAndroid Build Coastguard Worker mova m6, m8 1118*c0909341SAndroid Build Coastguard Worker vpgatherdd m2, [scalingq+m4-0], m8 1119*c0909341SAndroid Build Coastguard Worker psrld m3, m0, 16 1120*c0909341SAndroid Build Coastguard Worker mova m8, m6 1121*c0909341SAndroid Build Coastguard Worker vpgatherdd m4, [scalingq+m3-2], m6 1122*c0909341SAndroid Build Coastguard Worker pandn m5, m8, m1 1123*c0909341SAndroid Build Coastguard Worker mova m6, m8 1124*c0909341SAndroid Build Coastguard Worker vpgatherdd m3, [scalingq+m5-0], m8 1125*c0909341SAndroid Build Coastguard Worker pblendw m2, m4, 0xaa 1126*c0909341SAndroid Build Coastguard Worker psrld m4, m1, 16 1127*c0909341SAndroid Build Coastguard Worker mova m8, m6 1128*c0909341SAndroid Build Coastguard Worker vpgatherdd m5, [scalingq+m4-2], m6 1129*c0909341SAndroid Build Coastguard Worker pblendw m3, m5, 0xaa 1130*c0909341SAndroid Build Coastguard Worker 1131*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 1132*c0909341SAndroid Build Coastguard Worker movu m6, [grain_lutq+offxyq] 1133*c0909341SAndroid Build Coastguard Worker movu m4, [grain_lutq+top_offxyq] 1134*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m4, m6 1135*c0909341SAndroid Build Coastguard Worker punpckhbw m4, m6 1136*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m14, m5 1137*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m14, m4 1138*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m12 1139*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m12 1140*c0909341SAndroid Build Coastguard Worker packsswb m5, m4 1141*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5, m7 1142*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m7 1143*c0909341SAndroid Build Coastguard Worker 1144*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[src] * grain, scaling_shift) 1145*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m4 1146*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m5 1147*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m9 1148*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m9 1149*c0909341SAndroid Build Coastguard Worker 1150*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 1151*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1152*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1153*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1154*c0909341SAndroid Build Coastguard Worker pmaxub m0, m10 1155*c0909341SAndroid Build Coastguard Worker pminub m0, m11 1156*c0909341SAndroid Build Coastguard Worker mova [dstq+srcq], m0 1157*c0909341SAndroid Build Coastguard Worker 1158*c0909341SAndroid Build Coastguard Worker add srcq, strideq 1159*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82 1160*c0909341SAndroid Build Coastguard Worker dec hb 1161*c0909341SAndroid Build Coastguard Worker jz .end_y_v_overlap 1162*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [pb_17_27] ; swap weights for second v-overlap line 1163*c0909341SAndroid Build Coastguard Worker ; 2 lines get vertical overlap, then fall back to non-overlap code for 1164*c0909341SAndroid Build Coastguard Worker ; remaining (up to) 30 lines 1165*c0909341SAndroid Build Coastguard Worker add hd, 0x80000000 1166*c0909341SAndroid Build Coastguard Worker jnc .loop_y_v_overlap 1167*c0909341SAndroid Build Coastguard Worker jmp .loop_y 1168*c0909341SAndroid Build Coastguard Worker.end_y_v_overlap: 1169*c0909341SAndroid Build Coastguard Worker add wq, 32 1170*c0909341SAndroid Build Coastguard Worker jge .end 1171*c0909341SAndroid Build Coastguard Worker lea srcq, [src_bakq+wq] 1172*c0909341SAndroid Build Coastguard Worker 1173*c0909341SAndroid Build Coastguard Worker ; since fg_dataq.overlap is guaranteed to be set, we never jump 1174*c0909341SAndroid Build Coastguard Worker ; back to .loop_x_v_overlap, and instead always fall-through to 1175*c0909341SAndroid Build Coastguard Worker ; h+v overlap 1176*c0909341SAndroid Build Coastguard Worker.loop_x_hv_overlap: 1177*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [pb_27_17] 1178*c0909341SAndroid Build Coastguard Worker 1179*c0909341SAndroid Build Coastguard Worker ; we assume from the block above that bits 8-15 of r7d are zero'ed 1180*c0909341SAndroid Build Coastguard Worker mov r6d, seed 1181*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4eff4 1182*c0909341SAndroid Build Coastguard Worker test seeb, seeh 1183*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of top_seed 1184*c0909341SAndroid Build Coastguard Worker shr seed, 16 1185*c0909341SAndroid Build Coastguard Worker shl r7d, 16 1186*c0909341SAndroid Build Coastguard Worker test seeb, seeh 1187*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of cur_seed 1188*c0909341SAndroid Build Coastguard Worker or r6d, 0x00010001 1189*c0909341SAndroid Build Coastguard Worker xor r7d, r6d 1190*c0909341SAndroid Build Coastguard Worker rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1191*c0909341SAndroid Build Coastguard Worker 1192*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1193*c0909341SAndroid Build Coastguard Worker offx, offy, see, left_offxy, top_offxy, topleft_offxy 1194*c0909341SAndroid Build Coastguard Worker 1195*c0909341SAndroid Build Coastguard Worker lea topleft_offxyd, [top_offxyq+32] 1196*c0909341SAndroid Build Coastguard Worker lea left_offxyd, [offyq+32] 1197*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 1198*c0909341SAndroid Build Coastguard Worker rorx offxd, seed, 12 1199*c0909341SAndroid Build Coastguard Worker and offyd, 0xf000f 1200*c0909341SAndroid Build Coastguard Worker and offxd, 0xf000f 1201*c0909341SAndroid Build Coastguard Worker imul offyd, 164 1202*c0909341SAndroid Build Coastguard Worker ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1203*c0909341SAndroid Build Coastguard Worker lea offyd, [offyq+offxq*2+0x10001*747+32*82] 1204*c0909341SAndroid Build Coastguard Worker 1205*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1206*c0909341SAndroid Build Coastguard Worker h, offxy, see, left_offxy, top_offxy, topleft_offxy 1207*c0909341SAndroid Build Coastguard Worker 1208*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 1209*c0909341SAndroid Build Coastguard Worker mov hd, hm 1210*c0909341SAndroid Build Coastguard Worker movzx top_offxyd, offxyw 1211*c0909341SAndroid Build Coastguard Worker shr offxyd, 16 1212*c0909341SAndroid Build Coastguard Worker.loop_y_hv_overlap: 1213*c0909341SAndroid Build Coastguard Worker ; src 1214*c0909341SAndroid Build Coastguard Worker mova m2, [srcq] 1215*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m2, m7 1216*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2, m7 1217*c0909341SAndroid Build Coastguard Worker 1218*c0909341SAndroid Build Coastguard Worker ; scaling[src] 1219*c0909341SAndroid Build Coastguard Worker pandn m4, m8, m0 1220*c0909341SAndroid Build Coastguard Worker mova m6, m8 1221*c0909341SAndroid Build Coastguard Worker vpgatherdd m2, [scalingq+m4-0], m8 1222*c0909341SAndroid Build Coastguard Worker psrld m3, m0, 16 1223*c0909341SAndroid Build Coastguard Worker mova m8, m6 1224*c0909341SAndroid Build Coastguard Worker vpgatherdd m4, [scalingq+m3-2], m6 1225*c0909341SAndroid Build Coastguard Worker pandn m5, m8, m1 1226*c0909341SAndroid Build Coastguard Worker mova m6, m8 1227*c0909341SAndroid Build Coastguard Worker vpgatherdd m3, [scalingq+m5-0], m8 1228*c0909341SAndroid Build Coastguard Worker pblendw m2, m4, 0xaa 1229*c0909341SAndroid Build Coastguard Worker psrld m4, m1, 16 1230*c0909341SAndroid Build Coastguard Worker mova m8, m6 1231*c0909341SAndroid Build Coastguard Worker vpgatherdd m5, [scalingq+m4-2], m6 1232*c0909341SAndroid Build Coastguard Worker pblendw m3, m5, 0xaa 1233*c0909341SAndroid Build Coastguard Worker 1234*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 1235*c0909341SAndroid Build Coastguard Worker movu m6, [grain_lutq+offxyq] 1236*c0909341SAndroid Build Coastguard Worker movd xm7, [grain_lutq+left_offxyq] 1237*c0909341SAndroid Build Coastguard Worker movu m4, [grain_lutq+top_offxyq] 1238*c0909341SAndroid Build Coastguard Worker movd xm5, [grain_lutq+topleft_offxyq] 1239*c0909341SAndroid Build Coastguard Worker ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 1240*c0909341SAndroid Build Coastguard Worker punpcklbw xm7, xm6 1241*c0909341SAndroid Build Coastguard Worker punpcklbw xm5, xm4 1242*c0909341SAndroid Build Coastguard Worker pmaddubsw xm7, xm13, xm7 1243*c0909341SAndroid Build Coastguard Worker pmaddubsw xm5, xm13, xm5 1244*c0909341SAndroid Build Coastguard Worker pmulhrsw xm7, xm12 1245*c0909341SAndroid Build Coastguard Worker pmulhrsw xm5, xm12 1246*c0909341SAndroid Build Coastguard Worker packsswb xm7, xm7 1247*c0909341SAndroid Build Coastguard Worker packsswb xm5, xm5 1248*c0909341SAndroid Build Coastguard Worker vpblendd m7, m6, 0xfe 1249*c0909341SAndroid Build Coastguard Worker vpblendd m5, m4, 0xfe 1250*c0909341SAndroid Build Coastguard Worker ; followed by v interpolation (top | cur -> cur) 1251*c0909341SAndroid Build Coastguard Worker punpckhbw m4, m6 1252*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m7 1253*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m14, m4 1254*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m14, m5 1255*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m12 1256*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m12 1257*c0909341SAndroid Build Coastguard Worker pxor m7, m7 1258*c0909341SAndroid Build Coastguard Worker packsswb m5, m4 1259*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5, m7 1260*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m7 1261*c0909341SAndroid Build Coastguard Worker 1262*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[src] * grain, scaling_shift) 1263*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m4 1264*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m5 1265*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m9 1266*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m9 1267*c0909341SAndroid Build Coastguard Worker 1268*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 1269*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1270*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1271*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1272*c0909341SAndroid Build Coastguard Worker pmaxub m0, m10 1273*c0909341SAndroid Build Coastguard Worker pminub m0, m11 1274*c0909341SAndroid Build Coastguard Worker mova [dstq+srcq], m0 1275*c0909341SAndroid Build Coastguard Worker 1276*c0909341SAndroid Build Coastguard Worker add srcq, strideq 1277*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82 1278*c0909341SAndroid Build Coastguard Worker dec hb 1279*c0909341SAndroid Build Coastguard Worker jz .end_y_hv_overlap 1280*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [pb_17_27] ; swap weights for second v-overlap line 1281*c0909341SAndroid Build Coastguard Worker ; 2 lines get vertical overlap, then fall back to non-overlap code for 1282*c0909341SAndroid Build Coastguard Worker ; remaining (up to) 30 lines 1283*c0909341SAndroid Build Coastguard Worker add hd, 0x80000000 1284*c0909341SAndroid Build Coastguard Worker jnc .loop_y_hv_overlap 1285*c0909341SAndroid Build Coastguard Worker jmp .loop_y_h_overlap 1286*c0909341SAndroid Build Coastguard Worker.end_y_hv_overlap: 1287*c0909341SAndroid Build Coastguard Worker add wq, 32 1288*c0909341SAndroid Build Coastguard Worker lea srcq, [src_bakq+wq] 1289*c0909341SAndroid Build Coastguard Worker jl .loop_x_hv_overlap 1290*c0909341SAndroid Build Coastguard Worker.end: 1291*c0909341SAndroid Build Coastguard Worker RET 1292*c0909341SAndroid Build Coastguard Worker 1293*c0909341SAndroid Build Coastguard Worker%macro FGUV_FN 3 ; name, ss_hor, ss_ver 1294*c0909341SAndroid Build Coastguard Workercglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ 1295*c0909341SAndroid Build Coastguard Worker grain_lut, h, sby, luma, overlap, uv_pl, is_id 1296*c0909341SAndroid Build Coastguard Worker%define base r11-pd_m65536 1297*c0909341SAndroid Build Coastguard Worker lea r11, [pd_m65536] 1298*c0909341SAndroid Build Coastguard Worker mov r6d, [fg_dataq+FGData.scaling_shift] 1299*c0909341SAndroid Build Coastguard Worker mov r7d, [fg_dataq+FGData.clip_to_restricted_range] 1300*c0909341SAndroid Build Coastguard Worker mov r9d, is_idm 1301*c0909341SAndroid Build Coastguard Worker mov sbyd, sbym 1302*c0909341SAndroid Build Coastguard Worker mov overlapd, [fg_dataq+FGData.overlap_flag] 1303*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+pd_m65536] 1304*c0909341SAndroid Build Coastguard Worker vpbroadcastw m9, [base+mul_bits+r6*2-14] 1305*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+fg_min+r7*4] 1306*c0909341SAndroid Build Coastguard Worker shlx r7d, r7d, r9d 1307*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+fg_max+r7*4] 1308*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [base+pw_1024] 1309*c0909341SAndroid Build Coastguard Worker pxor m7, m7 1310*c0909341SAndroid Build Coastguard Worker test sbyd, sbyd 1311*c0909341SAndroid Build Coastguard Worker setnz r7b 1312*c0909341SAndroid Build Coastguard Worker cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 1313*c0909341SAndroid Build Coastguard Worker jne .csfl 1314*c0909341SAndroid Build Coastguard Worker 1315*c0909341SAndroid Build Coastguard Worker%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver 1316*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1317*c0909341SAndroid Build Coastguard Worker h, sby, see, overlap, uv_pl 1318*c0909341SAndroid Build Coastguard Worker%if %1 1319*c0909341SAndroid Build Coastguard Worker mov r6d, uv_plm 1320*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [base+pw_8] 1321*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m14, [fg_dataq+FGData.uv_mult+r6*4] 1322*c0909341SAndroid Build Coastguard Worker vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4] 1323*c0909341SAndroid Build Coastguard Worker pshufb m14, m0 ; uv_luma_mult, uv_mult 1324*c0909341SAndroid Build Coastguard Worker%elif %2 1325*c0909341SAndroid Build Coastguard Worker vpbroadcastq m15, [base+pb_23_22] 1326*c0909341SAndroid Build Coastguard Worker%else 1327*c0909341SAndroid Build Coastguard Worker vpbroadcastq xm15, [base+pb_27_17_17_27] 1328*c0909341SAndroid Build Coastguard Worker%endif 1329*c0909341SAndroid Build Coastguard Worker%if %3 1330*c0909341SAndroid Build Coastguard Worker vpbroadcastw m13, [base+pb_23_22] 1331*c0909341SAndroid Build Coastguard Worker%elif %2 1332*c0909341SAndroid Build Coastguard Worker pshufd m13, [base+pb_27_17], q0000 ; 8x27_17, 8x17_27 1333*c0909341SAndroid Build Coastguard Worker%endif 1334*c0909341SAndroid Build Coastguard Worker test r7b, overlapb 1335*c0909341SAndroid Build Coastguard Worker jnz %%vertical_overlap 1336*c0909341SAndroid Build Coastguard Worker 1337*c0909341SAndroid Build Coastguard Worker imul seed, sbyd, (173 << 24) | 37 1338*c0909341SAndroid Build Coastguard Worker add seed, (105 << 24) | 178 1339*c0909341SAndroid Build Coastguard Worker rorx seed, seed, 24 1340*c0909341SAndroid Build Coastguard Worker movzx seed, seew 1341*c0909341SAndroid Build Coastguard Worker xor seed, [fg_dataq+FGData.seed] 1342*c0909341SAndroid Build Coastguard Worker 1343*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1344*c0909341SAndroid Build Coastguard Worker unused2, unused3, see, overlap, unused4, unused5, lstride 1345*c0909341SAndroid Build Coastguard Worker 1346*c0909341SAndroid Build Coastguard Worker mov lumaq, r9mp 1347*c0909341SAndroid Build Coastguard Worker lea r12, [srcq+wq] 1348*c0909341SAndroid Build Coastguard Worker lea r13, [dstq+wq] 1349*c0909341SAndroid Build Coastguard Worker lea r14, [lumaq+wq*(1+%2)] 1350*c0909341SAndroid Build Coastguard Worker mov r11mp, r12 1351*c0909341SAndroid Build Coastguard Worker mov r12mp, r13 1352*c0909341SAndroid Build Coastguard Worker mov lstrideq, r10mp 1353*c0909341SAndroid Build Coastguard Worker neg wq 1354*c0909341SAndroid Build Coastguard Worker 1355*c0909341SAndroid Build Coastguard Worker%%loop_x: 1356*c0909341SAndroid Build Coastguard Worker rorx r6, seeq, 1 1357*c0909341SAndroid Build Coastguard Worker or seed, 0xEFF4 1358*c0909341SAndroid Build Coastguard Worker test seeb, seeh 1359*c0909341SAndroid Build Coastguard Worker lea seed, [r6+0x8000] 1360*c0909341SAndroid Build Coastguard Worker cmovp seed, r6d ; updated seed 1361*c0909341SAndroid Build Coastguard Worker 1362*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1363*c0909341SAndroid Build Coastguard Worker offx, offy, see, overlap, unused1, unused2, lstride 1364*c0909341SAndroid Build Coastguard Worker 1365*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 1366*c0909341SAndroid Build Coastguard Worker rorx offxq, seeq, 12 1367*c0909341SAndroid Build Coastguard Worker and offyd, 0xf 1368*c0909341SAndroid Build Coastguard Worker imul offyd, 164>>%3 1369*c0909341SAndroid Build Coastguard Worker lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 1370*c0909341SAndroid Build Coastguard Worker 1371*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1372*c0909341SAndroid Build Coastguard Worker h, offxy, see, overlap, unused1, unused2, lstride 1373*c0909341SAndroid Build Coastguard Worker 1374*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 1375*c0909341SAndroid Build Coastguard Worker mov hd, hm 1376*c0909341SAndroid Build Coastguard Worker%%loop_y: 1377*c0909341SAndroid Build Coastguard Worker ; src 1378*c0909341SAndroid Build Coastguard Worker%if %2 1379*c0909341SAndroid Build Coastguard Worker mova xm3, [lumaq+lstrideq*0+ 0] 1380*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [lumaq+lstrideq*(1+%3) +0], 1 1381*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [pb_1] 1382*c0909341SAndroid Build Coastguard Worker mova xm0, [lumaq+lstrideq*0+16] 1383*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 1384*c0909341SAndroid Build Coastguard Worker mova xm1, [srcq] 1385*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+strideq], 1 1386*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m2 1387*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 1388*c0909341SAndroid Build Coastguard Worker pavgw m3, m7 1389*c0909341SAndroid Build Coastguard Worker pavgw m0, m7 1390*c0909341SAndroid Build Coastguard Worker%else 1391*c0909341SAndroid Build Coastguard Worker mova m2, [lumaq] 1392*c0909341SAndroid Build Coastguard Worker mova m1, [srcq] 1393*c0909341SAndroid Build Coastguard Worker%endif 1394*c0909341SAndroid Build Coastguard Worker%if %1 1395*c0909341SAndroid Build Coastguard Worker%if %2 1396*c0909341SAndroid Build Coastguard Worker packuswb m2, m3, m0 ; luma 1397*c0909341SAndroid Build Coastguard Worker%endif 1398*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m2, m1 1399*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m1 ; { luma, chroma } 1400*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m14 1401*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m14 1402*c0909341SAndroid Build Coastguard Worker psraw m3, 6 1403*c0909341SAndroid Build Coastguard Worker psraw m2, 6 1404*c0909341SAndroid Build Coastguard Worker paddw m3, m15 1405*c0909341SAndroid Build Coastguard Worker paddw m2, m15 1406*c0909341SAndroid Build Coastguard Worker packuswb m2, m3 ; pack+unpack = clip 1407*c0909341SAndroid Build Coastguard Worker%endif 1408*c0909341SAndroid Build Coastguard Worker%if %1 || %2 == 0 1409*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m2, m7 1410*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m2, m7 1411*c0909341SAndroid Build Coastguard Worker%endif 1412*c0909341SAndroid Build Coastguard Worker 1413*c0909341SAndroid Build Coastguard Worker ; scaling[luma_src] 1414*c0909341SAndroid Build Coastguard Worker pandn m4, m8, m3 1415*c0909341SAndroid Build Coastguard Worker mova m6, m8 1416*c0909341SAndroid Build Coastguard Worker vpgatherdd m2, [scalingq+m4-0], m8 1417*c0909341SAndroid Build Coastguard Worker psrld m3, 16 1418*c0909341SAndroid Build Coastguard Worker mova m8, m6 1419*c0909341SAndroid Build Coastguard Worker vpgatherdd m4, [scalingq+m3-2], m6 1420*c0909341SAndroid Build Coastguard Worker pandn m5, m8, m0 1421*c0909341SAndroid Build Coastguard Worker mova m6, m8 1422*c0909341SAndroid Build Coastguard Worker vpgatherdd m3, [scalingq+m5-0], m8 1423*c0909341SAndroid Build Coastguard Worker psrld m0, 16 1424*c0909341SAndroid Build Coastguard Worker mova m8, m6 1425*c0909341SAndroid Build Coastguard Worker vpgatherdd m5, [scalingq+m0-2], m6 1426*c0909341SAndroid Build Coastguard Worker pblendw m2, m4, 0xaa 1427*c0909341SAndroid Build Coastguard Worker pblendw m3, m5, 0xaa 1428*c0909341SAndroid Build Coastguard Worker 1429*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 1430*c0909341SAndroid Build Coastguard Worker%if %2 1431*c0909341SAndroid Build Coastguard Worker movu xm5, [grain_lutq+offxyq+ 0] 1432*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [grain_lutq+offxyq+82], 1 1433*c0909341SAndroid Build Coastguard Worker%else 1434*c0909341SAndroid Build Coastguard Worker movu m5, [grain_lutq+offxyq] 1435*c0909341SAndroid Build Coastguard Worker%endif 1436*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5, m7 1437*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m7 1438*c0909341SAndroid Build Coastguard Worker 1439*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[luma_src] * grain, scaling_shift) 1440*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m4 1441*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m5 1442*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m9 1443*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m9 1444*c0909341SAndroid Build Coastguard Worker 1445*c0909341SAndroid Build Coastguard Worker ; unpack chroma_source 1446*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m7 1447*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m7 1448*c0909341SAndroid Build Coastguard Worker 1449*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 1450*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1451*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1452*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1453*c0909341SAndroid Build Coastguard Worker pmaxub m0, m10 1454*c0909341SAndroid Build Coastguard Worker pminub m0, m11 1455*c0909341SAndroid Build Coastguard Worker%if %2 1456*c0909341SAndroid Build Coastguard Worker mova [dstq], xm0 1457*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq], m0, 1 1458*c0909341SAndroid Build Coastguard Worker%else 1459*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 1460*c0909341SAndroid Build Coastguard Worker%endif 1461*c0909341SAndroid Build Coastguard Worker 1462*c0909341SAndroid Build Coastguard Worker%if %2 1463*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 1464*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1465*c0909341SAndroid Build Coastguard Worker lea lumaq, [lumaq+lstrideq*(2<<%3)] 1466*c0909341SAndroid Build Coastguard Worker%else 1467*c0909341SAndroid Build Coastguard Worker add srcq, strideq 1468*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1469*c0909341SAndroid Build Coastguard Worker add lumaq, lstrideq 1470*c0909341SAndroid Build Coastguard Worker%endif 1471*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82<<%2 1472*c0909341SAndroid Build Coastguard Worker sub hb, 1+%2 1473*c0909341SAndroid Build Coastguard Worker jg %%loop_y 1474*c0909341SAndroid Build Coastguard Worker 1475*c0909341SAndroid Build Coastguard Worker add wq, 32>>%2 1476*c0909341SAndroid Build Coastguard Worker jge .end 1477*c0909341SAndroid Build Coastguard Worker mov srcq, r11mp 1478*c0909341SAndroid Build Coastguard Worker mov dstq, r12mp 1479*c0909341SAndroid Build Coastguard Worker lea lumaq, [r14+wq*(1+%2)] 1480*c0909341SAndroid Build Coastguard Worker add srcq, wq 1481*c0909341SAndroid Build Coastguard Worker add dstq, wq 1482*c0909341SAndroid Build Coastguard Worker test overlapd, overlapd 1483*c0909341SAndroid Build Coastguard Worker jz %%loop_x 1484*c0909341SAndroid Build Coastguard Worker 1485*c0909341SAndroid Build Coastguard Worker ; r8m = sbym 1486*c0909341SAndroid Build Coastguard Worker cmp dword r8m, 0 1487*c0909341SAndroid Build Coastguard Worker jne %%loop_x_hv_overlap 1488*c0909341SAndroid Build Coastguard Worker 1489*c0909341SAndroid Build Coastguard Worker ; horizontal overlap (without vertical overlap) 1490*c0909341SAndroid Build Coastguard Worker%%loop_x_h_overlap: 1491*c0909341SAndroid Build Coastguard Worker rorx r6, seeq, 1 1492*c0909341SAndroid Build Coastguard Worker or seed, 0xEFF4 1493*c0909341SAndroid Build Coastguard Worker test seeb, seeh 1494*c0909341SAndroid Build Coastguard Worker lea seed, [r6+0x8000] 1495*c0909341SAndroid Build Coastguard Worker cmovp seed, r6d ; updated seed 1496*c0909341SAndroid Build Coastguard Worker 1497*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1498*c0909341SAndroid Build Coastguard Worker offx, offy, see, left_offxy, unused1, unused2, lstride 1499*c0909341SAndroid Build Coastguard Worker 1500*c0909341SAndroid Build Coastguard Worker lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx 1501*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 1502*c0909341SAndroid Build Coastguard Worker rorx offxq, seeq, 12 1503*c0909341SAndroid Build Coastguard Worker and offyd, 0xf 1504*c0909341SAndroid Build Coastguard Worker imul offyd, 164>>%3 1505*c0909341SAndroid Build Coastguard Worker lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 1506*c0909341SAndroid Build Coastguard Worker 1507*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1508*c0909341SAndroid Build Coastguard Worker h, offxy, see, left_offxy, unused1, unused2, lstride 1509*c0909341SAndroid Build Coastguard Worker 1510*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 1511*c0909341SAndroid Build Coastguard Worker mov hd, hm 1512*c0909341SAndroid Build Coastguard Worker%%loop_y_h_overlap: 1513*c0909341SAndroid Build Coastguard Worker ; src 1514*c0909341SAndroid Build Coastguard Worker%if %2 1515*c0909341SAndroid Build Coastguard Worker mova xm3, [lumaq+lstrideq*0+ 0] 1516*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 1517*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [pb_1] 1518*c0909341SAndroid Build Coastguard Worker mova xm0, [lumaq+lstrideq*0+16] 1519*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 1520*c0909341SAndroid Build Coastguard Worker mova xm1, [srcq] 1521*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+strideq], 1 1522*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m2 1523*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 1524*c0909341SAndroid Build Coastguard Worker pavgw m3, m7 1525*c0909341SAndroid Build Coastguard Worker pavgw m0, m7 1526*c0909341SAndroid Build Coastguard Worker%else 1527*c0909341SAndroid Build Coastguard Worker mova m2, [lumaq] 1528*c0909341SAndroid Build Coastguard Worker mova m1, [srcq] 1529*c0909341SAndroid Build Coastguard Worker%endif 1530*c0909341SAndroid Build Coastguard Worker%if %1 1531*c0909341SAndroid Build Coastguard Worker%if %2 1532*c0909341SAndroid Build Coastguard Worker packuswb m2, m3, m0 ; luma 1533*c0909341SAndroid Build Coastguard Worker%endif 1534*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m2, m1 1535*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m1 ; { luma, chroma } 1536*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m14 1537*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m14 1538*c0909341SAndroid Build Coastguard Worker psraw m3, 6 1539*c0909341SAndroid Build Coastguard Worker psraw m2, 6 1540*c0909341SAndroid Build Coastguard Worker paddw m3, m15 1541*c0909341SAndroid Build Coastguard Worker paddw m2, m15 1542*c0909341SAndroid Build Coastguard Worker packuswb m2, m3 ; pack+unpack = clip 1543*c0909341SAndroid Build Coastguard Worker%endif 1544*c0909341SAndroid Build Coastguard Worker%if %1 || %2 == 0 1545*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m2, m7 1546*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m2, m7 1547*c0909341SAndroid Build Coastguard Worker%endif 1548*c0909341SAndroid Build Coastguard Worker 1549*c0909341SAndroid Build Coastguard Worker ; scaling[luma_src] 1550*c0909341SAndroid Build Coastguard Worker pandn m4, m8, m3 1551*c0909341SAndroid Build Coastguard Worker mova m6, m8 1552*c0909341SAndroid Build Coastguard Worker vpgatherdd m2, [scalingq+m4-0], m8 1553*c0909341SAndroid Build Coastguard Worker psrld m3, 16 1554*c0909341SAndroid Build Coastguard Worker mova m8, m6 1555*c0909341SAndroid Build Coastguard Worker vpgatherdd m4, [scalingq+m3-2], m6 1556*c0909341SAndroid Build Coastguard Worker pandn m5, m8, m0 1557*c0909341SAndroid Build Coastguard Worker mova m6, m8 1558*c0909341SAndroid Build Coastguard Worker vpgatherdd m3, [scalingq+m5-0], m8 1559*c0909341SAndroid Build Coastguard Worker psrld m0, 16 1560*c0909341SAndroid Build Coastguard Worker mova m8, m6 1561*c0909341SAndroid Build Coastguard Worker vpgatherdd m5, [scalingq+m0-2], m6 1562*c0909341SAndroid Build Coastguard Worker pblendw m2, m4, 0xaa 1563*c0909341SAndroid Build Coastguard Worker pblendw m3, m5, 0xaa 1564*c0909341SAndroid Build Coastguard Worker 1565*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 1566*c0909341SAndroid Build Coastguard Worker%if %2 1567*c0909341SAndroid Build Coastguard Worker movu xm5, [grain_lutq+offxyq+ 0] 1568*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [grain_lutq+offxyq+82], 1 1569*c0909341SAndroid Build Coastguard Worker movd xm4, [grain_lutq+left_offxyq+ 0] 1570*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [grain_lutq+left_offxyq+82], 1 1571*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5 1572*c0909341SAndroid Build Coastguard Worker%if %1 1573*c0909341SAndroid Build Coastguard Worker vpbroadcastq m0, [pb_23_22] 1574*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m0, m4 1575*c0909341SAndroid Build Coastguard Worker%else 1576*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m15, m4 1577*c0909341SAndroid Build Coastguard Worker%endif 1578*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m12 1579*c0909341SAndroid Build Coastguard Worker packsswb m4, m4 1580*c0909341SAndroid Build Coastguard Worker vpblendd m4, m5, 0xee 1581*c0909341SAndroid Build Coastguard Worker%else 1582*c0909341SAndroid Build Coastguard Worker movu m5, [grain_lutq+offxyq] 1583*c0909341SAndroid Build Coastguard Worker movd xm4, [grain_lutq+left_offxyq] 1584*c0909341SAndroid Build Coastguard Worker punpcklbw xm4, xm5 1585*c0909341SAndroid Build Coastguard Worker%if %1 1586*c0909341SAndroid Build Coastguard Worker movq xm0, [pb_27_17_17_27] 1587*c0909341SAndroid Build Coastguard Worker pmaddubsw xm4, xm0, xm4 1588*c0909341SAndroid Build Coastguard Worker%else 1589*c0909341SAndroid Build Coastguard Worker pmaddubsw xm4, xm15, xm4 1590*c0909341SAndroid Build Coastguard Worker%endif 1591*c0909341SAndroid Build Coastguard Worker pmulhrsw xm4, xm12 1592*c0909341SAndroid Build Coastguard Worker packsswb xm4, xm4 1593*c0909341SAndroid Build Coastguard Worker vpblendd m4, m5, 0xfe 1594*c0909341SAndroid Build Coastguard Worker%endif 1595*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m7 1596*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m7 1597*c0909341SAndroid Build Coastguard Worker 1598*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[luma_src] * grain, scaling_shift) 1599*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m4 1600*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m5 1601*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m9 1602*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m9 1603*c0909341SAndroid Build Coastguard Worker 1604*c0909341SAndroid Build Coastguard Worker ; unpack chroma_source 1605*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m7 1606*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m7 1607*c0909341SAndroid Build Coastguard Worker 1608*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 1609*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1610*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1611*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1612*c0909341SAndroid Build Coastguard Worker pmaxub m0, m10 1613*c0909341SAndroid Build Coastguard Worker pminub m0, m11 1614*c0909341SAndroid Build Coastguard Worker%if %2 1615*c0909341SAndroid Build Coastguard Worker mova [dstq], xm0 1616*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq], m0, 1 1617*c0909341SAndroid Build Coastguard Worker%else 1618*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 1619*c0909341SAndroid Build Coastguard Worker%endif 1620*c0909341SAndroid Build Coastguard Worker 1621*c0909341SAndroid Build Coastguard Worker%if %2 1622*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 1623*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1624*c0909341SAndroid Build Coastguard Worker lea lumaq, [lumaq+lstrideq*(2<<%3)] 1625*c0909341SAndroid Build Coastguard Worker%else 1626*c0909341SAndroid Build Coastguard Worker add srcq, strideq 1627*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1628*c0909341SAndroid Build Coastguard Worker add lumaq, lstrideq 1629*c0909341SAndroid Build Coastguard Worker%endif 1630*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82*(1+%2) 1631*c0909341SAndroid Build Coastguard Worker sub hb, 1+%2 1632*c0909341SAndroid Build Coastguard Worker jg %%loop_y_h_overlap 1633*c0909341SAndroid Build Coastguard Worker 1634*c0909341SAndroid Build Coastguard Worker add wq, 32>>%2 1635*c0909341SAndroid Build Coastguard Worker jge .end 1636*c0909341SAndroid Build Coastguard Worker mov srcq, r11mp 1637*c0909341SAndroid Build Coastguard Worker mov dstq, r12mp 1638*c0909341SAndroid Build Coastguard Worker lea lumaq, [r14+wq*(1+%2)] 1639*c0909341SAndroid Build Coastguard Worker add srcq, wq 1640*c0909341SAndroid Build Coastguard Worker add dstq, wq 1641*c0909341SAndroid Build Coastguard Worker 1642*c0909341SAndroid Build Coastguard Worker ; r8m = sbym 1643*c0909341SAndroid Build Coastguard Worker cmp dword r8m, 0 1644*c0909341SAndroid Build Coastguard Worker jne %%loop_x_hv_overlap 1645*c0909341SAndroid Build Coastguard Worker jmp %%loop_x_h_overlap 1646*c0909341SAndroid Build Coastguard Worker 1647*c0909341SAndroid Build Coastguard Worker%%vertical_overlap: 1648*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ 1649*c0909341SAndroid Build Coastguard Worker sby, see, overlap, unused1, unused2, lstride 1650*c0909341SAndroid Build Coastguard Worker 1651*c0909341SAndroid Build Coastguard Worker movzx sbyd, sbyb 1652*c0909341SAndroid Build Coastguard Worker imul seed, [fg_dataq+FGData.seed], 0x00010001 1653*c0909341SAndroid Build Coastguard Worker imul r7d, sbyd, 173 * 0x00010001 1654*c0909341SAndroid Build Coastguard Worker imul sbyd, 37 * 0x01000100 1655*c0909341SAndroid Build Coastguard Worker add r7d, (105 << 16) | 188 1656*c0909341SAndroid Build Coastguard Worker add sbyd, (178 << 24) | (141 << 8) 1657*c0909341SAndroid Build Coastguard Worker and r7d, 0x00ff00ff 1658*c0909341SAndroid Build Coastguard Worker and sbyd, 0xff00ff00 1659*c0909341SAndroid Build Coastguard Worker xor seed, r7d 1660*c0909341SAndroid Build Coastguard Worker xor seed, sbyd ; (cur_seed << 16) | top_seed 1661*c0909341SAndroid Build Coastguard Worker 1662*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1663*c0909341SAndroid Build Coastguard Worker unused1, unused2, see, overlap, unused3, unused4, lstride 1664*c0909341SAndroid Build Coastguard Worker 1665*c0909341SAndroid Build Coastguard Worker mov lumaq, r9mp 1666*c0909341SAndroid Build Coastguard Worker lea r12, [srcq+wq] 1667*c0909341SAndroid Build Coastguard Worker lea r13, [dstq+wq] 1668*c0909341SAndroid Build Coastguard Worker lea r14, [lumaq+wq*(1+%2)] 1669*c0909341SAndroid Build Coastguard Worker mov r11mp, r12 1670*c0909341SAndroid Build Coastguard Worker mov r12mp, r13 1671*c0909341SAndroid Build Coastguard Worker mov lstrideq, r10mp 1672*c0909341SAndroid Build Coastguard Worker neg wq 1673*c0909341SAndroid Build Coastguard Worker 1674*c0909341SAndroid Build Coastguard Worker%%loop_x_v_overlap: 1675*c0909341SAndroid Build Coastguard Worker ; we assume from the block above that bits 8-15 of r7d are zero'ed 1676*c0909341SAndroid Build Coastguard Worker mov r6d, seed 1677*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4eff4 1678*c0909341SAndroid Build Coastguard Worker test seeb, seeh 1679*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of top_seed 1680*c0909341SAndroid Build Coastguard Worker shr seed, 16 1681*c0909341SAndroid Build Coastguard Worker shl r7d, 16 1682*c0909341SAndroid Build Coastguard Worker test seeb, seeh 1683*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of cur_seed 1684*c0909341SAndroid Build Coastguard Worker or r6d, 0x00010001 1685*c0909341SAndroid Build Coastguard Worker xor r7d, r6d 1686*c0909341SAndroid Build Coastguard Worker rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1687*c0909341SAndroid Build Coastguard Worker 1688*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1689*c0909341SAndroid Build Coastguard Worker offx, offy, see, overlap, top_offxy, unused, lstride 1690*c0909341SAndroid Build Coastguard Worker 1691*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 1692*c0909341SAndroid Build Coastguard Worker rorx offxd, seed, 12 1693*c0909341SAndroid Build Coastguard Worker and offyd, 0xf000f 1694*c0909341SAndroid Build Coastguard Worker and offxd, 0xf000f 1695*c0909341SAndroid Build Coastguard Worker imul offyd, 164>>%3 1696*c0909341SAndroid Build Coastguard Worker ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1697*c0909341SAndroid Build Coastguard Worker lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 1698*c0909341SAndroid Build Coastguard Worker 1699*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1700*c0909341SAndroid Build Coastguard Worker h, offxy, see, overlap, top_offxy, unused, lstride 1701*c0909341SAndroid Build Coastguard Worker 1702*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 1703*c0909341SAndroid Build Coastguard Worker mov hd, hm 1704*c0909341SAndroid Build Coastguard Worker movzx top_offxyd, offxyw 1705*c0909341SAndroid Build Coastguard Worker shr offxyd, 16 1706*c0909341SAndroid Build Coastguard Worker%if %2 == 0 1707*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [pb_27_17] 1708*c0909341SAndroid Build Coastguard Worker%endif 1709*c0909341SAndroid Build Coastguard Worker%%loop_y_v_overlap: 1710*c0909341SAndroid Build Coastguard Worker ; src 1711*c0909341SAndroid Build Coastguard Worker%if %2 1712*c0909341SAndroid Build Coastguard Worker mova xm3, [lumaq+lstrideq*0+ 0] 1713*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 1714*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [pb_1] 1715*c0909341SAndroid Build Coastguard Worker mova xm0, [lumaq+lstrideq*0+16] 1716*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 1717*c0909341SAndroid Build Coastguard Worker mova xm1, [srcq] 1718*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+strideq], 1 1719*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m2 1720*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 1721*c0909341SAndroid Build Coastguard Worker pavgw m3, m7 1722*c0909341SAndroid Build Coastguard Worker pavgw m0, m7 1723*c0909341SAndroid Build Coastguard Worker%else 1724*c0909341SAndroid Build Coastguard Worker mova m2, [lumaq] 1725*c0909341SAndroid Build Coastguard Worker mova m1, [srcq] 1726*c0909341SAndroid Build Coastguard Worker%endif 1727*c0909341SAndroid Build Coastguard Worker%if %1 1728*c0909341SAndroid Build Coastguard Worker%if %2 1729*c0909341SAndroid Build Coastguard Worker packuswb m2, m3, m0 ; luma 1730*c0909341SAndroid Build Coastguard Worker%endif 1731*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m2, m1 1732*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m1 ; { luma, chroma } 1733*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m14 1734*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m14 1735*c0909341SAndroid Build Coastguard Worker psraw m3, 6 1736*c0909341SAndroid Build Coastguard Worker psraw m2, 6 1737*c0909341SAndroid Build Coastguard Worker paddw m3, m15 1738*c0909341SAndroid Build Coastguard Worker paddw m2, m15 1739*c0909341SAndroid Build Coastguard Worker packuswb m2, m3 ; pack+unpack = clip 1740*c0909341SAndroid Build Coastguard Worker%endif 1741*c0909341SAndroid Build Coastguard Worker%if %1 || %2 == 0 1742*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m2, m7 1743*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m2, m7 1744*c0909341SAndroid Build Coastguard Worker%endif 1745*c0909341SAndroid Build Coastguard Worker 1746*c0909341SAndroid Build Coastguard Worker ; scaling[luma_src] 1747*c0909341SAndroid Build Coastguard Worker pandn m4, m8, m3 1748*c0909341SAndroid Build Coastguard Worker mova m6, m8 1749*c0909341SAndroid Build Coastguard Worker vpgatherdd m2, [scalingq+m4-0], m8 1750*c0909341SAndroid Build Coastguard Worker psrld m3, 16 1751*c0909341SAndroid Build Coastguard Worker mova m8, m6 1752*c0909341SAndroid Build Coastguard Worker vpgatherdd m4, [scalingq+m3-2], m6 1753*c0909341SAndroid Build Coastguard Worker pandn m5, m8, m0 1754*c0909341SAndroid Build Coastguard Worker mova m6, m8 1755*c0909341SAndroid Build Coastguard Worker vpgatherdd m3, [scalingq+m5-0], m8 1756*c0909341SAndroid Build Coastguard Worker psrld m0, 16 1757*c0909341SAndroid Build Coastguard Worker mova m8, m6 1758*c0909341SAndroid Build Coastguard Worker vpgatherdd m5, [scalingq+m0-2], m6 1759*c0909341SAndroid Build Coastguard Worker pblendw m2, m4, 0xaa 1760*c0909341SAndroid Build Coastguard Worker pblendw m3, m5, 0xaa 1761*c0909341SAndroid Build Coastguard Worker 1762*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 1763*c0909341SAndroid Build Coastguard Worker%if %3 == 0 1764*c0909341SAndroid Build Coastguard Worker%if %2 1765*c0909341SAndroid Build Coastguard Worker movu xm0, [grain_lutq+offxyq] 1766*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [grain_lutq+offxyq+82], 1 1767*c0909341SAndroid Build Coastguard Worker movu xm4, [grain_lutq+top_offxyq] 1768*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [grain_lutq+top_offxyq+82], 1 1769*c0909341SAndroid Build Coastguard Worker%else 1770*c0909341SAndroid Build Coastguard Worker movu m0, [grain_lutq+offxyq] 1771*c0909341SAndroid Build Coastguard Worker movu m4, [grain_lutq+top_offxyq] 1772*c0909341SAndroid Build Coastguard Worker%endif 1773*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m4, m0 1774*c0909341SAndroid Build Coastguard Worker punpckhbw m4, m0 1775*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m13, m5 1776*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m13, m4 1777*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m12 1778*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m12 1779*c0909341SAndroid Build Coastguard Worker packsswb m5, m4 1780*c0909341SAndroid Build Coastguard Worker%else 1781*c0909341SAndroid Build Coastguard Worker movq xm4, [grain_lutq+offxyq] 1782*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [grain_lutq+offxyq+8], 1 1783*c0909341SAndroid Build Coastguard Worker movq xm5, [grain_lutq+top_offxyq] 1784*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [grain_lutq+top_offxyq+8], 1 1785*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m4 1786*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m13, m5 1787*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m12 1788*c0909341SAndroid Build Coastguard Worker vextracti128 xm4, m5, 1 1789*c0909341SAndroid Build Coastguard Worker packsswb xm5, xm4 1790*c0909341SAndroid Build Coastguard Worker ; only interpolate first line, insert second line unmodified 1791*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [grain_lutq+offxyq+82], 1 1792*c0909341SAndroid Build Coastguard Worker%endif 1793*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5, m7 1794*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m7 1795*c0909341SAndroid Build Coastguard Worker 1796*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[luma_src] * grain, scaling_shift) 1797*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m4 1798*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m5 1799*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m9 1800*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m9 1801*c0909341SAndroid Build Coastguard Worker 1802*c0909341SAndroid Build Coastguard Worker ; unpack chroma_source 1803*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m7 1804*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m7 1805*c0909341SAndroid Build Coastguard Worker 1806*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 1807*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1808*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1809*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1810*c0909341SAndroid Build Coastguard Worker pmaxub m0, m10 1811*c0909341SAndroid Build Coastguard Worker pminub m0, m11 1812*c0909341SAndroid Build Coastguard Worker%if %2 1813*c0909341SAndroid Build Coastguard Worker mova [dstq], xm0 1814*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq], m0, 1 1815*c0909341SAndroid Build Coastguard Worker%else 1816*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 1817*c0909341SAndroid Build Coastguard Worker%endif 1818*c0909341SAndroid Build Coastguard Worker 1819*c0909341SAndroid Build Coastguard Worker sub hb, 1+%2 1820*c0909341SAndroid Build Coastguard Worker jle %%end_y_v_overlap 1821*c0909341SAndroid Build Coastguard Worker%if %2 1822*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 1823*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 1824*c0909341SAndroid Build Coastguard Worker lea lumaq, [lumaq+lstrideq*(2<<%3)] 1825*c0909341SAndroid Build Coastguard Worker%else 1826*c0909341SAndroid Build Coastguard Worker add srcq, strideq 1827*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1828*c0909341SAndroid Build Coastguard Worker add lumaq, lstrideq 1829*c0909341SAndroid Build Coastguard Worker%endif 1830*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82<<%2 1831*c0909341SAndroid Build Coastguard Worker%if %2 == 0 1832*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [pb_17_27] 1833*c0909341SAndroid Build Coastguard Worker add hd, 0x80000000 1834*c0909341SAndroid Build Coastguard Worker jnc %%loop_y_v_overlap 1835*c0909341SAndroid Build Coastguard Worker%endif 1836*c0909341SAndroid Build Coastguard Worker jmp %%loop_y 1837*c0909341SAndroid Build Coastguard Worker 1838*c0909341SAndroid Build Coastguard Worker%%end_y_v_overlap: 1839*c0909341SAndroid Build Coastguard Worker add wq, 32>>%2 1840*c0909341SAndroid Build Coastguard Worker jge .end 1841*c0909341SAndroid Build Coastguard Worker mov srcq, r11mp 1842*c0909341SAndroid Build Coastguard Worker mov dstq, r12mp 1843*c0909341SAndroid Build Coastguard Worker lea lumaq, [r14+wq*(1+%2)] 1844*c0909341SAndroid Build Coastguard Worker add srcq, wq 1845*c0909341SAndroid Build Coastguard Worker add dstq, wq 1846*c0909341SAndroid Build Coastguard Worker 1847*c0909341SAndroid Build Coastguard Worker ; since fg_dataq.overlap is guaranteed to be set, we never jump 1848*c0909341SAndroid Build Coastguard Worker ; back to .loop_x_v_overlap, and instead always fall-through to 1849*c0909341SAndroid Build Coastguard Worker ; h+v overlap 1850*c0909341SAndroid Build Coastguard Worker 1851*c0909341SAndroid Build Coastguard Worker%%loop_x_hv_overlap: 1852*c0909341SAndroid Build Coastguard Worker ; we assume from the block above that bits 8-15 of r7d are zero'ed 1853*c0909341SAndroid Build Coastguard Worker mov r6d, seed 1854*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4eff4 1855*c0909341SAndroid Build Coastguard Worker test seeb, seeh 1856*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of top_seed 1857*c0909341SAndroid Build Coastguard Worker shr seed, 16 1858*c0909341SAndroid Build Coastguard Worker shl r7d, 16 1859*c0909341SAndroid Build Coastguard Worker test seeb, seeh 1860*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of cur_seed 1861*c0909341SAndroid Build Coastguard Worker or r6d, 0x00010001 1862*c0909341SAndroid Build Coastguard Worker xor r7d, r6d 1863*c0909341SAndroid Build Coastguard Worker rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1864*c0909341SAndroid Build Coastguard Worker 1865*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1866*c0909341SAndroid Build Coastguard Worker offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride 1867*c0909341SAndroid Build Coastguard Worker 1868*c0909341SAndroid Build Coastguard Worker lea topleft_offxyd, [top_offxyq+(32>>%2)] 1869*c0909341SAndroid Build Coastguard Worker lea left_offxyd, [offyq+(32>>%2)] 1870*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 1871*c0909341SAndroid Build Coastguard Worker rorx offxd, seed, 12 1872*c0909341SAndroid Build Coastguard Worker and offyd, 0xf000f 1873*c0909341SAndroid Build Coastguard Worker and offxd, 0xf000f 1874*c0909341SAndroid Build Coastguard Worker imul offyd, 164>>%3 1875*c0909341SAndroid Build Coastguard Worker ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1876*c0909341SAndroid Build Coastguard Worker lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 1877*c0909341SAndroid Build Coastguard Worker 1878*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1879*c0909341SAndroid Build Coastguard Worker h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride 1880*c0909341SAndroid Build Coastguard Worker 1881*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 1882*c0909341SAndroid Build Coastguard Worker mov hd, hm 1883*c0909341SAndroid Build Coastguard Worker movzx top_offxyd, offxyw 1884*c0909341SAndroid Build Coastguard Worker shr offxyd, 16 1885*c0909341SAndroid Build Coastguard Worker%if %2 == 0 1886*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [pb_27_17] 1887*c0909341SAndroid Build Coastguard Worker%endif 1888*c0909341SAndroid Build Coastguard Worker%%loop_y_hv_overlap: 1889*c0909341SAndroid Build Coastguard Worker ; src 1890*c0909341SAndroid Build Coastguard Worker%if %2 1891*c0909341SAndroid Build Coastguard Worker mova xm3, [lumaq+lstrideq*0+ 0] 1892*c0909341SAndroid Build Coastguard Worker vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 1893*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [pb_1] 1894*c0909341SAndroid Build Coastguard Worker mova xm0, [lumaq+lstrideq*0+16] 1895*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 1896*c0909341SAndroid Build Coastguard Worker mova xm1, [srcq] 1897*c0909341SAndroid Build Coastguard Worker vinserti128 m1, [srcq+strideq], 1 1898*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m2 1899*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m2 1900*c0909341SAndroid Build Coastguard Worker pavgw m3, m7 1901*c0909341SAndroid Build Coastguard Worker pavgw m0, m7 1902*c0909341SAndroid Build Coastguard Worker%else 1903*c0909341SAndroid Build Coastguard Worker mova m2, [lumaq] 1904*c0909341SAndroid Build Coastguard Worker mova m1, [srcq] 1905*c0909341SAndroid Build Coastguard Worker%endif 1906*c0909341SAndroid Build Coastguard Worker%if %1 1907*c0909341SAndroid Build Coastguard Worker%if %2 1908*c0909341SAndroid Build Coastguard Worker packuswb m2, m3, m0 ; luma 1909*c0909341SAndroid Build Coastguard Worker%endif 1910*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m2, m1 1911*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m1 ; { luma, chroma } 1912*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m14 1913*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m14 1914*c0909341SAndroid Build Coastguard Worker psraw m3, 6 1915*c0909341SAndroid Build Coastguard Worker psraw m2, 6 1916*c0909341SAndroid Build Coastguard Worker paddw m3, m15 1917*c0909341SAndroid Build Coastguard Worker paddw m2, m15 1918*c0909341SAndroid Build Coastguard Worker packuswb m2, m3 ; pack+unpack = clip 1919*c0909341SAndroid Build Coastguard Worker%endif 1920*c0909341SAndroid Build Coastguard Worker%if %1 || %2 == 0 1921*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m2, m7 1922*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m2, m7 1923*c0909341SAndroid Build Coastguard Worker%endif 1924*c0909341SAndroid Build Coastguard Worker 1925*c0909341SAndroid Build Coastguard Worker ; scaling[luma_src] 1926*c0909341SAndroid Build Coastguard Worker pandn m4, m8, m3 1927*c0909341SAndroid Build Coastguard Worker mova m6, m8 1928*c0909341SAndroid Build Coastguard Worker vpgatherdd m2, [scalingq+m4-0], m8 1929*c0909341SAndroid Build Coastguard Worker psrld m3, 16 1930*c0909341SAndroid Build Coastguard Worker mova m8, m6 1931*c0909341SAndroid Build Coastguard Worker vpgatherdd m4, [scalingq+m3-2], m6 1932*c0909341SAndroid Build Coastguard Worker pandn m5, m8, m0 1933*c0909341SAndroid Build Coastguard Worker mova m6, m8 1934*c0909341SAndroid Build Coastguard Worker vpgatherdd m3, [scalingq+m5-0], m8 1935*c0909341SAndroid Build Coastguard Worker psrld m0, 16 1936*c0909341SAndroid Build Coastguard Worker mova m8, m6 1937*c0909341SAndroid Build Coastguard Worker vpgatherdd m5, [scalingq+m0-2], m6 1938*c0909341SAndroid Build Coastguard Worker pblendw m2, m4, 0xaa 1939*c0909341SAndroid Build Coastguard Worker pblendw m3, m5, 0xaa 1940*c0909341SAndroid Build Coastguard Worker 1941*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 1942*c0909341SAndroid Build Coastguard Worker%if %2 1943*c0909341SAndroid Build Coastguard Worker movu xm4, [grain_lutq+offxyq] 1944*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [grain_lutq+offxyq+82], 1 1945*c0909341SAndroid Build Coastguard Worker movd xm0, [grain_lutq+left_offxyq] 1946*c0909341SAndroid Build Coastguard Worker vinserti128 m0, [grain_lutq+left_offxyq+82], 1 1947*c0909341SAndroid Build Coastguard Worker movd xm6, [grain_lutq+topleft_offxyq] 1948*c0909341SAndroid Build Coastguard Worker%if %3 1949*c0909341SAndroid Build Coastguard Worker movq xm5, [grain_lutq+top_offxyq] 1950*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [grain_lutq+top_offxyq+8], 1 1951*c0909341SAndroid Build Coastguard Worker%else 1952*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [grain_lutq+topleft_offxyq+82], 1 1953*c0909341SAndroid Build Coastguard Worker movu xm5, [grain_lutq+top_offxyq] 1954*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [grain_lutq+top_offxyq+82], 1 1955*c0909341SAndroid Build Coastguard Worker%endif 1956*c0909341SAndroid Build Coastguard Worker 1957*c0909341SAndroid Build Coastguard Worker ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 1958*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m4 1959*c0909341SAndroid Build Coastguard Worker%if %3 1960*c0909341SAndroid Build Coastguard Worker punpcklbw xm6, xm5 1961*c0909341SAndroid Build Coastguard Worker%else 1962*c0909341SAndroid Build Coastguard Worker punpcklbw m6, m5 1963*c0909341SAndroid Build Coastguard Worker%endif 1964*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m6 1965*c0909341SAndroid Build Coastguard Worker%if %1 1966*c0909341SAndroid Build Coastguard Worker vpbroadcastq m6, [pb_23_22] 1967*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m6, m0 1968*c0909341SAndroid Build Coastguard Worker%else 1969*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m15, m0 1970*c0909341SAndroid Build Coastguard Worker%endif 1971*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m12 1972*c0909341SAndroid Build Coastguard Worker packsswb m0, m0 1973*c0909341SAndroid Build Coastguard Worker vpblendd m4, m0, 0x11 1974*c0909341SAndroid Build Coastguard Worker%if %3 1975*c0909341SAndroid Build Coastguard Worker pshuflw xm0, xm0, q1032 1976*c0909341SAndroid Build Coastguard Worker vpblendd m5, m0, 0x01 1977*c0909341SAndroid Build Coastguard Worker%else 1978*c0909341SAndroid Build Coastguard Worker pshuflw m0, m0, q1032 1979*c0909341SAndroid Build Coastguard Worker vpblendd m5, m0, 0x11 1980*c0909341SAndroid Build Coastguard Worker%endif 1981*c0909341SAndroid Build Coastguard Worker%else 1982*c0909341SAndroid Build Coastguard Worker movu m4, [grain_lutq+offxyq] 1983*c0909341SAndroid Build Coastguard Worker movd xm0, [grain_lutq+left_offxyq] 1984*c0909341SAndroid Build Coastguard Worker movu m5, [grain_lutq+top_offxyq] 1985*c0909341SAndroid Build Coastguard Worker movd xm6, [grain_lutq+topleft_offxyq] 1986*c0909341SAndroid Build Coastguard Worker punpcklbw xm0, xm4 1987*c0909341SAndroid Build Coastguard Worker punpcklbw xm6, xm5 1988*c0909341SAndroid Build Coastguard Worker punpcklqdq xm0, xm6 1989*c0909341SAndroid Build Coastguard Worker%if %1 1990*c0909341SAndroid Build Coastguard Worker vpbroadcastq xm6, [pb_27_17_17_27] 1991*c0909341SAndroid Build Coastguard Worker pmaddubsw xm0, xm6, xm0 1992*c0909341SAndroid Build Coastguard Worker%else 1993*c0909341SAndroid Build Coastguard Worker pmaddubsw xm0, xm15, xm0 1994*c0909341SAndroid Build Coastguard Worker%endif 1995*c0909341SAndroid Build Coastguard Worker pmulhrsw xm0, xm12 1996*c0909341SAndroid Build Coastguard Worker packsswb xm0, xm0 1997*c0909341SAndroid Build Coastguard Worker vpblendd m4, m0, 0x01 1998*c0909341SAndroid Build Coastguard Worker pshuflw xm0, xm0, q1032 1999*c0909341SAndroid Build Coastguard Worker vpblendd m5, m0, 0x01 2000*c0909341SAndroid Build Coastguard Worker%endif 2001*c0909341SAndroid Build Coastguard Worker 2002*c0909341SAndroid Build Coastguard Worker ; followed by v interpolation (top | cur -> cur) 2003*c0909341SAndroid Build Coastguard Worker%if %3 2004*c0909341SAndroid Build Coastguard Worker vpermq m0, m4, q3120 2005*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m0 2006*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m13, m5 2007*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m12 2008*c0909341SAndroid Build Coastguard Worker vextracti128 xm0, m5, 1 2009*c0909341SAndroid Build Coastguard Worker packsswb xm5, xm0 2010*c0909341SAndroid Build Coastguard Worker vpblendd m5, m4, 0xf0 2011*c0909341SAndroid Build Coastguard Worker%else 2012*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m5, m4 2013*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m4 2014*c0909341SAndroid Build Coastguard Worker pmaddubsw m4, m13, m0 2015*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m13, m5 2016*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m12 2017*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m12 2018*c0909341SAndroid Build Coastguard Worker packsswb m5, m4 2019*c0909341SAndroid Build Coastguard Worker%endif 2020*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5, m7 2021*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m7 2022*c0909341SAndroid Build Coastguard Worker 2023*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[src] * grain, scaling_shift) 2024*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m4 2025*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m5 2026*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m9 2027*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m9 2028*c0909341SAndroid Build Coastguard Worker 2029*c0909341SAndroid Build Coastguard Worker ; unpack chroma source 2030*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m7 2031*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m7 2032*c0909341SAndroid Build Coastguard Worker 2033*c0909341SAndroid Build Coastguard Worker ; dst = clip_pixel(src, noise) 2034*c0909341SAndroid Build Coastguard Worker paddw m0, m2 2035*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2036*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 2037*c0909341SAndroid Build Coastguard Worker pmaxub m0, m10 2038*c0909341SAndroid Build Coastguard Worker pminub m0, m11 2039*c0909341SAndroid Build Coastguard Worker%if %2 2040*c0909341SAndroid Build Coastguard Worker mova [dstq], xm0 2041*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq], m0, 1 2042*c0909341SAndroid Build Coastguard Worker%else 2043*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 2044*c0909341SAndroid Build Coastguard Worker%endif 2045*c0909341SAndroid Build Coastguard Worker 2046*c0909341SAndroid Build Coastguard Worker%if %2 2047*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 2048*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 2049*c0909341SAndroid Build Coastguard Worker lea lumaq, [lumaq+lstrideq*(2<<%3)] 2050*c0909341SAndroid Build Coastguard Worker%else 2051*c0909341SAndroid Build Coastguard Worker add srcq, strideq 2052*c0909341SAndroid Build Coastguard Worker add dstq, strideq 2053*c0909341SAndroid Build Coastguard Worker add lumaq, lstrideq 2054*c0909341SAndroid Build Coastguard Worker%endif 2055*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82<<%2 2056*c0909341SAndroid Build Coastguard Worker sub hb, 1+%2 2057*c0909341SAndroid Build Coastguard Worker%if %2 2058*c0909341SAndroid Build Coastguard Worker jg %%loop_y_h_overlap 2059*c0909341SAndroid Build Coastguard Worker%else 2060*c0909341SAndroid Build Coastguard Worker je %%end_y_hv_overlap 2061*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [pb_17_27] 2062*c0909341SAndroid Build Coastguard Worker add hd, 0x80000000 2063*c0909341SAndroid Build Coastguard Worker jnc %%loop_y_hv_overlap 2064*c0909341SAndroid Build Coastguard Worker jmp %%loop_y_h_overlap 2065*c0909341SAndroid Build Coastguard Worker%endif 2066*c0909341SAndroid Build Coastguard Worker 2067*c0909341SAndroid Build Coastguard Worker%%end_y_hv_overlap: 2068*c0909341SAndroid Build Coastguard Worker add wq, 32>>%2 2069*c0909341SAndroid Build Coastguard Worker jge .end 2070*c0909341SAndroid Build Coastguard Worker mov srcq, r11mp 2071*c0909341SAndroid Build Coastguard Worker mov dstq, r12mp 2072*c0909341SAndroid Build Coastguard Worker lea lumaq, [r14+wq*(1+%2)] 2073*c0909341SAndroid Build Coastguard Worker add srcq, wq 2074*c0909341SAndroid Build Coastguard Worker add dstq, wq 2075*c0909341SAndroid Build Coastguard Worker jmp %%loop_x_hv_overlap 2076*c0909341SAndroid Build Coastguard Worker%endmacro 2077*c0909341SAndroid Build Coastguard Worker 2078*c0909341SAndroid Build Coastguard Worker %%FGUV_32x32xN_LOOP 1, %2, %3 2079*c0909341SAndroid Build Coastguard Worker.csfl: 2080*c0909341SAndroid Build Coastguard Worker %%FGUV_32x32xN_LOOP 0, %2, %3 2081*c0909341SAndroid Build Coastguard Worker.end: 2082*c0909341SAndroid Build Coastguard Worker RET 2083*c0909341SAndroid Build Coastguard Worker%endmacro 2084*c0909341SAndroid Build Coastguard Worker 2085*c0909341SAndroid Build Coastguard WorkerGEN_GRAIN_UV_FN 420, 1, 1 2086*c0909341SAndroid Build Coastguard WorkerFGUV_FN 420, 1, 1 2087*c0909341SAndroid Build Coastguard WorkerGEN_GRAIN_UV_FN 422, 1, 0 2088*c0909341SAndroid Build Coastguard WorkerFGUV_FN 422, 1, 0 2089*c0909341SAndroid Build Coastguard WorkerGEN_GRAIN_UV_FN 444, 0, 0 2090*c0909341SAndroid Build Coastguard WorkerFGUV_FN 444, 0, 0 2091*c0909341SAndroid Build Coastguard Worker 2092*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64 2093