1*c0909341SAndroid Build Coastguard Worker; Copyright © 2022, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2022, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; All rights reserved. 4*c0909341SAndroid Build Coastguard Worker; 5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 7*c0909341SAndroid Build Coastguard Worker; 8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 9*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 10*c0909341SAndroid Build Coastguard Worker; 11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 12*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 13*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 14*c0909341SAndroid Build Coastguard Worker; 15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*c0909341SAndroid Build Coastguard Worker 26*c0909341SAndroid Build Coastguard Worker%include "config.asm" 27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 28*c0909341SAndroid Build Coastguard Worker%include "x86/filmgrain_common.asm" 29*c0909341SAndroid Build Coastguard Worker 30*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 31*c0909341SAndroid Build Coastguard Worker 32*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 16 33*c0909341SAndroid Build Coastguard Workerscale_mask: db -1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1 34*c0909341SAndroid Build Coastguard Workerscale_shift: dw 7, 7, 6, 6, 5, 5, 4, 4 35*c0909341SAndroid Build Coastguard Workerpw_27_17_17_27: dw 108, 68, 68, 108, 27, 17, 17, 27 36*c0909341SAndroid Build Coastguard Workerpw_23_22: dw 92, 88, 0, 128, 23, 22, 0, 32 37*c0909341SAndroid Build Coastguard Workerfg_min: times 2 dw 0 38*c0909341SAndroid Build Coastguard Worker times 2 dw 64 39*c0909341SAndroid Build Coastguard Worker times 2 dw 256 40*c0909341SAndroid Build Coastguard Workerfg_max: times 2 dw 1023 41*c0909341SAndroid Build Coastguard Worker times 2 dw 4095 42*c0909341SAndroid Build Coastguard Worker times 2 dw 960 43*c0909341SAndroid Build Coastguard Worker times 2 dw 3840 44*c0909341SAndroid Build Coastguard Worker times 2 dw 940 45*c0909341SAndroid Build Coastguard Worker times 2 dw 3760 46*c0909341SAndroid Build Coastguard Workerscale_rnd: dd 64 47*c0909341SAndroid Build Coastguard Worker dd 16 48*c0909341SAndroid Build Coastguard Workeruv_offset_mul: dd 256 49*c0909341SAndroid Build Coastguard Worker dd 1024 50*c0909341SAndroid Build Coastguard Workerpb_8_9_0_1: db 8, 9, 0, 1 51*c0909341SAndroid Build Coastguard Worker 52*c0909341SAndroid Build Coastguard Workercextern pb_0to63 53*c0909341SAndroid Build Coastguard Worker 54*c0909341SAndroid Build Coastguard WorkerSECTION .text 55*c0909341SAndroid Build Coastguard Worker 56*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl 57*c0909341SAndroid Build Coastguard Workercglobal fgy_32x32xn_16bpc, 6, 15, 21, dst, src, stride, fg_data, w, scaling, \ 58*c0909341SAndroid Build Coastguard Worker grain_lut, offx, sby, see, offy, src_bak 59*c0909341SAndroid Build Coastguard Worker%define base r11-fg_min 60*c0909341SAndroid Build Coastguard Worker lea r11, [fg_min] 61*c0909341SAndroid Build Coastguard Worker mov r6d, r9m ; bdmax 62*c0909341SAndroid Build Coastguard Worker mov r9d, [fg_dataq+FGData.clip_to_restricted_range] 63*c0909341SAndroid Build Coastguard Worker mov r7d, [fg_dataq+FGData.scaling_shift] 64*c0909341SAndroid Build Coastguard Worker mov sbyd, sbym 65*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, r9m 66*c0909341SAndroid Build Coastguard Worker shr r6d, 11 ; is_12bpc 67*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m7, [base+scale_mask] 68*c0909341SAndroid Build Coastguard Worker shlx r10d, r9d, r6d 69*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+scale_shift+r7*4-32] 70*c0909341SAndroid Build Coastguard Worker lea r9d, [r6+r9*4] 71*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+fg_min+r10*4] 72*c0909341SAndroid Build Coastguard Worker kxnorw k1, k1, k1 ; 0xffff 73*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+fg_max+r9*4] 74*c0909341SAndroid Build Coastguard Worker mov r12, 0xeeeeeeeeeeeeeeee 75*c0909341SAndroid Build Coastguard Worker vpbroadcastd m19, [base+scale_rnd+r6*4] 76*c0909341SAndroid Build Coastguard Worker kshiftrb k2, k1, 4 ; 0xf 77*c0909341SAndroid Build Coastguard Worker vpbroadcastq xm20, [base+pw_27_17_17_27+r6*8] 78*c0909341SAndroid Build Coastguard Worker kmovq k3, r12 79*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+scale_shift+r6*8+4] 80*c0909341SAndroid Build Coastguard Worker test sbyd, sbyd 81*c0909341SAndroid Build Coastguard Worker setnz r7b 82*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [base+pw_27_17_17_27+r6*8+0] 83*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [base+pw_27_17_17_27+r6*8+4] 84*c0909341SAndroid Build Coastguard Worker test r7b, [fg_dataq+FGData.overlap_flag] 85*c0909341SAndroid Build Coastguard Worker jnz .v_overlap 86*c0909341SAndroid Build Coastguard Worker 87*c0909341SAndroid Build Coastguard Worker imul seed, sbyd, (173 << 24) | 37 88*c0909341SAndroid Build Coastguard Worker add seed, (105 << 24) | 178 89*c0909341SAndroid Build Coastguard Worker rorx seed, seed, 24 90*c0909341SAndroid Build Coastguard Worker movzx seed, seew 91*c0909341SAndroid Build Coastguard Worker xor seed, [fg_dataq+FGData.seed] 92*c0909341SAndroid Build Coastguard Worker lea src_bakq, [srcq+wq*2] 93*c0909341SAndroid Build Coastguard Worker neg wq 94*c0909341SAndroid Build Coastguard Worker sub dstq, srcq 95*c0909341SAndroid Build Coastguard Worker 96*c0909341SAndroid Build Coastguard Worker.loop_x: 97*c0909341SAndroid Build Coastguard Worker rorx r6, seeq, 1 98*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4 99*c0909341SAndroid Build Coastguard Worker test seeb, seeh 100*c0909341SAndroid Build Coastguard Worker lea seed, [r6+0x8000] 101*c0909341SAndroid Build Coastguard Worker cmovp seed, r6d ; updated seed 102*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 103*c0909341SAndroid Build Coastguard Worker rorx offxq, seeq, 12 104*c0909341SAndroid Build Coastguard Worker and offyd, 0xf 105*c0909341SAndroid Build Coastguard Worker imul offyd, 164 106*c0909341SAndroid Build Coastguard Worker lea offyd, [offyq+offxq*2+747] ; offy*stride+offx 107*c0909341SAndroid Build Coastguard Worker 108*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ 109*c0909341SAndroid Build Coastguard Worker sby, see, offxy, src_bak 110*c0909341SAndroid Build Coastguard Worker 111*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 112*c0909341SAndroid Build Coastguard Worker mov hd, hm 113*c0909341SAndroid Build Coastguard Worker.loop_y: 114*c0909341SAndroid Build Coastguard Worker movu m4, [grain_lutq+offxyq*2+82*0] 115*c0909341SAndroid Build Coastguard Worker movu m5, [grain_lutq+offxyq*2+82*2] 116*c0909341SAndroid Build Coastguard Worker call .add_noise 117*c0909341SAndroid Build Coastguard Worker sub hb, 2 118*c0909341SAndroid Build Coastguard Worker jg .loop_y 119*c0909341SAndroid Build Coastguard Worker add wq, 32 120*c0909341SAndroid Build Coastguard Worker jge .end 121*c0909341SAndroid Build Coastguard Worker lea srcq, [src_bakq+wq*2] 122*c0909341SAndroid Build Coastguard Worker cmp byte [fg_dataq+FGData.overlap_flag], 0 123*c0909341SAndroid Build Coastguard Worker je .loop_x 124*c0909341SAndroid Build Coastguard Worker test sbyd, sbyd 125*c0909341SAndroid Build Coastguard Worker jnz .hv_overlap 126*c0909341SAndroid Build Coastguard Worker 127*c0909341SAndroid Build Coastguard Worker ; horizontal overlap (without vertical overlap) 128*c0909341SAndroid Build Coastguard Worker.loop_x_h_overlap: 129*c0909341SAndroid Build Coastguard Worker rorx r6, seeq, 1 130*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4 131*c0909341SAndroid Build Coastguard Worker test seeb, seeh 132*c0909341SAndroid Build Coastguard Worker lea seed, [r6+0x8000] 133*c0909341SAndroid Build Coastguard Worker cmovp seed, r6d ; updated seed 134*c0909341SAndroid Build Coastguard Worker 135*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \ 136*c0909341SAndroid Build Coastguard Worker sby, see, offy, src_bak, left_offxy 137*c0909341SAndroid Build Coastguard Worker 138*c0909341SAndroid Build Coastguard Worker lea left_offxyd, [offyq+73] ; previous column's offy*stride+offx 139*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 140*c0909341SAndroid Build Coastguard Worker rorx offxq, seeq, 12 141*c0909341SAndroid Build Coastguard Worker and offyd, 0xf 142*c0909341SAndroid Build Coastguard Worker imul offyd, 164 143*c0909341SAndroid Build Coastguard Worker lea offyd, [offyq+offxq*2+747] ; offy*stride+offx 144*c0909341SAndroid Build Coastguard Worker 145*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ 146*c0909341SAndroid Build Coastguard Worker sby, see, offxy, src_bak, left_offxy 147*c0909341SAndroid Build Coastguard Worker 148*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 149*c0909341SAndroid Build Coastguard Worker mov hd, hm 150*c0909341SAndroid Build Coastguard Worker.loop_y_h_overlap: 151*c0909341SAndroid Build Coastguard Worker movu m4, [grain_lutq+offxyq*2+82*0] 152*c0909341SAndroid Build Coastguard Worker movu m5, [grain_lutq+offxyq*2+82*2] 153*c0909341SAndroid Build Coastguard Worker movd xm17, [grain_lutq+left_offxyq*2-82*1] 154*c0909341SAndroid Build Coastguard Worker pinsrd xm17, [grain_lutq+left_offxyq*2+82*1], 1 155*c0909341SAndroid Build Coastguard Worker punpckldq xm16, xm4, xm5 156*c0909341SAndroid Build Coastguard Worker punpcklwd xm17, xm16 157*c0909341SAndroid Build Coastguard Worker mova xm16, xm19 158*c0909341SAndroid Build Coastguard Worker vpdpwssd xm16, xm20, xm17 159*c0909341SAndroid Build Coastguard Worker psrad xm16, 1 160*c0909341SAndroid Build Coastguard Worker packssdw xm16, xm16 161*c0909341SAndroid Build Coastguard Worker vpsravw xm16, xm11 162*c0909341SAndroid Build Coastguard Worker vmovdqu8 m4{k2}, m16 163*c0909341SAndroid Build Coastguard Worker vpalignr m5{k2}, m16, m16, 4 164*c0909341SAndroid Build Coastguard Worker call .add_noise 165*c0909341SAndroid Build Coastguard Worker sub hb, 2 166*c0909341SAndroid Build Coastguard Worker jg .loop_y_h_overlap 167*c0909341SAndroid Build Coastguard Worker add wq, 32 168*c0909341SAndroid Build Coastguard Worker jge .end 169*c0909341SAndroid Build Coastguard Worker lea srcq, [src_bakq+wq*2] 170*c0909341SAndroid Build Coastguard Worker test sbyd, sbyd 171*c0909341SAndroid Build Coastguard Worker jnz .hv_overlap 172*c0909341SAndroid Build Coastguard Worker jmp .loop_x_h_overlap 173*c0909341SAndroid Build Coastguard Worker 174*c0909341SAndroid Build Coastguard Worker.v_overlap: 175*c0909341SAndroid Build Coastguard Worker movzx sbyd, sbyb 176*c0909341SAndroid Build Coastguard Worker imul seed, [fg_dataq+FGData.seed], 0x00010001 177*c0909341SAndroid Build Coastguard Worker imul r7d, sbyd, 173 * 0x00010001 178*c0909341SAndroid Build Coastguard Worker imul sbyd, 37 * 0x01000100 179*c0909341SAndroid Build Coastguard Worker add r7d, (105 << 16) | 188 180*c0909341SAndroid Build Coastguard Worker add sbyd, (178 << 24) | (141 << 8) 181*c0909341SAndroid Build Coastguard Worker and r7d, 0x00ff00ff 182*c0909341SAndroid Build Coastguard Worker and sbyd, 0xff00ff00 183*c0909341SAndroid Build Coastguard Worker xor seed, r7d 184*c0909341SAndroid Build Coastguard Worker xor seed, sbyd ; (cur_seed << 16) | top_seed 185*c0909341SAndroid Build Coastguard Worker lea src_bakq, [srcq+wq*2] 186*c0909341SAndroid Build Coastguard Worker neg wq 187*c0909341SAndroid Build Coastguard Worker sub dstq, srcq 188*c0909341SAndroid Build Coastguard Worker 189*c0909341SAndroid Build Coastguard Worker ; we assume from the block above that bits 8-15 of r7d are zero'ed 190*c0909341SAndroid Build Coastguard Worker mov r6d, seed 191*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4eff4 192*c0909341SAndroid Build Coastguard Worker test seeb, seeh 193*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of top_seed 194*c0909341SAndroid Build Coastguard Worker shr seed, 16 195*c0909341SAndroid Build Coastguard Worker shl r7d, 16 196*c0909341SAndroid Build Coastguard Worker test seeb, seeh 197*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of cur_seed 198*c0909341SAndroid Build Coastguard Worker or r6d, 0x00010001 199*c0909341SAndroid Build Coastguard Worker xor r7d, r6d 200*c0909341SAndroid Build Coastguard Worker rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 201*c0909341SAndroid Build Coastguard Worker 202*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \ 203*c0909341SAndroid Build Coastguard Worker sby, see, offy, src_bak, _, top_offxy 204*c0909341SAndroid Build Coastguard Worker 205*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 206*c0909341SAndroid Build Coastguard Worker rorx offxd, seed, 12 207*c0909341SAndroid Build Coastguard Worker and offyd, 0xf000f 208*c0909341SAndroid Build Coastguard Worker and offxd, 0xf000f 209*c0909341SAndroid Build Coastguard Worker imul offyd, 164 210*c0909341SAndroid Build Coastguard Worker ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 211*c0909341SAndroid Build Coastguard Worker lea offyd, [offyq+offxq*2+0x10001*747+32*82] 212*c0909341SAndroid Build Coastguard Worker 213*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ 214*c0909341SAndroid Build Coastguard Worker sby, see, offxy, src_bak, _, top_offxy 215*c0909341SAndroid Build Coastguard Worker 216*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 217*c0909341SAndroid Build Coastguard Worker mov hd, hm 218*c0909341SAndroid Build Coastguard Worker movzx top_offxyd, offxyw 219*c0909341SAndroid Build Coastguard Worker shr offxyd, 16 220*c0909341SAndroid Build Coastguard Worker 221*c0909341SAndroid Build Coastguard Worker movu m16, [grain_lutq+offxyq*2+82*0] 222*c0909341SAndroid Build Coastguard Worker movu m0, [grain_lutq+top_offxyq*2+82*0] 223*c0909341SAndroid Build Coastguard Worker movu m17, [grain_lutq+offxyq*2+82*2] 224*c0909341SAndroid Build Coastguard Worker movu m1, [grain_lutq+top_offxyq*2+82*2] 225*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m0, m16 226*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m16 227*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m1, m17 228*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m17 229*c0909341SAndroid Build Coastguard Worker call .add_noise_v 230*c0909341SAndroid Build Coastguard Worker sub hb, 2 231*c0909341SAndroid Build Coastguard Worker jg .loop_y 232*c0909341SAndroid Build Coastguard Worker add wq, 32 233*c0909341SAndroid Build Coastguard Worker jge .end 234*c0909341SAndroid Build Coastguard Worker lea srcq, [src_bakq+wq*2] 235*c0909341SAndroid Build Coastguard Worker 236*c0909341SAndroid Build Coastguard Worker ; since fg_dataq.overlap is guaranteed to be set, we never jump back 237*c0909341SAndroid Build Coastguard Worker ; to .v_overlap, and instead always fall-through to .hv_overlap 238*c0909341SAndroid Build Coastguard Worker.hv_overlap: 239*c0909341SAndroid Build Coastguard Worker ; we assume from the block above that bits 8-15 of r7d are zero'ed 240*c0909341SAndroid Build Coastguard Worker mov r6d, seed 241*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4eff4 242*c0909341SAndroid Build Coastguard Worker test seeb, seeh 243*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of top_seed 244*c0909341SAndroid Build Coastguard Worker shr seed, 16 245*c0909341SAndroid Build Coastguard Worker shl r7d, 16 246*c0909341SAndroid Build Coastguard Worker test seeb, seeh 247*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of cur_seed 248*c0909341SAndroid Build Coastguard Worker or r6d, 0x00010001 249*c0909341SAndroid Build Coastguard Worker xor r7d, r6d 250*c0909341SAndroid Build Coastguard Worker rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 251*c0909341SAndroid Build Coastguard Worker 252*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \ 253*c0909341SAndroid Build Coastguard Worker sby, see, offy, src_bak, left_offxy, top_offxy, topleft_offxy 254*c0909341SAndroid Build Coastguard Worker 255*c0909341SAndroid Build Coastguard Worker lea topleft_offxyd, [top_offxyq+73] 256*c0909341SAndroid Build Coastguard Worker lea left_offxyd, [offyq+73] 257*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 258*c0909341SAndroid Build Coastguard Worker rorx offxd, seed, 12 259*c0909341SAndroid Build Coastguard Worker and offyd, 0xf000f 260*c0909341SAndroid Build Coastguard Worker and offxd, 0xf000f 261*c0909341SAndroid Build Coastguard Worker imul offyd, 164 262*c0909341SAndroid Build Coastguard Worker ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 263*c0909341SAndroid Build Coastguard Worker lea offyd, [offyq+offxq*2+0x10001*747+32*82] 264*c0909341SAndroid Build Coastguard Worker 265*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ 266*c0909341SAndroid Build Coastguard Worker sby, see, offxy, src_bak, left_offxy, top_offxy, topleft_offxy 267*c0909341SAndroid Build Coastguard Worker 268*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 269*c0909341SAndroid Build Coastguard Worker mov hd, hm 270*c0909341SAndroid Build Coastguard Worker movzx top_offxyd, offxyw 271*c0909341SAndroid Build Coastguard Worker shr offxyd, 16 272*c0909341SAndroid Build Coastguard Worker 273*c0909341SAndroid Build Coastguard Worker movu m5, [grain_lutq+offxyq*2+82*0] 274*c0909341SAndroid Build Coastguard Worker movu m0, [grain_lutq+top_offxyq*2+82*0] 275*c0909341SAndroid Build Coastguard Worker movd xm17, [grain_lutq+left_offxyq*2-82*1] 276*c0909341SAndroid Build Coastguard Worker pinsrd xm17, [grain_lutq+topleft_offxyq*2-82*1], 1 277*c0909341SAndroid Build Coastguard Worker movu m2, [grain_lutq+offxyq*2+82*2] 278*c0909341SAndroid Build Coastguard Worker movu m1, [grain_lutq+top_offxyq*2+82*2] 279*c0909341SAndroid Build Coastguard Worker movd xm18, [grain_lutq+left_offxyq*2+82*1] 280*c0909341SAndroid Build Coastguard Worker pinsrd xm18, [grain_lutq+topleft_offxyq*2+82*1], 1 281*c0909341SAndroid Build Coastguard Worker punpckldq xm16, xm5, xm0 282*c0909341SAndroid Build Coastguard Worker punpcklwd xm17, xm16 283*c0909341SAndroid Build Coastguard Worker mova xm16, xm19 284*c0909341SAndroid Build Coastguard Worker vpdpwssd xm16, xm20, xm17 285*c0909341SAndroid Build Coastguard Worker punpckldq xm17, xm2, xm1 286*c0909341SAndroid Build Coastguard Worker punpcklwd xm18, xm17 287*c0909341SAndroid Build Coastguard Worker mova xm17, xm19 288*c0909341SAndroid Build Coastguard Worker vpdpwssd xm17, xm20, xm18 289*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m0, m5 290*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m5 291*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m1, m2 292*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2 293*c0909341SAndroid Build Coastguard Worker psrad xm16, 1 294*c0909341SAndroid Build Coastguard Worker psrad xm17, 1 295*c0909341SAndroid Build Coastguard Worker packssdw xm16, xm17 296*c0909341SAndroid Build Coastguard Worker vpsravw xm16, xm11 297*c0909341SAndroid Build Coastguard Worker vpshuflw m0{k2}, m16, q1302 298*c0909341SAndroid Build Coastguard Worker punpckhqdq xm16, xm16 299*c0909341SAndroid Build Coastguard Worker vpshuflw m1{k2}, m16, q1302 300*c0909341SAndroid Build Coastguard Worker call .add_noise_v 301*c0909341SAndroid Build Coastguard Worker sub hb, 2 302*c0909341SAndroid Build Coastguard Worker jg .loop_y_h_overlap 303*c0909341SAndroid Build Coastguard Worker add wq, 32 304*c0909341SAndroid Build Coastguard Worker lea srcq, [src_bakq+wq*2] 305*c0909341SAndroid Build Coastguard Worker jl .hv_overlap 306*c0909341SAndroid Build Coastguard Worker.end: 307*c0909341SAndroid Build Coastguard Worker RET 308*c0909341SAndroid Build Coastguard WorkerALIGN function_align 309*c0909341SAndroid Build Coastguard Worker.add_noise_v: 310*c0909341SAndroid Build Coastguard Worker mova m2, m19 311*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m12, m4 312*c0909341SAndroid Build Coastguard Worker mova m3, m19 313*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m13, m5 314*c0909341SAndroid Build Coastguard Worker mova m4, m19 315*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m12, m0 316*c0909341SAndroid Build Coastguard Worker mova m5, m19 317*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m13, m1 318*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 1}, m2, m3, m4, m5 319*c0909341SAndroid Build Coastguard Worker packssdw m4, m2 320*c0909341SAndroid Build Coastguard Worker packssdw m5, m3 321*c0909341SAndroid Build Coastguard Worker vpsravw m4, m11 322*c0909341SAndroid Build Coastguard Worker vpsravw m5, m11 323*c0909341SAndroid Build Coastguard Worker.add_noise: 324*c0909341SAndroid Build Coastguard Worker mova m0, [srcq+strideq*0] 325*c0909341SAndroid Build Coastguard Worker mova m1, [srcq+strideq*1] 326*c0909341SAndroid Build Coastguard Worker kmovw k4, k1 327*c0909341SAndroid Build Coastguard Worker pand m16, m6, m0 328*c0909341SAndroid Build Coastguard Worker psrld m3, m0, 16 329*c0909341SAndroid Build Coastguard Worker vpgatherdd m2{k4}, [scalingq+m16] 330*c0909341SAndroid Build Coastguard Worker vpcmpud k4, m3, m6, 2 ; px <= bdmax 331*c0909341SAndroid Build Coastguard Worker vpgatherdd m16{k4}, [scalingq+m3] 332*c0909341SAndroid Build Coastguard Worker kmovw k4, k1 333*c0909341SAndroid Build Coastguard Worker pand m17, m6, m1 334*c0909341SAndroid Build Coastguard Worker vpgatherdd m3{k4}, [scalingq+m17] 335*c0909341SAndroid Build Coastguard Worker vpshufb m2{k3}, m16, m7 336*c0909341SAndroid Build Coastguard Worker psrld m16, m1, 16 337*c0909341SAndroid Build Coastguard Worker vpcmpud k4, m16, m6, 2 338*c0909341SAndroid Build Coastguard Worker vpgatherdd m17{k4}, [scalingq+m16] 339*c0909341SAndroid Build Coastguard Worker vpshufb m3{k3}, m17, m7 340*c0909341SAndroid Build Coastguard Worker vpsllvw m2, m10 341*c0909341SAndroid Build Coastguard Worker vpsllvw m3, m10 342*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m2 343*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m3 344*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82*4 345*c0909341SAndroid Build Coastguard Worker paddw m0, m4 346*c0909341SAndroid Build Coastguard Worker paddw m1, m5 347*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m8 348*c0909341SAndroid Build Coastguard Worker pmaxsw m1, m8 349*c0909341SAndroid Build Coastguard Worker pminsw m0, m9 350*c0909341SAndroid Build Coastguard Worker pminsw m1, m9 351*c0909341SAndroid Build Coastguard Worker mova [dstq+srcq], m0 352*c0909341SAndroid Build Coastguard Worker add srcq, strideq 353*c0909341SAndroid Build Coastguard Worker mova [dstq+srcq], m1 354*c0909341SAndroid Build Coastguard Worker add srcq, strideq 355*c0909341SAndroid Build Coastguard Worker ret 356*c0909341SAndroid Build Coastguard Worker 357*c0909341SAndroid Build Coastguard Worker%macro FGUV_FN 3 ; name, ss_hor, ss_ver 358*c0909341SAndroid Build Coastguard Workercglobal fguv_32x32xn_i%1_16bpc, 6, 15, 22, dst, src, stride, fg_data, w, scaling, \ 359*c0909341SAndroid Build Coastguard Worker grain_lut, h, sby, luma, lstride, uv_pl, is_id 360*c0909341SAndroid Build Coastguard Worker%define base r12-fg_min 361*c0909341SAndroid Build Coastguard Worker lea r12, [fg_min] 362*c0909341SAndroid Build Coastguard Worker mov r9d, r13m ; bdmax 363*c0909341SAndroid Build Coastguard Worker mov r7d, [fg_dataq+FGData.scaling_shift] 364*c0909341SAndroid Build Coastguard Worker mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 365*c0909341SAndroid Build Coastguard Worker mov r11d, is_idm 366*c0909341SAndroid Build Coastguard Worker kxnorw k1, k1, k1 ; 0xffff 367*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, r13m 368*c0909341SAndroid Build Coastguard Worker mov r13, 0xeeeeeeeeeeeeeeee 369*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m6, [base+scale_mask] 370*c0909341SAndroid Build Coastguard Worker shr r9d, 11 ; is_12bpc 371*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+scale_shift+r7*4-32] 372*c0909341SAndroid Build Coastguard Worker shlx r10d, r6d, r9d 373*c0909341SAndroid Build Coastguard Worker mov sbyd, sbym 374*c0909341SAndroid Build Coastguard Worker shlx r6d, r6d, r11d 375*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+fg_min+r10*4] 376*c0909341SAndroid Build Coastguard Worker lea r6d, [r9+r6*2] 377*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+fg_max+r6*4] 378*c0909341SAndroid Build Coastguard Worker kmovq k2, r13 379*c0909341SAndroid Build Coastguard Worker vpbroadcastd m20, [base+scale_rnd+r9*4] 380*c0909341SAndroid Build Coastguard Worker packssdw m4, m5, m5 381*c0909341SAndroid Build Coastguard Worker vpbroadcastd m21, [base+scale_shift+r9*8+4] 382*c0909341SAndroid Build Coastguard Worker%if %2 383*c0909341SAndroid Build Coastguard Worker mova m12, [pb_0to63] ; pw_even 384*c0909341SAndroid Build Coastguard Worker mov r13d, 0x0101 385*c0909341SAndroid Build Coastguard Worker vpbroadcastq m10, [base+pw_23_22+r9*8] 386*c0909341SAndroid Build Coastguard Worker kmovw k3, r13d 387*c0909341SAndroid Build Coastguard Worker%if %3 388*c0909341SAndroid Build Coastguard Worker pshufd m11, m10, q0000 389*c0909341SAndroid Build Coastguard Worker%else 390*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym16, [base+pw_27_17_17_27+r9*8+0] 391*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+pw_27_17_17_27+r9*8+4] 392*c0909341SAndroid Build Coastguard Worker vmovdqu16 m11{k1}, m16 393*c0909341SAndroid Build Coastguard Worker%endif 394*c0909341SAndroid Build Coastguard Worker psrlw m13, m12, 8 ; pw_odd 395*c0909341SAndroid Build Coastguard Worker%else 396*c0909341SAndroid Build Coastguard Worker vpbroadcastq m10, [base+pw_27_17_17_27+r9*8] 397*c0909341SAndroid Build Coastguard Worker kshiftrb k3, k1, 7 ; 0x01 398*c0909341SAndroid Build Coastguard Worker kshiftrb k4, k1, 4 ; 0x0f 399*c0909341SAndroid Build Coastguard Worker pshufd m11, m10, q0000 400*c0909341SAndroid Build Coastguard Worker%endif 401*c0909341SAndroid Build Coastguard Worker mov lstrideq, r10mp 402*c0909341SAndroid Build Coastguard Worker test sbyd, sbyd 403*c0909341SAndroid Build Coastguard Worker setnz r7b 404*c0909341SAndroid Build Coastguard Worker cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 405*c0909341SAndroid Build Coastguard Worker jne .csfl 406*c0909341SAndroid Build Coastguard Worker 407*c0909341SAndroid Build Coastguard Worker%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver 408*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 409*c0909341SAndroid Build Coastguard Worker _, sby, see, lstride 410*c0909341SAndroid Build Coastguard Worker 411*c0909341SAndroid Build Coastguard Worker%if %1 412*c0909341SAndroid Build Coastguard Worker mov r6d, r11m 413*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [base+uv_offset_mul+r9*4] 414*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [base+pb_8_9_0_1] 415*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [fg_dataq+FGData.uv_offset+r6*4] 416*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m15, [fg_dataq+FGData.uv_mult+r6*4] 417*c0909341SAndroid Build Coastguard Worker pmaddwd m14, m0 418*c0909341SAndroid Build Coastguard Worker pshufb m15, m1 ; { uv_luma_mult, uv_mult } 419*c0909341SAndroid Build Coastguard Worker%endif 420*c0909341SAndroid Build Coastguard Worker test r7b, [fg_dataq+FGData.overlap_flag] 421*c0909341SAndroid Build Coastguard Worker jnz %%v_overlap 422*c0909341SAndroid Build Coastguard Worker 423*c0909341SAndroid Build Coastguard Worker imul seed, sbyd, (173 << 24) | 37 424*c0909341SAndroid Build Coastguard Worker add seed, (105 << 24) | 178 425*c0909341SAndroid Build Coastguard Worker rorx seed, seed, 24 426*c0909341SAndroid Build Coastguard Worker movzx seed, seew 427*c0909341SAndroid Build Coastguard Worker xor seed, [fg_dataq+FGData.seed] 428*c0909341SAndroid Build Coastguard Worker 429*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 430*c0909341SAndroid Build Coastguard Worker offx, offy, see, lstride, luma 431*c0909341SAndroid Build Coastguard Worker 432*c0909341SAndroid Build Coastguard Worker mov lumaq, r9mp 433*c0909341SAndroid Build Coastguard Worker lea r12, [srcq+wq*2] 434*c0909341SAndroid Build Coastguard Worker lea r13, [dstq+wq*2] 435*c0909341SAndroid Build Coastguard Worker lea r14, [lumaq+wq*(2<<%2)] 436*c0909341SAndroid Build Coastguard Worker mov r9mp, r12 437*c0909341SAndroid Build Coastguard Worker mov r10mp, r13 438*c0909341SAndroid Build Coastguard Worker mov r11mp, r14 439*c0909341SAndroid Build Coastguard Worker neg wq 440*c0909341SAndroid Build Coastguard Worker 441*c0909341SAndroid Build Coastguard Worker%%loop_x: 442*c0909341SAndroid Build Coastguard Worker rorx r6, seeq, 1 443*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4 444*c0909341SAndroid Build Coastguard Worker test seeb, seeh 445*c0909341SAndroid Build Coastguard Worker lea seed, [r6+0x8000] 446*c0909341SAndroid Build Coastguard Worker cmovp seed, r6d ; updated seed 447*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 448*c0909341SAndroid Build Coastguard Worker rorx offxq, seeq, 12 449*c0909341SAndroid Build Coastguard Worker and offyd, 0xf 450*c0909341SAndroid Build Coastguard Worker imul offyd, 164>>%3 451*c0909341SAndroid Build Coastguard Worker lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx 452*c0909341SAndroid Build Coastguard Worker 453*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 454*c0909341SAndroid Build Coastguard Worker h, offxy, see, lstride, luma 455*c0909341SAndroid Build Coastguard Worker 456*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 457*c0909341SAndroid Build Coastguard Worker mov hd, hm 458*c0909341SAndroid Build Coastguard Worker%%loop_y: 459*c0909341SAndroid Build Coastguard Worker%if %2 460*c0909341SAndroid Build Coastguard Worker movu ym18, [grain_lutq+offxyq*2+82*0] 461*c0909341SAndroid Build Coastguard Worker vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 462*c0909341SAndroid Build Coastguard Worker movu ym19, [grain_lutq+offxyq*2+82*4] 463*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 464*c0909341SAndroid Build Coastguard Worker%else 465*c0909341SAndroid Build Coastguard Worker movu m18, [grain_lutq+offxyq*2+82*0] 466*c0909341SAndroid Build Coastguard Worker movu m19, [grain_lutq+offxyq*2+82*2] 467*c0909341SAndroid Build Coastguard Worker%endif 468*c0909341SAndroid Build Coastguard Worker call %%add_noise 469*c0909341SAndroid Build Coastguard Worker sub hb, 2<<%2 470*c0909341SAndroid Build Coastguard Worker jg %%loop_y 471*c0909341SAndroid Build Coastguard Worker add wq, 32>>%2 472*c0909341SAndroid Build Coastguard Worker jge .end 473*c0909341SAndroid Build Coastguard Worker mov srcq, r9mp 474*c0909341SAndroid Build Coastguard Worker mov dstq, r10mp 475*c0909341SAndroid Build Coastguard Worker mov lumaq, r11mp 476*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+wq*2] 477*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+wq*2] 478*c0909341SAndroid Build Coastguard Worker lea lumaq, [lumaq+wq*(2<<%2)] 479*c0909341SAndroid Build Coastguard Worker cmp byte [fg_dataq+FGData.overlap_flag], 0 480*c0909341SAndroid Build Coastguard Worker je %%loop_x 481*c0909341SAndroid Build Coastguard Worker cmp dword r8m, 0 ; sby 482*c0909341SAndroid Build Coastguard Worker jne %%hv_overlap 483*c0909341SAndroid Build Coastguard Worker 484*c0909341SAndroid Build Coastguard Worker ; horizontal overlap (without vertical overlap) 485*c0909341SAndroid Build Coastguard Worker%%loop_x_h_overlap: 486*c0909341SAndroid Build Coastguard Worker rorx r6, seeq, 1 487*c0909341SAndroid Build Coastguard Worker or seed, 0xEFF4 488*c0909341SAndroid Build Coastguard Worker test seeb, seeh 489*c0909341SAndroid Build Coastguard Worker lea seed, [r6+0x8000] 490*c0909341SAndroid Build Coastguard Worker cmovp seed, r6d ; updated seed 491*c0909341SAndroid Build Coastguard Worker 492*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 493*c0909341SAndroid Build Coastguard Worker offx, offy, see, lstride, luma, left_offxy 494*c0909341SAndroid Build Coastguard Worker 495*c0909341SAndroid Build Coastguard Worker lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx 496*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 497*c0909341SAndroid Build Coastguard Worker rorx offxq, seeq, 12 498*c0909341SAndroid Build Coastguard Worker and offyd, 0xf 499*c0909341SAndroid Build Coastguard Worker imul offyd, 164>>%3 500*c0909341SAndroid Build Coastguard Worker lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 501*c0909341SAndroid Build Coastguard Worker 502*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 503*c0909341SAndroid Build Coastguard Worker h, offxy, see, lstride, luma, left_offxy 504*c0909341SAndroid Build Coastguard Worker 505*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 506*c0909341SAndroid Build Coastguard Worker mov hd, hm 507*c0909341SAndroid Build Coastguard Worker%%loop_y_h_overlap: 508*c0909341SAndroid Build Coastguard Worker%if %2 509*c0909341SAndroid Build Coastguard Worker movu ym18, [grain_lutq+offxyq*2+82*0] 510*c0909341SAndroid Build Coastguard Worker vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 511*c0909341SAndroid Build Coastguard Worker movu ym19, [grain_lutq+offxyq*2+82*4] 512*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 513*c0909341SAndroid Build Coastguard Worker movd xm16, [grain_lutq+left_offxyq*2+82*0] 514*c0909341SAndroid Build Coastguard Worker vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2 515*c0909341SAndroid Build Coastguard Worker movd xm17, [grain_lutq+left_offxyq*2+82*4] 516*c0909341SAndroid Build Coastguard Worker vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2 517*c0909341SAndroid Build Coastguard Worker punpckldq m16, m17 518*c0909341SAndroid Build Coastguard Worker punpckldq m17, m18, m19 519*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m17 520*c0909341SAndroid Build Coastguard Worker mova m17, m20 521*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m16, m10 522*c0909341SAndroid Build Coastguard Worker psrad m17, 1 523*c0909341SAndroid Build Coastguard Worker packssdw m17, m17 524*c0909341SAndroid Build Coastguard Worker vpsravw m17, m21 525*c0909341SAndroid Build Coastguard Worker%else 526*c0909341SAndroid Build Coastguard Worker movu m18, [grain_lutq+offxyq*2+82*0] 527*c0909341SAndroid Build Coastguard Worker movu m19, [grain_lutq+offxyq*2+82*2] 528*c0909341SAndroid Build Coastguard Worker movd xm16, [grain_lutq+left_offxyq*2+82*0] 529*c0909341SAndroid Build Coastguard Worker pinsrd xm16, [grain_lutq+left_offxyq*2+82*2], 1 530*c0909341SAndroid Build Coastguard Worker punpckldq xm17, xm18, xm19 531*c0909341SAndroid Build Coastguard Worker punpcklwd xm16, xm17 532*c0909341SAndroid Build Coastguard Worker mova xm17, xm20 533*c0909341SAndroid Build Coastguard Worker vpdpwssd xm17, xm16, xm10 534*c0909341SAndroid Build Coastguard Worker psrad xm17, 1 535*c0909341SAndroid Build Coastguard Worker packssdw xm17, xm17 536*c0909341SAndroid Build Coastguard Worker vpsravw xm17, xm21 537*c0909341SAndroid Build Coastguard Worker%endif 538*c0909341SAndroid Build Coastguard Worker vmovdqa32 m18{k3}, m17 539*c0909341SAndroid Build Coastguard Worker vpshufd m19{k3}, m17, q0321 540*c0909341SAndroid Build Coastguard Worker call %%add_noise 541*c0909341SAndroid Build Coastguard Worker sub hb, 2<<%2 542*c0909341SAndroid Build Coastguard Worker jg %%loop_y_h_overlap 543*c0909341SAndroid Build Coastguard Worker add wq, 32>>%2 544*c0909341SAndroid Build Coastguard Worker jge .end 545*c0909341SAndroid Build Coastguard Worker mov srcq, r9mp 546*c0909341SAndroid Build Coastguard Worker mov dstq, r10mp 547*c0909341SAndroid Build Coastguard Worker mov lumaq, r11mp 548*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+wq*2] 549*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+wq*2] 550*c0909341SAndroid Build Coastguard Worker lea lumaq, [lumaq+wq*(2<<%2)] 551*c0909341SAndroid Build Coastguard Worker cmp dword r8m, 0 ; sby 552*c0909341SAndroid Build Coastguard Worker jne %%hv_overlap 553*c0909341SAndroid Build Coastguard Worker jmp %%loop_x_h_overlap 554*c0909341SAndroid Build Coastguard Worker 555*c0909341SAndroid Build Coastguard Worker%%v_overlap: 556*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 557*c0909341SAndroid Build Coastguard Worker _, sby, see, lstride 558*c0909341SAndroid Build Coastguard Worker 559*c0909341SAndroid Build Coastguard Worker movzx sbyd, sbyb 560*c0909341SAndroid Build Coastguard Worker imul seed, [fg_dataq+FGData.seed], 0x00010001 561*c0909341SAndroid Build Coastguard Worker imul r7d, sbyd, 173 * 0x00010001 562*c0909341SAndroid Build Coastguard Worker imul sbyd, 37 * 0x01000100 563*c0909341SAndroid Build Coastguard Worker add r7d, (105 << 16) | 188 564*c0909341SAndroid Build Coastguard Worker add sbyd, (178 << 24) | (141 << 8) 565*c0909341SAndroid Build Coastguard Worker and r7d, 0x00ff00ff 566*c0909341SAndroid Build Coastguard Worker and sbyd, 0xff00ff00 567*c0909341SAndroid Build Coastguard Worker xor seed, r7d 568*c0909341SAndroid Build Coastguard Worker xor seed, sbyd ; (cur_seed << 16) | top_seed 569*c0909341SAndroid Build Coastguard Worker 570*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 571*c0909341SAndroid Build Coastguard Worker offx, offy, see, lstride, luma, _, top_offxy 572*c0909341SAndroid Build Coastguard Worker 573*c0909341SAndroid Build Coastguard Worker mov lumaq, r9mp 574*c0909341SAndroid Build Coastguard Worker lea r12, [srcq+wq*2] 575*c0909341SAndroid Build Coastguard Worker lea r13, [dstq+wq*2] 576*c0909341SAndroid Build Coastguard Worker lea r14, [lumaq+wq*(2<<%2)] 577*c0909341SAndroid Build Coastguard Worker mov r9mp, r12 578*c0909341SAndroid Build Coastguard Worker mov r10mp, r13 579*c0909341SAndroid Build Coastguard Worker mov r11mp, r14 580*c0909341SAndroid Build Coastguard Worker neg wq 581*c0909341SAndroid Build Coastguard Worker 582*c0909341SAndroid Build Coastguard Worker ; we assume from the block above that bits 8-15 of r7d are zero'ed 583*c0909341SAndroid Build Coastguard Worker mov r6d, seed 584*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4eff4 585*c0909341SAndroid Build Coastguard Worker test seeb, seeh 586*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of top_seed 587*c0909341SAndroid Build Coastguard Worker shr seed, 16 588*c0909341SAndroid Build Coastguard Worker shl r7d, 16 589*c0909341SAndroid Build Coastguard Worker test seeb, seeh 590*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of cur_seed 591*c0909341SAndroid Build Coastguard Worker or r6d, 0x00010001 592*c0909341SAndroid Build Coastguard Worker xor r7d, r6d 593*c0909341SAndroid Build Coastguard Worker rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 594*c0909341SAndroid Build Coastguard Worker 595*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 596*c0909341SAndroid Build Coastguard Worker rorx offxd, seed, 12 597*c0909341SAndroid Build Coastguard Worker and offyd, 0xf000f 598*c0909341SAndroid Build Coastguard Worker and offxd, 0xf000f 599*c0909341SAndroid Build Coastguard Worker imul offyd, 164>>%3 600*c0909341SAndroid Build Coastguard Worker ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 601*c0909341SAndroid Build Coastguard Worker lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 602*c0909341SAndroid Build Coastguard Worker 603*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 604*c0909341SAndroid Build Coastguard Worker h, offxy, see, lstride, luma, _, top_offxy 605*c0909341SAndroid Build Coastguard Worker 606*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 607*c0909341SAndroid Build Coastguard Worker mov hd, hm 608*c0909341SAndroid Build Coastguard Worker movzx top_offxyd, offxyw 609*c0909341SAndroid Build Coastguard Worker shr offxyd, 16 610*c0909341SAndroid Build Coastguard Worker 611*c0909341SAndroid Build Coastguard Worker%if %3 612*c0909341SAndroid Build Coastguard Worker movu ym16, [grain_lutq+offxyq*2+82*0] 613*c0909341SAndroid Build Coastguard Worker movu ym1, [grain_lutq+top_offxyq*2+82*0] 614*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m18, [grain_lutq+offxyq*2+82*2] 615*c0909341SAndroid Build Coastguard Worker movu ym19, [grain_lutq+offxyq*2+82*4] 616*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 617*c0909341SAndroid Build Coastguard Worker punpcklwd ym17, ym1, ym16 618*c0909341SAndroid Build Coastguard Worker punpckhwd ym1, ym16 619*c0909341SAndroid Build Coastguard Worker%elif %2 620*c0909341SAndroid Build Coastguard Worker movu ym18, [grain_lutq+offxyq*2+82*0] 621*c0909341SAndroid Build Coastguard Worker vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 622*c0909341SAndroid Build Coastguard Worker movu ym17, [grain_lutq+top_offxyq*2+82*0] 623*c0909341SAndroid Build Coastguard Worker vinserti32x8 m17, [grain_lutq+top_offxyq*2+82*2], 1 624*c0909341SAndroid Build Coastguard Worker movu ym19, [grain_lutq+offxyq*2+82*4] 625*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 626*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m17, m18 627*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m18 628*c0909341SAndroid Build Coastguard Worker%else 629*c0909341SAndroid Build Coastguard Worker movu m18, [grain_lutq+offxyq*2+82*0] 630*c0909341SAndroid Build Coastguard Worker movu m19, [grain_lutq+top_offxyq*2+82*0] 631*c0909341SAndroid Build Coastguard Worker movu m2, [grain_lutq+offxyq*2+82*2] 632*c0909341SAndroid Build Coastguard Worker movu m16, [grain_lutq+top_offxyq*2+82*2] 633*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m19, m18 634*c0909341SAndroid Build Coastguard Worker punpcklwd m19, m18 635*c0909341SAndroid Build Coastguard Worker punpckhwd m18, m2, m16 636*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m16 637*c0909341SAndroid Build Coastguard Worker%endif 638*c0909341SAndroid Build Coastguard Worker call %%add_noise_v 639*c0909341SAndroid Build Coastguard Worker sub hb, 2<<%2 640*c0909341SAndroid Build Coastguard Worker jg %%loop_y 641*c0909341SAndroid Build Coastguard Worker add wq, 32>>%2 642*c0909341SAndroid Build Coastguard Worker jge .end 643*c0909341SAndroid Build Coastguard Worker mov srcq, r9mp 644*c0909341SAndroid Build Coastguard Worker mov dstq, r10mp 645*c0909341SAndroid Build Coastguard Worker mov lumaq, r11mp 646*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+wq*2] 647*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+wq*2] 648*c0909341SAndroid Build Coastguard Worker lea lumaq, [lumaq+wq*(2<<%2)] 649*c0909341SAndroid Build Coastguard Worker 650*c0909341SAndroid Build Coastguard Worker ; since fg_dataq.overlap is guaranteed to be set, we never jump back 651*c0909341SAndroid Build Coastguard Worker ; to %%v_overlap, and instead always fall-through to %%hv_overlap 652*c0909341SAndroid Build Coastguard Worker%%hv_overlap: 653*c0909341SAndroid Build Coastguard Worker ; we assume from the block above that bits 8-15 of r7d are zero'ed 654*c0909341SAndroid Build Coastguard Worker mov r6d, seed 655*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4eff4 656*c0909341SAndroid Build Coastguard Worker test seeb, seeh 657*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of top_seed 658*c0909341SAndroid Build Coastguard Worker shr seed, 16 659*c0909341SAndroid Build Coastguard Worker shl r7d, 16 660*c0909341SAndroid Build Coastguard Worker test seeb, seeh 661*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of cur_seed 662*c0909341SAndroid Build Coastguard Worker or r6d, 0x00010001 663*c0909341SAndroid Build Coastguard Worker xor r7d, r6d 664*c0909341SAndroid Build Coastguard Worker rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 665*c0909341SAndroid Build Coastguard Worker 666*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 667*c0909341SAndroid Build Coastguard Worker offx, offy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy 668*c0909341SAndroid Build Coastguard Worker 669*c0909341SAndroid Build Coastguard Worker lea topleft_offxyq, [top_offxyq+(32>>%2)] 670*c0909341SAndroid Build Coastguard Worker lea left_offxyq, [offyq+(32>>%2)] 671*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 672*c0909341SAndroid Build Coastguard Worker rorx offxd, seed, 12 673*c0909341SAndroid Build Coastguard Worker and offyd, 0xf000f 674*c0909341SAndroid Build Coastguard Worker and offxd, 0xf000f 675*c0909341SAndroid Build Coastguard Worker imul offyd, 164>>%3 676*c0909341SAndroid Build Coastguard Worker ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 677*c0909341SAndroid Build Coastguard Worker lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 678*c0909341SAndroid Build Coastguard Worker 679*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 680*c0909341SAndroid Build Coastguard Worker h, offxy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy 681*c0909341SAndroid Build Coastguard Worker 682*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 683*c0909341SAndroid Build Coastguard Worker mov hd, hm 684*c0909341SAndroid Build Coastguard Worker movzx top_offxyd, offxyw 685*c0909341SAndroid Build Coastguard Worker shr offxyd, 16 686*c0909341SAndroid Build Coastguard Worker 687*c0909341SAndroid Build Coastguard Worker ; grain = grain_lut[offy+y][offx+x] 688*c0909341SAndroid Build Coastguard Worker%if %2 689*c0909341SAndroid Build Coastguard Worker movd xm16, [grain_lutq+left_offxyq*2+82*0] 690*c0909341SAndroid Build Coastguard Worker vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2 691*c0909341SAndroid Build Coastguard Worker movd xm17, [grain_lutq+left_offxyq*2+82*4] 692*c0909341SAndroid Build Coastguard Worker vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2 693*c0909341SAndroid Build Coastguard Worker movu ym18, [grain_lutq+offxyq*2+82*0] 694*c0909341SAndroid Build Coastguard Worker vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 695*c0909341SAndroid Build Coastguard Worker movu ym19, [grain_lutq+offxyq*2+82*4] 696*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 697*c0909341SAndroid Build Coastguard Worker punpckldq m16, m17 698*c0909341SAndroid Build Coastguard Worker punpckldq m17, m18, m19 699*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m17 700*c0909341SAndroid Build Coastguard Worker movu ym1, [grain_lutq+top_offxyq*2+82*0] 701*c0909341SAndroid Build Coastguard Worker movd xm17, [grain_lutq+topleft_offxyq*2+82*0] 702*c0909341SAndroid Build Coastguard Worker mova m0, m20 703*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m16, m10 704*c0909341SAndroid Build Coastguard Worker%if %3 705*c0909341SAndroid Build Coastguard Worker punpcklwd xm17, xm1 706*c0909341SAndroid Build Coastguard Worker mova xm16, xm20 707*c0909341SAndroid Build Coastguard Worker vpdpwssd xm16, xm17, xm10 708*c0909341SAndroid Build Coastguard Worker psrad xm16, 1 709*c0909341SAndroid Build Coastguard Worker%else 710*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, [grain_lutq+top_offxyq*2+82*2], 1 711*c0909341SAndroid Build Coastguard Worker vinserti32x4 m17, [grain_lutq+topleft_offxyq*2+82*2], 2 712*c0909341SAndroid Build Coastguard Worker punpcklwd m17, m1 713*c0909341SAndroid Build Coastguard Worker mova m16, m20 714*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m17, m10 715*c0909341SAndroid Build Coastguard Worker psrad m16, 1 716*c0909341SAndroid Build Coastguard Worker%endif 717*c0909341SAndroid Build Coastguard Worker psrad m0, 1 718*c0909341SAndroid Build Coastguard Worker packssdw m0, m16 719*c0909341SAndroid Build Coastguard Worker vpsravw m0, m21 720*c0909341SAndroid Build Coastguard Worker vmovdqa32 m18{k3}, m0 721*c0909341SAndroid Build Coastguard Worker vpshufd m19{k3}, m0, q0321 722*c0909341SAndroid Build Coastguard Worker%if %3 723*c0909341SAndroid Build Coastguard Worker vpunpckhdq ym1{k3}, ym0, ym0 724*c0909341SAndroid Build Coastguard Worker punpcklwd ym17, ym1, ym18 725*c0909341SAndroid Build Coastguard Worker punpckhwd ym1, ym18 726*c0909341SAndroid Build Coastguard Worker%else 727*c0909341SAndroid Build Coastguard Worker vpunpckhdq m1{k3}, m0, m0 728*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m1, m18 729*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m1, m18 730*c0909341SAndroid Build Coastguard Worker%endif 731*c0909341SAndroid Build Coastguard Worker%else 732*c0909341SAndroid Build Coastguard Worker movu m18, [grain_lutq+offxyq*2+82*0] 733*c0909341SAndroid Build Coastguard Worker movu m19, [grain_lutq+top_offxyq*2+82*0] 734*c0909341SAndroid Build Coastguard Worker movd xm17, [grain_lutq+left_offxyq*2+82*0] 735*c0909341SAndroid Build Coastguard Worker pinsrd xm17, [grain_lutq+topleft_offxyq*2+82*0], 1 736*c0909341SAndroid Build Coastguard Worker punpckldq xm16, xm18, xm19 737*c0909341SAndroid Build Coastguard Worker punpcklwd xm17, xm16 738*c0909341SAndroid Build Coastguard Worker movu m2, [grain_lutq+offxyq*2+82*2] 739*c0909341SAndroid Build Coastguard Worker movu m0, [grain_lutq+top_offxyq*2+82*2] 740*c0909341SAndroid Build Coastguard Worker movd xm16, [grain_lutq+left_offxyq*2+82*2] 741*c0909341SAndroid Build Coastguard Worker pinsrd xm16, [grain_lutq+topleft_offxyq*2+82*2], 1 742*c0909341SAndroid Build Coastguard Worker punpckldq xm1, xm2, xm0 743*c0909341SAndroid Build Coastguard Worker punpcklwd xm1, xm16, xm1 744*c0909341SAndroid Build Coastguard Worker mova xm16, xm20 745*c0909341SAndroid Build Coastguard Worker vpdpwssd xm16, xm17, xm10 746*c0909341SAndroid Build Coastguard Worker mova xm17, xm20 747*c0909341SAndroid Build Coastguard Worker vpdpwssd xm17, xm1, xm10 748*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m19, m18 749*c0909341SAndroid Build Coastguard Worker punpcklwd m19, m18 750*c0909341SAndroid Build Coastguard Worker punpckhwd m18, m2, m0 751*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m0 752*c0909341SAndroid Build Coastguard Worker psrad xm16, 1 753*c0909341SAndroid Build Coastguard Worker psrad xm17, 1 754*c0909341SAndroid Build Coastguard Worker packssdw xm16, xm17 755*c0909341SAndroid Build Coastguard Worker vpsravw xm16, xm21 756*c0909341SAndroid Build Coastguard Worker vpshuflw m19{k4}, m16, q1302 757*c0909341SAndroid Build Coastguard Worker punpckhqdq xm16, xm16 758*c0909341SAndroid Build Coastguard Worker vpshuflw m2{k4}, m16, q3120 759*c0909341SAndroid Build Coastguard Worker%endif 760*c0909341SAndroid Build Coastguard Worker call %%add_noise_v 761*c0909341SAndroid Build Coastguard Worker sub hb, 2<<%2 762*c0909341SAndroid Build Coastguard Worker jg %%loop_y_h_overlap 763*c0909341SAndroid Build Coastguard Worker add wq, 32>>%2 764*c0909341SAndroid Build Coastguard Worker jge .end 765*c0909341SAndroid Build Coastguard Worker mov srcq, r9mp 766*c0909341SAndroid Build Coastguard Worker mov dstq, r10mp 767*c0909341SAndroid Build Coastguard Worker mov lumaq, r11mp 768*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+wq*2] 769*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+wq*2] 770*c0909341SAndroid Build Coastguard Worker lea lumaq, [lumaq+wq*(2<<%2)] 771*c0909341SAndroid Build Coastguard Worker jmp %%hv_overlap 772*c0909341SAndroid Build Coastguard Worker 773*c0909341SAndroid Build Coastguard WorkerALIGN function_align 774*c0909341SAndroid Build Coastguard Worker%%add_noise_v: 775*c0909341SAndroid Build Coastguard Worker%if %3 776*c0909341SAndroid Build Coastguard Worker mova ym16, ym20 777*c0909341SAndroid Build Coastguard Worker vpdpwssd ym16, ym17, ym11 778*c0909341SAndroid Build Coastguard Worker mova ym17, ym20 779*c0909341SAndroid Build Coastguard Worker vpdpwssd ym17, ym1, ym11 780*c0909341SAndroid Build Coastguard Worker psrad ym16, 1 781*c0909341SAndroid Build Coastguard Worker psrad ym17, 1 782*c0909341SAndroid Build Coastguard Worker packssdw ym16, ym17 783*c0909341SAndroid Build Coastguard Worker vpsravw m18{k1}, m16, m21 784*c0909341SAndroid Build Coastguard Worker%elif %2 785*c0909341SAndroid Build Coastguard Worker mova m18, m20 786*c0909341SAndroid Build Coastguard Worker vpdpwssd m18, m16, m11 787*c0909341SAndroid Build Coastguard Worker mova m16, m20 788*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m17, m11 789*c0909341SAndroid Build Coastguard Worker psrad m18, 1 790*c0909341SAndroid Build Coastguard Worker psrad m16, 1 791*c0909341SAndroid Build Coastguard Worker packssdw m18, m16 792*c0909341SAndroid Build Coastguard Worker vpsravw m18, m21 793*c0909341SAndroid Build Coastguard Worker%else 794*c0909341SAndroid Build Coastguard Worker mova m16, m20 795*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m1, m11 796*c0909341SAndroid Build Coastguard Worker mova m17, m20 797*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m18, m11 798*c0909341SAndroid Build Coastguard Worker mova m18, m20 799*c0909341SAndroid Build Coastguard Worker vpdpwssd m18, m19, m11 800*c0909341SAndroid Build Coastguard Worker mova m19, m20 801*c0909341SAndroid Build Coastguard Worker vpdpwssd m19, m2, m11 802*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 1}, m16, m17, m18, m19 803*c0909341SAndroid Build Coastguard Worker packssdw m18, m16 804*c0909341SAndroid Build Coastguard Worker packssdw m19, m17 805*c0909341SAndroid Build Coastguard Worker vpsravw m18, m21 806*c0909341SAndroid Build Coastguard Worker vpsravw m19, m21 807*c0909341SAndroid Build Coastguard Worker%endif 808*c0909341SAndroid Build Coastguard Worker%%add_noise: 809*c0909341SAndroid Build Coastguard Worker%if %2 810*c0909341SAndroid Build Coastguard Worker mova m2, [lumaq+lstrideq*(0<<%3)] 811*c0909341SAndroid Build Coastguard Worker mova m0, [lumaq+lstrideq*(1<<%3)] 812*c0909341SAndroid Build Coastguard Worker lea lumaq, [lumaq+lstrideq*(2<<%3)] 813*c0909341SAndroid Build Coastguard Worker mova m3, [lumaq+lstrideq*(0<<%3)] 814*c0909341SAndroid Build Coastguard Worker mova m1, [lumaq+lstrideq*(1<<%3)] 815*c0909341SAndroid Build Coastguard Worker mova m16, m12 816*c0909341SAndroid Build Coastguard Worker vpermi2w m16, m2, m0 817*c0909341SAndroid Build Coastguard Worker vpermt2w m2, m13, m0 818*c0909341SAndroid Build Coastguard Worker mova m17, m12 819*c0909341SAndroid Build Coastguard Worker vpermi2w m17, m3, m1 820*c0909341SAndroid Build Coastguard Worker vpermt2w m3, m13, m1 821*c0909341SAndroid Build Coastguard Worker pavgw m2, m16 822*c0909341SAndroid Build Coastguard Worker pavgw m3, m17 823*c0909341SAndroid Build Coastguard Worker%elif %1 824*c0909341SAndroid Build Coastguard Worker mova m2, [lumaq+lstrideq*0] 825*c0909341SAndroid Build Coastguard Worker mova m3, [lumaq+lstrideq*1] 826*c0909341SAndroid Build Coastguard Worker%endif 827*c0909341SAndroid Build Coastguard Worker%if %2 828*c0909341SAndroid Build Coastguard Worker mova ym16, [srcq+strideq*0] 829*c0909341SAndroid Build Coastguard Worker vinserti32x8 m16, [srcq+strideq*1], 1 830*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 831*c0909341SAndroid Build Coastguard Worker%else 832*c0909341SAndroid Build Coastguard Worker mova m16, [srcq+strideq*0] 833*c0909341SAndroid Build Coastguard Worker%endif 834*c0909341SAndroid Build Coastguard Worker%if %1 835*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m2, m16 836*c0909341SAndroid Build Coastguard Worker mova m0, m14 837*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m17, m15 838*c0909341SAndroid Build Coastguard Worker punpcklwd m17, m2, m16 839*c0909341SAndroid Build Coastguard Worker mova m2, m14 840*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m17, m15 841*c0909341SAndroid Build Coastguard Worker%endif 842*c0909341SAndroid Build Coastguard Worker%if %2 843*c0909341SAndroid Build Coastguard Worker mova ym17, [srcq+strideq*0] 844*c0909341SAndroid Build Coastguard Worker vinserti32x8 m17, [srcq+strideq*1], 1 845*c0909341SAndroid Build Coastguard Worker%else 846*c0909341SAndroid Build Coastguard Worker mova m17, [srcq+strideq*1] 847*c0909341SAndroid Build Coastguard Worker%endif 848*c0909341SAndroid Build Coastguard Worker%if %1 849*c0909341SAndroid Build Coastguard Worker psrad m0, 6 850*c0909341SAndroid Build Coastguard Worker psrad m2, 6 851*c0909341SAndroid Build Coastguard Worker packusdw m2, m0 852*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m3, m17 853*c0909341SAndroid Build Coastguard Worker mova m1, m14 854*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m15, m0 855*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m3, m17 856*c0909341SAndroid Build Coastguard Worker mova m3, m14 857*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m15, m0 858*c0909341SAndroid Build Coastguard Worker psrad m1, 6 859*c0909341SAndroid Build Coastguard Worker psrad m3, 6 860*c0909341SAndroid Build Coastguard Worker packusdw m3, m1 861*c0909341SAndroid Build Coastguard Worker pminuw m2, m4 862*c0909341SAndroid Build Coastguard Worker pminuw m3, m4 863*c0909341SAndroid Build Coastguard Worker 864*c0909341SAndroid Build Coastguard Worker.add_noise_main: 865*c0909341SAndroid Build Coastguard Worker ; scaling[luma_src] 866*c0909341SAndroid Build Coastguard Worker kmovw k5, k1 867*c0909341SAndroid Build Coastguard Worker pand m1, m5, m2 868*c0909341SAndroid Build Coastguard Worker vpgatherdd m0{k5}, [scalingq+m1] 869*c0909341SAndroid Build Coastguard Worker kmovw k5, k1 870*c0909341SAndroid Build Coastguard Worker psrld m2, 16 871*c0909341SAndroid Build Coastguard Worker vpgatherdd m1{k5}, [scalingq+m2] 872*c0909341SAndroid Build Coastguard Worker vpshufb m0{k2}, m1, m6 873*c0909341SAndroid Build Coastguard Worker kmovw k5, k1 874*c0909341SAndroid Build Coastguard Worker psrld m1, m3, 16 875*c0909341SAndroid Build Coastguard Worker vpgatherdd m2{k5}, [scalingq+m1] 876*c0909341SAndroid Build Coastguard Worker kmovw k5, k1 877*c0909341SAndroid Build Coastguard Worker pand m3, m5 878*c0909341SAndroid Build Coastguard Worker vpgatherdd m1{k5}, [scalingq+m3] 879*c0909341SAndroid Build Coastguard Worker vpshufb m1{k2}, m2, m6 880*c0909341SAndroid Build Coastguard Worker 881*c0909341SAndroid Build Coastguard Worker ; noise = round2(scaling[luma_src] * grain, scaling_shift) 882*c0909341SAndroid Build Coastguard Worker vpsllvw m0, m7 883*c0909341SAndroid Build Coastguard Worker vpsllvw m1, m7 884*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m0 885*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m1 886*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82*(4<<%2) 887*c0909341SAndroid Build Coastguard Worker lea lumaq, [lumaq+lstrideq*(2<<%3)] 888*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*2] 889*c0909341SAndroid Build Coastguard Worker paddw m16, m18 890*c0909341SAndroid Build Coastguard Worker paddw m17, m19 891*c0909341SAndroid Build Coastguard Worker pmaxsw m16, m8 892*c0909341SAndroid Build Coastguard Worker pmaxsw m17, m8 893*c0909341SAndroid Build Coastguard Worker pminsw m16, m9 894*c0909341SAndroid Build Coastguard Worker pminsw m17, m9 895*c0909341SAndroid Build Coastguard Worker%if %2 896*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym16 897*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*1], m16, 1 898*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 899*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym17 900*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*1], m17, 1 901*c0909341SAndroid Build Coastguard Worker%else 902*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], m16 903*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], m17 904*c0909341SAndroid Build Coastguard Worker%endif 905*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 906*c0909341SAndroid Build Coastguard Worker ret 907*c0909341SAndroid Build Coastguard Worker%else 908*c0909341SAndroid Build Coastguard Worker%if %2 909*c0909341SAndroid Build Coastguard Worker pand m2, m4 910*c0909341SAndroid Build Coastguard Worker pand m3, m4 911*c0909341SAndroid Build Coastguard Worker%else 912*c0909341SAndroid Build Coastguard Worker pand m2, m4, [lumaq+lstrideq*0] 913*c0909341SAndroid Build Coastguard Worker pand m3, m4, [lumaq+lstrideq*1] 914*c0909341SAndroid Build Coastguard Worker%endif 915*c0909341SAndroid Build Coastguard Worker jmp .add_noise_main 916*c0909341SAndroid Build Coastguard Worker%endif 917*c0909341SAndroid Build Coastguard Worker%endmacro 918*c0909341SAndroid Build Coastguard Worker 919*c0909341SAndroid Build Coastguard Worker %%FGUV_32x32xN_LOOP 1, %2, %3 920*c0909341SAndroid Build Coastguard Worker.csfl: 921*c0909341SAndroid Build Coastguard Worker %%FGUV_32x32xN_LOOP 0, %2, %3 922*c0909341SAndroid Build Coastguard Worker.end: 923*c0909341SAndroid Build Coastguard Worker RET 924*c0909341SAndroid Build Coastguard Worker%endmacro 925*c0909341SAndroid Build Coastguard Worker 926*c0909341SAndroid Build Coastguard WorkerFGUV_FN 420, 1, 1 927*c0909341SAndroid Build Coastguard WorkerFGUV_FN 422, 1, 0 928*c0909341SAndroid Build Coastguard WorkerFGUV_FN 444, 0, 0 929*c0909341SAndroid Build Coastguard Worker 930*c0909341SAndroid Build Coastguard Worker%endif 931