1*c0909341SAndroid Build Coastguard Worker; Copyright © 2022, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2022, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; All rights reserved. 4*c0909341SAndroid Build Coastguard Worker; 5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 7*c0909341SAndroid Build Coastguard Worker; 8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 9*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 10*c0909341SAndroid Build Coastguard Worker; 11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 12*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 13*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 14*c0909341SAndroid Build Coastguard Worker; 15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*c0909341SAndroid Build Coastguard Worker 26*c0909341SAndroid Build Coastguard Worker%include "config.asm" 27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 28*c0909341SAndroid Build Coastguard Worker%include "x86/filmgrain_common.asm" 29*c0909341SAndroid Build Coastguard Worker 30*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 31*c0909341SAndroid Build Coastguard Worker 32*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64 33*c0909341SAndroid Build Coastguard Worker 34*c0909341SAndroid Build Coastguard Workerpb_even: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 35*c0909341SAndroid Build Coastguard Worker db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 36*c0909341SAndroid Build Coastguard Worker db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94 37*c0909341SAndroid Build Coastguard Worker db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126 38*c0909341SAndroid Build Coastguard Workerpb_odd: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 39*c0909341SAndroid Build Coastguard Worker db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 40*c0909341SAndroid Build Coastguard Worker db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95 41*c0909341SAndroid Build Coastguard Worker db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127 42*c0909341SAndroid Build Coastguard Workerinterleave_hl: db 8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7 43*c0909341SAndroid Build Coastguard Workerpb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32 44*c0909341SAndroid Build Coastguard Workerpb_23_22_0_32: db 23, 22, 0, 32, 0, 32, 0, 32 45*c0909341SAndroid Build Coastguard Workerpb_27_17: times 2 db 27, 17 46*c0909341SAndroid Build Coastguard Workerpb_23_22: times 2 db 23, 22 47*c0909341SAndroid Build Coastguard Workerpw_8: times 2 dw 8 48*c0909341SAndroid Build Coastguard Workerpw_1024: times 2 dw 1024 49*c0909341SAndroid Build Coastguard Workerpb_17_27: times 2 db 17, 27 50*c0909341SAndroid Build Coastguard Workerfg_max: times 4 db 255 51*c0909341SAndroid Build Coastguard Worker times 4 db 240 52*c0909341SAndroid Build Coastguard Worker times 4 db 235 53*c0909341SAndroid Build Coastguard Workerfg_min: times 4 db 0 54*c0909341SAndroid Build Coastguard Worker times 4 db 16 55*c0909341SAndroid Build Coastguard Workernoise_rnd: times 2 dw 128 56*c0909341SAndroid Build Coastguard Worker times 2 dw 64 57*c0909341SAndroid Build Coastguard Worker times 2 dw 32 58*c0909341SAndroid Build Coastguard Worker times 2 dw 16 59*c0909341SAndroid Build Coastguard Worker 60*c0909341SAndroid Build Coastguard WorkerSECTION .text 61*c0909341SAndroid Build Coastguard Worker 62*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl 63*c0909341SAndroid Build Coastguard Workercglobal fgy_32x32xn_8bpc, 6, 13, 22, dst, src, stride, fg_data, w, scaling, \ 64*c0909341SAndroid Build Coastguard Worker grain_lut, h, sby, see, overlap 65*c0909341SAndroid Build Coastguard Worker%define base r11-fg_min 66*c0909341SAndroid Build Coastguard Worker lea r11, [fg_min] 67*c0909341SAndroid Build Coastguard Worker mov r6d, [fg_dataq+FGData.scaling_shift] 68*c0909341SAndroid Build Coastguard Worker mov r7d, [fg_dataq+FGData.clip_to_restricted_range] 69*c0909341SAndroid Build Coastguard Worker mov sbyd, sbym 70*c0909341SAndroid Build Coastguard Worker mov overlapd, [fg_dataq+FGData.overlap_flag] 71*c0909341SAndroid Build Coastguard Worker mov r12, 0x0000000f0000000f ; h_overlap mask 72*c0909341SAndroid Build Coastguard Worker mova m0, [scalingq+64*0] 73*c0909341SAndroid Build Coastguard Worker mova m1, [scalingq+64*1] 74*c0909341SAndroid Build Coastguard Worker mova m2, [scalingq+64*2] 75*c0909341SAndroid Build Coastguard Worker mova m3, [scalingq+64*3] 76*c0909341SAndroid Build Coastguard Worker kmovq k1, r12 77*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m4, [base+interleave_hl] 78*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym16, [base+pb_27_17] 79*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [base+pb_17_27] 80*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+noise_rnd+r6*4-32] 81*c0909341SAndroid Build Coastguard Worker test sbyd, sbyd 82*c0909341SAndroid Build Coastguard Worker setnz r6b 83*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+fg_min+r7*4] 84*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+fg_max+r7*8] 85*c0909341SAndroid Build Coastguard Worker pxor m5, m5 86*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+pw_1024] 87*c0909341SAndroid Build Coastguard Worker vpbroadcastq m10, [base+pb_27_17_17_27] 88*c0909341SAndroid Build Coastguard Worker vmovdqa64 m12{k1}, m16 89*c0909341SAndroid Build Coastguard Worker test r6b, overlapb 90*c0909341SAndroid Build Coastguard Worker jnz .v_overlap 91*c0909341SAndroid Build Coastguard Worker 92*c0909341SAndroid Build Coastguard Worker imul seed, sbyd, (173 << 24) | 37 93*c0909341SAndroid Build Coastguard Worker add seed, (105 << 24) | 178 94*c0909341SAndroid Build Coastguard Worker rorx seed, seed, 24 95*c0909341SAndroid Build Coastguard Worker movzx seed, seew 96*c0909341SAndroid Build Coastguard Worker xor seed, [fg_dataq+FGData.seed] 97*c0909341SAndroid Build Coastguard Worker 98*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ 99*c0909341SAndroid Build Coastguard Worker h, sby, see, overlap 100*c0909341SAndroid Build Coastguard Worker 101*c0909341SAndroid Build Coastguard Worker lea src_bakq, [srcq+wq] 102*c0909341SAndroid Build Coastguard Worker neg wq 103*c0909341SAndroid Build Coastguard Worker sub dstq, srcq 104*c0909341SAndroid Build Coastguard Worker.loop_x: 105*c0909341SAndroid Build Coastguard Worker rorx r6, seeq, 1 106*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4 107*c0909341SAndroid Build Coastguard Worker test seeb, seeh 108*c0909341SAndroid Build Coastguard Worker lea seed, [r6+0x8000] 109*c0909341SAndroid Build Coastguard Worker cmovp seed, r6d ; updated seed 110*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 111*c0909341SAndroid Build Coastguard Worker rorx offxq, seeq, 12 112*c0909341SAndroid Build Coastguard Worker and offyd, 0xf 113*c0909341SAndroid Build Coastguard Worker imul offyd, 164 114*c0909341SAndroid Build Coastguard Worker lea offxd, [offyq+offxq*2+829] ; offy*stride+offx 115*c0909341SAndroid Build Coastguard Worker 116*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ 117*c0909341SAndroid Build Coastguard Worker h, sby, see, overlap 118*c0909341SAndroid Build Coastguard Worker 119*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 120*c0909341SAndroid Build Coastguard Worker mov hd, hm 121*c0909341SAndroid Build Coastguard Worker.loop_y: 122*c0909341SAndroid Build Coastguard Worker movu ym21, [grain_lutq+offxyq-82] 123*c0909341SAndroid Build Coastguard Worker vinserti32x8 m21, [grain_lutq+offxyq+ 0], 1 124*c0909341SAndroid Build Coastguard Worker call .add_noise 125*c0909341SAndroid Build Coastguard Worker sub hb, 2 126*c0909341SAndroid Build Coastguard Worker jg .loop_y 127*c0909341SAndroid Build Coastguard Worker add wq, 32 128*c0909341SAndroid Build Coastguard Worker jge .end 129*c0909341SAndroid Build Coastguard Worker lea srcq, [src_bakq+wq] 130*c0909341SAndroid Build Coastguard Worker test overlapd, overlapd 131*c0909341SAndroid Build Coastguard Worker jz .loop_x 132*c0909341SAndroid Build Coastguard Worker test sbyd, sbyd 133*c0909341SAndroid Build Coastguard Worker jnz .hv_overlap 134*c0909341SAndroid Build Coastguard Worker 135*c0909341SAndroid Build Coastguard Worker.loop_x_h_overlap: 136*c0909341SAndroid Build Coastguard Worker rorx r6, seeq, 1 137*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4 138*c0909341SAndroid Build Coastguard Worker test seeb, seeh 139*c0909341SAndroid Build Coastguard Worker lea seed, [r6+0x8000] 140*c0909341SAndroid Build Coastguard Worker cmovp seed, r6d ; updated seed 141*c0909341SAndroid Build Coastguard Worker 142*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ 143*c0909341SAndroid Build Coastguard Worker h, sby, see, left_offxy 144*c0909341SAndroid Build Coastguard Worker 145*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 146*c0909341SAndroid Build Coastguard Worker mov left_offxyd, offxd ; previous column's offy*stride 147*c0909341SAndroid Build Coastguard Worker rorx offxq, seeq, 12 148*c0909341SAndroid Build Coastguard Worker and offyd, 0xf 149*c0909341SAndroid Build Coastguard Worker imul offyd, 164 150*c0909341SAndroid Build Coastguard Worker lea offxd, [offyq+offxq*2+829] ; offy*stride+offx 151*c0909341SAndroid Build Coastguard Worker 152*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ 153*c0909341SAndroid Build Coastguard Worker h, sby, see, left_offxy 154*c0909341SAndroid Build Coastguard Worker 155*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 156*c0909341SAndroid Build Coastguard Worker mov hd, hm 157*c0909341SAndroid Build Coastguard Worker.loop_y_h_overlap: 158*c0909341SAndroid Build Coastguard Worker movu ym20, [grain_lutq+offxyq-82] 159*c0909341SAndroid Build Coastguard Worker vinserti32x8 m20, [grain_lutq+offxyq+ 0], 1 160*c0909341SAndroid Build Coastguard Worker movd xm19, [grain_lutq+left_offxyq-50] 161*c0909341SAndroid Build Coastguard Worker vinserti32x4 m19, [grain_lutq+left_offxyq+32], 2 162*c0909341SAndroid Build Coastguard Worker punpcklbw m19, m20 163*c0909341SAndroid Build Coastguard Worker pmaddubsw m19, m10, m19 164*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m9 165*c0909341SAndroid Build Coastguard Worker punpckhbw m21, m20, m5 166*c0909341SAndroid Build Coastguard Worker packsswb m20{k1}, m19, m19 167*c0909341SAndroid Build Coastguard Worker punpcklbw m20, m5, m20 168*c0909341SAndroid Build Coastguard Worker call .add_noise_h 169*c0909341SAndroid Build Coastguard Worker sub hb, 2 170*c0909341SAndroid Build Coastguard Worker jg .loop_y_h_overlap 171*c0909341SAndroid Build Coastguard Worker add wq, 32 172*c0909341SAndroid Build Coastguard Worker jge .end 173*c0909341SAndroid Build Coastguard Worker lea srcq, [src_bakq+wq] 174*c0909341SAndroid Build Coastguard Worker test sbyd, sbyd 175*c0909341SAndroid Build Coastguard Worker jnz .hv_overlap 176*c0909341SAndroid Build Coastguard Worker jmp .loop_x_h_overlap 177*c0909341SAndroid Build Coastguard Worker 178*c0909341SAndroid Build Coastguard Worker.v_overlap: 179*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, offy, offx, \ 180*c0909341SAndroid Build Coastguard Worker h, sby, see, overlap 181*c0909341SAndroid Build Coastguard Worker 182*c0909341SAndroid Build Coastguard Worker movzx r6d, sbyb 183*c0909341SAndroid Build Coastguard Worker imul seed, [fg_dataq+FGData.seed], 0x00010001 184*c0909341SAndroid Build Coastguard Worker imul r7d, r6d, 173 * 0x00010001 185*c0909341SAndroid Build Coastguard Worker imul r6d, 37 * 0x01000100 186*c0909341SAndroid Build Coastguard Worker add r7d, (105 << 16) | 188 187*c0909341SAndroid Build Coastguard Worker add r6d, (178 << 24) | (141 << 8) 188*c0909341SAndroid Build Coastguard Worker and r7d, 0x00ff00ff 189*c0909341SAndroid Build Coastguard Worker and r6d, 0xff00ff00 190*c0909341SAndroid Build Coastguard Worker xor seed, r7d 191*c0909341SAndroid Build Coastguard Worker xor seed, r6d ; (cur_seed << 16) | top_seed 192*c0909341SAndroid Build Coastguard Worker 193*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ 194*c0909341SAndroid Build Coastguard Worker h, sby, see, overlap 195*c0909341SAndroid Build Coastguard Worker 196*c0909341SAndroid Build Coastguard Worker lea src_bakq, [srcq+wq] 197*c0909341SAndroid Build Coastguard Worker neg wq 198*c0909341SAndroid Build Coastguard Worker sub dstq, srcq 199*c0909341SAndroid Build Coastguard Worker 200*c0909341SAndroid Build Coastguard Worker ; we assume from the block above that bits 8-15 of r7d are zero'ed 201*c0909341SAndroid Build Coastguard Worker mov r6d, seed 202*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4eff4 203*c0909341SAndroid Build Coastguard Worker test seeb, seeh 204*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of top_seed 205*c0909341SAndroid Build Coastguard Worker shr seed, 16 206*c0909341SAndroid Build Coastguard Worker shl r7d, 16 207*c0909341SAndroid Build Coastguard Worker test seeb, seeh 208*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of cur_seed 209*c0909341SAndroid Build Coastguard Worker or r6d, 0x00010001 210*c0909341SAndroid Build Coastguard Worker xor r7d, r6d 211*c0909341SAndroid Build Coastguard Worker rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 212*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 213*c0909341SAndroid Build Coastguard Worker rorx offxd, seed, 12 214*c0909341SAndroid Build Coastguard Worker and offyd, 0xf000f 215*c0909341SAndroid Build Coastguard Worker and offxd, 0xf000f 216*c0909341SAndroid Build Coastguard Worker imul offyd, 164 217*c0909341SAndroid Build Coastguard Worker ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 218*c0909341SAndroid Build Coastguard Worker lea offxd, [offyq+offxq*2+0x10001*829+32*82] 219*c0909341SAndroid Build Coastguard Worker 220*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ 221*c0909341SAndroid Build Coastguard Worker h, sby, see, overlap, top_offxy 222*c0909341SAndroid Build Coastguard Worker 223*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 224*c0909341SAndroid Build Coastguard Worker mov hd, hm 225*c0909341SAndroid Build Coastguard Worker movzx top_offxyd, offxyw 226*c0909341SAndroid Build Coastguard Worker shr offxyd, 16 227*c0909341SAndroid Build Coastguard Worker movu ym19, [grain_lutq+offxyq-82] 228*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1 229*c0909341SAndroid Build Coastguard Worker movu ym21, [grain_lutq+top_offxyq-82] 230*c0909341SAndroid Build Coastguard Worker vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1 231*c0909341SAndroid Build Coastguard Worker punpckhbw m20, m21, m19 232*c0909341SAndroid Build Coastguard Worker punpcklbw m21, m19 233*c0909341SAndroid Build Coastguard Worker call .add_noise_v 234*c0909341SAndroid Build Coastguard Worker sub hb, 2 235*c0909341SAndroid Build Coastguard Worker jg .loop_y 236*c0909341SAndroid Build Coastguard Worker add wq, 32 237*c0909341SAndroid Build Coastguard Worker jge .end 238*c0909341SAndroid Build Coastguard Worker lea srcq, [src_bakq+wq] 239*c0909341SAndroid Build Coastguard Worker 240*c0909341SAndroid Build Coastguard Worker ; since fg_dataq.overlap is guaranteed to be set, we never jump back 241*c0909341SAndroid Build Coastguard Worker ; to .v_overlap, and instead always fall-through to h+v overlap 242*c0909341SAndroid Build Coastguard Worker.hv_overlap: 243*c0909341SAndroid Build Coastguard Worker ; we assume from the block above that bits 8-15 of r7d are zero'ed 244*c0909341SAndroid Build Coastguard Worker mov r6d, seed 245*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4eff4 246*c0909341SAndroid Build Coastguard Worker test seeb, seeh 247*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of top_seed 248*c0909341SAndroid Build Coastguard Worker shr seed, 16 249*c0909341SAndroid Build Coastguard Worker shl r7d, 16 250*c0909341SAndroid Build Coastguard Worker test seeb, seeh 251*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of cur_seed 252*c0909341SAndroid Build Coastguard Worker or r6d, 0x00010001 253*c0909341SAndroid Build Coastguard Worker xor r7d, r6d 254*c0909341SAndroid Build Coastguard Worker rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 255*c0909341SAndroid Build Coastguard Worker 256*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ 257*c0909341SAndroid Build Coastguard Worker h, sby, see, left_offxy, top_offxy, topleft_offxy 258*c0909341SAndroid Build Coastguard Worker 259*c0909341SAndroid Build Coastguard Worker mov topleft_offxyd, top_offxyd 260*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 261*c0909341SAndroid Build Coastguard Worker mov left_offxyd, offxd 262*c0909341SAndroid Build Coastguard Worker rorx offxd, seed, 12 263*c0909341SAndroid Build Coastguard Worker and offyd, 0xf000f 264*c0909341SAndroid Build Coastguard Worker and offxd, 0xf000f 265*c0909341SAndroid Build Coastguard Worker imul offyd, 164 266*c0909341SAndroid Build Coastguard Worker ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 267*c0909341SAndroid Build Coastguard Worker lea offxd, [offyq+offxq*2+0x10001*829+32*82] 268*c0909341SAndroid Build Coastguard Worker 269*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ 270*c0909341SAndroid Build Coastguard Worker h, sby, see, left_offxy, top_offxy, topleft_offxy 271*c0909341SAndroid Build Coastguard Worker 272*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 273*c0909341SAndroid Build Coastguard Worker mov hd, hm 274*c0909341SAndroid Build Coastguard Worker movzx top_offxyd, offxyw 275*c0909341SAndroid Build Coastguard Worker shr offxyd, 16 276*c0909341SAndroid Build Coastguard Worker movu ym19, [grain_lutq+offxyq-82] 277*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1 278*c0909341SAndroid Build Coastguard Worker movd xm16, [grain_lutq+left_offxyq-50] 279*c0909341SAndroid Build Coastguard Worker vinserti32x4 m16, [grain_lutq+left_offxyq+32], 2 280*c0909341SAndroid Build Coastguard Worker movu ym21, [grain_lutq+top_offxyq-82] 281*c0909341SAndroid Build Coastguard Worker vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1 282*c0909341SAndroid Build Coastguard Worker movd xm17, [grain_lutq+topleft_offxyq-50] 283*c0909341SAndroid Build Coastguard Worker vinserti32x4 m17, [grain_lutq+topleft_offxyq+32], 2 284*c0909341SAndroid Build Coastguard Worker ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 285*c0909341SAndroid Build Coastguard Worker punpcklbw m16, m19 286*c0909341SAndroid Build Coastguard Worker pmaddubsw m16, m10, m16 287*c0909341SAndroid Build Coastguard Worker punpcklbw m17, m21 288*c0909341SAndroid Build Coastguard Worker pmaddubsw m17, m10, m17 289*c0909341SAndroid Build Coastguard Worker punpckhbw m20, m21, m19 290*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m9 291*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m9 292*c0909341SAndroid Build Coastguard Worker packsswb m19{k1}, m16, m16 293*c0909341SAndroid Build Coastguard Worker packsswb m21{k1}, m17, m17 294*c0909341SAndroid Build Coastguard Worker ; followed by v interpolation (top | cur -> cur) 295*c0909341SAndroid Build Coastguard Worker punpcklbw m21, m19 296*c0909341SAndroid Build Coastguard Worker call .add_noise_v 297*c0909341SAndroid Build Coastguard Worker sub hb, 2 298*c0909341SAndroid Build Coastguard Worker jg .loop_y_h_overlap 299*c0909341SAndroid Build Coastguard Worker add wq, 32 300*c0909341SAndroid Build Coastguard Worker lea srcq, [src_bakq+wq] 301*c0909341SAndroid Build Coastguard Worker jl .hv_overlap 302*c0909341SAndroid Build Coastguard Worker.end: 303*c0909341SAndroid Build Coastguard Worker RET 304*c0909341SAndroid Build Coastguard WorkerALIGN function_align 305*c0909341SAndroid Build Coastguard Worker.add_noise_v: 306*c0909341SAndroid Build Coastguard Worker pmaddubsw m20, m12, m20 307*c0909341SAndroid Build Coastguard Worker pmaddubsw m21, m12, m21 308*c0909341SAndroid Build Coastguard Worker pmulhrsw m20, m9 309*c0909341SAndroid Build Coastguard Worker pmulhrsw m21, m9 310*c0909341SAndroid Build Coastguard Worker packsswb m21, m20 311*c0909341SAndroid Build Coastguard Worker.add_noise: 312*c0909341SAndroid Build Coastguard Worker punpcklbw m20, m5, m21 313*c0909341SAndroid Build Coastguard Worker punpckhbw m21, m5 314*c0909341SAndroid Build Coastguard Worker.add_noise_h: 315*c0909341SAndroid Build Coastguard Worker mova ym18, [srcq+strideq*0] 316*c0909341SAndroid Build Coastguard Worker vinserti32x8 m18, [srcq+strideq*1], 1 317*c0909341SAndroid Build Coastguard Worker mova m19, m0 318*c0909341SAndroid Build Coastguard Worker punpcklbw m16, m18, m5 319*c0909341SAndroid Build Coastguard Worker vpermt2b m19, m18, m1 ; scaling[ 0..127] 320*c0909341SAndroid Build Coastguard Worker vpmovb2m k2, m18 321*c0909341SAndroid Build Coastguard Worker punpckhbw m17, m18, m5 322*c0909341SAndroid Build Coastguard Worker vpermi2b m18, m2, m3 ; scaling[128..255] 323*c0909341SAndroid Build Coastguard Worker vmovdqu8 m19{k2}, m18 ; scaling[src] 324*c0909341SAndroid Build Coastguard Worker pshufb m19, m4 325*c0909341SAndroid Build Coastguard Worker pmaddubsw m18, m19, m20 326*c0909341SAndroid Build Coastguard Worker pmaddubsw m19, m21 327*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82*2 328*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m6 ; noise 329*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m6 330*c0909341SAndroid Build Coastguard Worker paddw m16, m18 331*c0909341SAndroid Build Coastguard Worker paddw m17, m19 332*c0909341SAndroid Build Coastguard Worker packuswb m16, m17 333*c0909341SAndroid Build Coastguard Worker pmaxub m16, m7 334*c0909341SAndroid Build Coastguard Worker pminub m16, m8 335*c0909341SAndroid Build Coastguard Worker mova [dstq+srcq], ym16 336*c0909341SAndroid Build Coastguard Worker add srcq, strideq 337*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+srcq], m16, 1 338*c0909341SAndroid Build Coastguard Worker add srcq, strideq 339*c0909341SAndroid Build Coastguard Worker ret 340*c0909341SAndroid Build Coastguard Worker 341*c0909341SAndroid Build Coastguard Worker%macro FGUV_FN 3 ; name, ss_hor, ss_ver 342*c0909341SAndroid Build Coastguard Workercglobal fguv_32x32xn_i%1_8bpc, 6, 14+%2, 22, dst, src, stride, fg_data, w, \ 343*c0909341SAndroid Build Coastguard Worker scaling, grain_lut, h, sby, luma, \ 344*c0909341SAndroid Build Coastguard Worker overlap, uv_pl, is_id, _, stride3 345*c0909341SAndroid Build Coastguard Worker lea r11, [fg_min] 346*c0909341SAndroid Build Coastguard Worker mov r6d, [fg_dataq+FGData.scaling_shift] 347*c0909341SAndroid Build Coastguard Worker mov r7d, [fg_dataq+FGData.clip_to_restricted_range] 348*c0909341SAndroid Build Coastguard Worker mov r9d, is_idm 349*c0909341SAndroid Build Coastguard Worker mov sbyd, sbym 350*c0909341SAndroid Build Coastguard Worker mov overlapd, [fg_dataq+FGData.overlap_flag] 351*c0909341SAndroid Build Coastguard Worker%if %2 352*c0909341SAndroid Build Coastguard Worker mov r12, 0x000f000f000f000f ; h_overlap mask 353*c0909341SAndroid Build Coastguard Worker vpbroadcastq m10, [base+pb_23_22_0_32] 354*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 355*c0909341SAndroid Build Coastguard Worker%else 356*c0909341SAndroid Build Coastguard Worker mov r12, 0x0000000f0000000f 357*c0909341SAndroid Build Coastguard Worker vpbroadcastq m10, [base+pb_27_17_17_27] 358*c0909341SAndroid Build Coastguard Worker%endif 359*c0909341SAndroid Build Coastguard Worker mova m0, [scalingq+64*0] 360*c0909341SAndroid Build Coastguard Worker mova m1, [scalingq+64*1] 361*c0909341SAndroid Build Coastguard Worker mova m2, [scalingq+64*2] 362*c0909341SAndroid Build Coastguard Worker mova m3, [scalingq+64*3] 363*c0909341SAndroid Build Coastguard Worker kmovq k1, r12 364*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m4, [base+interleave_hl] 365*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+noise_rnd+r6*4-32] 366*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [base+fg_min+r7*4] 367*c0909341SAndroid Build Coastguard Worker shlx r7d, r7d, r9d 368*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+fg_max+r7*4] 369*c0909341SAndroid Build Coastguard Worker test sbyd, sbyd 370*c0909341SAndroid Build Coastguard Worker setnz r7b 371*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+pw_1024] 372*c0909341SAndroid Build Coastguard Worker mova m11, [base+pb_even] 373*c0909341SAndroid Build Coastguard Worker mova m12, [base+pb_odd] 374*c0909341SAndroid Build Coastguard Worker pxor m5, m5 375*c0909341SAndroid Build Coastguard Worker mov r5, r10mp ; lstride 376*c0909341SAndroid Build Coastguard Worker cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 377*c0909341SAndroid Build Coastguard Worker jne .csfl 378*c0909341SAndroid Build Coastguard Worker 379*c0909341SAndroid Build Coastguard Worker%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver 380*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \ 381*c0909341SAndroid Build Coastguard Worker h, sby, see, overlap, uv_pl, _, _, stride3 382*c0909341SAndroid Build Coastguard Worker%if %1 383*c0909341SAndroid Build Coastguard Worker mov r6d, uv_plm 384*c0909341SAndroid Build Coastguard Worker vpbroadcastd m16, [base+pw_8] 385*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m14, [fg_dataq+FGData.uv_mult+r6*4] 386*c0909341SAndroid Build Coastguard Worker vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4] 387*c0909341SAndroid Build Coastguard Worker pshufb m14, m16 ; uv_luma_mult, uv_mult 388*c0909341SAndroid Build Coastguard Worker%endif 389*c0909341SAndroid Build Coastguard Worker test r7b, overlapb 390*c0909341SAndroid Build Coastguard Worker jnz %%v_overlap 391*c0909341SAndroid Build Coastguard Worker 392*c0909341SAndroid Build Coastguard Worker imul seed, sbyd, (173 << 24) | 37 393*c0909341SAndroid Build Coastguard Worker add seed, (105 << 24) | 178 394*c0909341SAndroid Build Coastguard Worker rorx seed, seed, 24 395*c0909341SAndroid Build Coastguard Worker movzx seed, seew 396*c0909341SAndroid Build Coastguard Worker xor seed, [fg_dataq+FGData.seed] 397*c0909341SAndroid Build Coastguard Worker 398*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ 399*c0909341SAndroid Build Coastguard Worker offx, offy, see, overlap, _, _, _, stride3 400*c0909341SAndroid Build Coastguard Worker 401*c0909341SAndroid Build Coastguard Worker mov lumaq, r9mp 402*c0909341SAndroid Build Coastguard Worker lea r11, [srcq+wq] 403*c0909341SAndroid Build Coastguard Worker lea r12, [dstq+wq] 404*c0909341SAndroid Build Coastguard Worker lea r13, [lumaq+wq*(1+%2)] 405*c0909341SAndroid Build Coastguard Worker mov r11mp, r11 406*c0909341SAndroid Build Coastguard Worker mov r12mp, r12 407*c0909341SAndroid Build Coastguard Worker neg wq 408*c0909341SAndroid Build Coastguard Worker 409*c0909341SAndroid Build Coastguard Worker%%loop_x: 410*c0909341SAndroid Build Coastguard Worker rorx r6, seeq, 1 411*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4 412*c0909341SAndroid Build Coastguard Worker test seeb, seeh 413*c0909341SAndroid Build Coastguard Worker lea seed, [r6+0x8000] 414*c0909341SAndroid Build Coastguard Worker cmovp seed, r6d ; updated seed 415*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 416*c0909341SAndroid Build Coastguard Worker rorx offxq, seeq, 12 417*c0909341SAndroid Build Coastguard Worker and offyd, 0xf 418*c0909341SAndroid Build Coastguard Worker imul offyd, 164>>%3 419*c0909341SAndroid Build Coastguard Worker lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 420*c0909341SAndroid Build Coastguard Worker 421*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ 422*c0909341SAndroid Build Coastguard Worker h, offxy, see, overlap, _, _, _, stride3 423*c0909341SAndroid Build Coastguard Worker 424*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 425*c0909341SAndroid Build Coastguard Worker mov hd, hm 426*c0909341SAndroid Build Coastguard Worker%%loop_y: 427*c0909341SAndroid Build Coastguard Worker%if %2 428*c0909341SAndroid Build Coastguard Worker movu xm21, [grain_lutq+offxyq+82*0] 429*c0909341SAndroid Build Coastguard Worker vinserti128 ym21, [grain_lutq+offxyq+82*1], 1 430*c0909341SAndroid Build Coastguard Worker vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 431*c0909341SAndroid Build Coastguard Worker vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 432*c0909341SAndroid Build Coastguard Worker%else 433*c0909341SAndroid Build Coastguard Worker movu ym21, [grain_lutq+offxyq+82*0] 434*c0909341SAndroid Build Coastguard Worker vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1 435*c0909341SAndroid Build Coastguard Worker%endif 436*c0909341SAndroid Build Coastguard Worker call %%add_noise 437*c0909341SAndroid Build Coastguard Worker sub hb, 2<<%2 438*c0909341SAndroid Build Coastguard Worker jg %%loop_y 439*c0909341SAndroid Build Coastguard Worker add wq, 32>>%2 440*c0909341SAndroid Build Coastguard Worker jge .end 441*c0909341SAndroid Build Coastguard Worker mov srcq, r11mp 442*c0909341SAndroid Build Coastguard Worker mov dstq, r12mp 443*c0909341SAndroid Build Coastguard Worker lea lumaq, [r13+wq*(1<<%2)] 444*c0909341SAndroid Build Coastguard Worker add srcq, wq 445*c0909341SAndroid Build Coastguard Worker add dstq, wq 446*c0909341SAndroid Build Coastguard Worker test overlapd, overlapd 447*c0909341SAndroid Build Coastguard Worker jz %%loop_x 448*c0909341SAndroid Build Coastguard Worker cmp dword r8m, 0 ; sby 449*c0909341SAndroid Build Coastguard Worker jne %%hv_overlap 450*c0909341SAndroid Build Coastguard Worker 451*c0909341SAndroid Build Coastguard Worker ; horizontal overlap (without vertical overlap) 452*c0909341SAndroid Build Coastguard Worker%%loop_x_h_overlap: 453*c0909341SAndroid Build Coastguard Worker rorx r6, seeq, 1 454*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4 455*c0909341SAndroid Build Coastguard Worker test seeb, seeh 456*c0909341SAndroid Build Coastguard Worker lea seed, [r6+0x8000] 457*c0909341SAndroid Build Coastguard Worker cmovp seed, r6d ; updated seed 458*c0909341SAndroid Build Coastguard Worker 459*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ 460*c0909341SAndroid Build Coastguard Worker offx, offy, see, left_offxy, _, _, _, stride3 461*c0909341SAndroid Build Coastguard Worker 462*c0909341SAndroid Build Coastguard Worker lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx 463*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 464*c0909341SAndroid Build Coastguard Worker rorx offxq, seeq, 12 465*c0909341SAndroid Build Coastguard Worker and offyd, 0xf 466*c0909341SAndroid Build Coastguard Worker imul offyd, 164>>%3 467*c0909341SAndroid Build Coastguard Worker lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 468*c0909341SAndroid Build Coastguard Worker 469*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ 470*c0909341SAndroid Build Coastguard Worker h, offxy, see, left_offxy, _, _, _, stride3 471*c0909341SAndroid Build Coastguard Worker 472*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 473*c0909341SAndroid Build Coastguard Worker mov hd, hm 474*c0909341SAndroid Build Coastguard Worker%%loop_y_h_overlap: 475*c0909341SAndroid Build Coastguard Worker%if %2 476*c0909341SAndroid Build Coastguard Worker movu xm20, [grain_lutq+offxyq +82*0] 477*c0909341SAndroid Build Coastguard Worker movd xm19, [grain_lutq+left_offxyq+82*0] 478*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym20, [grain_lutq+offxyq +82*1], 1 479*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym19, [grain_lutq+left_offxyq+82*1], 1 480*c0909341SAndroid Build Coastguard Worker vinserti32x4 m20, [grain_lutq+offxyq +82*2], 2 481*c0909341SAndroid Build Coastguard Worker vinserti32x4 m19, [grain_lutq+left_offxyq+82*2], 2 482*c0909341SAndroid Build Coastguard Worker vinserti32x4 m20, [grain_lutq+offxyq +82*3], 3 483*c0909341SAndroid Build Coastguard Worker vinserti32x4 m19, [grain_lutq+left_offxyq+82*3], 3 484*c0909341SAndroid Build Coastguard Worker%else 485*c0909341SAndroid Build Coastguard Worker movu ym20, [grain_lutq+offxyq + 0] 486*c0909341SAndroid Build Coastguard Worker movd xm19, [grain_lutq+left_offxyq+ 0] 487*c0909341SAndroid Build Coastguard Worker vinserti32x8 m20, [grain_lutq+offxyq +82], 1 488*c0909341SAndroid Build Coastguard Worker vinserti32x4 m19, [grain_lutq+left_offxyq+82], 2 489*c0909341SAndroid Build Coastguard Worker%endif 490*c0909341SAndroid Build Coastguard Worker punpcklbw m19, m20 491*c0909341SAndroid Build Coastguard Worker pmaddubsw m19, m10, m19 492*c0909341SAndroid Build Coastguard Worker punpckhbw m21, m20, m5 493*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m9 494*c0909341SAndroid Build Coastguard Worker vpacksswb m20{k1}, m19, m19 495*c0909341SAndroid Build Coastguard Worker punpcklbw m20, m5, m20 496*c0909341SAndroid Build Coastguard Worker call %%add_noise_h 497*c0909341SAndroid Build Coastguard Worker sub hb, 2<<%2 498*c0909341SAndroid Build Coastguard Worker jg %%loop_y_h_overlap 499*c0909341SAndroid Build Coastguard Worker add wq, 32>>%2 500*c0909341SAndroid Build Coastguard Worker jge .end 501*c0909341SAndroid Build Coastguard Worker mov srcq, r11mp 502*c0909341SAndroid Build Coastguard Worker mov dstq, r12mp 503*c0909341SAndroid Build Coastguard Worker lea lumaq, [r13+wq*(1<<%2)] 504*c0909341SAndroid Build Coastguard Worker add srcq, wq 505*c0909341SAndroid Build Coastguard Worker add dstq, wq 506*c0909341SAndroid Build Coastguard Worker cmp dword r8m, 0 ; sby 507*c0909341SAndroid Build Coastguard Worker jne %%hv_overlap 508*c0909341SAndroid Build Coastguard Worker jmp %%loop_x_h_overlap 509*c0909341SAndroid Build Coastguard Worker 510*c0909341SAndroid Build Coastguard Worker%%v_overlap: 511*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \ 512*c0909341SAndroid Build Coastguard Worker _, sby, see, overlap, _, _, _, stride3 513*c0909341SAndroid Build Coastguard Worker 514*c0909341SAndroid Build Coastguard Worker movzx sbyd, sbyb 515*c0909341SAndroid Build Coastguard Worker imul seed, [fg_dataq+FGData.seed], 0x00010001 516*c0909341SAndroid Build Coastguard Worker imul r7d, sbyd, 173 * 0x00010001 517*c0909341SAndroid Build Coastguard Worker imul sbyd, 37 * 0x01000100 518*c0909341SAndroid Build Coastguard Worker add r7d, (105 << 16) | 188 519*c0909341SAndroid Build Coastguard Worker add sbyd, (178 << 24) | (141 << 8) 520*c0909341SAndroid Build Coastguard Worker and r7d, 0x00ff00ff 521*c0909341SAndroid Build Coastguard Worker and sbyd, 0xff00ff00 522*c0909341SAndroid Build Coastguard Worker xor seed, r7d 523*c0909341SAndroid Build Coastguard Worker xor seed, sbyd ; (cur_seed << 16) | top_seed 524*c0909341SAndroid Build Coastguard Worker 525*c0909341SAndroid Build Coastguard Worker%if %3 526*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [base+pb_23_22] 527*c0909341SAndroid Build Coastguard Worker kxnorw k3, k3, k3 ; v_overlap mask 528*c0909341SAndroid Build Coastguard Worker%elif %2 529*c0909341SAndroid Build Coastguard Worker vbroadcasti32x8 m13, [base+pb_27_17] 530*c0909341SAndroid Build Coastguard Worker kxnord k3, k3, k3 531*c0909341SAndroid Build Coastguard Worker pshufd m13, m13, q0000 ; 8x27_17, 8x17_27 532*c0909341SAndroid Build Coastguard Worker%else 533*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym16, [base+pb_27_17] 534*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [base+pb_17_27] 535*c0909341SAndroid Build Coastguard Worker vmovdqa64 m13{k1}, m16 536*c0909341SAndroid Build Coastguard Worker%endif 537*c0909341SAndroid Build Coastguard Worker 538*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ 539*c0909341SAndroid Build Coastguard Worker offx, offy, see, overlap, top_offxy, _, _, stride3 540*c0909341SAndroid Build Coastguard Worker 541*c0909341SAndroid Build Coastguard Worker mov lumaq, r9mp 542*c0909341SAndroid Build Coastguard Worker lea r11, [srcq+wq] 543*c0909341SAndroid Build Coastguard Worker lea r12, [dstq+wq] 544*c0909341SAndroid Build Coastguard Worker lea r13, [lumaq+wq*(1<<%2)] 545*c0909341SAndroid Build Coastguard Worker mov r11mp, r11 546*c0909341SAndroid Build Coastguard Worker mov r12mp, r12 547*c0909341SAndroid Build Coastguard Worker neg wq 548*c0909341SAndroid Build Coastguard Worker 549*c0909341SAndroid Build Coastguard Worker ; we assume from the block above that bits 8-15 of r7d are zero'ed 550*c0909341SAndroid Build Coastguard Worker mov r6d, seed 551*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4eff4 552*c0909341SAndroid Build Coastguard Worker test seeb, seeh 553*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of top_seed 554*c0909341SAndroid Build Coastguard Worker shr seed, 16 555*c0909341SAndroid Build Coastguard Worker shl r7d, 16 556*c0909341SAndroid Build Coastguard Worker test seeb, seeh 557*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of cur_seed 558*c0909341SAndroid Build Coastguard Worker or r6d, 0x00010001 559*c0909341SAndroid Build Coastguard Worker xor r7d, r6d 560*c0909341SAndroid Build Coastguard Worker rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 561*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 562*c0909341SAndroid Build Coastguard Worker rorx offxd, seed, 12 563*c0909341SAndroid Build Coastguard Worker and offyd, 0x000f000f 564*c0909341SAndroid Build Coastguard Worker and offxd, 0x000f000f 565*c0909341SAndroid Build Coastguard Worker imul offyd, 164>>%3 566*c0909341SAndroid Build Coastguard Worker ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 567*c0909341SAndroid Build Coastguard Worker lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 568*c0909341SAndroid Build Coastguard Worker 569*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ 570*c0909341SAndroid Build Coastguard Worker h, offxy, see, overlap, top_offxy, _, _, stride3 571*c0909341SAndroid Build Coastguard Worker 572*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 573*c0909341SAndroid Build Coastguard Worker mov hd, hm 574*c0909341SAndroid Build Coastguard Worker movzx top_offxyd, offxyw 575*c0909341SAndroid Build Coastguard Worker shr offxyd, 16 576*c0909341SAndroid Build Coastguard Worker 577*c0909341SAndroid Build Coastguard Worker%if %3 578*c0909341SAndroid Build Coastguard Worker movu xm18, [grain_lutq+offxyq+82*0] 579*c0909341SAndroid Build Coastguard Worker movu xm20, [grain_lutq+top_offxyq+82*0] 580*c0909341SAndroid Build Coastguard Worker ; only interpolate first line, insert remaining line unmodified 581*c0909341SAndroid Build Coastguard Worker vbroadcasti128 ym21, [grain_lutq+offxyq+82*1] 582*c0909341SAndroid Build Coastguard Worker vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 583*c0909341SAndroid Build Coastguard Worker vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 584*c0909341SAndroid Build Coastguard Worker punpcklbw xm19, xm20, xm18 585*c0909341SAndroid Build Coastguard Worker punpckhbw xm20, xm18 586*c0909341SAndroid Build Coastguard Worker%elif %2 587*c0909341SAndroid Build Coastguard Worker movu xm18, [grain_lutq+offxyq+82*0] 588*c0909341SAndroid Build Coastguard Worker vinserti128 ym18, [grain_lutq+offxyq+82*1], 1 589*c0909341SAndroid Build Coastguard Worker movu xm20, [grain_lutq+top_offxyq+82*0] 590*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym20, [grain_lutq+top_offxyq+82*1], 1 591*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m21, [grain_lutq+offxyq+82*2] 592*c0909341SAndroid Build Coastguard Worker vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 593*c0909341SAndroid Build Coastguard Worker punpcklbw ym19, ym20, ym18 594*c0909341SAndroid Build Coastguard Worker punpckhbw ym20, ym18 595*c0909341SAndroid Build Coastguard Worker%else 596*c0909341SAndroid Build Coastguard Worker movu ym21, [grain_lutq+offxyq+82*0] 597*c0909341SAndroid Build Coastguard Worker vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1 598*c0909341SAndroid Build Coastguard Worker movu ym20, [grain_lutq+top_offxyq+82*0] 599*c0909341SAndroid Build Coastguard Worker vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1 600*c0909341SAndroid Build Coastguard Worker%endif 601*c0909341SAndroid Build Coastguard Worker call %%add_noise_v 602*c0909341SAndroid Build Coastguard Worker sub hb, 2<<%2 603*c0909341SAndroid Build Coastguard Worker jg %%loop_y 604*c0909341SAndroid Build Coastguard Worker add wq, 32>>%2 605*c0909341SAndroid Build Coastguard Worker jge .end 606*c0909341SAndroid Build Coastguard Worker mov srcq, r11mp 607*c0909341SAndroid Build Coastguard Worker mov dstq, r12mp 608*c0909341SAndroid Build Coastguard Worker lea lumaq, [r13+wq*(1<<%2)] 609*c0909341SAndroid Build Coastguard Worker add srcq, wq 610*c0909341SAndroid Build Coastguard Worker add dstq, wq 611*c0909341SAndroid Build Coastguard Worker 612*c0909341SAndroid Build Coastguard Worker%%hv_overlap: 613*c0909341SAndroid Build Coastguard Worker ; we assume from the block above that bits 8-15 of r7d are zero'ed 614*c0909341SAndroid Build Coastguard Worker mov r6d, seed 615*c0909341SAndroid Build Coastguard Worker or seed, 0xeff4eff4 616*c0909341SAndroid Build Coastguard Worker test seeb, seeh 617*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of top_seed 618*c0909341SAndroid Build Coastguard Worker shr seed, 16 619*c0909341SAndroid Build Coastguard Worker shl r7d, 16 620*c0909341SAndroid Build Coastguard Worker test seeb, seeh 621*c0909341SAndroid Build Coastguard Worker setp r7b ; parity of cur_seed 622*c0909341SAndroid Build Coastguard Worker or r6d, 0x00010001 623*c0909341SAndroid Build Coastguard Worker xor r7d, r6d 624*c0909341SAndroid Build Coastguard Worker rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 625*c0909341SAndroid Build Coastguard Worker 626*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ 627*c0909341SAndroid Build Coastguard Worker offx, offy, see, left_offxy, top_offxy, topleft_offxy, _, stride3 628*c0909341SAndroid Build Coastguard Worker 629*c0909341SAndroid Build Coastguard Worker lea topleft_offxyd, [top_offxyq+(32>>%2)] 630*c0909341SAndroid Build Coastguard Worker lea left_offxyd, [offyq+(32>>%2)] 631*c0909341SAndroid Build Coastguard Worker rorx offyd, seed, 8 632*c0909341SAndroid Build Coastguard Worker rorx offxd, seed, 12 633*c0909341SAndroid Build Coastguard Worker and offyd, 0x000f000f 634*c0909341SAndroid Build Coastguard Worker and offxd, 0x000f000f 635*c0909341SAndroid Build Coastguard Worker imul offyd, 164>>%3 636*c0909341SAndroid Build Coastguard Worker ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 637*c0909341SAndroid Build Coastguard Worker lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 638*c0909341SAndroid Build Coastguard Worker 639*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ 640*c0909341SAndroid Build Coastguard Worker h, offxy, see, left_offxy, top_offxy, topleft_offxy, _, stride3 641*c0909341SAndroid Build Coastguard Worker 642*c0909341SAndroid Build Coastguard Worker mov grain_lutq, grain_lutmp 643*c0909341SAndroid Build Coastguard Worker mov hd, hm 644*c0909341SAndroid Build Coastguard Worker movzx top_offxyd, offxyw 645*c0909341SAndroid Build Coastguard Worker shr offxyd, 16 646*c0909341SAndroid Build Coastguard Worker 647*c0909341SAndroid Build Coastguard Worker%if %2 648*c0909341SAndroid Build Coastguard Worker movu xm21, [grain_lutq+offxyq+82*0] 649*c0909341SAndroid Build Coastguard Worker movd xm16, [grain_lutq+left_offxyq+82*0] 650*c0909341SAndroid Build Coastguard Worker vinserti128 ym21, [grain_lutq+offxyq+82*1], 1 651*c0909341SAndroid Build Coastguard Worker vinserti128 ym16, [grain_lutq+left_offxyq+82*1], 1 652*c0909341SAndroid Build Coastguard Worker vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 653*c0909341SAndroid Build Coastguard Worker vinserti32x4 m16, [grain_lutq+left_offxyq+82*2], 2 654*c0909341SAndroid Build Coastguard Worker vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 655*c0909341SAndroid Build Coastguard Worker vinserti32x4 m16, [grain_lutq+left_offxyq+82*3], 3 656*c0909341SAndroid Build Coastguard Worker movd xm18, [grain_lutq+topleft_offxyq+82*0] 657*c0909341SAndroid Build Coastguard Worker movu xm20, [grain_lutq+top_offxyq] 658*c0909341SAndroid Build Coastguard Worker ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 659*c0909341SAndroid Build Coastguard Worker punpcklbw m16, m21 660*c0909341SAndroid Build Coastguard Worker%if %3 661*c0909341SAndroid Build Coastguard Worker punpcklbw xm18, xm20 662*c0909341SAndroid Build Coastguard Worker%else 663*c0909341SAndroid Build Coastguard Worker vinserti128 ym18, [grain_lutq+topleft_offxyq+82*1], 1 664*c0909341SAndroid Build Coastguard Worker vinserti128 ym20, [grain_lutq+top_offxyq+82*1], 1 665*c0909341SAndroid Build Coastguard Worker punpcklbw ym18, ym20 666*c0909341SAndroid Build Coastguard Worker%endif 667*c0909341SAndroid Build Coastguard Worker punpcklqdq m16, m18 668*c0909341SAndroid Build Coastguard Worker pmaddubsw m16, m10, m16 669*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m9 670*c0909341SAndroid Build Coastguard Worker packsswb m16, m16 671*c0909341SAndroid Build Coastguard Worker vmovdqu8 m21{k1}, m16 672*c0909341SAndroid Build Coastguard Worker%if %3 673*c0909341SAndroid Build Coastguard Worker vpalignr xm20{k1}, xm16, xm16, 4 674*c0909341SAndroid Build Coastguard Worker punpcklbw xm19, xm20, xm21 675*c0909341SAndroid Build Coastguard Worker punpckhbw xm20, xm21 676*c0909341SAndroid Build Coastguard Worker%else 677*c0909341SAndroid Build Coastguard Worker vpalignr ym20{k1}, ym16, ym16, 4 678*c0909341SAndroid Build Coastguard Worker punpcklbw ym19, ym20, ym21 679*c0909341SAndroid Build Coastguard Worker punpckhbw ym20, ym21 680*c0909341SAndroid Build Coastguard Worker%endif 681*c0909341SAndroid Build Coastguard Worker%else 682*c0909341SAndroid Build Coastguard Worker movu ym21, [grain_lutq+offxyq+82*0] 683*c0909341SAndroid Build Coastguard Worker vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1 684*c0909341SAndroid Build Coastguard Worker movd xm16, [grain_lutq+left_offxyq+82*0] 685*c0909341SAndroid Build Coastguard Worker vinserti32x4 m16, [grain_lutq+left_offxyq+82*1], 2 686*c0909341SAndroid Build Coastguard Worker movu ym20, [grain_lutq+top_offxyq+82*0] 687*c0909341SAndroid Build Coastguard Worker vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1 688*c0909341SAndroid Build Coastguard Worker movd xm18, [grain_lutq+topleft_offxyq+82*0] 689*c0909341SAndroid Build Coastguard Worker vinserti32x4 m18, [grain_lutq+topleft_offxyq+82*1], 2 690*c0909341SAndroid Build Coastguard Worker punpcklbw m16, m21 691*c0909341SAndroid Build Coastguard Worker punpcklbw m18, m20 692*c0909341SAndroid Build Coastguard Worker punpcklqdq m16, m18 693*c0909341SAndroid Build Coastguard Worker pmaddubsw m16, m10, m16 694*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m9 695*c0909341SAndroid Build Coastguard Worker packsswb m16, m16 696*c0909341SAndroid Build Coastguard Worker vpalignr m20{k1}, m16, m16, 4 697*c0909341SAndroid Build Coastguard Worker vmovdqu8 m21{k1}, m16 698*c0909341SAndroid Build Coastguard Worker%endif 699*c0909341SAndroid Build Coastguard Worker call %%add_noise_v 700*c0909341SAndroid Build Coastguard Worker sub hb, 2<<%2 701*c0909341SAndroid Build Coastguard Worker jg %%loop_y_h_overlap 702*c0909341SAndroid Build Coastguard Worker add wq, 32>>%2 703*c0909341SAndroid Build Coastguard Worker jge .end 704*c0909341SAndroid Build Coastguard Worker mov srcq, r11mp 705*c0909341SAndroid Build Coastguard Worker mov dstq, r12mp 706*c0909341SAndroid Build Coastguard Worker lea lumaq, [r13+wq*(1<<%2)] 707*c0909341SAndroid Build Coastguard Worker add srcq, wq 708*c0909341SAndroid Build Coastguard Worker add dstq, wq 709*c0909341SAndroid Build Coastguard Worker jmp %%hv_overlap 710*c0909341SAndroid Build Coastguard WorkerALIGN function_align 711*c0909341SAndroid Build Coastguard Worker%%add_noise_v: 712*c0909341SAndroid Build Coastguard Worker%if %3 713*c0909341SAndroid Build Coastguard Worker pmaddubsw xm19, xm13, xm19 714*c0909341SAndroid Build Coastguard Worker pmaddubsw xm20, xm13, xm20 715*c0909341SAndroid Build Coastguard Worker pmulhrsw xm19, xm9 716*c0909341SAndroid Build Coastguard Worker pmulhrsw xm20, xm9 717*c0909341SAndroid Build Coastguard Worker vpacksswb m21{k3}, m19, m20 718*c0909341SAndroid Build Coastguard Worker%elif %2 719*c0909341SAndroid Build Coastguard Worker pmaddubsw ym19, ym13, ym19 720*c0909341SAndroid Build Coastguard Worker pmaddubsw ym20, ym13, ym20 721*c0909341SAndroid Build Coastguard Worker pmulhrsw ym19, ym9 722*c0909341SAndroid Build Coastguard Worker pmulhrsw ym20, ym9 723*c0909341SAndroid Build Coastguard Worker vpacksswb m21{k3}, m19, m20 724*c0909341SAndroid Build Coastguard Worker%else 725*c0909341SAndroid Build Coastguard Worker punpcklbw m19, m20, m21 726*c0909341SAndroid Build Coastguard Worker punpckhbw m20, m21 727*c0909341SAndroid Build Coastguard Worker pmaddubsw m19, m13, m19 728*c0909341SAndroid Build Coastguard Worker pmaddubsw m20, m13, m20 729*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m9 730*c0909341SAndroid Build Coastguard Worker pmulhrsw m20, m9 731*c0909341SAndroid Build Coastguard Worker packsswb m21, m19, m20 732*c0909341SAndroid Build Coastguard Worker%endif 733*c0909341SAndroid Build Coastguard Worker%%add_noise: 734*c0909341SAndroid Build Coastguard Worker punpcklbw m20, m5, m21 735*c0909341SAndroid Build Coastguard Worker punpckhbw m21, m5 736*c0909341SAndroid Build Coastguard Worker%%add_noise_h: 737*c0909341SAndroid Build Coastguard Worker mova ym18, [lumaq+lstrideq*(0<<%3)] 738*c0909341SAndroid Build Coastguard Worker vinserti32x8 m18, [lumaq+lstrideq*(1<<%3)], 1 739*c0909341SAndroid Build Coastguard Worker%if %2 740*c0909341SAndroid Build Coastguard Worker lea lumaq, [lumaq+lstrideq*(2<<%3)] 741*c0909341SAndroid Build Coastguard Worker mova ym16, [lumaq+lstrideq*(0<<%3)] 742*c0909341SAndroid Build Coastguard Worker vinserti32x8 m16, [lumaq+lstrideq*(1<<%3)], 1 743*c0909341SAndroid Build Coastguard Worker mova xm17, [srcq+strideq*0] 744*c0909341SAndroid Build Coastguard Worker mova m19, m11 745*c0909341SAndroid Build Coastguard Worker vpermi2b m19, m18, m16 746*c0909341SAndroid Build Coastguard Worker vinserti128 ym17, [srcq+strideq*1], 1 747*c0909341SAndroid Build Coastguard Worker vpermt2b m18, m12, m16 748*c0909341SAndroid Build Coastguard Worker vinserti32x4 m17, [srcq+strideq*2], 2 749*c0909341SAndroid Build Coastguard Worker pavgb m18, m19 750*c0909341SAndroid Build Coastguard Worker vinserti32x4 m17, [srcq+stride3q ], 3 751*c0909341SAndroid Build Coastguard Worker%else 752*c0909341SAndroid Build Coastguard Worker mova ym17, [srcq+strideq*0] 753*c0909341SAndroid Build Coastguard Worker vinserti32x8 m17, [srcq+strideq*1], 1 754*c0909341SAndroid Build Coastguard Worker%endif 755*c0909341SAndroid Build Coastguard Worker%if %1 756*c0909341SAndroid Build Coastguard Worker punpckhbw m19, m18, m17 757*c0909341SAndroid Build Coastguard Worker punpcklbw m18, m17 ; { luma, chroma } 758*c0909341SAndroid Build Coastguard Worker pmaddubsw m19, m14 759*c0909341SAndroid Build Coastguard Worker pmaddubsw m18, m14 760*c0909341SAndroid Build Coastguard Worker psraw m19, 6 761*c0909341SAndroid Build Coastguard Worker psraw m18, 6 762*c0909341SAndroid Build Coastguard Worker paddw m19, m15 763*c0909341SAndroid Build Coastguard Worker paddw m18, m15 764*c0909341SAndroid Build Coastguard Worker packuswb m18, m19 765*c0909341SAndroid Build Coastguard Worker.add_noise_main: 766*c0909341SAndroid Build Coastguard Worker mova m19, m0 767*c0909341SAndroid Build Coastguard Worker vpermt2b m19, m18, m1 ; scaling[ 0..127] 768*c0909341SAndroid Build Coastguard Worker vpmovb2m k2, m18 769*c0909341SAndroid Build Coastguard Worker vpermi2b m18, m2, m3 ; scaling[128..255] 770*c0909341SAndroid Build Coastguard Worker vmovdqu8 m19{k2}, m18 ; scaling[src] 771*c0909341SAndroid Build Coastguard Worker pshufb m19, m4 772*c0909341SAndroid Build Coastguard Worker pmaddubsw m18, m19, m20 773*c0909341SAndroid Build Coastguard Worker pmaddubsw m19, m21 774*c0909341SAndroid Build Coastguard Worker add grain_lutq, 82*2<<%2 775*c0909341SAndroid Build Coastguard Worker lea lumaq, [lumaq+lstrideq*(2<<%3)] 776*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*(2<<%2)] 777*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m6 ; noise 778*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m6 779*c0909341SAndroid Build Coastguard Worker punpcklbw m16, m17, m5 ; chroma 780*c0909341SAndroid Build Coastguard Worker punpckhbw m17, m5 781*c0909341SAndroid Build Coastguard Worker paddw m16, m18 782*c0909341SAndroid Build Coastguard Worker paddw m17, m19 783*c0909341SAndroid Build Coastguard Worker packuswb m16, m17 784*c0909341SAndroid Build Coastguard Worker pmaxub m16, m7 785*c0909341SAndroid Build Coastguard Worker pminub m16, m8 786*c0909341SAndroid Build Coastguard Worker%if %2 787*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm16 788*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], ym16, 1 789*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], m16, 2 790*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+stride3q ], m16, 3 791*c0909341SAndroid Build Coastguard Worker%else 792*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym16 793*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*1], m16, 1 794*c0909341SAndroid Build Coastguard Worker%endif 795*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*(2<<%2)] 796*c0909341SAndroid Build Coastguard Worker ret 797*c0909341SAndroid Build Coastguard Worker%else 798*c0909341SAndroid Build Coastguard Worker jmp .add_noise_main 799*c0909341SAndroid Build Coastguard Worker%endif 800*c0909341SAndroid Build Coastguard Worker%endmacro 801*c0909341SAndroid Build Coastguard Worker 802*c0909341SAndroid Build Coastguard Worker %%FGUV_32x32xN_LOOP 1, %2, %3 803*c0909341SAndroid Build Coastguard Worker.csfl: 804*c0909341SAndroid Build Coastguard Worker %%FGUV_32x32xN_LOOP 0, %2, %3 805*c0909341SAndroid Build Coastguard Worker.end: 806*c0909341SAndroid Build Coastguard Worker RET 807*c0909341SAndroid Build Coastguard Worker%endmacro 808*c0909341SAndroid Build Coastguard Worker 809*c0909341SAndroid Build Coastguard WorkerFGUV_FN 420, 1, 1 810*c0909341SAndroid Build Coastguard WorkerFGUV_FN 422, 1, 0 811*c0909341SAndroid Build Coastguard WorkerFGUV_FN 444, 0, 0 812*c0909341SAndroid Build Coastguard Worker 813*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64 814