xref: /aosp_15_r20/external/libdav1d/src/x86/filmgrain_avx2.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2019-2022, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2019-2022, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker%include "x86/filmgrain_common.asm"
29*c0909341SAndroid Build Coastguard Worker
30*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
31*c0909341SAndroid Build Coastguard Worker
32*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 32
33*c0909341SAndroid Build Coastguard Workerpb_mask:       db  0,128,128,  0,128,  0,  0,128,128,  0,  0,128,  0,128,128,  0
34*c0909341SAndroid Build Coastguard Workergen_shufE:     db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
35*c0909341SAndroid Build Coastguard Workergen_shufA:     db  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
36*c0909341SAndroid Build Coastguard Workergen_shufB:     db  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11
37*c0909341SAndroid Build Coastguard Workergen_shufC:     db  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13
38*c0909341SAndroid Build Coastguard Workergen_shufD:     db  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
39*c0909341SAndroid Build Coastguard Worker; note: the order of (some of) the following constants matter
40*c0909341SAndroid Build Coastguard Workerpb_27_17:      times 2 db 27, 17
41*c0909341SAndroid Build Coastguard Workerbyte_blend:            db  0,  0,  0, -1
42*c0909341SAndroid Build Coastguard Workerpb_27_17_17_27:        db 27, 17, 17, 27,  0, 32,  0, 32
43*c0909341SAndroid Build Coastguard Workerpb_17_27:      times 2 db 17, 27
44*c0909341SAndroid Build Coastguard Workerpb_1:          times 4 db 1
45*c0909341SAndroid Build Coastguard Workerpb_23_22:              db 23, 22,  0, 32,  0, 32,  0, 32
46*c0909341SAndroid Build Coastguard Workernext_upperbit_mask:    dw 0x100B, 0x2016, 0x402C, 0x8058
47*c0909341SAndroid Build Coastguard Workerpw_seed_xor:   times 2 dw 0xb524
48*c0909341SAndroid Build Coastguard Worker               times 2 dw 0x49d8
49*c0909341SAndroid Build Coastguard Workerfg_min:        times 4 db 0
50*c0909341SAndroid Build Coastguard Worker               times 4 db 16
51*c0909341SAndroid Build Coastguard Workerfg_max:        times 4 db 255
52*c0909341SAndroid Build Coastguard Worker               times 4 db 240
53*c0909341SAndroid Build Coastguard Worker               times 4 db 235
54*c0909341SAndroid Build Coastguard Workerpd_m65536:             dd -65536
55*c0909341SAndroid Build Coastguard Workerpw_8:          times 2 dw 8
56*c0909341SAndroid Build Coastguard Workerpw_1024:       times 2 dw 1024
57*c0909341SAndroid Build Coastguard Workerhmul_bits:             dw 32768, 16384,  8192,  4096
58*c0909341SAndroid Build Coastguard Workerround:                 dw  2048,  1024,   512
59*c0909341SAndroid Build Coastguard Workermul_bits:              dw   256,   128,    64,    32,    16
60*c0909341SAndroid Build Coastguard Workerround_vals:            dw    32,    64,   128,   256,   512
61*c0909341SAndroid Build Coastguard Workerpw_1:                  dw 1
62*c0909341SAndroid Build Coastguard Worker
63*c0909341SAndroid Build Coastguard Worker%macro JMP_TABLE 2-*
64*c0909341SAndroid Build Coastguard Worker    %1_8bpc_%2_table:
65*c0909341SAndroid Build Coastguard Worker    %xdefine %%base %1_8bpc_%2_table
66*c0909341SAndroid Build Coastguard Worker    %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
67*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
68*c0909341SAndroid Build Coastguard Worker        dd %%prefix %+ .ar%3 - %%base
69*c0909341SAndroid Build Coastguard Worker        %rotate 1
70*c0909341SAndroid Build Coastguard Worker    %endrep
71*c0909341SAndroid Build Coastguard Worker%endmacro
72*c0909341SAndroid Build Coastguard Worker
73*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_y,      avx2, 0, 1, 2, 3
74*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3
75*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3
76*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3
77*c0909341SAndroid Build Coastguard Worker
78*c0909341SAndroid Build Coastguard WorkerSECTION .text
79*c0909341SAndroid Build Coastguard Worker
80*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2
81*c0909341SAndroid Build Coastguard Workercglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data
82*c0909341SAndroid Build Coastguard Worker%define base r4-generate_grain_y_8bpc_avx2_table
83*c0909341SAndroid Build Coastguard Worker    lea              r4, [generate_grain_y_8bpc_avx2_table]
84*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
85*c0909341SAndroid Build Coastguard Worker    mov             r6d, [fg_dataq+FGData.grain_scale_shift]
86*c0909341SAndroid Build Coastguard Worker    movq            xm1, [base+next_upperbit_mask]
87*c0909341SAndroid Build Coastguard Worker    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
88*c0909341SAndroid Build Coastguard Worker    movq            xm4, [base+mul_bits]
89*c0909341SAndroid Build Coastguard Worker    movq            xm5, [base+hmul_bits]
90*c0909341SAndroid Build Coastguard Worker    mov              r7, -73*82
91*c0909341SAndroid Build Coastguard Worker    mova            xm6, [base+pb_mask]
92*c0909341SAndroid Build Coastguard Worker    sub            bufq, r7
93*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    xm7, [base+round+r6*2]
94*c0909341SAndroid Build Coastguard Worker    lea              r6, [gaussian_sequence]
95*c0909341SAndroid Build Coastguard Worker    movsxd           r5, [r4+r5*4]
96*c0909341SAndroid Build Coastguard Worker.loop:
97*c0909341SAndroid Build Coastguard Worker    pand            xm2, xm0, xm1
98*c0909341SAndroid Build Coastguard Worker    psrlw           xm3, xm2, 10
99*c0909341SAndroid Build Coastguard Worker    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
100*c0909341SAndroid Build Coastguard Worker    pmullw          xm2, xm4            ; bits 0x0f00 are set
101*c0909341SAndroid Build Coastguard Worker    pmulhuw         xm0, xm5
102*c0909341SAndroid Build Coastguard Worker    pshufb          xm3, xm6, xm2       ; set 15th bit for next 4 seeds
103*c0909341SAndroid Build Coastguard Worker    psllq           xm2, xm3, 30
104*c0909341SAndroid Build Coastguard Worker    por             xm2, xm3
105*c0909341SAndroid Build Coastguard Worker    psllq           xm3, xm2, 15
106*c0909341SAndroid Build Coastguard Worker    por             xm2, xm0            ; aggregate each bit into next seed's high bit
107*c0909341SAndroid Build Coastguard Worker    por             xm3, xm2            ; 4 next output seeds
108*c0909341SAndroid Build Coastguard Worker    pshuflw         xm0, xm3, q3333
109*c0909341SAndroid Build Coastguard Worker    psrlw           xm3, 5
110*c0909341SAndroid Build Coastguard Worker    pand            xm2, xm0, xm1
111*c0909341SAndroid Build Coastguard Worker    movq             r2, xm3
112*c0909341SAndroid Build Coastguard Worker    psrlw           xm3, xm2, 10
113*c0909341SAndroid Build Coastguard Worker    por             xm2, xm3
114*c0909341SAndroid Build Coastguard Worker    pmullw          xm2, xm4
115*c0909341SAndroid Build Coastguard Worker    pmulhuw         xm0, xm5
116*c0909341SAndroid Build Coastguard Worker    movzx           r3d, r2w
117*c0909341SAndroid Build Coastguard Worker    pshufb          xm3, xm6, xm2
118*c0909341SAndroid Build Coastguard Worker    psllq           xm2, xm3, 30
119*c0909341SAndroid Build Coastguard Worker    por             xm2, xm3
120*c0909341SAndroid Build Coastguard Worker    psllq           xm3, xm2, 15
121*c0909341SAndroid Build Coastguard Worker    por             xm0, xm2
122*c0909341SAndroid Build Coastguard Worker    movd            xm2, [r6+r3*2]
123*c0909341SAndroid Build Coastguard Worker    rorx             r3, r2, 32
124*c0909341SAndroid Build Coastguard Worker    por             xm3, xm0
125*c0909341SAndroid Build Coastguard Worker    shr             r2d, 16
126*c0909341SAndroid Build Coastguard Worker    pinsrw          xm2, [r6+r2*2], 1
127*c0909341SAndroid Build Coastguard Worker    pshuflw         xm0, xm3, q3333
128*c0909341SAndroid Build Coastguard Worker    movzx           r2d, r3w
129*c0909341SAndroid Build Coastguard Worker    psrlw           xm3, 5
130*c0909341SAndroid Build Coastguard Worker    pinsrw          xm2, [r6+r2*2], 2
131*c0909341SAndroid Build Coastguard Worker    shr             r3d, 16
132*c0909341SAndroid Build Coastguard Worker    movq             r2, xm3
133*c0909341SAndroid Build Coastguard Worker    pinsrw          xm2, [r6+r3*2], 3
134*c0909341SAndroid Build Coastguard Worker    movzx           r3d, r2w
135*c0909341SAndroid Build Coastguard Worker    pinsrw          xm2, [r6+r3*2], 4
136*c0909341SAndroid Build Coastguard Worker    rorx             r3, r2, 32
137*c0909341SAndroid Build Coastguard Worker    shr             r2d, 16
138*c0909341SAndroid Build Coastguard Worker    pinsrw          xm2, [r6+r2*2], 5
139*c0909341SAndroid Build Coastguard Worker    movzx           r2d, r3w
140*c0909341SAndroid Build Coastguard Worker    pinsrw          xm2, [r6+r2*2], 6
141*c0909341SAndroid Build Coastguard Worker    shr             r3d, 16
142*c0909341SAndroid Build Coastguard Worker    pinsrw          xm2, [r6+r3*2], 7
143*c0909341SAndroid Build Coastguard Worker    pmulhrsw        xm2, xm7
144*c0909341SAndroid Build Coastguard Worker    packsswb        xm2, xm2
145*c0909341SAndroid Build Coastguard Worker    movq      [bufq+r7], xm2
146*c0909341SAndroid Build Coastguard Worker    add              r7, 8
147*c0909341SAndroid Build Coastguard Worker    jl .loop
148*c0909341SAndroid Build Coastguard Worker
149*c0909341SAndroid Build Coastguard Worker    ; auto-regression code
150*c0909341SAndroid Build Coastguard Worker    add              r5, r4
151*c0909341SAndroid Build Coastguard Worker    jmp              r5
152*c0909341SAndroid Build Coastguard Worker
153*c0909341SAndroid Build Coastguard Worker.ar1:
154*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0
155*c0909341SAndroid Build Coastguard Worker    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
156*c0909341SAndroid Build Coastguard Worker    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
157*c0909341SAndroid Build Coastguard Worker    movd            xm5, [fg_dataq+FGData.ar_coeffs_y]
158*c0909341SAndroid Build Coastguard Worker    mova            xm2, [base+gen_shufC]
159*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0
160*c0909341SAndroid Build Coastguard Worker    pinsrb          xm5, [base+pb_1], 3
161*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    xm3, [base+round_vals+shiftq*2-12]    ; rnd
162*c0909341SAndroid Build Coastguard Worker    pmovsxbw        xm5, xm5
163*c0909341SAndroid Build Coastguard Worker    pshufd          xm4, xm5, q0000
164*c0909341SAndroid Build Coastguard Worker    pshufd          xm5, xm5, q1111
165*c0909341SAndroid Build Coastguard Worker    sub            bufq, 82*73-(82*3+79)
166*c0909341SAndroid Build Coastguard Worker    mov              hd, 70
167*c0909341SAndroid Build Coastguard Worker    mov            mind, -128
168*c0909341SAndroid Build Coastguard Worker    mov            maxd, 127
169*c0909341SAndroid Build Coastguard Worker.y_loop_ar1:
170*c0909341SAndroid Build Coastguard Worker    mov              xq, -76
171*c0909341SAndroid Build Coastguard Worker    movsx         val3d, byte [bufq+xq-1]
172*c0909341SAndroid Build Coastguard Worker.x_loop_ar1:
173*c0909341SAndroid Build Coastguard Worker    pmovsxbw        xm1, [bufq+xq-82-3]
174*c0909341SAndroid Build Coastguard Worker    pshufb          xm0, xm1, xm2
175*c0909341SAndroid Build Coastguard Worker    punpckhwd       xm1, xm3
176*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm0, xm4
177*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm1, xm5
178*c0909341SAndroid Build Coastguard Worker    paddd           xm0, xm1
179*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_inner:
180*c0909341SAndroid Build Coastguard Worker    movd          val0d, xm0
181*c0909341SAndroid Build Coastguard Worker    psrldq          xm0, 4
182*c0909341SAndroid Build Coastguard Worker    imul          val3d, cf3d
183*c0909341SAndroid Build Coastguard Worker    add           val3d, val0d
184*c0909341SAndroid Build Coastguard Worker    movsx         val0d, byte [bufq+xq]
185*c0909341SAndroid Build Coastguard Worker    sarx          val3d, val3d, shiftd
186*c0909341SAndroid Build Coastguard Worker    add           val3d, val0d
187*c0909341SAndroid Build Coastguard Worker    cmp           val3d, maxd
188*c0909341SAndroid Build Coastguard Worker    cmovns        val3d, maxd
189*c0909341SAndroid Build Coastguard Worker    cmp           val3d, mind
190*c0909341SAndroid Build Coastguard Worker    cmovs         val3d, mind
191*c0909341SAndroid Build Coastguard Worker    mov       [bufq+xq], val3b
192*c0909341SAndroid Build Coastguard Worker    ; keep val3d in-place as left for next x iteration
193*c0909341SAndroid Build Coastguard Worker    inc              xq
194*c0909341SAndroid Build Coastguard Worker    jz .x_loop_ar1_end
195*c0909341SAndroid Build Coastguard Worker    test             xb, 3
196*c0909341SAndroid Build Coastguard Worker    jnz .x_loop_ar1_inner
197*c0909341SAndroid Build Coastguard Worker    jmp .x_loop_ar1
198*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_end:
199*c0909341SAndroid Build Coastguard Worker    add            bufq, 82
200*c0909341SAndroid Build Coastguard Worker    dec              hd
201*c0909341SAndroid Build Coastguard Worker    jg .y_loop_ar1
202*c0909341SAndroid Build Coastguard Worker.ar0:
203*c0909341SAndroid Build Coastguard Worker    RET
204*c0909341SAndroid Build Coastguard Worker
205*c0909341SAndroid Build Coastguard Worker.ar2:
206*c0909341SAndroid Build Coastguard Worker%if WIN64
207*c0909341SAndroid Build Coastguard Worker    %assign stack_size_padded 168
208*c0909341SAndroid Build Coastguard Worker    SUB             rsp, stack_size_padded
209*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM   16, 8
210*c0909341SAndroid Build Coastguard Worker%endif
211*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, fg_data, h, x
212*c0909341SAndroid Build Coastguard Worker    mov             r6d, [fg_dataq+FGData.ar_coeff_shift]
213*c0909341SAndroid Build Coastguard Worker    pmovsxbw        xm7, [fg_dataq+FGData.ar_coeffs_y+0]    ; cf0-7
214*c0909341SAndroid Build Coastguard Worker    movd            xm9, [fg_dataq+FGData.ar_coeffs_y+8]    ; cf8-11
215*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   xm10, [base+round_vals-14+r6*2]
216*c0909341SAndroid Build Coastguard Worker    movd           xm11, [base+byte_blend+1]
217*c0909341SAndroid Build Coastguard Worker    pmovsxbw        xm9, xm9
218*c0909341SAndroid Build Coastguard Worker    pshufd          xm4, xm7, q0000
219*c0909341SAndroid Build Coastguard Worker    mova           xm12, [base+gen_shufA]
220*c0909341SAndroid Build Coastguard Worker    pshufd          xm5, xm7, q3333
221*c0909341SAndroid Build Coastguard Worker    mova           xm13, [base+gen_shufB]
222*c0909341SAndroid Build Coastguard Worker    pshufd          xm6, xm7, q1111
223*c0909341SAndroid Build Coastguard Worker    mova           xm14, [base+gen_shufC]
224*c0909341SAndroid Build Coastguard Worker    pshufd          xm7, xm7, q2222
225*c0909341SAndroid Build Coastguard Worker    mova           xm15, [base+gen_shufD]
226*c0909341SAndroid Build Coastguard Worker    pshufd          xm8, xm9, q0000
227*c0909341SAndroid Build Coastguard Worker    psrld          xm10, 16
228*c0909341SAndroid Build Coastguard Worker    pshufd          xm9, xm9, q1111
229*c0909341SAndroid Build Coastguard Worker    sub            bufq, 82*73-(82*3+79)
230*c0909341SAndroid Build Coastguard Worker    mov              hd, 70
231*c0909341SAndroid Build Coastguard Worker.y_loop_ar2:
232*c0909341SAndroid Build Coastguard Worker    mov              xq, -76
233*c0909341SAndroid Build Coastguard Worker.x_loop_ar2:
234*c0909341SAndroid Build Coastguard Worker    pmovsxbw        xm0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
235*c0909341SAndroid Build Coastguard Worker    pmovsxbw        xm1, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
236*c0909341SAndroid Build Coastguard Worker    pshufb          xm2, xm0, xm12
237*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm2, xm4
238*c0909341SAndroid Build Coastguard Worker    pshufb          xm3, xm1, xm13
239*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm3, xm5
240*c0909341SAndroid Build Coastguard Worker    paddd           xm2, xm3
241*c0909341SAndroid Build Coastguard Worker    pshufb          xm3, xm0, xm14
242*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm3, xm6
243*c0909341SAndroid Build Coastguard Worker    punpckhqdq      xm0, xm0
244*c0909341SAndroid Build Coastguard Worker    punpcklwd       xm0, xm1
245*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm0, xm7
246*c0909341SAndroid Build Coastguard Worker    pshufb          xm1, xm15
247*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm1, xm8
248*c0909341SAndroid Build Coastguard Worker    paddd           xm2, xm10
249*c0909341SAndroid Build Coastguard Worker    paddd           xm2, xm3
250*c0909341SAndroid Build Coastguard Worker    paddd           xm0, xm1
251*c0909341SAndroid Build Coastguard Worker    paddd           xm2, xm0
252*c0909341SAndroid Build Coastguard Worker    movq            xm0, [bufq+xq-2]        ; y=0,x=[-2,+5]
253*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_inner:
254*c0909341SAndroid Build Coastguard Worker    pmovsxbw        xm1, xm0
255*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm3, xm9, xm1
256*c0909341SAndroid Build Coastguard Worker    psrldq          xm1, 4                  ; y=0,x=0
257*c0909341SAndroid Build Coastguard Worker    paddd           xm3, xm2
258*c0909341SAndroid Build Coastguard Worker    psrldq          xm2, 4                  ; shift top to next pixel
259*c0909341SAndroid Build Coastguard Worker    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
260*c0909341SAndroid Build Coastguard Worker    ; don't packssdw since we only care about one value
261*c0909341SAndroid Build Coastguard Worker    paddw           xm3, xm1
262*c0909341SAndroid Build Coastguard Worker    packsswb        xm3, xm3
263*c0909341SAndroid Build Coastguard Worker    pextrb    [bufq+xq], xm3, 0
264*c0909341SAndroid Build Coastguard Worker    pslldq          xm3, 2
265*c0909341SAndroid Build Coastguard Worker    vpblendvb       xm0, xm3, xm11
266*c0909341SAndroid Build Coastguard Worker    psrldq          xm0, 1
267*c0909341SAndroid Build Coastguard Worker    inc              xq
268*c0909341SAndroid Build Coastguard Worker    jz .x_loop_ar2_end
269*c0909341SAndroid Build Coastguard Worker    test             xb, 3
270*c0909341SAndroid Build Coastguard Worker    jnz .x_loop_ar2_inner
271*c0909341SAndroid Build Coastguard Worker    jmp .x_loop_ar2
272*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_end:
273*c0909341SAndroid Build Coastguard Worker    add            bufq, 82
274*c0909341SAndroid Build Coastguard Worker    dec              hd
275*c0909341SAndroid Build Coastguard Worker    jg .y_loop_ar2
276*c0909341SAndroid Build Coastguard Worker    RET
277*c0909341SAndroid Build Coastguard Worker
278*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2
279*c0909341SAndroid Build Coastguard Worker.ar3:
280*c0909341SAndroid Build Coastguard Worker%if WIN64
281*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK   16*14
282*c0909341SAndroid Build Coastguard Worker    %assign stack_size stack_size - 16*4
283*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM   12, 8
284*c0909341SAndroid Build Coastguard Worker%else
285*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK   16*12
286*c0909341SAndroid Build Coastguard Worker%endif
287*c0909341SAndroid Build Coastguard Worker    mov             r6d, [fg_dataq+FGData.ar_coeff_shift]
288*c0909341SAndroid Build Coastguard Worker    movd           xm11, [base+byte_blend]
289*c0909341SAndroid Build Coastguard Worker    pmovsxbw         m1, [fg_dataq+FGData.ar_coeffs_y+ 0]   ; cf0-15
290*c0909341SAndroid Build Coastguard Worker    pmovsxbw        xm2, [fg_dataq+FGData.ar_coeffs_y+16]   ; cf16-23
291*c0909341SAndroid Build Coastguard Worker    pshufd           m0, m1, q0000
292*c0909341SAndroid Build Coastguard Worker    mova    [rsp+16* 0], m0
293*c0909341SAndroid Build Coastguard Worker    pshufd           m0, m1, q1111
294*c0909341SAndroid Build Coastguard Worker    mova    [rsp+16* 2], m0
295*c0909341SAndroid Build Coastguard Worker    pshufd           m0, m1, q2222
296*c0909341SAndroid Build Coastguard Worker    mova    [rsp+16* 4], m0
297*c0909341SAndroid Build Coastguard Worker    pshufd           m1, m1, q3333
298*c0909341SAndroid Build Coastguard Worker    mova    [rsp+16* 6], m1
299*c0909341SAndroid Build Coastguard Worker    pshufd          xm0, xm2, q0000
300*c0909341SAndroid Build Coastguard Worker    mova    [rsp+16* 8], xm0
301*c0909341SAndroid Build Coastguard Worker    pshufd          xm0, xm2, q1111
302*c0909341SAndroid Build Coastguard Worker    mova    [rsp+16* 9], xm0
303*c0909341SAndroid Build Coastguard Worker    psrldq          xm7, xm2, 10
304*c0909341SAndroid Build Coastguard Worker    mova             m8, [base+gen_shufA]
305*c0909341SAndroid Build Coastguard Worker    pinsrw          xm2, [base+pw_1], 5
306*c0909341SAndroid Build Coastguard Worker    mova             m9, [base+gen_shufC]
307*c0909341SAndroid Build Coastguard Worker    pshufd          xm2, xm2, q2222
308*c0909341SAndroid Build Coastguard Worker    movu            m10, [base+gen_shufE]
309*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    xm6, [base+round_vals-12+r6*2]
310*c0909341SAndroid Build Coastguard Worker    pinsrw          xm7, [base+round_vals+r6*2-10], 3
311*c0909341SAndroid Build Coastguard Worker    mova    [rsp+16*10], xm2
312*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, fg_data, h, x
313*c0909341SAndroid Build Coastguard Worker    sub            bufq, 82*73-(82*3+79)
314*c0909341SAndroid Build Coastguard Worker    mov              hd, 70
315*c0909341SAndroid Build Coastguard Worker.y_loop_ar3:
316*c0909341SAndroid Build Coastguard Worker    mov              xq, -76
317*c0909341SAndroid Build Coastguard Worker.x_loop_ar3:
318*c0909341SAndroid Build Coastguard Worker    movu            xm5, [bufq+xq-82*3-3]    ; y=-3,x=[-3,+12]
319*c0909341SAndroid Build Coastguard Worker    vinserti128      m5, [bufq+xq-82*2-3], 1 ; y=-2,x=[-3,+12]
320*c0909341SAndroid Build Coastguard Worker    movu            xm4, [bufq+xq-82*1-3]    ; y=-1,x=[-3,+12]
321*c0909341SAndroid Build Coastguard Worker    punpcklbw        m3, m5, m5
322*c0909341SAndroid Build Coastguard Worker    punpckhwd        m5, m4
323*c0909341SAndroid Build Coastguard Worker    psraw            m3, 8
324*c0909341SAndroid Build Coastguard Worker    punpcklbw        m5, m5
325*c0909341SAndroid Build Coastguard Worker    psraw            m5, 8
326*c0909341SAndroid Build Coastguard Worker    punpcklbw       xm4, xm4
327*c0909341SAndroid Build Coastguard Worker    psraw           xm4, 8
328*c0909341SAndroid Build Coastguard Worker    pshufb           m0, m3, m8
329*c0909341SAndroid Build Coastguard Worker    pmaddwd          m0, [rsp+16*0]
330*c0909341SAndroid Build Coastguard Worker    pshufb           m1, m3, m9
331*c0909341SAndroid Build Coastguard Worker    pmaddwd          m1, [rsp+16*2]
332*c0909341SAndroid Build Coastguard Worker    shufps           m2, m3, m5, q1032
333*c0909341SAndroid Build Coastguard Worker    paddd            m0, m1
334*c0909341SAndroid Build Coastguard Worker    pshufb           m1, m2, m8
335*c0909341SAndroid Build Coastguard Worker    vperm2i128       m3, m4, 0x21
336*c0909341SAndroid Build Coastguard Worker    pmaddwd          m1, [rsp+16*4]
337*c0909341SAndroid Build Coastguard Worker    shufps          xm2, xm3, q1021
338*c0909341SAndroid Build Coastguard Worker    vpblendd         m2, m3, 0xf0
339*c0909341SAndroid Build Coastguard Worker    pshufb           m2, m10
340*c0909341SAndroid Build Coastguard Worker    paddd            m0, m1
341*c0909341SAndroid Build Coastguard Worker    pmaddwd          m2, [rsp+16*6]
342*c0909341SAndroid Build Coastguard Worker    pshufb          xm1, xm4, xm9
343*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm1, [rsp+16*8]
344*c0909341SAndroid Build Coastguard Worker    shufps          xm4, xm5, q1132
345*c0909341SAndroid Build Coastguard Worker    paddd            m0, m2
346*c0909341SAndroid Build Coastguard Worker    pshufb          xm2, xm4, xm8
347*c0909341SAndroid Build Coastguard Worker    pshufd          xm4, xm4, q2121
348*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm2, [rsp+16*9]
349*c0909341SAndroid Build Coastguard Worker    punpcklwd       xm4, xm6
350*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm4, [rsp+16*10]
351*c0909341SAndroid Build Coastguard Worker    vextracti128    xm3, m0, 1
352*c0909341SAndroid Build Coastguard Worker    paddd           xm0, xm1
353*c0909341SAndroid Build Coastguard Worker    movq            xm1, [bufq+xq-3]        ; y=0,x=[-3,+4]
354*c0909341SAndroid Build Coastguard Worker    paddd           xm2, xm4
355*c0909341SAndroid Build Coastguard Worker    paddd           xm0, xm2
356*c0909341SAndroid Build Coastguard Worker    paddd           xm0, xm3
357*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_inner:
358*c0909341SAndroid Build Coastguard Worker    pmovsxbw        xm2, xm1
359*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm2, xm7
360*c0909341SAndroid Build Coastguard Worker    pshufd          xm3, xm2, q1111
361*c0909341SAndroid Build Coastguard Worker    paddd           xm2, xm0                ; add top
362*c0909341SAndroid Build Coastguard Worker    paddd           xm2, xm3                ; left+cur
363*c0909341SAndroid Build Coastguard Worker    psrldq          xm0, 4
364*c0909341SAndroid Build Coastguard Worker    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
365*c0909341SAndroid Build Coastguard Worker    ; don't packssdw since we only care about one value
366*c0909341SAndroid Build Coastguard Worker    packsswb        xm2, xm2
367*c0909341SAndroid Build Coastguard Worker    pextrb    [bufq+xq], xm2, 0
368*c0909341SAndroid Build Coastguard Worker    pslldq          xm2, 3
369*c0909341SAndroid Build Coastguard Worker    vpblendvb       xm1, xm2, xm11
370*c0909341SAndroid Build Coastguard Worker    psrldq          xm1, 1
371*c0909341SAndroid Build Coastguard Worker    inc              xq
372*c0909341SAndroid Build Coastguard Worker    jz .x_loop_ar3_end
373*c0909341SAndroid Build Coastguard Worker    test             xb, 3
374*c0909341SAndroid Build Coastguard Worker    jnz .x_loop_ar3_inner
375*c0909341SAndroid Build Coastguard Worker    jmp .x_loop_ar3
376*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_end:
377*c0909341SAndroid Build Coastguard Worker    add            bufq, 82
378*c0909341SAndroid Build Coastguard Worker    dec              hd
379*c0909341SAndroid Build Coastguard Worker    jg .y_loop_ar3
380*c0909341SAndroid Build Coastguard Worker    RET
381*c0909341SAndroid Build Coastguard Worker
382*c0909341SAndroid Build Coastguard Worker%macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y
383*c0909341SAndroid Build Coastguard WorkerINIT_XMM avx2
384*c0909341SAndroid Build Coastguard Workercglobal generate_grain_uv_%1_8bpc, 4, 10, 16, buf, bufy, fg_data, uv
385*c0909341SAndroid Build Coastguard Worker%define base r4-generate_grain_uv_%1_8bpc_avx2_table
386*c0909341SAndroid Build Coastguard Worker    lea              r4, [generate_grain_uv_%1_8bpc_avx2_table]
387*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
388*c0909341SAndroid Build Coastguard Worker    mov             r6d, [fg_dataq+FGData.grain_scale_shift]
389*c0909341SAndroid Build Coastguard Worker    movq            xm1, [base+next_upperbit_mask]
390*c0909341SAndroid Build Coastguard Worker    movq            xm4, [base+mul_bits]
391*c0909341SAndroid Build Coastguard Worker    movq            xm5, [base+hmul_bits]
392*c0909341SAndroid Build Coastguard Worker    mova            xm6, [base+pb_mask]
393*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    xm7, [base+round+r6*2]
394*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    xm2, [base+pw_seed_xor+uvq*4]
395*c0909341SAndroid Build Coastguard Worker    pxor            xm0, xm2
396*c0909341SAndroid Build Coastguard Worker    lea              r6, [gaussian_sequence]
397*c0909341SAndroid Build Coastguard Worker%if %2
398*c0909341SAndroid Build Coastguard Worker    mov             r7d, 73-35*%3
399*c0909341SAndroid Build Coastguard Worker    add            bufq, 44
400*c0909341SAndroid Build Coastguard Worker.loop_y:
401*c0909341SAndroid Build Coastguard Worker    mov              r5, -44
402*c0909341SAndroid Build Coastguard Worker%else
403*c0909341SAndroid Build Coastguard Worker    mov              r5, -73*82
404*c0909341SAndroid Build Coastguard Worker    sub            bufq, r5
405*c0909341SAndroid Build Coastguard Worker%endif
406*c0909341SAndroid Build Coastguard Worker.loop:
407*c0909341SAndroid Build Coastguard Worker    pand            xm2, xm0, xm1
408*c0909341SAndroid Build Coastguard Worker    psrlw           xm3, xm2, 10
409*c0909341SAndroid Build Coastguard Worker    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
410*c0909341SAndroid Build Coastguard Worker    pmullw          xm2, xm4            ; bits 0x0f00 are set
411*c0909341SAndroid Build Coastguard Worker    pmulhuw         xm0, xm5
412*c0909341SAndroid Build Coastguard Worker    pshufb          xm3, xm6, xm2       ; set 15th bit for next 4 seeds
413*c0909341SAndroid Build Coastguard Worker    psllq           xm2, xm3, 30
414*c0909341SAndroid Build Coastguard Worker    por             xm2, xm3
415*c0909341SAndroid Build Coastguard Worker    psllq           xm3, xm2, 15
416*c0909341SAndroid Build Coastguard Worker    por             xm2, xm0            ; aggregate each bit into next seed's high bit
417*c0909341SAndroid Build Coastguard Worker    por             xm2, xm3            ; 4 next output seeds
418*c0909341SAndroid Build Coastguard Worker    pshuflw         xm0, xm2, q3333
419*c0909341SAndroid Build Coastguard Worker    psrlw           xm2, 5
420*c0909341SAndroid Build Coastguard Worker    movq             r8, xm2
421*c0909341SAndroid Build Coastguard Worker    movzx           r9d, r8w
422*c0909341SAndroid Build Coastguard Worker    movd            xm2, [r6+r9*2]
423*c0909341SAndroid Build Coastguard Worker    rorx             r9, r8, 32
424*c0909341SAndroid Build Coastguard Worker    shr             r8d, 16
425*c0909341SAndroid Build Coastguard Worker    pinsrw          xm2, [r6+r8*2], 1
426*c0909341SAndroid Build Coastguard Worker    movzx           r8d, r9w
427*c0909341SAndroid Build Coastguard Worker    pinsrw          xm2, [r6+r8*2], 2
428*c0909341SAndroid Build Coastguard Worker    shr             r9d, 16
429*c0909341SAndroid Build Coastguard Worker    pinsrw          xm2, [r6+r9*2], 3
430*c0909341SAndroid Build Coastguard Worker    pmulhrsw        xm2, xm7
431*c0909341SAndroid Build Coastguard Worker    packsswb        xm2, xm2
432*c0909341SAndroid Build Coastguard Worker    movd      [bufq+r5], xm2
433*c0909341SAndroid Build Coastguard Worker    add              r5, 4
434*c0909341SAndroid Build Coastguard Worker    jl .loop
435*c0909341SAndroid Build Coastguard Worker%if %2
436*c0909341SAndroid Build Coastguard Worker    add            bufq, 82
437*c0909341SAndroid Build Coastguard Worker    dec             r7d
438*c0909341SAndroid Build Coastguard Worker    jg .loop_y
439*c0909341SAndroid Build Coastguard Worker%endif
440*c0909341SAndroid Build Coastguard Worker
441*c0909341SAndroid Build Coastguard Worker    ; auto-regression code
442*c0909341SAndroid Build Coastguard Worker    movsxd           r6, [fg_dataq+FGData.ar_coeff_lag]
443*c0909341SAndroid Build Coastguard Worker    movsxd           r6, [base+generate_grain_uv_%1_8bpc_avx2_table+r6*4]
444*c0909341SAndroid Build Coastguard Worker    add              r6, r4
445*c0909341SAndroid Build Coastguard Worker    jmp              r6
446*c0909341SAndroid Build Coastguard Worker
447*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2
448*c0909341SAndroid Build Coastguard Worker.ar0:
449*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
450*c0909341SAndroid Build Coastguard Worker    imul            uvd, 28
451*c0909341SAndroid Build Coastguard Worker    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
452*c0909341SAndroid Build Coastguard Worker    movd            xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq]
453*c0909341SAndroid Build Coastguard Worker    movd            xm3, [base+hmul_bits+shiftq*2]
454*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, h
455*c0909341SAndroid Build Coastguard Worker    pmovsxbw        xm2, xm2
456*c0909341SAndroid Build Coastguard Worker%if %2
457*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m7, [base+pb_1]
458*c0909341SAndroid Build Coastguard Worker    vpbroadcastw     m6, [base+hmul_bits+2+%3*2]
459*c0909341SAndroid Build Coastguard Worker%endif
460*c0909341SAndroid Build Coastguard Worker    vpbroadcastw     m2, xm2
461*c0909341SAndroid Build Coastguard Worker    vpbroadcastw     m3, xm3
462*c0909341SAndroid Build Coastguard Worker    pxor            m12, m12
463*c0909341SAndroid Build Coastguard Worker%if %2
464*c0909341SAndroid Build Coastguard Worker    sub            bufq, 82*(73-35*%3)+82-(82*3+41)
465*c0909341SAndroid Build Coastguard Worker%else
466*c0909341SAndroid Build Coastguard Worker    sub            bufq, 82*70-3
467*c0909341SAndroid Build Coastguard Worker%endif
468*c0909341SAndroid Build Coastguard Worker    add           bufyq, 3+82*3
469*c0909341SAndroid Build Coastguard Worker    mov              hd, 70-35*%3
470*c0909341SAndroid Build Coastguard Worker.y_loop_ar0:
471*c0909341SAndroid Build Coastguard Worker%if %2
472*c0909341SAndroid Build Coastguard Worker    ; first 32 pixels
473*c0909341SAndroid Build Coastguard Worker    movu            xm4, [bufyq]
474*c0909341SAndroid Build Coastguard Worker    vinserti128      m4, [bufyq+32], 1
475*c0909341SAndroid Build Coastguard Worker%if %3
476*c0909341SAndroid Build Coastguard Worker    movu            xm0, [bufyq+82]
477*c0909341SAndroid Build Coastguard Worker    vinserti128      m0, [bufyq+82+32], 1
478*c0909341SAndroid Build Coastguard Worker%endif
479*c0909341SAndroid Build Coastguard Worker    movu            xm5, [bufyq+16]
480*c0909341SAndroid Build Coastguard Worker    vinserti128      m5, [bufyq+48], 1
481*c0909341SAndroid Build Coastguard Worker%if %3
482*c0909341SAndroid Build Coastguard Worker    movu            xm1, [bufyq+82+16]
483*c0909341SAndroid Build Coastguard Worker    vinserti128      m1, [bufyq+82+48], 1
484*c0909341SAndroid Build Coastguard Worker%endif
485*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m4, m7, m4
486*c0909341SAndroid Build Coastguard Worker%if %3
487*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m0, m7, m0
488*c0909341SAndroid Build Coastguard Worker%endif
489*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m5, m7, m5
490*c0909341SAndroid Build Coastguard Worker%if %3
491*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m1, m7, m1
492*c0909341SAndroid Build Coastguard Worker    paddw            m4, m0
493*c0909341SAndroid Build Coastguard Worker    paddw            m5, m1
494*c0909341SAndroid Build Coastguard Worker%endif
495*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m4, m6
496*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m5, m6
497*c0909341SAndroid Build Coastguard Worker%else
498*c0909341SAndroid Build Coastguard Worker    xor             r3d, r3d
499*c0909341SAndroid Build Coastguard Worker    ; first 32x2 pixels
500*c0909341SAndroid Build Coastguard Worker.x_loop_ar0:
501*c0909341SAndroid Build Coastguard Worker    movu             m4, [bufyq+r3]
502*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m0, m12, m4
503*c0909341SAndroid Build Coastguard Worker    punpckhbw        m5, m4, m0
504*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m0
505*c0909341SAndroid Build Coastguard Worker%endif
506*c0909341SAndroid Build Coastguard Worker    pmullw           m4, m2
507*c0909341SAndroid Build Coastguard Worker    pmullw           m5, m2
508*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m4, m3
509*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m5, m3
510*c0909341SAndroid Build Coastguard Worker%if %2
511*c0909341SAndroid Build Coastguard Worker    movu             m1, [bufq]
512*c0909341SAndroid Build Coastguard Worker%else
513*c0909341SAndroid Build Coastguard Worker    movu             m1, [bufq+r3]
514*c0909341SAndroid Build Coastguard Worker%endif
515*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m8, m12, m1
516*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m1, m8
517*c0909341SAndroid Build Coastguard Worker    punpckhbw        m1, m8
518*c0909341SAndroid Build Coastguard Worker    paddw            m0, m4
519*c0909341SAndroid Build Coastguard Worker    paddw            m1, m5
520*c0909341SAndroid Build Coastguard Worker    packsswb         m0, m1
521*c0909341SAndroid Build Coastguard Worker%if %2
522*c0909341SAndroid Build Coastguard Worker    movu         [bufq], m0
523*c0909341SAndroid Build Coastguard Worker%else
524*c0909341SAndroid Build Coastguard Worker    movu      [bufq+r3], m0
525*c0909341SAndroid Build Coastguard Worker    add             r3d, 32
526*c0909341SAndroid Build Coastguard Worker    cmp             r3d, 64
527*c0909341SAndroid Build Coastguard Worker    jl .x_loop_ar0
528*c0909341SAndroid Build Coastguard Worker%endif
529*c0909341SAndroid Build Coastguard Worker
530*c0909341SAndroid Build Coastguard Worker    ; last 6/12 pixels
531*c0909341SAndroid Build Coastguard Worker    movu            xm4, [bufyq+32*2]
532*c0909341SAndroid Build Coastguard Worker%if %2
533*c0909341SAndroid Build Coastguard Worker%if %3
534*c0909341SAndroid Build Coastguard Worker    movu            xm5, [bufyq+32*2+82]
535*c0909341SAndroid Build Coastguard Worker%endif
536*c0909341SAndroid Build Coastguard Worker    pmaddubsw       xm4, xm7, xm4
537*c0909341SAndroid Build Coastguard Worker%if %3
538*c0909341SAndroid Build Coastguard Worker    pmaddubsw       xm5, xm7, xm5
539*c0909341SAndroid Build Coastguard Worker    paddw           xm4, xm5
540*c0909341SAndroid Build Coastguard Worker%endif
541*c0909341SAndroid Build Coastguard Worker    movq            xm0, [bufq+32]
542*c0909341SAndroid Build Coastguard Worker    pmulhrsw        xm4, xm6
543*c0909341SAndroid Build Coastguard Worker    pmullw          xm4, xm2
544*c0909341SAndroid Build Coastguard Worker    pmulhrsw        xm4, xm3
545*c0909341SAndroid Build Coastguard Worker    pcmpgtb         xm5, xm12, xm0
546*c0909341SAndroid Build Coastguard Worker    punpcklbw       xm5, xm0, xm5
547*c0909341SAndroid Build Coastguard Worker    paddw           xm4, xm5
548*c0909341SAndroid Build Coastguard Worker    packsswb        xm4, xm4
549*c0909341SAndroid Build Coastguard Worker    pblendw         xm0, xm4, xm0, 1000b
550*c0909341SAndroid Build Coastguard Worker    movq      [bufq+32], xm0
551*c0909341SAndroid Build Coastguard Worker%else
552*c0909341SAndroid Build Coastguard Worker    movu            xm0, [bufq+64]
553*c0909341SAndroid Build Coastguard Worker    pcmpgtb         xm1, xm12, xm4
554*c0909341SAndroid Build Coastguard Worker    punpckhbw       xm5, xm4, xm1
555*c0909341SAndroid Build Coastguard Worker    punpcklbw       xm4, xm1
556*c0909341SAndroid Build Coastguard Worker    pmullw          xm5, xm2
557*c0909341SAndroid Build Coastguard Worker    pmullw          xm4, xm2
558*c0909341SAndroid Build Coastguard Worker    vpblendd        xm1, xm3, xm12, 0x0c
559*c0909341SAndroid Build Coastguard Worker    pmulhrsw        xm5, xm1
560*c0909341SAndroid Build Coastguard Worker    pmulhrsw        xm4, xm3
561*c0909341SAndroid Build Coastguard Worker    pcmpgtb         xm1, xm12, xm0
562*c0909341SAndroid Build Coastguard Worker    punpckhbw       xm8, xm0, xm1
563*c0909341SAndroid Build Coastguard Worker    punpcklbw       xm0, xm1
564*c0909341SAndroid Build Coastguard Worker    paddw           xm5, xm8
565*c0909341SAndroid Build Coastguard Worker    paddw           xm0, xm4
566*c0909341SAndroid Build Coastguard Worker    packsswb        xm0, xm5
567*c0909341SAndroid Build Coastguard Worker    movu      [bufq+64], xm0
568*c0909341SAndroid Build Coastguard Worker%endif
569*c0909341SAndroid Build Coastguard Worker    add            bufq, 82
570*c0909341SAndroid Build Coastguard Worker    add           bufyq, 82<<%3
571*c0909341SAndroid Build Coastguard Worker    dec              hd
572*c0909341SAndroid Build Coastguard Worker    jg .y_loop_ar0
573*c0909341SAndroid Build Coastguard Worker    RET
574*c0909341SAndroid Build Coastguard Worker
575*c0909341SAndroid Build Coastguard WorkerINIT_XMM avx2
576*c0909341SAndroid Build Coastguard Worker.ar1:
577*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift
578*c0909341SAndroid Build Coastguard Worker    imul            uvd, 28
579*c0909341SAndroid Build Coastguard Worker    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
580*c0909341SAndroid Build Coastguard Worker    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
581*c0909341SAndroid Build Coastguard Worker    movd            xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
582*c0909341SAndroid Build Coastguard Worker    pinsrb          xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3
583*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift
584*c0909341SAndroid Build Coastguard Worker    pmovsxbw        xm4, xm4
585*c0909341SAndroid Build Coastguard Worker    pshufd          xm5, xm4, q1111
586*c0909341SAndroid Build Coastguard Worker    pshufd          xm4, xm4, q0000
587*c0909341SAndroid Build Coastguard Worker    pmovsxwd        xm3, [base+round_vals+shiftq*2-12]    ; rnd
588*c0909341SAndroid Build Coastguard Worker%if %2
589*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    xm7, [base+pb_1]
590*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    xm6, [base+hmul_bits+2+%3*2]
591*c0909341SAndroid Build Coastguard Worker%endif
592*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    xm3, xm3
593*c0909341SAndroid Build Coastguard Worker%if %2
594*c0909341SAndroid Build Coastguard Worker    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
595*c0909341SAndroid Build Coastguard Worker%else
596*c0909341SAndroid Build Coastguard Worker    sub            bufq, 82*70-(82-3)
597*c0909341SAndroid Build Coastguard Worker%endif
598*c0909341SAndroid Build Coastguard Worker    add           bufyq, 79+82*3
599*c0909341SAndroid Build Coastguard Worker    mov              hd, 70-35*%3
600*c0909341SAndroid Build Coastguard Worker    mov            mind, -128
601*c0909341SAndroid Build Coastguard Worker    mov            maxd, 127
602*c0909341SAndroid Build Coastguard Worker.y_loop_ar1:
603*c0909341SAndroid Build Coastguard Worker    mov              xq, -(76>>%2)
604*c0909341SAndroid Build Coastguard Worker    movsx         val3d, byte [bufq+xq-1]
605*c0909341SAndroid Build Coastguard Worker.x_loop_ar1:
606*c0909341SAndroid Build Coastguard Worker    pmovsxbw        xm0, [bufq+xq-82-1]     ; top/left
607*c0909341SAndroid Build Coastguard Worker%if %2
608*c0909341SAndroid Build Coastguard Worker    movq            xm8, [bufyq+xq*2]
609*c0909341SAndroid Build Coastguard Worker%if %3
610*c0909341SAndroid Build Coastguard Worker    movq            xm9, [bufyq+xq*2+82]
611*c0909341SAndroid Build Coastguard Worker%endif
612*c0909341SAndroid Build Coastguard Worker%endif
613*c0909341SAndroid Build Coastguard Worker    psrldq          xm2, xm0, 2             ; top
614*c0909341SAndroid Build Coastguard Worker    psrldq          xm1, xm0, 4             ; top/right
615*c0909341SAndroid Build Coastguard Worker%if %2
616*c0909341SAndroid Build Coastguard Worker    pmaddubsw       xm8, xm7, xm8
617*c0909341SAndroid Build Coastguard Worker%if %3
618*c0909341SAndroid Build Coastguard Worker    pmaddubsw       xm9, xm7, xm9
619*c0909341SAndroid Build Coastguard Worker    paddw           xm8, xm9
620*c0909341SAndroid Build Coastguard Worker%endif
621*c0909341SAndroid Build Coastguard Worker    pmulhrsw        xm8, xm6
622*c0909341SAndroid Build Coastguard Worker%else
623*c0909341SAndroid Build Coastguard Worker    pmovsxbw        xm8, [bufyq+xq]
624*c0909341SAndroid Build Coastguard Worker%endif
625*c0909341SAndroid Build Coastguard Worker    punpcklwd       xm0, xm2
626*c0909341SAndroid Build Coastguard Worker    punpcklwd       xm1, xm8
627*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm0, xm4
628*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm1, xm5
629*c0909341SAndroid Build Coastguard Worker    paddd           xm0, xm1
630*c0909341SAndroid Build Coastguard Worker    paddd           xm0, xm3
631*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_inner:
632*c0909341SAndroid Build Coastguard Worker    movd          val0d, xm0
633*c0909341SAndroid Build Coastguard Worker    psrldq          xm0, 4
634*c0909341SAndroid Build Coastguard Worker    imul          val3d, cf3d
635*c0909341SAndroid Build Coastguard Worker    add           val3d, val0d
636*c0909341SAndroid Build Coastguard Worker    sarx          val3d, val3d, shiftd
637*c0909341SAndroid Build Coastguard Worker    movsx         val0d, byte [bufq+xq]
638*c0909341SAndroid Build Coastguard Worker    add           val3d, val0d
639*c0909341SAndroid Build Coastguard Worker    cmp           val3d, maxd
640*c0909341SAndroid Build Coastguard Worker    cmovns        val3d, maxd
641*c0909341SAndroid Build Coastguard Worker    cmp           val3d, mind
642*c0909341SAndroid Build Coastguard Worker    cmovs         val3d, mind
643*c0909341SAndroid Build Coastguard Worker    mov  byte [bufq+xq], val3b
644*c0909341SAndroid Build Coastguard Worker    ; keep val3d in-place as left for next x iteration
645*c0909341SAndroid Build Coastguard Worker    inc              xq
646*c0909341SAndroid Build Coastguard Worker    jz .x_loop_ar1_end
647*c0909341SAndroid Build Coastguard Worker    test             xq, 3
648*c0909341SAndroid Build Coastguard Worker    jnz .x_loop_ar1_inner
649*c0909341SAndroid Build Coastguard Worker    jmp .x_loop_ar1
650*c0909341SAndroid Build Coastguard Worker
651*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_end:
652*c0909341SAndroid Build Coastguard Worker    add            bufq, 82
653*c0909341SAndroid Build Coastguard Worker    add           bufyq, 82<<%3
654*c0909341SAndroid Build Coastguard Worker    dec              hd
655*c0909341SAndroid Build Coastguard Worker    jg .y_loop_ar1
656*c0909341SAndroid Build Coastguard Worker    RET
657*c0909341SAndroid Build Coastguard Worker
658*c0909341SAndroid Build Coastguard Worker.ar2:
659*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
660*c0909341SAndroid Build Coastguard Worker    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
661*c0909341SAndroid Build Coastguard Worker    imul            uvd, 28
662*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   xm13, [base+round_vals-12+shiftq*2]
663*c0909341SAndroid Build Coastguard Worker    pmovsxbw        xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]   ; cf0-7
664*c0909341SAndroid Build Coastguard Worker    pmovsxbw        xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+8]   ; cf8-12
665*c0909341SAndroid Build Coastguard Worker    pinsrw          xm0, [base+pw_1], 5
666*c0909341SAndroid Build Coastguard Worker%if %2
667*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   xm12, [base+hmul_bits+2+%3*2]
668*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   xm11, [base+pb_1]
669*c0909341SAndroid Build Coastguard Worker%endif
670*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
671*c0909341SAndroid Build Coastguard Worker    pshufd          xm4, xm7, q0000
672*c0909341SAndroid Build Coastguard Worker    pshufd          xm5, xm7, q3333
673*c0909341SAndroid Build Coastguard Worker    pshufd          xm6, xm7, q1111
674*c0909341SAndroid Build Coastguard Worker    pshufd          xm7, xm7, q2222
675*c0909341SAndroid Build Coastguard Worker    pshufd          xm8, xm0, q0000
676*c0909341SAndroid Build Coastguard Worker    pshufd          xm9, xm0, q1111
677*c0909341SAndroid Build Coastguard Worker    pshufd         xm10, xm0, q2222
678*c0909341SAndroid Build Coastguard Worker%if %2
679*c0909341SAndroid Build Coastguard Worker    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
680*c0909341SAndroid Build Coastguard Worker%else
681*c0909341SAndroid Build Coastguard Worker    sub            bufq, 82*70-(82-3)
682*c0909341SAndroid Build Coastguard Worker%endif
683*c0909341SAndroid Build Coastguard Worker    add           bufyq, 79+82*3
684*c0909341SAndroid Build Coastguard Worker    mov              hd, 70-35*%3
685*c0909341SAndroid Build Coastguard Worker.y_loop_ar2:
686*c0909341SAndroid Build Coastguard Worker    mov              xq, -(76>>%2)
687*c0909341SAndroid Build Coastguard Worker
688*c0909341SAndroid Build Coastguard Worker.x_loop_ar2:
689*c0909341SAndroid Build Coastguard Worker    pmovsxbw        xm0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
690*c0909341SAndroid Build Coastguard Worker    pmovsxbw        xm1, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
691*c0909341SAndroid Build Coastguard Worker    pshufb          xm2, xm0, [base+gen_shufA]
692*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm2, xm4
693*c0909341SAndroid Build Coastguard Worker    pshufb          xm3, xm1, [base+gen_shufB]
694*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm3, xm5
695*c0909341SAndroid Build Coastguard Worker    paddd           xm2, xm3
696*c0909341SAndroid Build Coastguard Worker    pshufb          xm3, xm0, [base+gen_shufC]
697*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm3, xm6
698*c0909341SAndroid Build Coastguard Worker    punpckhqdq      xm0, xm0                 ; y=-2,x=[+2,+5]
699*c0909341SAndroid Build Coastguard Worker    punpcklwd       xm0, xm1
700*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm0, xm7
701*c0909341SAndroid Build Coastguard Worker    pshufb          xm1, [gen_shufD]
702*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm1, xm8
703*c0909341SAndroid Build Coastguard Worker    paddd           xm2, xm3
704*c0909341SAndroid Build Coastguard Worker    paddd           xm0, xm1
705*c0909341SAndroid Build Coastguard Worker    paddd           xm2, xm0
706*c0909341SAndroid Build Coastguard Worker
707*c0909341SAndroid Build Coastguard Worker%if %2
708*c0909341SAndroid Build Coastguard Worker    movq            xm0, [bufyq+xq*2]
709*c0909341SAndroid Build Coastguard Worker%if %3
710*c0909341SAndroid Build Coastguard Worker    movq            xm3, [bufyq+xq*2+82]
711*c0909341SAndroid Build Coastguard Worker%endif
712*c0909341SAndroid Build Coastguard Worker    pmaddubsw       xm0, xm11, xm0
713*c0909341SAndroid Build Coastguard Worker%if %3
714*c0909341SAndroid Build Coastguard Worker    pmaddubsw       xm3, xm11, xm3
715*c0909341SAndroid Build Coastguard Worker    paddw           xm0, xm3
716*c0909341SAndroid Build Coastguard Worker%endif
717*c0909341SAndroid Build Coastguard Worker    pmulhrsw        xm0, xm12
718*c0909341SAndroid Build Coastguard Worker%else
719*c0909341SAndroid Build Coastguard Worker    pmovsxbw        xm0, [bufyq+xq]
720*c0909341SAndroid Build Coastguard Worker%endif
721*c0909341SAndroid Build Coastguard Worker    punpcklwd       xm0, xm13
722*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm0, xm10
723*c0909341SAndroid Build Coastguard Worker    paddd           xm2, xm0
724*c0909341SAndroid Build Coastguard Worker
725*c0909341SAndroid Build Coastguard Worker    movq            xm0, [bufq+xq-2]        ; y=0,x=[-2,+5]
726*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_inner:
727*c0909341SAndroid Build Coastguard Worker    pmovsxbw        xm0, xm0
728*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm3, xm0, xm9
729*c0909341SAndroid Build Coastguard Worker    psrldq          xm0, 2
730*c0909341SAndroid Build Coastguard Worker    paddd           xm3, xm2
731*c0909341SAndroid Build Coastguard Worker    psrldq          xm2, 4                  ; shift top to next pixel
732*c0909341SAndroid Build Coastguard Worker    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
733*c0909341SAndroid Build Coastguard Worker    pslldq          xm3, 2
734*c0909341SAndroid Build Coastguard Worker    paddw           xm3, xm0
735*c0909341SAndroid Build Coastguard Worker    pblendw         xm0, xm3, 00000010b
736*c0909341SAndroid Build Coastguard Worker    packsswb        xm0, xm0
737*c0909341SAndroid Build Coastguard Worker    pextrb    [bufq+xq], xm0, 1
738*c0909341SAndroid Build Coastguard Worker    inc              xq
739*c0909341SAndroid Build Coastguard Worker    jz .x_loop_ar2_end
740*c0909341SAndroid Build Coastguard Worker    test             xb, 3
741*c0909341SAndroid Build Coastguard Worker    jnz .x_loop_ar2_inner
742*c0909341SAndroid Build Coastguard Worker    jmp .x_loop_ar2
743*c0909341SAndroid Build Coastguard Worker
744*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_end:
745*c0909341SAndroid Build Coastguard Worker    add            bufq, 82
746*c0909341SAndroid Build Coastguard Worker    add           bufyq, 82<<%3
747*c0909341SAndroid Build Coastguard Worker    dec              hd
748*c0909341SAndroid Build Coastguard Worker    jg .y_loop_ar2
749*c0909341SAndroid Build Coastguard Worker    RET
750*c0909341SAndroid Build Coastguard Worker
751*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2
752*c0909341SAndroid Build Coastguard Worker.ar3:
753*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
754*c0909341SAndroid Build Coastguard Worker    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
755*c0909341SAndroid Build Coastguard Worker    imul            uvd, 28
756*c0909341SAndroid Build Coastguard Worker    pmovsxbw         m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15
757*c0909341SAndroid Build Coastguard Worker    pmovsxbw        xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23
758*c0909341SAndroid Build Coastguard Worker    vpbroadcastb    xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma]
759*c0909341SAndroid Build Coastguard Worker    movd           xm13, [base+round_vals-10+shiftq*2]
760*c0909341SAndroid Build Coastguard Worker    vpbroadcastd   xm14, [base+round_vals-14+shiftq*2]
761*c0909341SAndroid Build Coastguard Worker    pshufd           m6, m0, q0000
762*c0909341SAndroid Build Coastguard Worker    pshufd           m7, m0, q1111
763*c0909341SAndroid Build Coastguard Worker    pshufd           m8, m0, q2222
764*c0909341SAndroid Build Coastguard Worker    pshufd           m9, m0, q3333
765*c0909341SAndroid Build Coastguard Worker    pshufd         xm10, xm1, q0000
766*c0909341SAndroid Build Coastguard Worker    pshufd         xm11, xm1, q1111
767*c0909341SAndroid Build Coastguard Worker    pshufhw        xm12, xm1, q0000
768*c0909341SAndroid Build Coastguard Worker    psraw           xm2, 8
769*c0909341SAndroid Build Coastguard Worker    palignr        xm13, xm1, 10
770*c0909341SAndroid Build Coastguard Worker    punpckhwd      xm12, xm2                     ; interleave luma cf
771*c0909341SAndroid Build Coastguard Worker    psrld          xm14, 16
772*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
773*c0909341SAndroid Build Coastguard Worker%if %2
774*c0909341SAndroid Build Coastguard Worker    vpbroadcastw   xm15, [base+hmul_bits+2+%3*2]
775*c0909341SAndroid Build Coastguard Worker    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
776*c0909341SAndroid Build Coastguard Worker%else
777*c0909341SAndroid Build Coastguard Worker    sub            bufq, 82*70-(82-3)
778*c0909341SAndroid Build Coastguard Worker%endif
779*c0909341SAndroid Build Coastguard Worker    add           bufyq, 79+82*3
780*c0909341SAndroid Build Coastguard Worker    mov              hd, 70-35*%3
781*c0909341SAndroid Build Coastguard Worker.y_loop_ar3:
782*c0909341SAndroid Build Coastguard Worker    mov              xq, -(76>>%2)
783*c0909341SAndroid Build Coastguard Worker.x_loop_ar3:
784*c0909341SAndroid Build Coastguard Worker    vbroadcasti128   m3, [bufq+xq-82*2-3]         ; y=-2,x=[-3,+12
785*c0909341SAndroid Build Coastguard Worker    palignr         xm1, xm3, [bufq+xq-82*3-9], 6 ; y=-3,x=[-3,+12]
786*c0909341SAndroid Build Coastguard Worker    vbroadcasti128   m4, [bufq+xq-82*1-3]    ; y=-1,x=[-3,+12]
787*c0909341SAndroid Build Coastguard Worker    vpblendd         m3, m1, 0x0f
788*c0909341SAndroid Build Coastguard Worker    pxor             m0, m0
789*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m2, m0, m3
790*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m0, m4
791*c0909341SAndroid Build Coastguard Worker    punpcklbw        m1, m3, m2
792*c0909341SAndroid Build Coastguard Worker    punpckhbw        m3, m2
793*c0909341SAndroid Build Coastguard Worker    punpcklbw        m2, m4, m0
794*c0909341SAndroid Build Coastguard Worker    punpckhbw       xm4, xm0
795*c0909341SAndroid Build Coastguard Worker    pshufb           m0, m1, [base+gen_shufA]
796*c0909341SAndroid Build Coastguard Worker    pmaddwd          m0, m6
797*c0909341SAndroid Build Coastguard Worker    pshufb           m5, m1, [base+gen_shufC]
798*c0909341SAndroid Build Coastguard Worker    pmaddwd          m5, m7
799*c0909341SAndroid Build Coastguard Worker    shufps           m1, m3, q1032
800*c0909341SAndroid Build Coastguard Worker    paddd            m0, m5
801*c0909341SAndroid Build Coastguard Worker    pshufb           m5, m1, [base+gen_shufA]
802*c0909341SAndroid Build Coastguard Worker    pmaddwd          m5, m8
803*c0909341SAndroid Build Coastguard Worker    shufps          xm1, xm3, q2121
804*c0909341SAndroid Build Coastguard Worker    vpblendd         m1, m2, 0xf0
805*c0909341SAndroid Build Coastguard Worker    pshufb           m1, [base+gen_shufE]
806*c0909341SAndroid Build Coastguard Worker    pmaddwd          m1, m9
807*c0909341SAndroid Build Coastguard Worker    paddd            m0, m5
808*c0909341SAndroid Build Coastguard Worker    pshufb          xm3, xm2, [base+gen_shufC]
809*c0909341SAndroid Build Coastguard Worker    paddd            m0, m1
810*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm3, xm10
811*c0909341SAndroid Build Coastguard Worker    palignr         xm1, xm4, xm2, 2
812*c0909341SAndroid Build Coastguard Worker    punpckhwd       xm1, xm2, xm1
813*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm1, xm11
814*c0909341SAndroid Build Coastguard Worker    palignr         xm4, xm2, 12
815*c0909341SAndroid Build Coastguard Worker    paddd           xm3, xm1
816*c0909341SAndroid Build Coastguard Worker%if %2
817*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    xm5, [base+pb_1]
818*c0909341SAndroid Build Coastguard Worker    movq            xm1, [bufyq+xq*2]
819*c0909341SAndroid Build Coastguard Worker    pmaddubsw       xm1, xm5, xm1
820*c0909341SAndroid Build Coastguard Worker%if %3
821*c0909341SAndroid Build Coastguard Worker    movq            xm2, [bufyq+xq*2+82]
822*c0909341SAndroid Build Coastguard Worker    pmaddubsw       xm5, xm2
823*c0909341SAndroid Build Coastguard Worker    paddw           xm1, xm5
824*c0909341SAndroid Build Coastguard Worker%endif
825*c0909341SAndroid Build Coastguard Worker    pmulhrsw        xm1, xm15
826*c0909341SAndroid Build Coastguard Worker%else
827*c0909341SAndroid Build Coastguard Worker    pmovsxbw        xm1, [bufyq+xq]
828*c0909341SAndroid Build Coastguard Worker%endif
829*c0909341SAndroid Build Coastguard Worker    punpcklwd       xm4, xm1
830*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm4, xm12
831*c0909341SAndroid Build Coastguard Worker    movq            xm1, [bufq+xq-3]        ; y=0,x=[-3,+4]
832*c0909341SAndroid Build Coastguard Worker    vextracti128    xm2, m0, 1
833*c0909341SAndroid Build Coastguard Worker    paddd           xm0, xm14
834*c0909341SAndroid Build Coastguard Worker    paddd           xm3, xm4
835*c0909341SAndroid Build Coastguard Worker    paddd           xm0, xm3
836*c0909341SAndroid Build Coastguard Worker    paddd           xm0, xm2
837*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_inner:
838*c0909341SAndroid Build Coastguard Worker    pmovsxbw        xm1, xm1
839*c0909341SAndroid Build Coastguard Worker    pmaddwd         xm2, xm13, xm1
840*c0909341SAndroid Build Coastguard Worker    pshuflw         xm3, xm2, q1032
841*c0909341SAndroid Build Coastguard Worker    paddd           xm2, xm0                ; add top
842*c0909341SAndroid Build Coastguard Worker    paddd           xm2, xm3                ; left+cur
843*c0909341SAndroid Build Coastguard Worker    psrldq          xm0, 4
844*c0909341SAndroid Build Coastguard Worker    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
845*c0909341SAndroid Build Coastguard Worker    psrldq          xm1, 2
846*c0909341SAndroid Build Coastguard Worker    ; don't packssdw, we only care about one value
847*c0909341SAndroid Build Coastguard Worker    punpckldq       xm2, xm2
848*c0909341SAndroid Build Coastguard Worker    pblendw         xm1, xm2, 0100b
849*c0909341SAndroid Build Coastguard Worker    packsswb        xm1, xm1
850*c0909341SAndroid Build Coastguard Worker    pextrb    [bufq+xq], xm1, 2
851*c0909341SAndroid Build Coastguard Worker    inc              xq
852*c0909341SAndroid Build Coastguard Worker    jz .x_loop_ar3_end
853*c0909341SAndroid Build Coastguard Worker    test             xb, 3
854*c0909341SAndroid Build Coastguard Worker    jnz .x_loop_ar3_inner
855*c0909341SAndroid Build Coastguard Worker    jmp .x_loop_ar3
856*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_end:
857*c0909341SAndroid Build Coastguard Worker    add            bufq, 82
858*c0909341SAndroid Build Coastguard Worker    add           bufyq, 82<<%3
859*c0909341SAndroid Build Coastguard Worker    dec              hd
860*c0909341SAndroid Build Coastguard Worker    jg .y_loop_ar3
861*c0909341SAndroid Build Coastguard Worker    RET
862*c0909341SAndroid Build Coastguard Worker%endmacro
863*c0909341SAndroid Build Coastguard Worker
864*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2
865*c0909341SAndroid Build Coastguard Workercglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \
866*c0909341SAndroid Build Coastguard Worker                                     grain_lut, h, sby, see, overlap
867*c0909341SAndroid Build Coastguard Worker%define base r9-pd_m65536
868*c0909341SAndroid Build Coastguard Worker    lea              r9, [pd_m65536]
869*c0909341SAndroid Build Coastguard Worker    mov             r6d, [fg_dataq+FGData.scaling_shift]
870*c0909341SAndroid Build Coastguard Worker    mov             r7d, [fg_dataq+FGData.clip_to_restricted_range]
871*c0909341SAndroid Build Coastguard Worker    mov            sbyd, sbym
872*c0909341SAndroid Build Coastguard Worker    mov        overlapd, [fg_dataq+FGData.overlap_flag]
873*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m8, [base+pd_m65536]
874*c0909341SAndroid Build Coastguard Worker    vpbroadcastw     m9, [base+mul_bits+r6*2-14]
875*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m10, [base+fg_min+r7*4]
876*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m11, [base+fg_max+r7*8]
877*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m12, [base+pw_1024]
878*c0909341SAndroid Build Coastguard Worker    movq           xm13, [base+pb_27_17_17_27]
879*c0909341SAndroid Build Coastguard Worker    test           sbyd, sbyd
880*c0909341SAndroid Build Coastguard Worker    setnz           r7b
881*c0909341SAndroid Build Coastguard Worker    pxor             m7, m7
882*c0909341SAndroid Build Coastguard Worker    test            r7b, overlapb
883*c0909341SAndroid Build Coastguard Worker    jnz .vertical_overlap
884*c0909341SAndroid Build Coastguard Worker
885*c0909341SAndroid Build Coastguard Worker    imul           seed, sbyd, (173 << 24) | 37
886*c0909341SAndroid Build Coastguard Worker    add            seed, (105 << 24) | 178
887*c0909341SAndroid Build Coastguard Worker    rorx           seed, seed, 24
888*c0909341SAndroid Build Coastguard Worker    movzx          seed, seew
889*c0909341SAndroid Build Coastguard Worker    xor            seed, [fg_dataq+FGData.seed]
890*c0909341SAndroid Build Coastguard Worker
891*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
892*c0909341SAndroid Build Coastguard Worker                offx, offy, see, overlap
893*c0909341SAndroid Build Coastguard Worker
894*c0909341SAndroid Build Coastguard Worker    lea        src_bakq, [srcq+wq]
895*c0909341SAndroid Build Coastguard Worker    neg              wq
896*c0909341SAndroid Build Coastguard Worker    sub            dstq, srcq
897*c0909341SAndroid Build Coastguard Worker
898*c0909341SAndroid Build Coastguard Worker.loop_x:
899*c0909341SAndroid Build Coastguard Worker    rorx             r6, seeq, 1
900*c0909341SAndroid Build Coastguard Worker    or             seed, 0xEFF4
901*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
902*c0909341SAndroid Build Coastguard Worker    lea            seed, [r6+0x8000]
903*c0909341SAndroid Build Coastguard Worker    cmovp          seed, r6d                ; updated seed
904*c0909341SAndroid Build Coastguard Worker
905*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
906*c0909341SAndroid Build Coastguard Worker    rorx          offxq, seeq, 12
907*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf
908*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164
909*c0909341SAndroid Build Coastguard Worker    lea           offyd, [offyq+offxq*2+747] ; offy*stride+offx
910*c0909341SAndroid Build Coastguard Worker
911*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
912*c0909341SAndroid Build Coastguard Worker                h, offxy, see, overlap
913*c0909341SAndroid Build Coastguard Worker
914*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
915*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
916*c0909341SAndroid Build Coastguard Worker.loop_y:
917*c0909341SAndroid Build Coastguard Worker    ; src
918*c0909341SAndroid Build Coastguard Worker    mova             m2, [srcq]
919*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m2, m7
920*c0909341SAndroid Build Coastguard Worker    punpckhbw        m1, m2, m7
921*c0909341SAndroid Build Coastguard Worker
922*c0909341SAndroid Build Coastguard Worker    ; scaling[src]
923*c0909341SAndroid Build Coastguard Worker    pandn            m4, m8, m0
924*c0909341SAndroid Build Coastguard Worker    mova             m6, m8
925*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m2, [scalingq+m4-0], m8
926*c0909341SAndroid Build Coastguard Worker    psrld            m3, m0, 16
927*c0909341SAndroid Build Coastguard Worker    mova             m8, m6
928*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m4, [scalingq+m3-2], m6
929*c0909341SAndroid Build Coastguard Worker    pandn            m5, m8, m1
930*c0909341SAndroid Build Coastguard Worker    mova             m6, m8
931*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m3, [scalingq+m5-0], m8
932*c0909341SAndroid Build Coastguard Worker    pblendw          m2, m4, 0xaa
933*c0909341SAndroid Build Coastguard Worker    psrld            m4, m1, 16
934*c0909341SAndroid Build Coastguard Worker    mova             m8, m6
935*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m5, [scalingq+m4-2], m6
936*c0909341SAndroid Build Coastguard Worker    pblendw          m3, m5, 0xaa
937*c0909341SAndroid Build Coastguard Worker
938*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
939*c0909341SAndroid Build Coastguard Worker    movu             m5, [grain_lutq+offxyq]
940*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m5, m7
941*c0909341SAndroid Build Coastguard Worker    punpckhbw        m5, m7
942*c0909341SAndroid Build Coastguard Worker
943*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[src] * grain, scaling_shift)
944*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m2, m4
945*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m3, m5
946*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m2, m9
947*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m3, m9
948*c0909341SAndroid Build Coastguard Worker
949*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
950*c0909341SAndroid Build Coastguard Worker    paddw            m0, m2
951*c0909341SAndroid Build Coastguard Worker    paddw            m1, m3
952*c0909341SAndroid Build Coastguard Worker    packuswb         m0, m1
953*c0909341SAndroid Build Coastguard Worker    pmaxub           m0, m10
954*c0909341SAndroid Build Coastguard Worker    pminub           m0, m11
955*c0909341SAndroid Build Coastguard Worker    mova    [dstq+srcq], m0
956*c0909341SAndroid Build Coastguard Worker
957*c0909341SAndroid Build Coastguard Worker    add            srcq, strideq
958*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82
959*c0909341SAndroid Build Coastguard Worker    dec              hd
960*c0909341SAndroid Build Coastguard Worker    jg .loop_y
961*c0909341SAndroid Build Coastguard Worker
962*c0909341SAndroid Build Coastguard Worker    add              wq, 32
963*c0909341SAndroid Build Coastguard Worker    jge .end
964*c0909341SAndroid Build Coastguard Worker    lea            srcq, [src_bakq+wq]
965*c0909341SAndroid Build Coastguard Worker    test       overlapd, overlapd
966*c0909341SAndroid Build Coastguard Worker    jz .loop_x
967*c0909341SAndroid Build Coastguard Worker
968*c0909341SAndroid Build Coastguard Worker    ; r8m = sbym
969*c0909341SAndroid Build Coastguard Worker    cmp       dword r8m, 0
970*c0909341SAndroid Build Coastguard Worker    jne .loop_x_hv_overlap
971*c0909341SAndroid Build Coastguard Worker
972*c0909341SAndroid Build Coastguard Worker    ; horizontal overlap (without vertical overlap)
973*c0909341SAndroid Build Coastguard Worker.loop_x_h_overlap:
974*c0909341SAndroid Build Coastguard Worker    rorx             r6, seeq, 1
975*c0909341SAndroid Build Coastguard Worker    or             seed, 0xEFF4
976*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
977*c0909341SAndroid Build Coastguard Worker    lea            seed, [r6+0x8000]
978*c0909341SAndroid Build Coastguard Worker    cmovp          seed, r6d                ; updated seed
979*c0909341SAndroid Build Coastguard Worker
980*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
981*c0909341SAndroid Build Coastguard Worker                offx, offy, see, left_offxy
982*c0909341SAndroid Build Coastguard Worker
983*c0909341SAndroid Build Coastguard Worker    lea     left_offxyd, [offyq+32]         ; previous column's offy*stride+offx
984*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
985*c0909341SAndroid Build Coastguard Worker    rorx          offxq, seeq, 12
986*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf
987*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164
988*c0909341SAndroid Build Coastguard Worker    lea           offyd, [offyq+offxq*2+747] ; offy*stride+offx
989*c0909341SAndroid Build Coastguard Worker
990*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
991*c0909341SAndroid Build Coastguard Worker                h, offxy, see, left_offxy
992*c0909341SAndroid Build Coastguard Worker
993*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
994*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
995*c0909341SAndroid Build Coastguard Worker.loop_y_h_overlap:
996*c0909341SAndroid Build Coastguard Worker    ; src
997*c0909341SAndroid Build Coastguard Worker    mova             m2, [srcq]
998*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m2, m7
999*c0909341SAndroid Build Coastguard Worker    punpckhbw        m1, m2, m7
1000*c0909341SAndroid Build Coastguard Worker
1001*c0909341SAndroid Build Coastguard Worker    ; scaling[src]
1002*c0909341SAndroid Build Coastguard Worker    pandn            m4, m8, m0
1003*c0909341SAndroid Build Coastguard Worker    mova             m6, m8
1004*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m2, [scalingq+m4-0], m8
1005*c0909341SAndroid Build Coastguard Worker    psrld            m3, m0, 16
1006*c0909341SAndroid Build Coastguard Worker    mova             m8, m6
1007*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m4, [scalingq+m3-2], m6
1008*c0909341SAndroid Build Coastguard Worker    pandn            m5, m8, m1
1009*c0909341SAndroid Build Coastguard Worker    mova             m6, m8
1010*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m3, [scalingq+m5-0], m8
1011*c0909341SAndroid Build Coastguard Worker    pblendw          m2, m4, 0xaa
1012*c0909341SAndroid Build Coastguard Worker    psrld            m4, m1, 16
1013*c0909341SAndroid Build Coastguard Worker    mova             m8, m6
1014*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m5, [scalingq+m4-2], m6
1015*c0909341SAndroid Build Coastguard Worker    pblendw          m3, m5, 0xaa
1016*c0909341SAndroid Build Coastguard Worker
1017*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
1018*c0909341SAndroid Build Coastguard Worker    movu             m5, [grain_lutq+offxyq]
1019*c0909341SAndroid Build Coastguard Worker    movd            xm4, [grain_lutq+left_offxyq]
1020*c0909341SAndroid Build Coastguard Worker    punpcklbw       xm4, xm5
1021*c0909341SAndroid Build Coastguard Worker    pmaddubsw       xm4, xm13, xm4
1022*c0909341SAndroid Build Coastguard Worker    pmulhrsw        xm4, xm12
1023*c0909341SAndroid Build Coastguard Worker    packsswb        xm4, xm4
1024*c0909341SAndroid Build Coastguard Worker    vpblendd         m4, m5, 0xfe
1025*c0909341SAndroid Build Coastguard Worker    punpckhbw        m5, m7
1026*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m7
1027*c0909341SAndroid Build Coastguard Worker
1028*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[src] * grain, scaling_shift)
1029*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m2, m4
1030*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m3, m5
1031*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m2, m9
1032*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m3, m9
1033*c0909341SAndroid Build Coastguard Worker
1034*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
1035*c0909341SAndroid Build Coastguard Worker    paddw            m0, m2
1036*c0909341SAndroid Build Coastguard Worker    paddw            m1, m3
1037*c0909341SAndroid Build Coastguard Worker    packuswb         m0, m1
1038*c0909341SAndroid Build Coastguard Worker    pmaxub           m0, m10
1039*c0909341SAndroid Build Coastguard Worker    pminub           m0, m11
1040*c0909341SAndroid Build Coastguard Worker    mova    [dstq+srcq], m0
1041*c0909341SAndroid Build Coastguard Worker
1042*c0909341SAndroid Build Coastguard Worker    add            srcq, strideq
1043*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82
1044*c0909341SAndroid Build Coastguard Worker    dec              hd
1045*c0909341SAndroid Build Coastguard Worker    jg .loop_y_h_overlap
1046*c0909341SAndroid Build Coastguard Worker
1047*c0909341SAndroid Build Coastguard Worker    add              wq, 32
1048*c0909341SAndroid Build Coastguard Worker    jge .end
1049*c0909341SAndroid Build Coastguard Worker    lea            srcq, [src_bakq+wq]
1050*c0909341SAndroid Build Coastguard Worker
1051*c0909341SAndroid Build Coastguard Worker    ; r8m = sbym
1052*c0909341SAndroid Build Coastguard Worker    cmp       dword r8m, 0
1053*c0909341SAndroid Build Coastguard Worker    jne .loop_x_hv_overlap
1054*c0909341SAndroid Build Coastguard Worker    jmp .loop_x_h_overlap
1055*c0909341SAndroid Build Coastguard Worker
1056*c0909341SAndroid Build Coastguard Worker.vertical_overlap:
1057*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1058*c0909341SAndroid Build Coastguard Worker                unused, sby, see, overlap
1059*c0909341SAndroid Build Coastguard Worker
1060*c0909341SAndroid Build Coastguard Worker    movzx          sbyd, sbyb
1061*c0909341SAndroid Build Coastguard Worker    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1062*c0909341SAndroid Build Coastguard Worker    imul            r7d, sbyd, 173 * 0x00010001
1063*c0909341SAndroid Build Coastguard Worker    imul           sbyd, 37 * 0x01000100
1064*c0909341SAndroid Build Coastguard Worker    add             r7d, (105 << 16) | 188
1065*c0909341SAndroid Build Coastguard Worker    add            sbyd, (178 << 24) | (141 << 8)
1066*c0909341SAndroid Build Coastguard Worker    and             r7d, 0x00ff00ff
1067*c0909341SAndroid Build Coastguard Worker    and            sbyd, 0xff00ff00
1068*c0909341SAndroid Build Coastguard Worker    xor            seed, r7d
1069*c0909341SAndroid Build Coastguard Worker    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1070*c0909341SAndroid Build Coastguard Worker
1071*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1072*c0909341SAndroid Build Coastguard Worker                offx, offy, see, overlap
1073*c0909341SAndroid Build Coastguard Worker
1074*c0909341SAndroid Build Coastguard Worker    lea        src_bakq, [srcq+wq]
1075*c0909341SAndroid Build Coastguard Worker    neg              wq
1076*c0909341SAndroid Build Coastguard Worker    sub            dstq, srcq
1077*c0909341SAndroid Build Coastguard Worker
1078*c0909341SAndroid Build Coastguard Worker.loop_x_v_overlap:
1079*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m14, [pb_27_17]
1080*c0909341SAndroid Build Coastguard Worker
1081*c0909341SAndroid Build Coastguard Worker    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1082*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
1083*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4eff4
1084*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
1085*c0909341SAndroid Build Coastguard Worker    setp            r7b                     ; parity of top_seed
1086*c0909341SAndroid Build Coastguard Worker    shr            seed, 16
1087*c0909341SAndroid Build Coastguard Worker    shl             r7d, 16
1088*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
1089*c0909341SAndroid Build Coastguard Worker    setp            r7b                     ; parity of cur_seed
1090*c0909341SAndroid Build Coastguard Worker    or              r6d, 0x00010001
1091*c0909341SAndroid Build Coastguard Worker    xor             r7d, r6d
1092*c0909341SAndroid Build Coastguard Worker    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1093*c0909341SAndroid Build Coastguard Worker
1094*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
1095*c0909341SAndroid Build Coastguard Worker    rorx          offxd, seed, 12
1096*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf000f
1097*c0909341SAndroid Build Coastguard Worker    and           offxd, 0xf000f
1098*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164
1099*c0909341SAndroid Build Coastguard Worker    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1100*c0909341SAndroid Build Coastguard Worker    lea           offyd, [offyq+offxq*2+0x10001*747+32*82]
1101*c0909341SAndroid Build Coastguard Worker
1102*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1103*c0909341SAndroid Build Coastguard Worker                h, offxy, see, overlap, top_offxy
1104*c0909341SAndroid Build Coastguard Worker
1105*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
1106*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
1107*c0909341SAndroid Build Coastguard Worker    movzx    top_offxyd, offxyw
1108*c0909341SAndroid Build Coastguard Worker    shr          offxyd, 16
1109*c0909341SAndroid Build Coastguard Worker.loop_y_v_overlap:
1110*c0909341SAndroid Build Coastguard Worker    ; src
1111*c0909341SAndroid Build Coastguard Worker    mova             m2, [srcq]
1112*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m2, m7
1113*c0909341SAndroid Build Coastguard Worker    punpckhbw        m1, m2, m7
1114*c0909341SAndroid Build Coastguard Worker
1115*c0909341SAndroid Build Coastguard Worker    ; scaling[src]
1116*c0909341SAndroid Build Coastguard Worker    pandn            m4, m8, m0
1117*c0909341SAndroid Build Coastguard Worker    mova             m6, m8
1118*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m2, [scalingq+m4-0], m8
1119*c0909341SAndroid Build Coastguard Worker    psrld            m3, m0, 16
1120*c0909341SAndroid Build Coastguard Worker    mova             m8, m6
1121*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m4, [scalingq+m3-2], m6
1122*c0909341SAndroid Build Coastguard Worker    pandn            m5, m8, m1
1123*c0909341SAndroid Build Coastguard Worker    mova             m6, m8
1124*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m3, [scalingq+m5-0], m8
1125*c0909341SAndroid Build Coastguard Worker    pblendw          m2, m4, 0xaa
1126*c0909341SAndroid Build Coastguard Worker    psrld            m4, m1, 16
1127*c0909341SAndroid Build Coastguard Worker    mova             m8, m6
1128*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m5, [scalingq+m4-2], m6
1129*c0909341SAndroid Build Coastguard Worker    pblendw          m3, m5, 0xaa
1130*c0909341SAndroid Build Coastguard Worker
1131*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
1132*c0909341SAndroid Build Coastguard Worker    movu             m6, [grain_lutq+offxyq]
1133*c0909341SAndroid Build Coastguard Worker    movu             m4, [grain_lutq+top_offxyq]
1134*c0909341SAndroid Build Coastguard Worker    punpcklbw        m5, m4, m6
1135*c0909341SAndroid Build Coastguard Worker    punpckhbw        m4, m6
1136*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m5, m14, m5
1137*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m4, m14, m4
1138*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m5, m12
1139*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m4, m12
1140*c0909341SAndroid Build Coastguard Worker    packsswb         m5, m4
1141*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m5, m7
1142*c0909341SAndroid Build Coastguard Worker    punpckhbw        m5, m7
1143*c0909341SAndroid Build Coastguard Worker
1144*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[src] * grain, scaling_shift)
1145*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m2, m4
1146*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m3, m5
1147*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m2, m9
1148*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m3, m9
1149*c0909341SAndroid Build Coastguard Worker
1150*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
1151*c0909341SAndroid Build Coastguard Worker    paddw            m0, m2
1152*c0909341SAndroid Build Coastguard Worker    paddw            m1, m3
1153*c0909341SAndroid Build Coastguard Worker    packuswb         m0, m1
1154*c0909341SAndroid Build Coastguard Worker    pmaxub           m0, m10
1155*c0909341SAndroid Build Coastguard Worker    pminub           m0, m11
1156*c0909341SAndroid Build Coastguard Worker    mova    [dstq+srcq], m0
1157*c0909341SAndroid Build Coastguard Worker
1158*c0909341SAndroid Build Coastguard Worker    add            srcq, strideq
1159*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82
1160*c0909341SAndroid Build Coastguard Worker    dec              hb
1161*c0909341SAndroid Build Coastguard Worker    jz .end_y_v_overlap
1162*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m14, [pb_17_27] ; swap weights for second v-overlap line
1163*c0909341SAndroid Build Coastguard Worker    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1164*c0909341SAndroid Build Coastguard Worker    ; remaining (up to) 30 lines
1165*c0909341SAndroid Build Coastguard Worker    add              hd, 0x80000000
1166*c0909341SAndroid Build Coastguard Worker    jnc .loop_y_v_overlap
1167*c0909341SAndroid Build Coastguard Worker    jmp .loop_y
1168*c0909341SAndroid Build Coastguard Worker.end_y_v_overlap:
1169*c0909341SAndroid Build Coastguard Worker    add              wq, 32
1170*c0909341SAndroid Build Coastguard Worker    jge .end
1171*c0909341SAndroid Build Coastguard Worker    lea            srcq, [src_bakq+wq]
1172*c0909341SAndroid Build Coastguard Worker
1173*c0909341SAndroid Build Coastguard Worker    ; since fg_dataq.overlap is guaranteed to be set, we never jump
1174*c0909341SAndroid Build Coastguard Worker    ; back to .loop_x_v_overlap, and instead always fall-through to
1175*c0909341SAndroid Build Coastguard Worker    ; h+v overlap
1176*c0909341SAndroid Build Coastguard Worker.loop_x_hv_overlap:
1177*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m14, [pb_27_17]
1178*c0909341SAndroid Build Coastguard Worker
1179*c0909341SAndroid Build Coastguard Worker    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1180*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
1181*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4eff4
1182*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
1183*c0909341SAndroid Build Coastguard Worker    setp            r7b                     ; parity of top_seed
1184*c0909341SAndroid Build Coastguard Worker    shr            seed, 16
1185*c0909341SAndroid Build Coastguard Worker    shl             r7d, 16
1186*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
1187*c0909341SAndroid Build Coastguard Worker    setp            r7b                     ; parity of cur_seed
1188*c0909341SAndroid Build Coastguard Worker    or              r6d, 0x00010001
1189*c0909341SAndroid Build Coastguard Worker    xor             r7d, r6d
1190*c0909341SAndroid Build Coastguard Worker    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1191*c0909341SAndroid Build Coastguard Worker
1192*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1193*c0909341SAndroid Build Coastguard Worker                offx, offy, see, left_offxy, top_offxy, topleft_offxy
1194*c0909341SAndroid Build Coastguard Worker
1195*c0909341SAndroid Build Coastguard Worker    lea  topleft_offxyd, [top_offxyq+32]
1196*c0909341SAndroid Build Coastguard Worker    lea     left_offxyd, [offyq+32]
1197*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
1198*c0909341SAndroid Build Coastguard Worker    rorx          offxd, seed, 12
1199*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf000f
1200*c0909341SAndroid Build Coastguard Worker    and           offxd, 0xf000f
1201*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164
1202*c0909341SAndroid Build Coastguard Worker    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1203*c0909341SAndroid Build Coastguard Worker    lea           offyd, [offyq+offxq*2+0x10001*747+32*82]
1204*c0909341SAndroid Build Coastguard Worker
1205*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1206*c0909341SAndroid Build Coastguard Worker                h, offxy, see, left_offxy, top_offxy, topleft_offxy
1207*c0909341SAndroid Build Coastguard Worker
1208*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
1209*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
1210*c0909341SAndroid Build Coastguard Worker    movzx    top_offxyd, offxyw
1211*c0909341SAndroid Build Coastguard Worker    shr          offxyd, 16
1212*c0909341SAndroid Build Coastguard Worker.loop_y_hv_overlap:
1213*c0909341SAndroid Build Coastguard Worker    ; src
1214*c0909341SAndroid Build Coastguard Worker    mova             m2, [srcq]
1215*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m2, m7
1216*c0909341SAndroid Build Coastguard Worker    punpckhbw        m1, m2, m7
1217*c0909341SAndroid Build Coastguard Worker
1218*c0909341SAndroid Build Coastguard Worker    ; scaling[src]
1219*c0909341SAndroid Build Coastguard Worker    pandn            m4, m8, m0
1220*c0909341SAndroid Build Coastguard Worker    mova             m6, m8
1221*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m2, [scalingq+m4-0], m8
1222*c0909341SAndroid Build Coastguard Worker    psrld            m3, m0, 16
1223*c0909341SAndroid Build Coastguard Worker    mova             m8, m6
1224*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m4, [scalingq+m3-2], m6
1225*c0909341SAndroid Build Coastguard Worker    pandn            m5, m8, m1
1226*c0909341SAndroid Build Coastguard Worker    mova             m6, m8
1227*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m3, [scalingq+m5-0], m8
1228*c0909341SAndroid Build Coastguard Worker    pblendw          m2, m4, 0xaa
1229*c0909341SAndroid Build Coastguard Worker    psrld            m4, m1, 16
1230*c0909341SAndroid Build Coastguard Worker    mova             m8, m6
1231*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m5, [scalingq+m4-2], m6
1232*c0909341SAndroid Build Coastguard Worker    pblendw          m3, m5, 0xaa
1233*c0909341SAndroid Build Coastguard Worker
1234*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
1235*c0909341SAndroid Build Coastguard Worker    movu             m6, [grain_lutq+offxyq]
1236*c0909341SAndroid Build Coastguard Worker    movd            xm7, [grain_lutq+left_offxyq]
1237*c0909341SAndroid Build Coastguard Worker    movu             m4, [grain_lutq+top_offxyq]
1238*c0909341SAndroid Build Coastguard Worker    movd            xm5, [grain_lutq+topleft_offxyq]
1239*c0909341SAndroid Build Coastguard Worker    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
1240*c0909341SAndroid Build Coastguard Worker    punpcklbw       xm7, xm6
1241*c0909341SAndroid Build Coastguard Worker    punpcklbw       xm5, xm4
1242*c0909341SAndroid Build Coastguard Worker    pmaddubsw       xm7, xm13, xm7
1243*c0909341SAndroid Build Coastguard Worker    pmaddubsw       xm5, xm13, xm5
1244*c0909341SAndroid Build Coastguard Worker    pmulhrsw        xm7, xm12
1245*c0909341SAndroid Build Coastguard Worker    pmulhrsw        xm5, xm12
1246*c0909341SAndroid Build Coastguard Worker    packsswb        xm7, xm7
1247*c0909341SAndroid Build Coastguard Worker    packsswb        xm5, xm5
1248*c0909341SAndroid Build Coastguard Worker    vpblendd         m7, m6, 0xfe
1249*c0909341SAndroid Build Coastguard Worker    vpblendd         m5, m4, 0xfe
1250*c0909341SAndroid Build Coastguard Worker    ; followed by v interpolation (top | cur -> cur)
1251*c0909341SAndroid Build Coastguard Worker    punpckhbw        m4, m6
1252*c0909341SAndroid Build Coastguard Worker    punpcklbw        m5, m7
1253*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m4, m14, m4
1254*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m5, m14, m5
1255*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m4, m12
1256*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m5, m12
1257*c0909341SAndroid Build Coastguard Worker    pxor             m7, m7
1258*c0909341SAndroid Build Coastguard Worker    packsswb         m5, m4
1259*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m5, m7
1260*c0909341SAndroid Build Coastguard Worker    punpckhbw        m5, m7
1261*c0909341SAndroid Build Coastguard Worker
1262*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[src] * grain, scaling_shift)
1263*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m2, m4
1264*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m3, m5
1265*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m2, m9
1266*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m3, m9
1267*c0909341SAndroid Build Coastguard Worker
1268*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
1269*c0909341SAndroid Build Coastguard Worker    paddw            m0, m2
1270*c0909341SAndroid Build Coastguard Worker    paddw            m1, m3
1271*c0909341SAndroid Build Coastguard Worker    packuswb         m0, m1
1272*c0909341SAndroid Build Coastguard Worker    pmaxub           m0, m10
1273*c0909341SAndroid Build Coastguard Worker    pminub           m0, m11
1274*c0909341SAndroid Build Coastguard Worker    mova    [dstq+srcq], m0
1275*c0909341SAndroid Build Coastguard Worker
1276*c0909341SAndroid Build Coastguard Worker    add            srcq, strideq
1277*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82
1278*c0909341SAndroid Build Coastguard Worker    dec              hb
1279*c0909341SAndroid Build Coastguard Worker    jz .end_y_hv_overlap
1280*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m14, [pb_17_27] ; swap weights for second v-overlap line
1281*c0909341SAndroid Build Coastguard Worker    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1282*c0909341SAndroid Build Coastguard Worker    ; remaining (up to) 30 lines
1283*c0909341SAndroid Build Coastguard Worker    add              hd, 0x80000000
1284*c0909341SAndroid Build Coastguard Worker    jnc .loop_y_hv_overlap
1285*c0909341SAndroid Build Coastguard Worker    jmp .loop_y_h_overlap
1286*c0909341SAndroid Build Coastguard Worker.end_y_hv_overlap:
1287*c0909341SAndroid Build Coastguard Worker    add              wq, 32
1288*c0909341SAndroid Build Coastguard Worker    lea            srcq, [src_bakq+wq]
1289*c0909341SAndroid Build Coastguard Worker    jl .loop_x_hv_overlap
1290*c0909341SAndroid Build Coastguard Worker.end:
1291*c0909341SAndroid Build Coastguard Worker    RET
1292*c0909341SAndroid Build Coastguard Worker
1293*c0909341SAndroid Build Coastguard Worker%macro FGUV_FN 3 ; name, ss_hor, ss_ver
1294*c0909341SAndroid Build Coastguard Workercglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
1295*c0909341SAndroid Build Coastguard Worker                                          grain_lut, h, sby, luma, overlap, uv_pl, is_id
1296*c0909341SAndroid Build Coastguard Worker%define base r11-pd_m65536
1297*c0909341SAndroid Build Coastguard Worker    lea             r11, [pd_m65536]
1298*c0909341SAndroid Build Coastguard Worker    mov             r6d, [fg_dataq+FGData.scaling_shift]
1299*c0909341SAndroid Build Coastguard Worker    mov             r7d, [fg_dataq+FGData.clip_to_restricted_range]
1300*c0909341SAndroid Build Coastguard Worker    mov             r9d, is_idm
1301*c0909341SAndroid Build Coastguard Worker    mov            sbyd, sbym
1302*c0909341SAndroid Build Coastguard Worker    mov        overlapd, [fg_dataq+FGData.overlap_flag]
1303*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m8, [base+pd_m65536]
1304*c0909341SAndroid Build Coastguard Worker    vpbroadcastw     m9, [base+mul_bits+r6*2-14]
1305*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m10, [base+fg_min+r7*4]
1306*c0909341SAndroid Build Coastguard Worker    shlx            r7d, r7d, r9d
1307*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m11, [base+fg_max+r7*4]
1308*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m12, [base+pw_1024]
1309*c0909341SAndroid Build Coastguard Worker    pxor             m7, m7
1310*c0909341SAndroid Build Coastguard Worker    test           sbyd, sbyd
1311*c0909341SAndroid Build Coastguard Worker    setnz           r7b
1312*c0909341SAndroid Build Coastguard Worker    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
1313*c0909341SAndroid Build Coastguard Worker    jne .csfl
1314*c0909341SAndroid Build Coastguard Worker
1315*c0909341SAndroid Build Coastguard Worker%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
1316*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1317*c0909341SAndroid Build Coastguard Worker                h, sby, see, overlap, uv_pl
1318*c0909341SAndroid Build Coastguard Worker%if %1
1319*c0909341SAndroid Build Coastguard Worker    mov             r6d, uv_plm
1320*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m0, [base+pw_8]
1321*c0909341SAndroid Build Coastguard Worker    vbroadcasti128  m14, [fg_dataq+FGData.uv_mult+r6*4]
1322*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m15, [fg_dataq+FGData.uv_offset+r6*4]
1323*c0909341SAndroid Build Coastguard Worker    pshufb          m14, m0 ; uv_luma_mult, uv_mult
1324*c0909341SAndroid Build Coastguard Worker%elif %2
1325*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    m15, [base+pb_23_22]
1326*c0909341SAndroid Build Coastguard Worker%else
1327*c0909341SAndroid Build Coastguard Worker    vpbroadcastq   xm15, [base+pb_27_17_17_27]
1328*c0909341SAndroid Build Coastguard Worker%endif
1329*c0909341SAndroid Build Coastguard Worker%if %3
1330*c0909341SAndroid Build Coastguard Worker    vpbroadcastw    m13, [base+pb_23_22]
1331*c0909341SAndroid Build Coastguard Worker%elif %2
1332*c0909341SAndroid Build Coastguard Worker    pshufd          m13, [base+pb_27_17], q0000 ; 8x27_17, 8x17_27
1333*c0909341SAndroid Build Coastguard Worker%endif
1334*c0909341SAndroid Build Coastguard Worker    test            r7b, overlapb
1335*c0909341SAndroid Build Coastguard Worker    jnz %%vertical_overlap
1336*c0909341SAndroid Build Coastguard Worker
1337*c0909341SAndroid Build Coastguard Worker    imul           seed, sbyd, (173 << 24) | 37
1338*c0909341SAndroid Build Coastguard Worker    add            seed, (105 << 24) | 178
1339*c0909341SAndroid Build Coastguard Worker    rorx           seed, seed, 24
1340*c0909341SAndroid Build Coastguard Worker    movzx          seed, seew
1341*c0909341SAndroid Build Coastguard Worker    xor            seed, [fg_dataq+FGData.seed]
1342*c0909341SAndroid Build Coastguard Worker
1343*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1344*c0909341SAndroid Build Coastguard Worker                unused2, unused3, see, overlap, unused4, unused5, lstride
1345*c0909341SAndroid Build Coastguard Worker
1346*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r9mp
1347*c0909341SAndroid Build Coastguard Worker    lea             r12, [srcq+wq]
1348*c0909341SAndroid Build Coastguard Worker    lea             r13, [dstq+wq]
1349*c0909341SAndroid Build Coastguard Worker    lea             r14, [lumaq+wq*(1+%2)]
1350*c0909341SAndroid Build Coastguard Worker    mov           r11mp, r12
1351*c0909341SAndroid Build Coastguard Worker    mov           r12mp, r13
1352*c0909341SAndroid Build Coastguard Worker    mov        lstrideq, r10mp
1353*c0909341SAndroid Build Coastguard Worker    neg              wq
1354*c0909341SAndroid Build Coastguard Worker
1355*c0909341SAndroid Build Coastguard Worker%%loop_x:
1356*c0909341SAndroid Build Coastguard Worker    rorx             r6, seeq, 1
1357*c0909341SAndroid Build Coastguard Worker    or             seed, 0xEFF4
1358*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
1359*c0909341SAndroid Build Coastguard Worker    lea            seed, [r6+0x8000]
1360*c0909341SAndroid Build Coastguard Worker    cmovp          seed, r6d               ; updated seed
1361*c0909341SAndroid Build Coastguard Worker
1362*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1363*c0909341SAndroid Build Coastguard Worker                offx, offy, see, overlap, unused1, unused2, lstride
1364*c0909341SAndroid Build Coastguard Worker
1365*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
1366*c0909341SAndroid Build Coastguard Worker    rorx          offxq, seeq, 12
1367*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf
1368*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164>>%3
1369*c0909341SAndroid Build Coastguard Worker    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
1370*c0909341SAndroid Build Coastguard Worker
1371*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1372*c0909341SAndroid Build Coastguard Worker                h, offxy, see, overlap, unused1, unused2, lstride
1373*c0909341SAndroid Build Coastguard Worker
1374*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
1375*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
1376*c0909341SAndroid Build Coastguard Worker%%loop_y:
1377*c0909341SAndroid Build Coastguard Worker    ; src
1378*c0909341SAndroid Build Coastguard Worker%if %2
1379*c0909341SAndroid Build Coastguard Worker    mova            xm3, [lumaq+lstrideq*0+ 0]
1380*c0909341SAndroid Build Coastguard Worker    vinserti128      m3, [lumaq+lstrideq*(1+%3) +0], 1
1381*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m2, [pb_1]
1382*c0909341SAndroid Build Coastguard Worker    mova            xm0, [lumaq+lstrideq*0+16]
1383*c0909341SAndroid Build Coastguard Worker    vinserti128      m0, [lumaq+lstrideq*(1+%3)+16], 1
1384*c0909341SAndroid Build Coastguard Worker    mova            xm1, [srcq]
1385*c0909341SAndroid Build Coastguard Worker    vinserti128      m1, [srcq+strideq], 1
1386*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m3, m2
1387*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m0, m2
1388*c0909341SAndroid Build Coastguard Worker    pavgw            m3, m7
1389*c0909341SAndroid Build Coastguard Worker    pavgw            m0, m7
1390*c0909341SAndroid Build Coastguard Worker%else
1391*c0909341SAndroid Build Coastguard Worker    mova             m2, [lumaq]
1392*c0909341SAndroid Build Coastguard Worker    mova             m1, [srcq]
1393*c0909341SAndroid Build Coastguard Worker%endif
1394*c0909341SAndroid Build Coastguard Worker%if %1
1395*c0909341SAndroid Build Coastguard Worker%if %2
1396*c0909341SAndroid Build Coastguard Worker    packuswb         m2, m3, m0             ; luma
1397*c0909341SAndroid Build Coastguard Worker%endif
1398*c0909341SAndroid Build Coastguard Worker    punpckhbw        m3, m2, m1
1399*c0909341SAndroid Build Coastguard Worker    punpcklbw        m2, m1                 ; { luma, chroma }
1400*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m3, m14
1401*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m2, m14
1402*c0909341SAndroid Build Coastguard Worker    psraw            m3, 6
1403*c0909341SAndroid Build Coastguard Worker    psraw            m2, 6
1404*c0909341SAndroid Build Coastguard Worker    paddw            m3, m15
1405*c0909341SAndroid Build Coastguard Worker    paddw            m2, m15
1406*c0909341SAndroid Build Coastguard Worker    packuswb         m2, m3                 ; pack+unpack = clip
1407*c0909341SAndroid Build Coastguard Worker%endif
1408*c0909341SAndroid Build Coastguard Worker%if %1 || %2 == 0
1409*c0909341SAndroid Build Coastguard Worker    punpcklbw        m3, m2, m7
1410*c0909341SAndroid Build Coastguard Worker    punpckhbw        m0, m2, m7
1411*c0909341SAndroid Build Coastguard Worker%endif
1412*c0909341SAndroid Build Coastguard Worker
1413*c0909341SAndroid Build Coastguard Worker    ; scaling[luma_src]
1414*c0909341SAndroid Build Coastguard Worker    pandn            m4, m8, m3
1415*c0909341SAndroid Build Coastguard Worker    mova             m6, m8
1416*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m2, [scalingq+m4-0], m8
1417*c0909341SAndroid Build Coastguard Worker    psrld            m3, 16
1418*c0909341SAndroid Build Coastguard Worker    mova             m8, m6
1419*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m4, [scalingq+m3-2], m6
1420*c0909341SAndroid Build Coastguard Worker    pandn            m5, m8, m0
1421*c0909341SAndroid Build Coastguard Worker    mova             m6, m8
1422*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m3, [scalingq+m5-0], m8
1423*c0909341SAndroid Build Coastguard Worker    psrld            m0, 16
1424*c0909341SAndroid Build Coastguard Worker    mova             m8, m6
1425*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m5, [scalingq+m0-2], m6
1426*c0909341SAndroid Build Coastguard Worker    pblendw          m2, m4, 0xaa
1427*c0909341SAndroid Build Coastguard Worker    pblendw          m3, m5, 0xaa
1428*c0909341SAndroid Build Coastguard Worker
1429*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
1430*c0909341SAndroid Build Coastguard Worker%if %2
1431*c0909341SAndroid Build Coastguard Worker    movu            xm5, [grain_lutq+offxyq+ 0]
1432*c0909341SAndroid Build Coastguard Worker    vinserti128      m5, [grain_lutq+offxyq+82], 1
1433*c0909341SAndroid Build Coastguard Worker%else
1434*c0909341SAndroid Build Coastguard Worker    movu             m5, [grain_lutq+offxyq]
1435*c0909341SAndroid Build Coastguard Worker%endif
1436*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m5, m7
1437*c0909341SAndroid Build Coastguard Worker    punpckhbw        m5, m7
1438*c0909341SAndroid Build Coastguard Worker
1439*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
1440*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m2, m4
1441*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m3, m5
1442*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m2, m9
1443*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m3, m9
1444*c0909341SAndroid Build Coastguard Worker
1445*c0909341SAndroid Build Coastguard Worker    ; unpack chroma_source
1446*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m1, m7
1447*c0909341SAndroid Build Coastguard Worker    punpckhbw        m1, m7
1448*c0909341SAndroid Build Coastguard Worker
1449*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
1450*c0909341SAndroid Build Coastguard Worker    paddw            m0, m2
1451*c0909341SAndroid Build Coastguard Worker    paddw            m1, m3
1452*c0909341SAndroid Build Coastguard Worker    packuswb         m0, m1
1453*c0909341SAndroid Build Coastguard Worker    pmaxub           m0, m10
1454*c0909341SAndroid Build Coastguard Worker    pminub           m0, m11
1455*c0909341SAndroid Build Coastguard Worker%if %2
1456*c0909341SAndroid Build Coastguard Worker    mova         [dstq], xm0
1457*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq], m0, 1
1458*c0909341SAndroid Build Coastguard Worker%else
1459*c0909341SAndroid Build Coastguard Worker    mova         [dstq], m0
1460*c0909341SAndroid Build Coastguard Worker%endif
1461*c0909341SAndroid Build Coastguard Worker
1462*c0909341SAndroid Build Coastguard Worker%if %2
1463*c0909341SAndroid Build Coastguard Worker    lea            srcq, [srcq+strideq*2]
1464*c0909341SAndroid Build Coastguard Worker    lea            dstq, [dstq+strideq*2]
1465*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
1466*c0909341SAndroid Build Coastguard Worker%else
1467*c0909341SAndroid Build Coastguard Worker    add            srcq, strideq
1468*c0909341SAndroid Build Coastguard Worker    add            dstq, strideq
1469*c0909341SAndroid Build Coastguard Worker    add           lumaq, lstrideq
1470*c0909341SAndroid Build Coastguard Worker%endif
1471*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82<<%2
1472*c0909341SAndroid Build Coastguard Worker    sub              hb, 1+%2
1473*c0909341SAndroid Build Coastguard Worker    jg %%loop_y
1474*c0909341SAndroid Build Coastguard Worker
1475*c0909341SAndroid Build Coastguard Worker    add              wq, 32>>%2
1476*c0909341SAndroid Build Coastguard Worker    jge .end
1477*c0909341SAndroid Build Coastguard Worker    mov            srcq, r11mp
1478*c0909341SAndroid Build Coastguard Worker    mov            dstq, r12mp
1479*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [r14+wq*(1+%2)]
1480*c0909341SAndroid Build Coastguard Worker    add            srcq, wq
1481*c0909341SAndroid Build Coastguard Worker    add            dstq, wq
1482*c0909341SAndroid Build Coastguard Worker    test       overlapd, overlapd
1483*c0909341SAndroid Build Coastguard Worker    jz %%loop_x
1484*c0909341SAndroid Build Coastguard Worker
1485*c0909341SAndroid Build Coastguard Worker    ; r8m = sbym
1486*c0909341SAndroid Build Coastguard Worker    cmp       dword r8m, 0
1487*c0909341SAndroid Build Coastguard Worker    jne %%loop_x_hv_overlap
1488*c0909341SAndroid Build Coastguard Worker
1489*c0909341SAndroid Build Coastguard Worker    ; horizontal overlap (without vertical overlap)
1490*c0909341SAndroid Build Coastguard Worker%%loop_x_h_overlap:
1491*c0909341SAndroid Build Coastguard Worker    rorx             r6, seeq, 1
1492*c0909341SAndroid Build Coastguard Worker    or             seed, 0xEFF4
1493*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
1494*c0909341SAndroid Build Coastguard Worker    lea            seed, [r6+0x8000]
1495*c0909341SAndroid Build Coastguard Worker    cmovp          seed, r6d               ; updated seed
1496*c0909341SAndroid Build Coastguard Worker
1497*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1498*c0909341SAndroid Build Coastguard Worker                offx, offy, see, left_offxy, unused1, unused2, lstride
1499*c0909341SAndroid Build Coastguard Worker
1500*c0909341SAndroid Build Coastguard Worker    lea     left_offxyd, [offyq+(32>>%2)]         ; previous column's offy*stride+offx
1501*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
1502*c0909341SAndroid Build Coastguard Worker    rorx          offxq, seeq, 12
1503*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf
1504*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164>>%3
1505*c0909341SAndroid Build Coastguard Worker    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
1506*c0909341SAndroid Build Coastguard Worker
1507*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1508*c0909341SAndroid Build Coastguard Worker                h, offxy, see, left_offxy, unused1, unused2, lstride
1509*c0909341SAndroid Build Coastguard Worker
1510*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
1511*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
1512*c0909341SAndroid Build Coastguard Worker%%loop_y_h_overlap:
1513*c0909341SAndroid Build Coastguard Worker    ; src
1514*c0909341SAndroid Build Coastguard Worker%if %2
1515*c0909341SAndroid Build Coastguard Worker    mova            xm3, [lumaq+lstrideq*0+ 0]
1516*c0909341SAndroid Build Coastguard Worker    vinserti128      m3, [lumaq+lstrideq*(1+%3)+ 0], 1
1517*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m2, [pb_1]
1518*c0909341SAndroid Build Coastguard Worker    mova            xm0, [lumaq+lstrideq*0+16]
1519*c0909341SAndroid Build Coastguard Worker    vinserti128      m0, [lumaq+lstrideq*(1+%3)+16], 1
1520*c0909341SAndroid Build Coastguard Worker    mova            xm1, [srcq]
1521*c0909341SAndroid Build Coastguard Worker    vinserti128      m1, [srcq+strideq], 1
1522*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m3, m2
1523*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m0, m2
1524*c0909341SAndroid Build Coastguard Worker    pavgw            m3, m7
1525*c0909341SAndroid Build Coastguard Worker    pavgw            m0, m7
1526*c0909341SAndroid Build Coastguard Worker%else
1527*c0909341SAndroid Build Coastguard Worker    mova             m2, [lumaq]
1528*c0909341SAndroid Build Coastguard Worker    mova             m1, [srcq]
1529*c0909341SAndroid Build Coastguard Worker%endif
1530*c0909341SAndroid Build Coastguard Worker%if %1
1531*c0909341SAndroid Build Coastguard Worker%if %2
1532*c0909341SAndroid Build Coastguard Worker    packuswb         m2, m3, m0             ; luma
1533*c0909341SAndroid Build Coastguard Worker%endif
1534*c0909341SAndroid Build Coastguard Worker    punpckhbw        m3, m2, m1
1535*c0909341SAndroid Build Coastguard Worker    punpcklbw        m2, m1                 ; { luma, chroma }
1536*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m3, m14
1537*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m2, m14
1538*c0909341SAndroid Build Coastguard Worker    psraw            m3, 6
1539*c0909341SAndroid Build Coastguard Worker    psraw            m2, 6
1540*c0909341SAndroid Build Coastguard Worker    paddw            m3, m15
1541*c0909341SAndroid Build Coastguard Worker    paddw            m2, m15
1542*c0909341SAndroid Build Coastguard Worker    packuswb         m2, m3                 ; pack+unpack = clip
1543*c0909341SAndroid Build Coastguard Worker%endif
1544*c0909341SAndroid Build Coastguard Worker%if %1 || %2 == 0
1545*c0909341SAndroid Build Coastguard Worker    punpcklbw        m3, m2, m7
1546*c0909341SAndroid Build Coastguard Worker    punpckhbw        m0, m2, m7
1547*c0909341SAndroid Build Coastguard Worker%endif
1548*c0909341SAndroid Build Coastguard Worker
1549*c0909341SAndroid Build Coastguard Worker    ; scaling[luma_src]
1550*c0909341SAndroid Build Coastguard Worker    pandn            m4, m8, m3
1551*c0909341SAndroid Build Coastguard Worker    mova             m6, m8
1552*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m2, [scalingq+m4-0], m8
1553*c0909341SAndroid Build Coastguard Worker    psrld            m3, 16
1554*c0909341SAndroid Build Coastguard Worker    mova             m8, m6
1555*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m4, [scalingq+m3-2], m6
1556*c0909341SAndroid Build Coastguard Worker    pandn            m5, m8, m0
1557*c0909341SAndroid Build Coastguard Worker    mova             m6, m8
1558*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m3, [scalingq+m5-0], m8
1559*c0909341SAndroid Build Coastguard Worker    psrld            m0, 16
1560*c0909341SAndroid Build Coastguard Worker    mova             m8, m6
1561*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m5, [scalingq+m0-2], m6
1562*c0909341SAndroid Build Coastguard Worker    pblendw          m2, m4, 0xaa
1563*c0909341SAndroid Build Coastguard Worker    pblendw          m3, m5, 0xaa
1564*c0909341SAndroid Build Coastguard Worker
1565*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
1566*c0909341SAndroid Build Coastguard Worker%if %2
1567*c0909341SAndroid Build Coastguard Worker    movu            xm5, [grain_lutq+offxyq+ 0]
1568*c0909341SAndroid Build Coastguard Worker    vinserti128      m5, [grain_lutq+offxyq+82], 1
1569*c0909341SAndroid Build Coastguard Worker    movd            xm4, [grain_lutq+left_offxyq+ 0]
1570*c0909341SAndroid Build Coastguard Worker    vinserti128      m4, [grain_lutq+left_offxyq+82], 1
1571*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m5
1572*c0909341SAndroid Build Coastguard Worker%if %1
1573*c0909341SAndroid Build Coastguard Worker    vpbroadcastq     m0, [pb_23_22]
1574*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m4, m0, m4
1575*c0909341SAndroid Build Coastguard Worker%else
1576*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m4, m15, m4
1577*c0909341SAndroid Build Coastguard Worker%endif
1578*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m4, m12
1579*c0909341SAndroid Build Coastguard Worker    packsswb         m4, m4
1580*c0909341SAndroid Build Coastguard Worker    vpblendd         m4, m5, 0xee
1581*c0909341SAndroid Build Coastguard Worker%else
1582*c0909341SAndroid Build Coastguard Worker    movu             m5, [grain_lutq+offxyq]
1583*c0909341SAndroid Build Coastguard Worker    movd            xm4, [grain_lutq+left_offxyq]
1584*c0909341SAndroid Build Coastguard Worker    punpcklbw       xm4, xm5
1585*c0909341SAndroid Build Coastguard Worker%if %1
1586*c0909341SAndroid Build Coastguard Worker    movq            xm0, [pb_27_17_17_27]
1587*c0909341SAndroid Build Coastguard Worker    pmaddubsw       xm4, xm0, xm4
1588*c0909341SAndroid Build Coastguard Worker%else
1589*c0909341SAndroid Build Coastguard Worker    pmaddubsw       xm4, xm15, xm4
1590*c0909341SAndroid Build Coastguard Worker%endif
1591*c0909341SAndroid Build Coastguard Worker    pmulhrsw        xm4, xm12
1592*c0909341SAndroid Build Coastguard Worker    packsswb        xm4, xm4
1593*c0909341SAndroid Build Coastguard Worker    vpblendd         m4, m5, 0xfe
1594*c0909341SAndroid Build Coastguard Worker%endif
1595*c0909341SAndroid Build Coastguard Worker    punpckhbw        m5, m7
1596*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m7
1597*c0909341SAndroid Build Coastguard Worker
1598*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
1599*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m2, m4
1600*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m3, m5
1601*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m2, m9
1602*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m3, m9
1603*c0909341SAndroid Build Coastguard Worker
1604*c0909341SAndroid Build Coastguard Worker    ; unpack chroma_source
1605*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m1, m7
1606*c0909341SAndroid Build Coastguard Worker    punpckhbw        m1, m7
1607*c0909341SAndroid Build Coastguard Worker
1608*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
1609*c0909341SAndroid Build Coastguard Worker    paddw            m0, m2
1610*c0909341SAndroid Build Coastguard Worker    paddw            m1, m3
1611*c0909341SAndroid Build Coastguard Worker    packuswb         m0, m1
1612*c0909341SAndroid Build Coastguard Worker    pmaxub           m0, m10
1613*c0909341SAndroid Build Coastguard Worker    pminub           m0, m11
1614*c0909341SAndroid Build Coastguard Worker%if %2
1615*c0909341SAndroid Build Coastguard Worker    mova         [dstq], xm0
1616*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq], m0, 1
1617*c0909341SAndroid Build Coastguard Worker%else
1618*c0909341SAndroid Build Coastguard Worker    mova         [dstq], m0
1619*c0909341SAndroid Build Coastguard Worker%endif
1620*c0909341SAndroid Build Coastguard Worker
1621*c0909341SAndroid Build Coastguard Worker%if %2
1622*c0909341SAndroid Build Coastguard Worker    lea            srcq, [srcq+strideq*2]
1623*c0909341SAndroid Build Coastguard Worker    lea            dstq, [dstq+strideq*2]
1624*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
1625*c0909341SAndroid Build Coastguard Worker%else
1626*c0909341SAndroid Build Coastguard Worker    add            srcq, strideq
1627*c0909341SAndroid Build Coastguard Worker    add            dstq, strideq
1628*c0909341SAndroid Build Coastguard Worker    add           lumaq, lstrideq
1629*c0909341SAndroid Build Coastguard Worker%endif
1630*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82*(1+%2)
1631*c0909341SAndroid Build Coastguard Worker    sub              hb, 1+%2
1632*c0909341SAndroid Build Coastguard Worker    jg %%loop_y_h_overlap
1633*c0909341SAndroid Build Coastguard Worker
1634*c0909341SAndroid Build Coastguard Worker    add              wq, 32>>%2
1635*c0909341SAndroid Build Coastguard Worker    jge .end
1636*c0909341SAndroid Build Coastguard Worker    mov            srcq, r11mp
1637*c0909341SAndroid Build Coastguard Worker    mov            dstq, r12mp
1638*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [r14+wq*(1+%2)]
1639*c0909341SAndroid Build Coastguard Worker    add            srcq, wq
1640*c0909341SAndroid Build Coastguard Worker    add            dstq, wq
1641*c0909341SAndroid Build Coastguard Worker
1642*c0909341SAndroid Build Coastguard Worker    ; r8m = sbym
1643*c0909341SAndroid Build Coastguard Worker    cmp       dword r8m, 0
1644*c0909341SAndroid Build Coastguard Worker    jne %%loop_x_hv_overlap
1645*c0909341SAndroid Build Coastguard Worker    jmp %%loop_x_h_overlap
1646*c0909341SAndroid Build Coastguard Worker
1647*c0909341SAndroid Build Coastguard Worker%%vertical_overlap:
1648*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
1649*c0909341SAndroid Build Coastguard Worker                sby, see, overlap, unused1, unused2, lstride
1650*c0909341SAndroid Build Coastguard Worker
1651*c0909341SAndroid Build Coastguard Worker    movzx          sbyd, sbyb
1652*c0909341SAndroid Build Coastguard Worker    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1653*c0909341SAndroid Build Coastguard Worker    imul            r7d, sbyd, 173 * 0x00010001
1654*c0909341SAndroid Build Coastguard Worker    imul           sbyd, 37 * 0x01000100
1655*c0909341SAndroid Build Coastguard Worker    add             r7d, (105 << 16) | 188
1656*c0909341SAndroid Build Coastguard Worker    add            sbyd, (178 << 24) | (141 << 8)
1657*c0909341SAndroid Build Coastguard Worker    and             r7d, 0x00ff00ff
1658*c0909341SAndroid Build Coastguard Worker    and            sbyd, 0xff00ff00
1659*c0909341SAndroid Build Coastguard Worker    xor            seed, r7d
1660*c0909341SAndroid Build Coastguard Worker    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1661*c0909341SAndroid Build Coastguard Worker
1662*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1663*c0909341SAndroid Build Coastguard Worker                unused1, unused2, see, overlap, unused3, unused4, lstride
1664*c0909341SAndroid Build Coastguard Worker
1665*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r9mp
1666*c0909341SAndroid Build Coastguard Worker    lea             r12, [srcq+wq]
1667*c0909341SAndroid Build Coastguard Worker    lea             r13, [dstq+wq]
1668*c0909341SAndroid Build Coastguard Worker    lea             r14, [lumaq+wq*(1+%2)]
1669*c0909341SAndroid Build Coastguard Worker    mov           r11mp, r12
1670*c0909341SAndroid Build Coastguard Worker    mov           r12mp, r13
1671*c0909341SAndroid Build Coastguard Worker    mov        lstrideq, r10mp
1672*c0909341SAndroid Build Coastguard Worker    neg              wq
1673*c0909341SAndroid Build Coastguard Worker
1674*c0909341SAndroid Build Coastguard Worker%%loop_x_v_overlap:
1675*c0909341SAndroid Build Coastguard Worker    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1676*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
1677*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4eff4
1678*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
1679*c0909341SAndroid Build Coastguard Worker    setp            r7b                     ; parity of top_seed
1680*c0909341SAndroid Build Coastguard Worker    shr            seed, 16
1681*c0909341SAndroid Build Coastguard Worker    shl             r7d, 16
1682*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
1683*c0909341SAndroid Build Coastguard Worker    setp            r7b                     ; parity of cur_seed
1684*c0909341SAndroid Build Coastguard Worker    or              r6d, 0x00010001
1685*c0909341SAndroid Build Coastguard Worker    xor             r7d, r6d
1686*c0909341SAndroid Build Coastguard Worker    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1687*c0909341SAndroid Build Coastguard Worker
1688*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1689*c0909341SAndroid Build Coastguard Worker                offx, offy, see, overlap, top_offxy, unused, lstride
1690*c0909341SAndroid Build Coastguard Worker
1691*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
1692*c0909341SAndroid Build Coastguard Worker    rorx          offxd, seed, 12
1693*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf000f
1694*c0909341SAndroid Build Coastguard Worker    and           offxd, 0xf000f
1695*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164>>%3
1696*c0909341SAndroid Build Coastguard Worker    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1697*c0909341SAndroid Build Coastguard Worker    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
1698*c0909341SAndroid Build Coastguard Worker
1699*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1700*c0909341SAndroid Build Coastguard Worker                h, offxy, see, overlap, top_offxy, unused, lstride
1701*c0909341SAndroid Build Coastguard Worker
1702*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
1703*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
1704*c0909341SAndroid Build Coastguard Worker    movzx    top_offxyd, offxyw
1705*c0909341SAndroid Build Coastguard Worker    shr          offxyd, 16
1706*c0909341SAndroid Build Coastguard Worker%if %2 == 0
1707*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m13, [pb_27_17]
1708*c0909341SAndroid Build Coastguard Worker%endif
1709*c0909341SAndroid Build Coastguard Worker%%loop_y_v_overlap:
1710*c0909341SAndroid Build Coastguard Worker    ; src
1711*c0909341SAndroid Build Coastguard Worker%if %2
1712*c0909341SAndroid Build Coastguard Worker    mova            xm3, [lumaq+lstrideq*0+ 0]
1713*c0909341SAndroid Build Coastguard Worker    vinserti128      m3, [lumaq+lstrideq*(1+%3)+ 0], 1
1714*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m2, [pb_1]
1715*c0909341SAndroid Build Coastguard Worker    mova            xm0, [lumaq+lstrideq*0+16]
1716*c0909341SAndroid Build Coastguard Worker    vinserti128      m0, [lumaq+lstrideq*(1+%3)+16], 1
1717*c0909341SAndroid Build Coastguard Worker    mova            xm1, [srcq]
1718*c0909341SAndroid Build Coastguard Worker    vinserti128      m1, [srcq+strideq], 1
1719*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m3, m2
1720*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m0, m2
1721*c0909341SAndroid Build Coastguard Worker    pavgw            m3, m7
1722*c0909341SAndroid Build Coastguard Worker    pavgw            m0, m7
1723*c0909341SAndroid Build Coastguard Worker%else
1724*c0909341SAndroid Build Coastguard Worker    mova             m2, [lumaq]
1725*c0909341SAndroid Build Coastguard Worker    mova             m1, [srcq]
1726*c0909341SAndroid Build Coastguard Worker%endif
1727*c0909341SAndroid Build Coastguard Worker%if %1
1728*c0909341SAndroid Build Coastguard Worker%if %2
1729*c0909341SAndroid Build Coastguard Worker    packuswb         m2, m3, m0             ; luma
1730*c0909341SAndroid Build Coastguard Worker%endif
1731*c0909341SAndroid Build Coastguard Worker    punpckhbw        m3, m2, m1
1732*c0909341SAndroid Build Coastguard Worker    punpcklbw        m2, m1                 ; { luma, chroma }
1733*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m3, m14
1734*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m2, m14
1735*c0909341SAndroid Build Coastguard Worker    psraw            m3, 6
1736*c0909341SAndroid Build Coastguard Worker    psraw            m2, 6
1737*c0909341SAndroid Build Coastguard Worker    paddw            m3, m15
1738*c0909341SAndroid Build Coastguard Worker    paddw            m2, m15
1739*c0909341SAndroid Build Coastguard Worker    packuswb         m2, m3                 ; pack+unpack = clip
1740*c0909341SAndroid Build Coastguard Worker%endif
1741*c0909341SAndroid Build Coastguard Worker%if %1 || %2 == 0
1742*c0909341SAndroid Build Coastguard Worker    punpcklbw        m3, m2, m7
1743*c0909341SAndroid Build Coastguard Worker    punpckhbw        m0, m2, m7
1744*c0909341SAndroid Build Coastguard Worker%endif
1745*c0909341SAndroid Build Coastguard Worker
1746*c0909341SAndroid Build Coastguard Worker    ; scaling[luma_src]
1747*c0909341SAndroid Build Coastguard Worker    pandn            m4, m8, m3
1748*c0909341SAndroid Build Coastguard Worker    mova             m6, m8
1749*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m2, [scalingq+m4-0], m8
1750*c0909341SAndroid Build Coastguard Worker    psrld            m3, 16
1751*c0909341SAndroid Build Coastguard Worker    mova             m8, m6
1752*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m4, [scalingq+m3-2], m6
1753*c0909341SAndroid Build Coastguard Worker    pandn            m5, m8, m0
1754*c0909341SAndroid Build Coastguard Worker    mova             m6, m8
1755*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m3, [scalingq+m5-0], m8
1756*c0909341SAndroid Build Coastguard Worker    psrld            m0, 16
1757*c0909341SAndroid Build Coastguard Worker    mova             m8, m6
1758*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m5, [scalingq+m0-2], m6
1759*c0909341SAndroid Build Coastguard Worker    pblendw          m2, m4, 0xaa
1760*c0909341SAndroid Build Coastguard Worker    pblendw          m3, m5, 0xaa
1761*c0909341SAndroid Build Coastguard Worker
1762*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
1763*c0909341SAndroid Build Coastguard Worker%if %3 == 0
1764*c0909341SAndroid Build Coastguard Worker%if %2
1765*c0909341SAndroid Build Coastguard Worker    movu            xm0, [grain_lutq+offxyq]
1766*c0909341SAndroid Build Coastguard Worker    vinserti128      m0, [grain_lutq+offxyq+82], 1
1767*c0909341SAndroid Build Coastguard Worker    movu            xm4, [grain_lutq+top_offxyq]
1768*c0909341SAndroid Build Coastguard Worker    vinserti128      m4, [grain_lutq+top_offxyq+82], 1
1769*c0909341SAndroid Build Coastguard Worker%else
1770*c0909341SAndroid Build Coastguard Worker    movu             m0, [grain_lutq+offxyq]
1771*c0909341SAndroid Build Coastguard Worker    movu             m4, [grain_lutq+top_offxyq]
1772*c0909341SAndroid Build Coastguard Worker%endif
1773*c0909341SAndroid Build Coastguard Worker    punpcklbw        m5, m4, m0
1774*c0909341SAndroid Build Coastguard Worker    punpckhbw        m4, m0
1775*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m5, m13, m5
1776*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m4, m13, m4
1777*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m5, m12
1778*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m4, m12
1779*c0909341SAndroid Build Coastguard Worker    packsswb         m5, m4
1780*c0909341SAndroid Build Coastguard Worker%else
1781*c0909341SAndroid Build Coastguard Worker    movq            xm4, [grain_lutq+offxyq]
1782*c0909341SAndroid Build Coastguard Worker    vinserti128      m4, [grain_lutq+offxyq+8], 1
1783*c0909341SAndroid Build Coastguard Worker    movq            xm5, [grain_lutq+top_offxyq]
1784*c0909341SAndroid Build Coastguard Worker    vinserti128      m5, [grain_lutq+top_offxyq+8], 1
1785*c0909341SAndroid Build Coastguard Worker    punpcklbw        m5, m4
1786*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m5, m13, m5
1787*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m5, m12
1788*c0909341SAndroid Build Coastguard Worker    vextracti128    xm4, m5, 1
1789*c0909341SAndroid Build Coastguard Worker    packsswb        xm5, xm4
1790*c0909341SAndroid Build Coastguard Worker    ; only interpolate first line, insert second line unmodified
1791*c0909341SAndroid Build Coastguard Worker    vinserti128      m5, [grain_lutq+offxyq+82], 1
1792*c0909341SAndroid Build Coastguard Worker%endif
1793*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m5, m7
1794*c0909341SAndroid Build Coastguard Worker    punpckhbw        m5, m7
1795*c0909341SAndroid Build Coastguard Worker
1796*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
1797*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m2, m4
1798*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m3, m5
1799*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m2, m9
1800*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m3, m9
1801*c0909341SAndroid Build Coastguard Worker
1802*c0909341SAndroid Build Coastguard Worker    ; unpack chroma_source
1803*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m1, m7
1804*c0909341SAndroid Build Coastguard Worker    punpckhbw        m1, m7
1805*c0909341SAndroid Build Coastguard Worker
1806*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
1807*c0909341SAndroid Build Coastguard Worker    paddw            m0, m2
1808*c0909341SAndroid Build Coastguard Worker    paddw            m1, m3
1809*c0909341SAndroid Build Coastguard Worker    packuswb         m0, m1
1810*c0909341SAndroid Build Coastguard Worker    pmaxub           m0, m10
1811*c0909341SAndroid Build Coastguard Worker    pminub           m0, m11
1812*c0909341SAndroid Build Coastguard Worker%if %2
1813*c0909341SAndroid Build Coastguard Worker    mova         [dstq], xm0
1814*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq], m0, 1
1815*c0909341SAndroid Build Coastguard Worker%else
1816*c0909341SAndroid Build Coastguard Worker    mova         [dstq], m0
1817*c0909341SAndroid Build Coastguard Worker%endif
1818*c0909341SAndroid Build Coastguard Worker
1819*c0909341SAndroid Build Coastguard Worker    sub              hb, 1+%2
1820*c0909341SAndroid Build Coastguard Worker    jle %%end_y_v_overlap
1821*c0909341SAndroid Build Coastguard Worker%if %2
1822*c0909341SAndroid Build Coastguard Worker    lea            srcq, [srcq+strideq*2]
1823*c0909341SAndroid Build Coastguard Worker    lea            dstq, [dstq+strideq*2]
1824*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
1825*c0909341SAndroid Build Coastguard Worker%else
1826*c0909341SAndroid Build Coastguard Worker    add            srcq, strideq
1827*c0909341SAndroid Build Coastguard Worker    add            dstq, strideq
1828*c0909341SAndroid Build Coastguard Worker    add           lumaq, lstrideq
1829*c0909341SAndroid Build Coastguard Worker%endif
1830*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82<<%2
1831*c0909341SAndroid Build Coastguard Worker%if %2 == 0
1832*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m13, [pb_17_27]
1833*c0909341SAndroid Build Coastguard Worker    add              hd, 0x80000000
1834*c0909341SAndroid Build Coastguard Worker    jnc %%loop_y_v_overlap
1835*c0909341SAndroid Build Coastguard Worker%endif
1836*c0909341SAndroid Build Coastguard Worker    jmp %%loop_y
1837*c0909341SAndroid Build Coastguard Worker
1838*c0909341SAndroid Build Coastguard Worker%%end_y_v_overlap:
1839*c0909341SAndroid Build Coastguard Worker    add              wq, 32>>%2
1840*c0909341SAndroid Build Coastguard Worker    jge .end
1841*c0909341SAndroid Build Coastguard Worker    mov            srcq, r11mp
1842*c0909341SAndroid Build Coastguard Worker    mov            dstq, r12mp
1843*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [r14+wq*(1+%2)]
1844*c0909341SAndroid Build Coastguard Worker    add            srcq, wq
1845*c0909341SAndroid Build Coastguard Worker    add            dstq, wq
1846*c0909341SAndroid Build Coastguard Worker
1847*c0909341SAndroid Build Coastguard Worker    ; since fg_dataq.overlap is guaranteed to be set, we never jump
1848*c0909341SAndroid Build Coastguard Worker    ; back to .loop_x_v_overlap, and instead always fall-through to
1849*c0909341SAndroid Build Coastguard Worker    ; h+v overlap
1850*c0909341SAndroid Build Coastguard Worker
1851*c0909341SAndroid Build Coastguard Worker%%loop_x_hv_overlap:
1852*c0909341SAndroid Build Coastguard Worker    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1853*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
1854*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4eff4
1855*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
1856*c0909341SAndroid Build Coastguard Worker    setp            r7b                     ; parity of top_seed
1857*c0909341SAndroid Build Coastguard Worker    shr            seed, 16
1858*c0909341SAndroid Build Coastguard Worker    shl             r7d, 16
1859*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
1860*c0909341SAndroid Build Coastguard Worker    setp            r7b                     ; parity of cur_seed
1861*c0909341SAndroid Build Coastguard Worker    or              r6d, 0x00010001
1862*c0909341SAndroid Build Coastguard Worker    xor             r7d, r6d
1863*c0909341SAndroid Build Coastguard Worker    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1864*c0909341SAndroid Build Coastguard Worker
1865*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1866*c0909341SAndroid Build Coastguard Worker                offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
1867*c0909341SAndroid Build Coastguard Worker
1868*c0909341SAndroid Build Coastguard Worker    lea  topleft_offxyd, [top_offxyq+(32>>%2)]
1869*c0909341SAndroid Build Coastguard Worker    lea     left_offxyd, [offyq+(32>>%2)]
1870*c0909341SAndroid Build Coastguard Worker    rorx          offyd, seed, 8
1871*c0909341SAndroid Build Coastguard Worker    rorx          offxd, seed, 12
1872*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf000f
1873*c0909341SAndroid Build Coastguard Worker    and           offxd, 0xf000f
1874*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164>>%3
1875*c0909341SAndroid Build Coastguard Worker    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1876*c0909341SAndroid Build Coastguard Worker    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
1877*c0909341SAndroid Build Coastguard Worker
1878*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1879*c0909341SAndroid Build Coastguard Worker                h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
1880*c0909341SAndroid Build Coastguard Worker
1881*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
1882*c0909341SAndroid Build Coastguard Worker    mov              hd, hm
1883*c0909341SAndroid Build Coastguard Worker    movzx    top_offxyd, offxyw
1884*c0909341SAndroid Build Coastguard Worker    shr          offxyd, 16
1885*c0909341SAndroid Build Coastguard Worker%if %2 == 0
1886*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m13, [pb_27_17]
1887*c0909341SAndroid Build Coastguard Worker%endif
1888*c0909341SAndroid Build Coastguard Worker%%loop_y_hv_overlap:
1889*c0909341SAndroid Build Coastguard Worker    ; src
1890*c0909341SAndroid Build Coastguard Worker%if %2
1891*c0909341SAndroid Build Coastguard Worker    mova            xm3, [lumaq+lstrideq*0+ 0]
1892*c0909341SAndroid Build Coastguard Worker    vinserti128      m3, [lumaq+lstrideq*(1+%3)+ 0], 1
1893*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m2, [pb_1]
1894*c0909341SAndroid Build Coastguard Worker    mova            xm0, [lumaq+lstrideq*0+16]
1895*c0909341SAndroid Build Coastguard Worker    vinserti128      m0, [lumaq+lstrideq*(1+%3)+16], 1
1896*c0909341SAndroid Build Coastguard Worker    mova            xm1, [srcq]
1897*c0909341SAndroid Build Coastguard Worker    vinserti128      m1, [srcq+strideq], 1
1898*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m3, m2
1899*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m0, m2
1900*c0909341SAndroid Build Coastguard Worker    pavgw            m3, m7
1901*c0909341SAndroid Build Coastguard Worker    pavgw            m0, m7
1902*c0909341SAndroid Build Coastguard Worker%else
1903*c0909341SAndroid Build Coastguard Worker    mova             m2, [lumaq]
1904*c0909341SAndroid Build Coastguard Worker    mova             m1, [srcq]
1905*c0909341SAndroid Build Coastguard Worker%endif
1906*c0909341SAndroid Build Coastguard Worker%if %1
1907*c0909341SAndroid Build Coastguard Worker%if %2
1908*c0909341SAndroid Build Coastguard Worker    packuswb         m2, m3, m0             ; luma
1909*c0909341SAndroid Build Coastguard Worker%endif
1910*c0909341SAndroid Build Coastguard Worker    punpckhbw        m3, m2, m1
1911*c0909341SAndroid Build Coastguard Worker    punpcklbw        m2, m1                 ; { luma, chroma }
1912*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m3, m14
1913*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m2, m14
1914*c0909341SAndroid Build Coastguard Worker    psraw            m3, 6
1915*c0909341SAndroid Build Coastguard Worker    psraw            m2, 6
1916*c0909341SAndroid Build Coastguard Worker    paddw            m3, m15
1917*c0909341SAndroid Build Coastguard Worker    paddw            m2, m15
1918*c0909341SAndroid Build Coastguard Worker    packuswb         m2, m3                 ; pack+unpack = clip
1919*c0909341SAndroid Build Coastguard Worker%endif
1920*c0909341SAndroid Build Coastguard Worker%if %1 || %2 == 0
1921*c0909341SAndroid Build Coastguard Worker    punpcklbw        m3, m2, m7
1922*c0909341SAndroid Build Coastguard Worker    punpckhbw        m0, m2, m7
1923*c0909341SAndroid Build Coastguard Worker%endif
1924*c0909341SAndroid Build Coastguard Worker
1925*c0909341SAndroid Build Coastguard Worker    ; scaling[luma_src]
1926*c0909341SAndroid Build Coastguard Worker    pandn            m4, m8, m3
1927*c0909341SAndroid Build Coastguard Worker    mova             m6, m8
1928*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m2, [scalingq+m4-0], m8
1929*c0909341SAndroid Build Coastguard Worker    psrld            m3, 16
1930*c0909341SAndroid Build Coastguard Worker    mova             m8, m6
1931*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m4, [scalingq+m3-2], m6
1932*c0909341SAndroid Build Coastguard Worker    pandn            m5, m8, m0
1933*c0909341SAndroid Build Coastguard Worker    mova             m6, m8
1934*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m3, [scalingq+m5-0], m8
1935*c0909341SAndroid Build Coastguard Worker    psrld            m0, 16
1936*c0909341SAndroid Build Coastguard Worker    mova             m8, m6
1937*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m5, [scalingq+m0-2], m6
1938*c0909341SAndroid Build Coastguard Worker    pblendw          m2, m4, 0xaa
1939*c0909341SAndroid Build Coastguard Worker    pblendw          m3, m5, 0xaa
1940*c0909341SAndroid Build Coastguard Worker
1941*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
1942*c0909341SAndroid Build Coastguard Worker%if %2
1943*c0909341SAndroid Build Coastguard Worker    movu            xm4, [grain_lutq+offxyq]
1944*c0909341SAndroid Build Coastguard Worker    vinserti128      m4, [grain_lutq+offxyq+82], 1
1945*c0909341SAndroid Build Coastguard Worker    movd            xm0, [grain_lutq+left_offxyq]
1946*c0909341SAndroid Build Coastguard Worker    vinserti128      m0, [grain_lutq+left_offxyq+82], 1
1947*c0909341SAndroid Build Coastguard Worker    movd            xm6, [grain_lutq+topleft_offxyq]
1948*c0909341SAndroid Build Coastguard Worker%if %3
1949*c0909341SAndroid Build Coastguard Worker    movq            xm5, [grain_lutq+top_offxyq]
1950*c0909341SAndroid Build Coastguard Worker    vinserti128      m5, [grain_lutq+top_offxyq+8], 1
1951*c0909341SAndroid Build Coastguard Worker%else
1952*c0909341SAndroid Build Coastguard Worker    vinserti128      m6, [grain_lutq+topleft_offxyq+82], 1
1953*c0909341SAndroid Build Coastguard Worker    movu            xm5, [grain_lutq+top_offxyq]
1954*c0909341SAndroid Build Coastguard Worker    vinserti128      m5, [grain_lutq+top_offxyq+82], 1
1955*c0909341SAndroid Build Coastguard Worker%endif
1956*c0909341SAndroid Build Coastguard Worker
1957*c0909341SAndroid Build Coastguard Worker    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
1958*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m4
1959*c0909341SAndroid Build Coastguard Worker%if %3
1960*c0909341SAndroid Build Coastguard Worker    punpcklbw       xm6, xm5
1961*c0909341SAndroid Build Coastguard Worker%else
1962*c0909341SAndroid Build Coastguard Worker    punpcklbw        m6, m5
1963*c0909341SAndroid Build Coastguard Worker%endif
1964*c0909341SAndroid Build Coastguard Worker    punpcklqdq       m0, m6
1965*c0909341SAndroid Build Coastguard Worker%if %1
1966*c0909341SAndroid Build Coastguard Worker    vpbroadcastq     m6, [pb_23_22]
1967*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m0, m6, m0
1968*c0909341SAndroid Build Coastguard Worker%else
1969*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m0, m15, m0
1970*c0909341SAndroid Build Coastguard Worker%endif
1971*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m0, m12
1972*c0909341SAndroid Build Coastguard Worker    packsswb         m0, m0
1973*c0909341SAndroid Build Coastguard Worker    vpblendd         m4, m0, 0x11
1974*c0909341SAndroid Build Coastguard Worker%if %3
1975*c0909341SAndroid Build Coastguard Worker    pshuflw         xm0, xm0, q1032
1976*c0909341SAndroid Build Coastguard Worker    vpblendd         m5, m0, 0x01
1977*c0909341SAndroid Build Coastguard Worker%else
1978*c0909341SAndroid Build Coastguard Worker    pshuflw          m0, m0, q1032
1979*c0909341SAndroid Build Coastguard Worker    vpblendd         m5, m0, 0x11
1980*c0909341SAndroid Build Coastguard Worker%endif
1981*c0909341SAndroid Build Coastguard Worker%else
1982*c0909341SAndroid Build Coastguard Worker    movu             m4, [grain_lutq+offxyq]
1983*c0909341SAndroid Build Coastguard Worker    movd            xm0, [grain_lutq+left_offxyq]
1984*c0909341SAndroid Build Coastguard Worker    movu             m5, [grain_lutq+top_offxyq]
1985*c0909341SAndroid Build Coastguard Worker    movd            xm6, [grain_lutq+topleft_offxyq]
1986*c0909341SAndroid Build Coastguard Worker    punpcklbw       xm0, xm4
1987*c0909341SAndroid Build Coastguard Worker    punpcklbw       xm6, xm5
1988*c0909341SAndroid Build Coastguard Worker    punpcklqdq      xm0, xm6
1989*c0909341SAndroid Build Coastguard Worker%if %1
1990*c0909341SAndroid Build Coastguard Worker    vpbroadcastq    xm6, [pb_27_17_17_27]
1991*c0909341SAndroid Build Coastguard Worker    pmaddubsw       xm0, xm6, xm0
1992*c0909341SAndroid Build Coastguard Worker%else
1993*c0909341SAndroid Build Coastguard Worker    pmaddubsw       xm0, xm15, xm0
1994*c0909341SAndroid Build Coastguard Worker%endif
1995*c0909341SAndroid Build Coastguard Worker    pmulhrsw        xm0, xm12
1996*c0909341SAndroid Build Coastguard Worker    packsswb        xm0, xm0
1997*c0909341SAndroid Build Coastguard Worker    vpblendd         m4, m0, 0x01
1998*c0909341SAndroid Build Coastguard Worker    pshuflw         xm0, xm0, q1032
1999*c0909341SAndroid Build Coastguard Worker    vpblendd         m5, m0, 0x01
2000*c0909341SAndroid Build Coastguard Worker%endif
2001*c0909341SAndroid Build Coastguard Worker
2002*c0909341SAndroid Build Coastguard Worker    ; followed by v interpolation (top | cur -> cur)
2003*c0909341SAndroid Build Coastguard Worker%if %3
2004*c0909341SAndroid Build Coastguard Worker    vpermq           m0, m4, q3120
2005*c0909341SAndroid Build Coastguard Worker    punpcklbw        m5, m0
2006*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m5, m13, m5
2007*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m5, m12
2008*c0909341SAndroid Build Coastguard Worker    vextracti128    xm0, m5, 1
2009*c0909341SAndroid Build Coastguard Worker    packsswb        xm5, xm0
2010*c0909341SAndroid Build Coastguard Worker    vpblendd         m5, m4, 0xf0
2011*c0909341SAndroid Build Coastguard Worker%else
2012*c0909341SAndroid Build Coastguard Worker    punpckhbw        m0, m5, m4
2013*c0909341SAndroid Build Coastguard Worker    punpcklbw        m5, m4
2014*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m4, m13, m0
2015*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m5, m13, m5
2016*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m4, m12
2017*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m5, m12
2018*c0909341SAndroid Build Coastguard Worker    packsswb         m5, m4
2019*c0909341SAndroid Build Coastguard Worker%endif
2020*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m5, m7
2021*c0909341SAndroid Build Coastguard Worker    punpckhbw        m5, m7
2022*c0909341SAndroid Build Coastguard Worker
2023*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[src] * grain, scaling_shift)
2024*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m2, m4
2025*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m3, m5
2026*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m2, m9
2027*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m3, m9
2028*c0909341SAndroid Build Coastguard Worker
2029*c0909341SAndroid Build Coastguard Worker    ; unpack chroma source
2030*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m1, m7
2031*c0909341SAndroid Build Coastguard Worker    punpckhbw        m1, m7
2032*c0909341SAndroid Build Coastguard Worker
2033*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
2034*c0909341SAndroid Build Coastguard Worker    paddw            m0, m2
2035*c0909341SAndroid Build Coastguard Worker    paddw            m1, m3
2036*c0909341SAndroid Build Coastguard Worker    packuswb         m0, m1
2037*c0909341SAndroid Build Coastguard Worker    pmaxub           m0, m10
2038*c0909341SAndroid Build Coastguard Worker    pminub           m0, m11
2039*c0909341SAndroid Build Coastguard Worker%if %2
2040*c0909341SAndroid Build Coastguard Worker    mova         [dstq], xm0
2041*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq], m0, 1
2042*c0909341SAndroid Build Coastguard Worker%else
2043*c0909341SAndroid Build Coastguard Worker    mova         [dstq], m0
2044*c0909341SAndroid Build Coastguard Worker%endif
2045*c0909341SAndroid Build Coastguard Worker
2046*c0909341SAndroid Build Coastguard Worker%if %2
2047*c0909341SAndroid Build Coastguard Worker    lea            srcq, [srcq+strideq*2]
2048*c0909341SAndroid Build Coastguard Worker    lea            dstq, [dstq+strideq*2]
2049*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
2050*c0909341SAndroid Build Coastguard Worker%else
2051*c0909341SAndroid Build Coastguard Worker    add            srcq, strideq
2052*c0909341SAndroid Build Coastguard Worker    add            dstq, strideq
2053*c0909341SAndroid Build Coastguard Worker    add           lumaq, lstrideq
2054*c0909341SAndroid Build Coastguard Worker%endif
2055*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82<<%2
2056*c0909341SAndroid Build Coastguard Worker    sub              hb, 1+%2
2057*c0909341SAndroid Build Coastguard Worker%if %2
2058*c0909341SAndroid Build Coastguard Worker    jg %%loop_y_h_overlap
2059*c0909341SAndroid Build Coastguard Worker%else
2060*c0909341SAndroid Build Coastguard Worker    je %%end_y_hv_overlap
2061*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m13, [pb_17_27]
2062*c0909341SAndroid Build Coastguard Worker    add              hd, 0x80000000
2063*c0909341SAndroid Build Coastguard Worker    jnc %%loop_y_hv_overlap
2064*c0909341SAndroid Build Coastguard Worker    jmp %%loop_y_h_overlap
2065*c0909341SAndroid Build Coastguard Worker%endif
2066*c0909341SAndroid Build Coastguard Worker
2067*c0909341SAndroid Build Coastguard Worker%%end_y_hv_overlap:
2068*c0909341SAndroid Build Coastguard Worker    add              wq, 32>>%2
2069*c0909341SAndroid Build Coastguard Worker    jge .end
2070*c0909341SAndroid Build Coastguard Worker    mov            srcq, r11mp
2071*c0909341SAndroid Build Coastguard Worker    mov            dstq, r12mp
2072*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [r14+wq*(1+%2)]
2073*c0909341SAndroid Build Coastguard Worker    add            srcq, wq
2074*c0909341SAndroid Build Coastguard Worker    add            dstq, wq
2075*c0909341SAndroid Build Coastguard Worker    jmp %%loop_x_hv_overlap
2076*c0909341SAndroid Build Coastguard Worker%endmacro
2077*c0909341SAndroid Build Coastguard Worker
2078*c0909341SAndroid Build Coastguard Worker    %%FGUV_32x32xN_LOOP 1, %2, %3
2079*c0909341SAndroid Build Coastguard Worker.csfl:
2080*c0909341SAndroid Build Coastguard Worker    %%FGUV_32x32xN_LOOP 0, %2, %3
2081*c0909341SAndroid Build Coastguard Worker.end:
2082*c0909341SAndroid Build Coastguard Worker    RET
2083*c0909341SAndroid Build Coastguard Worker%endmacro
2084*c0909341SAndroid Build Coastguard Worker
2085*c0909341SAndroid Build Coastguard WorkerGEN_GRAIN_UV_FN 420, 1, 1
2086*c0909341SAndroid Build Coastguard WorkerFGUV_FN         420, 1, 1
2087*c0909341SAndroid Build Coastguard WorkerGEN_GRAIN_UV_FN 422, 1, 0
2088*c0909341SAndroid Build Coastguard WorkerFGUV_FN         422, 1, 0
2089*c0909341SAndroid Build Coastguard WorkerGEN_GRAIN_UV_FN 444, 0, 0
2090*c0909341SAndroid Build Coastguard WorkerFGUV_FN         444, 0, 0
2091*c0909341SAndroid Build Coastguard Worker
2092*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64
2093