xref: /aosp_15_r20/external/libdav1d/src/x86/filmgrain16_sse.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker%include "x86/filmgrain_common.asm"
29*c0909341SAndroid Build Coastguard Worker
30*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 16
31*c0909341SAndroid Build Coastguard Workerpd_16: times 4 dd 16
32*c0909341SAndroid Build Coastguard Workerpw_1: times 8 dw 1
33*c0909341SAndroid Build Coastguard Workerpw_16384: times 8 dw 16384
34*c0909341SAndroid Build Coastguard Workerpw_8192: times 8 dw 8192
35*c0909341SAndroid Build Coastguard Workerpw_23_22: dw 23, 22
36*c0909341SAndroid Build Coastguard Worker          times 3 dw 0, 32
37*c0909341SAndroid Build Coastguard Workerpb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
38*c0909341SAndroid Build Coastguard Workerpw_27_17_17_27: dw 27, 17, 17, 27
39*c0909341SAndroid Build Coastguard Worker                times 2 dw 0, 32
40*c0909341SAndroid Build Coastguard Workerrnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
41*c0909341SAndroid Build Coastguard Workerpw_seed_xor: times 2 dw 0xb524
42*c0909341SAndroid Build Coastguard Worker             times 2 dw 0x49d8
43*c0909341SAndroid Build Coastguard Workerpb_1: times 4 db 1
44*c0909341SAndroid Build Coastguard Workerhmul_bits: dw 32768, 16384, 8192, 4096
45*c0909341SAndroid Build Coastguard Workerround: dw 2048, 1024, 512
46*c0909341SAndroid Build Coastguard Workermul_bits: dw 256, 128, 64, 32, 16
47*c0909341SAndroid Build Coastguard Workerround_vals: dw 32, 64, 128, 256, 512, 1024
48*c0909341SAndroid Build Coastguard Workermax: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16
49*c0909341SAndroid Build Coastguard Workermin: dw 0, 16*4, 16*16
50*c0909341SAndroid Build Coastguard Worker; these two should be next to each other
51*c0909341SAndroid Build Coastguard Workerpw_4: times 2 dw 4
52*c0909341SAndroid Build Coastguard Workerpw_16: times 2 dw 16
53*c0909341SAndroid Build Coastguard Worker
54*c0909341SAndroid Build Coastguard Worker%macro JMP_TABLE 1-*
55*c0909341SAndroid Build Coastguard Worker    %xdefine %1_table %%table
56*c0909341SAndroid Build Coastguard Worker    %xdefine %%base %1_table
57*c0909341SAndroid Build Coastguard Worker    %xdefine %%prefix mangle(private_prefix %+ _%1)
58*c0909341SAndroid Build Coastguard Worker    %%table:
59*c0909341SAndroid Build Coastguard Worker    %rep %0 - 1
60*c0909341SAndroid Build Coastguard Worker        dd %%prefix %+ .ar%2 - %%base
61*c0909341SAndroid Build Coastguard Worker        %rotate 1
62*c0909341SAndroid Build Coastguard Worker    %endrep
63*c0909341SAndroid Build Coastguard Worker%endmacro
64*c0909341SAndroid Build Coastguard Worker
65*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_y_16bpc_ssse3, 0, 1, 2, 3
66*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3
67*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3
68*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3
69*c0909341SAndroid Build Coastguard Worker
70*c0909341SAndroid Build Coastguard WorkerSECTION .text
71*c0909341SAndroid Build Coastguard Worker
72*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
73*c0909341SAndroid Build Coastguard Worker%undef base
74*c0909341SAndroid Build Coastguard Worker%define PIC_ptr(a) base+a
75*c0909341SAndroid Build Coastguard Worker%else
76*c0909341SAndroid Build Coastguard Worker%define PIC_ptr(a) a
77*c0909341SAndroid Build Coastguard Worker%endif
78*c0909341SAndroid Build Coastguard Worker
79*c0909341SAndroid Build Coastguard Worker%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
80*c0909341SAndroid Build Coastguard Worker
81*c0909341SAndroid Build Coastguard Worker%macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg
82*c0909341SAndroid Build Coastguard Worker%assign %%idx 0
83*c0909341SAndroid Build Coastguard Worker%define %%tmp %2
84*c0909341SAndroid Build Coastguard Worker%if %0 == 8
85*c0909341SAndroid Build Coastguard Worker%define %%tmp %8
86*c0909341SAndroid Build Coastguard Worker%endif
87*c0909341SAndroid Build Coastguard Worker%rep (%6/2)
88*c0909341SAndroid Build Coastguard Worker%if %%idx == 0
89*c0909341SAndroid Build Coastguard Worker    movd        %5 %+ d, %2
90*c0909341SAndroid Build Coastguard Worker    pshuflw       %%tmp, %2, q3232
91*c0909341SAndroid Build Coastguard Worker%else
92*c0909341SAndroid Build Coastguard Worker    movd        %5 %+ d, %%tmp
93*c0909341SAndroid Build Coastguard Worker%if %6 == 8
94*c0909341SAndroid Build Coastguard Worker%if %%idx == 2
95*c0909341SAndroid Build Coastguard Worker    punpckhqdq    %%tmp, %%tmp
96*c0909341SAndroid Build Coastguard Worker%elif %%idx == 4
97*c0909341SAndroid Build Coastguard Worker    psrlq         %%tmp, 32
98*c0909341SAndroid Build Coastguard Worker%endif
99*c0909341SAndroid Build Coastguard Worker%endif
100*c0909341SAndroid Build Coastguard Worker%endif
101*c0909341SAndroid Build Coastguard Worker    movzx       %4 %+ d, %5 %+ w
102*c0909341SAndroid Build Coastguard Worker    shr         %5 %+ d, 16
103*c0909341SAndroid Build Coastguard Worker
104*c0909341SAndroid Build Coastguard Worker%if %%idx == 0
105*c0909341SAndroid Build Coastguard Worker    movd             %1, [%3+%4*%7]
106*c0909341SAndroid Build Coastguard Worker%else
107*c0909341SAndroid Build Coastguard Worker    pinsrw           %1, [%3+%4*%7], %%idx + 0
108*c0909341SAndroid Build Coastguard Worker%endif
109*c0909341SAndroid Build Coastguard Worker    pinsrw           %1, [%3+%5*%7], %%idx + 1
110*c0909341SAndroid Build Coastguard Worker%assign %%idx %%idx+2
111*c0909341SAndroid Build Coastguard Worker%endrep
112*c0909341SAndroid Build Coastguard Worker%endmacro
113*c0909341SAndroid Build Coastguard Worker
114*c0909341SAndroid Build Coastguard Worker%macro SPLATD 2 ; dst, src
115*c0909341SAndroid Build Coastguard Worker%ifnidn %1, %2
116*c0909341SAndroid Build Coastguard Worker    movd %1, %2
117*c0909341SAndroid Build Coastguard Worker%endif
118*c0909341SAndroid Build Coastguard Worker    pshufd %1, %1, q0000
119*c0909341SAndroid Build Coastguard Worker%endmacro
120*c0909341SAndroid Build Coastguard Worker
121*c0909341SAndroid Build Coastguard Worker%macro SPLATW 2 ; dst, src
122*c0909341SAndroid Build Coastguard Worker%ifnidn %1, %2
123*c0909341SAndroid Build Coastguard Worker    movd %1, %2
124*c0909341SAndroid Build Coastguard Worker%endif
125*c0909341SAndroid Build Coastguard Worker    pshuflw %1, %1, q0000
126*c0909341SAndroid Build Coastguard Worker    punpcklqdq %1, %1
127*c0909341SAndroid Build Coastguard Worker%endmacro
128*c0909341SAndroid Build Coastguard Worker
129*c0909341SAndroid Build Coastguard Worker
130*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
131*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
132*c0909341SAndroid Build Coastguard Workercglobal generate_grain_y_16bpc, 3, 8, 16, buf, fg_data, bdmax
133*c0909341SAndroid Build Coastguard Worker    lea              r4, [pb_mask]
134*c0909341SAndroid Build Coastguard Worker%define base r4-pb_mask
135*c0909341SAndroid Build Coastguard Worker%else
136*c0909341SAndroid Build Coastguard Workercglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax
137*c0909341SAndroid Build Coastguard Worker    LEA              r4, $$
138*c0909341SAndroid Build Coastguard Worker%define base r4-$$
139*c0909341SAndroid Build Coastguard Worker%endif
140*c0909341SAndroid Build Coastguard Worker    movq             m1, [base+rnd_next_upperbit_mask]
141*c0909341SAndroid Build Coastguard Worker    movq             m4, [base+mul_bits]
142*c0909341SAndroid Build Coastguard Worker    movq             m7, [base+hmul_bits]
143*c0909341SAndroid Build Coastguard Worker    mov             r3d, [fg_dataq+FGData.grain_scale_shift]
144*c0909341SAndroid Build Coastguard Worker    lea             r5d, [bdmaxq+1]
145*c0909341SAndroid Build Coastguard Worker    shr             r5d, 11             ; 0 for 10bpc, 2 for 12bpc
146*c0909341SAndroid Build Coastguard Worker    sub              r3, r5
147*c0909341SAndroid Build Coastguard Worker    SPLATW           m6, [base+round+r3*2-2]
148*c0909341SAndroid Build Coastguard Worker    mova             m5, [base+pb_mask]
149*c0909341SAndroid Build Coastguard Worker    SPLATW           m0, [fg_dataq+FGData.seed]
150*c0909341SAndroid Build Coastguard Worker    mov              r3, -73*82*2
151*c0909341SAndroid Build Coastguard Worker    sub            bufq, r3
152*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
153*c0909341SAndroid Build Coastguard Worker    lea              r6, [gaussian_sequence]
154*c0909341SAndroid Build Coastguard Worker%endif
155*c0909341SAndroid Build Coastguard Worker.loop:
156*c0909341SAndroid Build Coastguard Worker    pand             m2, m0, m1
157*c0909341SAndroid Build Coastguard Worker    psrlw            m3, m2, 10
158*c0909341SAndroid Build Coastguard Worker    por              m2, m3             ; bits 0xf, 0x1e, 0x3c and 0x78 are set
159*c0909341SAndroid Build Coastguard Worker    pmullw           m2, m4             ; bits 0x0f00 are set
160*c0909341SAndroid Build Coastguard Worker    pshufb           m3, m5, m2         ; set 15th bit for next 4 seeds
161*c0909341SAndroid Build Coastguard Worker    psllq            m2, m3, 30
162*c0909341SAndroid Build Coastguard Worker    por              m2, m3
163*c0909341SAndroid Build Coastguard Worker    psllq            m3, m2, 15
164*c0909341SAndroid Build Coastguard Worker    por              m2, m3             ; aggregate each bit into next seed's high bit
165*c0909341SAndroid Build Coastguard Worker    pmulhuw          m3, m0, m7
166*c0909341SAndroid Build Coastguard Worker    por              m2, m3             ; 4 next output seeds
167*c0909341SAndroid Build Coastguard Worker    pshuflw          m0, m2, q3333
168*c0909341SAndroid Build Coastguard Worker    psrlw            m2, 5
169*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
170*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m3, m2, r6, r5, r7, 4, 2
171*c0909341SAndroid Build Coastguard Worker%else
172*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m3, m2, base+gaussian_sequence, r5, r2, 4, 2
173*c0909341SAndroid Build Coastguard Worker%endif
174*c0909341SAndroid Build Coastguard Worker    paddw            m3, m3             ; otherwise bpc=12 w/ grain_scale_shift=0
175*c0909341SAndroid Build Coastguard Worker                                        ; shifts by 0, which pmulhrsw does not support
176*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m3, m6
177*c0909341SAndroid Build Coastguard Worker    movq      [bufq+r3], m3
178*c0909341SAndroid Build Coastguard Worker    add              r3, 4*2
179*c0909341SAndroid Build Coastguard Worker    jl .loop
180*c0909341SAndroid Build Coastguard Worker
181*c0909341SAndroid Build Coastguard Worker    ; auto-regression code
182*c0909341SAndroid Build Coastguard Worker    movsxd           r3, [fg_dataq+FGData.ar_coeff_lag]
183*c0909341SAndroid Build Coastguard Worker    movsxd           r3, [base+generate_grain_y_16bpc_ssse3_table+r3*4]
184*c0909341SAndroid Build Coastguard Worker    lea              r3, [r3+base+generate_grain_y_16bpc_ssse3_table]
185*c0909341SAndroid Build Coastguard Worker    jmp              r3
186*c0909341SAndroid Build Coastguard Worker
187*c0909341SAndroid Build Coastguard Worker.ar1:
188*c0909341SAndroid Build Coastguard Worker%if WIN64
189*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS shift, fg_data, max, buf, val3, min, cf3, x, val0
190*c0909341SAndroid Build Coastguard Worker    lea            bufq, [r0-2*(82*73-(82*3+79))]
191*c0909341SAndroid Build Coastguard Worker    PUSH             r8
192*c0909341SAndroid Build Coastguard Worker%else
193*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
194*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0
195*c0909341SAndroid Build Coastguard Worker%else ; x86-32
196*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, fg_data, min, val3, x, cf3, val0
197*c0909341SAndroid Build Coastguard Worker    PUSH             r6
198*c0909341SAndroid Build Coastguard Worker%define shiftd r1d
199*c0909341SAndroid Build Coastguard Worker%endif
200*c0909341SAndroid Build Coastguard Worker    sub            bufq, 2*(82*73-(82*3+79))
201*c0909341SAndroid Build Coastguard Worker%endif
202*c0909341SAndroid Build Coastguard Worker    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
203*c0909341SAndroid Build Coastguard Worker    movd             m4, [fg_dataq+FGData.ar_coeffs_y]
204*c0909341SAndroid Build Coastguard Worker    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
205*c0909341SAndroid Build Coastguard Worker%if WIN64
206*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS shift, h, max, buf, val3, min, cf3, x, val0
207*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_64
208*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0
209*c0909341SAndroid Build Coastguard Worker%else ; x86-32
210*c0909341SAndroid Build Coastguard Worker%undef shiftd
211*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, shift, min, val3, x, cf3, val0
212*c0909341SAndroid Build Coastguard Worker%define hd dword r0m
213*c0909341SAndroid Build Coastguard Worker%define maxd dword minm
214*c0909341SAndroid Build Coastguard Worker%endif
215*c0909341SAndroid Build Coastguard Worker%if cpuflag(sse4)
216*c0909341SAndroid Build Coastguard Worker    pmovsxbw         m4, m4
217*c0909341SAndroid Build Coastguard Worker%else
218*c0909341SAndroid Build Coastguard Worker    pxor             m3, m3
219*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m3, m4
220*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m3
221*c0909341SAndroid Build Coastguard Worker%endif
222*c0909341SAndroid Build Coastguard Worker    pinsrw           m4, [base+pw_1], 3
223*c0909341SAndroid Build Coastguard Worker    pshufd           m5, m4, q1111
224*c0909341SAndroid Build Coastguard Worker    pshufd           m4, m4, q0000
225*c0909341SAndroid Build Coastguard Worker    SPLATW           m3, [base+round_vals+shiftq*2-12]    ; rnd
226*c0909341SAndroid Build Coastguard Worker    mov              hd, 70
227*c0909341SAndroid Build Coastguard Worker    sar            maxd, 1
228*c0909341SAndroid Build Coastguard Worker    mov            mind, maxd
229*c0909341SAndroid Build Coastguard Worker    xor            mind, -1
230*c0909341SAndroid Build Coastguard Worker.y_loop_ar1:
231*c0909341SAndroid Build Coastguard Worker    mov              xq, -76
232*c0909341SAndroid Build Coastguard Worker    movsx         val3d, word [bufq+xq*2-2]
233*c0909341SAndroid Build Coastguard Worker.x_loop_ar1:
234*c0909341SAndroid Build Coastguard Worker    movu             m0, [bufq+xq*2-82*2-2]     ; top/left
235*c0909341SAndroid Build Coastguard Worker    psrldq           m2, m0, 2                  ; top
236*c0909341SAndroid Build Coastguard Worker    psrldq           m1, m0, 4                  ; top/right
237*c0909341SAndroid Build Coastguard Worker    punpcklwd        m0, m2
238*c0909341SAndroid Build Coastguard Worker    punpcklwd        m1, m3
239*c0909341SAndroid Build Coastguard Worker    pmaddwd          m0, m4
240*c0909341SAndroid Build Coastguard Worker    pmaddwd          m1, m5
241*c0909341SAndroid Build Coastguard Worker    paddd            m0, m1
242*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_inner:
243*c0909341SAndroid Build Coastguard Worker    movd          val0d, m0
244*c0909341SAndroid Build Coastguard Worker    psrldq           m0, 4
245*c0909341SAndroid Build Coastguard Worker    imul          val3d, cf3d
246*c0909341SAndroid Build Coastguard Worker    add           val3d, val0d
247*c0909341SAndroid Build Coastguard Worker    sar           val3d, shiftb
248*c0909341SAndroid Build Coastguard Worker    movsx         val0d, word [bufq+xq*2]
249*c0909341SAndroid Build Coastguard Worker    add           val3d, val0d
250*c0909341SAndroid Build Coastguard Worker    cmp           val3d, maxd
251*c0909341SAndroid Build Coastguard Worker    cmovg         val3d, maxd
252*c0909341SAndroid Build Coastguard Worker    cmp           val3d, mind
253*c0909341SAndroid Build Coastguard Worker    cmovl         val3d, mind
254*c0909341SAndroid Build Coastguard Worker    mov word [bufq+xq*2], val3w
255*c0909341SAndroid Build Coastguard Worker    ; keep val3d in-place as left for next x iteration
256*c0909341SAndroid Build Coastguard Worker    inc              xq
257*c0909341SAndroid Build Coastguard Worker    jz .x_loop_ar1_end
258*c0909341SAndroid Build Coastguard Worker    test             xq, 3
259*c0909341SAndroid Build Coastguard Worker    jnz .x_loop_ar1_inner
260*c0909341SAndroid Build Coastguard Worker    jmp .x_loop_ar1
261*c0909341SAndroid Build Coastguard Worker
262*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_end:
263*c0909341SAndroid Build Coastguard Worker    add            bufq, 82*2
264*c0909341SAndroid Build Coastguard Worker    dec              hd
265*c0909341SAndroid Build Coastguard Worker    jg .y_loop_ar1
266*c0909341SAndroid Build Coastguard Worker%if WIN64
267*c0909341SAndroid Build Coastguard Worker    POP              r8
268*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_32
269*c0909341SAndroid Build Coastguard Worker    POP              r6
270*c0909341SAndroid Build Coastguard Worker%undef maxd
271*c0909341SAndroid Build Coastguard Worker%undef hd
272*c0909341SAndroid Build Coastguard Worker%endif
273*c0909341SAndroid Build Coastguard Worker.ar0:
274*c0909341SAndroid Build Coastguard Worker    RET
275*c0909341SAndroid Build Coastguard Worker
276*c0909341SAndroid Build Coastguard Worker.ar2:
277*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
278*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK -16*8
279*c0909341SAndroid Build Coastguard Worker%endif
280*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, fg_data, bdmax, shift
281*c0909341SAndroid Build Coastguard Worker    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
282*c0909341SAndroid Build Coastguard Worker    movd             m0, [base+round_vals-12+shiftq*2]
283*c0909341SAndroid Build Coastguard Worker    pshuflw          m0, m0, q0000
284*c0909341SAndroid Build Coastguard Worker    movu             m6, [fg_dataq+FGData.ar_coeffs_y+0]    ; cf0-11
285*c0909341SAndroid Build Coastguard Worker    pxor             m2, m2
286*c0909341SAndroid Build Coastguard Worker    punpcklwd        m0, m2
287*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m2, m6
288*c0909341SAndroid Build Coastguard Worker    punpckhbw        m3, m6, m2
289*c0909341SAndroid Build Coastguard Worker    punpcklbw        m6, m2
290*c0909341SAndroid Build Coastguard Worker    pshufd           m2, m6, q3333
291*c0909341SAndroid Build Coastguard Worker    pshufd           m1, m6, q2222
292*c0909341SAndroid Build Coastguard Worker    pshufd           m7, m6, q1111
293*c0909341SAndroid Build Coastguard Worker    pshufd           m6, m6, q0000
294*c0909341SAndroid Build Coastguard Worker    pshufd           m4, m3, q1111
295*c0909341SAndroid Build Coastguard Worker    pshufd           m3, m3, q0000
296*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
297*c0909341SAndroid Build Coastguard Worker    SWAP              0, 12
298*c0909341SAndroid Build Coastguard Worker    SWAP              1, 8
299*c0909341SAndroid Build Coastguard Worker    SWAP              2, 9
300*c0909341SAndroid Build Coastguard Worker    SWAP              3, 10
301*c0909341SAndroid Build Coastguard Worker    SWAP              4, 11
302*c0909341SAndroid Build Coastguard Worker%else
303*c0909341SAndroid Build Coastguard Worker%define m12 [rsp+0*16]
304*c0909341SAndroid Build Coastguard Worker%define m8 [rsp+1*16]
305*c0909341SAndroid Build Coastguard Worker%define m9 [rsp+2*16]
306*c0909341SAndroid Build Coastguard Worker%define m10 [rsp+3*16]
307*c0909341SAndroid Build Coastguard Worker%define m11 [rsp+4*16]
308*c0909341SAndroid Build Coastguard Worker    mova            m12, m0
309*c0909341SAndroid Build Coastguard Worker    mova             m8, m1
310*c0909341SAndroid Build Coastguard Worker    mova             m9, m2
311*c0909341SAndroid Build Coastguard Worker    mova            m10, m3
312*c0909341SAndroid Build Coastguard Worker    mova            m11, m4
313*c0909341SAndroid Build Coastguard Worker    mov          bdmaxd, bdmaxm
314*c0909341SAndroid Build Coastguard Worker%endif
315*c0909341SAndroid Build Coastguard Worker    sar          bdmaxd, 1
316*c0909341SAndroid Build Coastguard Worker    SPLATW           m0, bdmaxd                             ; max_grain
317*c0909341SAndroid Build Coastguard Worker    pcmpeqw          m1, m1
318*c0909341SAndroid Build Coastguard Worker%if !cpuflag(sse4)
319*c0909341SAndroid Build Coastguard Worker    pcmpeqw          m2, m2
320*c0909341SAndroid Build Coastguard Worker    psrldq           m2, 14
321*c0909341SAndroid Build Coastguard Worker    pslldq           m2, 2
322*c0909341SAndroid Build Coastguard Worker    pxor             m2, m1
323*c0909341SAndroid Build Coastguard Worker%endif
324*c0909341SAndroid Build Coastguard Worker    pxor             m1, m0                                 ; min_grain
325*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
326*c0909341SAndroid Build Coastguard Worker    SWAP              0, 13
327*c0909341SAndroid Build Coastguard Worker    SWAP              1, 14
328*c0909341SAndroid Build Coastguard Worker    SWAP              2, 15
329*c0909341SAndroid Build Coastguard Worker%else
330*c0909341SAndroid Build Coastguard Worker%define m13 [rsp+5*16]
331*c0909341SAndroid Build Coastguard Worker%define m14 [rsp+6*16]
332*c0909341SAndroid Build Coastguard Worker    mova            m13, m0
333*c0909341SAndroid Build Coastguard Worker    mova            m14, m1
334*c0909341SAndroid Build Coastguard Worker%if !cpuflag(sse4)
335*c0909341SAndroid Build Coastguard Worker%define m15 [rsp+7*16]
336*c0909341SAndroid Build Coastguard Worker    mova            m15, m2
337*c0909341SAndroid Build Coastguard Worker%endif
338*c0909341SAndroid Build Coastguard Worker%endif
339*c0909341SAndroid Build Coastguard Worker    sub            bufq, 2*(82*73-(82*3+79))
340*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, fg_data, h, x
341*c0909341SAndroid Build Coastguard Worker    mov              hd, 70
342*c0909341SAndroid Build Coastguard Worker.y_loop_ar2:
343*c0909341SAndroid Build Coastguard Worker    mov              xq, -76
344*c0909341SAndroid Build Coastguard Worker
345*c0909341SAndroid Build Coastguard Worker.x_loop_ar2:
346*c0909341SAndroid Build Coastguard Worker    movu             m0, [bufq+xq*2-82*4-4]     ; y=-2,x=[-2,+5]
347*c0909341SAndroid Build Coastguard Worker    movu             m1, [bufq+xq*2-82*2-4]     ; y=-1,x=[-2,+5]
348*c0909341SAndroid Build Coastguard Worker    psrldq           m2, m0, 2
349*c0909341SAndroid Build Coastguard Worker    psrldq           m3, m0, 4
350*c0909341SAndroid Build Coastguard Worker    psrldq           m4, m0, 6
351*c0909341SAndroid Build Coastguard Worker    psrldq           m5, m0, 8
352*c0909341SAndroid Build Coastguard Worker    punpcklwd        m0, m2
353*c0909341SAndroid Build Coastguard Worker    punpcklwd        m3, m4
354*c0909341SAndroid Build Coastguard Worker    punpcklwd        m5, m1
355*c0909341SAndroid Build Coastguard Worker    psrldq           m2, m1, 2
356*c0909341SAndroid Build Coastguard Worker    psrldq           m4, m1, 4
357*c0909341SAndroid Build Coastguard Worker    punpcklwd        m2, m4
358*c0909341SAndroid Build Coastguard Worker    psrldq           m4, m1, 6
359*c0909341SAndroid Build Coastguard Worker    psrldq           m1, 8
360*c0909341SAndroid Build Coastguard Worker    punpcklwd        m4, m1
361*c0909341SAndroid Build Coastguard Worker    pmaddwd          m0, m6
362*c0909341SAndroid Build Coastguard Worker    pmaddwd          m3, m7
363*c0909341SAndroid Build Coastguard Worker    pmaddwd          m5, m8
364*c0909341SAndroid Build Coastguard Worker    pmaddwd          m2, m9
365*c0909341SAndroid Build Coastguard Worker    pmaddwd          m4, m10
366*c0909341SAndroid Build Coastguard Worker    paddd            m0, m3
367*c0909341SAndroid Build Coastguard Worker    paddd            m5, m2
368*c0909341SAndroid Build Coastguard Worker    paddd            m0, m4
369*c0909341SAndroid Build Coastguard Worker    paddd            m0, m5                     ; accumulated top 2 rows
370*c0909341SAndroid Build Coastguard Worker    paddd            m0, m12
371*c0909341SAndroid Build Coastguard Worker
372*c0909341SAndroid Build Coastguard Worker    movu             m1, [bufq+xq*2-4]      ; y=0,x=[-2,+5]
373*c0909341SAndroid Build Coastguard Worker    pshufd           m4, m1, q3321
374*c0909341SAndroid Build Coastguard Worker    pxor             m2, m2
375*c0909341SAndroid Build Coastguard Worker    pcmpgtw          m2, m4
376*c0909341SAndroid Build Coastguard Worker    punpcklwd        m4, m2                 ; in dwords, y=0,x=[0,3]
377*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_inner:
378*c0909341SAndroid Build Coastguard Worker    pmaddwd          m2, m1, m11
379*c0909341SAndroid Build Coastguard Worker    paddd            m2, m0
380*c0909341SAndroid Build Coastguard Worker    psrldq           m0, 4                  ; shift top to next pixel
381*c0909341SAndroid Build Coastguard Worker    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
382*c0909341SAndroid Build Coastguard Worker    paddd            m2, m4
383*c0909341SAndroid Build Coastguard Worker    packssdw         m2, m2
384*c0909341SAndroid Build Coastguard Worker    pminsw           m2, m13
385*c0909341SAndroid Build Coastguard Worker    pmaxsw           m2, m14
386*c0909341SAndroid Build Coastguard Worker    psrldq           m4, 4
387*c0909341SAndroid Build Coastguard Worker    pslldq           m2, 2
388*c0909341SAndroid Build Coastguard Worker    psrldq           m1, 2
389*c0909341SAndroid Build Coastguard Worker%if cpuflag(sse4)
390*c0909341SAndroid Build Coastguard Worker    pblendw          m1, m2, 00000010b
391*c0909341SAndroid Build Coastguard Worker%else
392*c0909341SAndroid Build Coastguard Worker    pand             m1, m15
393*c0909341SAndroid Build Coastguard Worker    pandn            m3, m15, m2
394*c0909341SAndroid Build Coastguard Worker    por              m1, m3
395*c0909341SAndroid Build Coastguard Worker%endif
396*c0909341SAndroid Build Coastguard Worker    ; overwrite previous pixel, this should be ok
397*c0909341SAndroid Build Coastguard Worker    movd  [bufq+xq*2-2], m1
398*c0909341SAndroid Build Coastguard Worker    inc              xq
399*c0909341SAndroid Build Coastguard Worker    jz .x_loop_ar2_end
400*c0909341SAndroid Build Coastguard Worker    test             xq, 3
401*c0909341SAndroid Build Coastguard Worker    jnz .x_loop_ar2_inner
402*c0909341SAndroid Build Coastguard Worker    jmp .x_loop_ar2
403*c0909341SAndroid Build Coastguard Worker
404*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_end:
405*c0909341SAndroid Build Coastguard Worker    add            bufq, 82*2
406*c0909341SAndroid Build Coastguard Worker    dec              hd
407*c0909341SAndroid Build Coastguard Worker    jg .y_loop_ar2
408*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
409*c0909341SAndroid Build Coastguard Worker%undef m8
410*c0909341SAndroid Build Coastguard Worker%undef m9
411*c0909341SAndroid Build Coastguard Worker%undef m10
412*c0909341SAndroid Build Coastguard Worker%undef m11
413*c0909341SAndroid Build Coastguard Worker%undef m12
414*c0909341SAndroid Build Coastguard Worker%undef m13
415*c0909341SAndroid Build Coastguard Worker%undef m14
416*c0909341SAndroid Build Coastguard Worker%undef m15
417*c0909341SAndroid Build Coastguard Worker%endif
418*c0909341SAndroid Build Coastguard Worker    RET
419*c0909341SAndroid Build Coastguard Worker
420*c0909341SAndroid Build Coastguard Worker.ar3:
421*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, fg_data, bdmax, shift
422*c0909341SAndroid Build Coastguard Worker%if WIN64
423*c0909341SAndroid Build Coastguard Worker    mov              r6, rsp
424*c0909341SAndroid Build Coastguard Worker    and             rsp, ~15
425*c0909341SAndroid Build Coastguard Worker    sub             rsp, 64
426*c0909341SAndroid Build Coastguard Worker    %define         tmp  rsp
427*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_64
428*c0909341SAndroid Build Coastguard Worker    %define         tmp  rsp+stack_offset-72
429*c0909341SAndroid Build Coastguard Worker%else
430*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK  -16*12
431*c0909341SAndroid Build Coastguard Worker    %define         tmp  rsp
432*c0909341SAndroid Build Coastguard Worker    mov          bdmaxd, bdmaxm
433*c0909341SAndroid Build Coastguard Worker%endif
434*c0909341SAndroid Build Coastguard Worker    sar          bdmaxd, 1
435*c0909341SAndroid Build Coastguard Worker    SPLATW           m7, bdmaxd                                 ; max_grain
436*c0909341SAndroid Build Coastguard Worker    pcmpeqw          m6, m6
437*c0909341SAndroid Build Coastguard Worker%if !cpuflag(sse4)
438*c0909341SAndroid Build Coastguard Worker    pcmpeqw          m4, m4
439*c0909341SAndroid Build Coastguard Worker    psrldq           m4, 14
440*c0909341SAndroid Build Coastguard Worker    pslldq           m4, 4
441*c0909341SAndroid Build Coastguard Worker    pxor             m4, m6
442*c0909341SAndroid Build Coastguard Worker%endif
443*c0909341SAndroid Build Coastguard Worker    pxor             m6, m7                                    ; min_grain
444*c0909341SAndroid Build Coastguard Worker    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
445*c0909341SAndroid Build Coastguard Worker
446*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
447*c0909341SAndroid Build Coastguard Worker    SWAP              6, 14
448*c0909341SAndroid Build Coastguard Worker    SWAP              7, 15
449*c0909341SAndroid Build Coastguard Worker%else
450*c0909341SAndroid Build Coastguard Worker%define m14 [rsp+10*16]
451*c0909341SAndroid Build Coastguard Worker%define m15 [esp+11*16]
452*c0909341SAndroid Build Coastguard Worker    mova            m14, m6
453*c0909341SAndroid Build Coastguard Worker    mova            m15, m7
454*c0909341SAndroid Build Coastguard Worker%endif
455*c0909341SAndroid Build Coastguard Worker
456*c0909341SAndroid Build Coastguard Worker    ; build cf0-1 until 18-19 in m5-12 and r0/1
457*c0909341SAndroid Build Coastguard Worker    pxor             m1, m1
458*c0909341SAndroid Build Coastguard Worker    movu             m0, [fg_dataq+FGData.ar_coeffs_y+ 0]       ; cf0-15
459*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m1, m0
460*c0909341SAndroid Build Coastguard Worker    punpckhbw        m2, m0, m1
461*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m1
462*c0909341SAndroid Build Coastguard Worker
463*c0909341SAndroid Build Coastguard Worker%if cpuflag(sse4)
464*c0909341SAndroid Build Coastguard Worker    pshufd           m4, m2, q3333
465*c0909341SAndroid Build Coastguard Worker%else
466*c0909341SAndroid Build Coastguard Worker    pshufd           m5, m2, q3333
467*c0909341SAndroid Build Coastguard Worker    mova       [tmp+48], m5
468*c0909341SAndroid Build Coastguard Worker%endif
469*c0909341SAndroid Build Coastguard Worker    pshufd           m3, m2, q2222
470*c0909341SAndroid Build Coastguard Worker    pshufd           m1, m2, q0000
471*c0909341SAndroid Build Coastguard Worker    pshufd           m2, m2, q1111
472*c0909341SAndroid Build Coastguard Worker    pshufd           m7, m0, q2222
473*c0909341SAndroid Build Coastguard Worker    pshufd           m6, m0, q1111
474*c0909341SAndroid Build Coastguard Worker    pshufd           m5, m0, q0000
475*c0909341SAndroid Build Coastguard Worker    pshufd           m0, m0, q3333
476*c0909341SAndroid Build Coastguard Worker
477*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
478*c0909341SAndroid Build Coastguard Worker    SWAP              0, 8
479*c0909341SAndroid Build Coastguard Worker    SWAP              1, 9
480*c0909341SAndroid Build Coastguard Worker    SWAP              2, 10
481*c0909341SAndroid Build Coastguard Worker    SWAP              3, 11
482*c0909341SAndroid Build Coastguard Worker    SWAP              4, 12
483*c0909341SAndroid Build Coastguard Worker%else
484*c0909341SAndroid Build Coastguard Worker%define m8 [rsp+4*16]
485*c0909341SAndroid Build Coastguard Worker%define m9 [esp+5*16]
486*c0909341SAndroid Build Coastguard Worker%define m10 [rsp+6*16]
487*c0909341SAndroid Build Coastguard Worker%define m11 [esp+7*16]
488*c0909341SAndroid Build Coastguard Worker%define m12 [rsp+8*16]
489*c0909341SAndroid Build Coastguard Worker    mova             m8, m0
490*c0909341SAndroid Build Coastguard Worker    mova             m9, m1
491*c0909341SAndroid Build Coastguard Worker    mova            m10, m2
492*c0909341SAndroid Build Coastguard Worker    mova            m11, m3
493*c0909341SAndroid Build Coastguard Worker    mova            m12, m4
494*c0909341SAndroid Build Coastguard Worker%endif
495*c0909341SAndroid Build Coastguard Worker
496*c0909341SAndroid Build Coastguard Worker    ; build cf20,round in r2
497*c0909341SAndroid Build Coastguard Worker    ; build cf21-23,round*2 in m13
498*c0909341SAndroid Build Coastguard Worker    pxor             m1, m1
499*c0909341SAndroid Build Coastguard Worker    movq             m0, [fg_dataq+FGData.ar_coeffs_y+16]       ; cf16-23
500*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m1, m0
501*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m1
502*c0909341SAndroid Build Coastguard Worker    pshufd           m1, m0, q0000
503*c0909341SAndroid Build Coastguard Worker    pshufd           m2, m0, q1111
504*c0909341SAndroid Build Coastguard Worker    mova       [tmp+ 0], m1
505*c0909341SAndroid Build Coastguard Worker    mova       [tmp+16], m2
506*c0909341SAndroid Build Coastguard Worker    psrldq           m3, m0, 10
507*c0909341SAndroid Build Coastguard Worker    pinsrw           m3, [base+round_vals+shiftq*2-10], 3
508*c0909341SAndroid Build Coastguard Worker
509*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
510*c0909341SAndroid Build Coastguard Worker    SWAP              3, 13
511*c0909341SAndroid Build Coastguard Worker%else
512*c0909341SAndroid Build Coastguard Worker%define m13 [esp+9*16]
513*c0909341SAndroid Build Coastguard Worker    mova            m13, m3
514*c0909341SAndroid Build Coastguard Worker%endif
515*c0909341SAndroid Build Coastguard Worker
516*c0909341SAndroid Build Coastguard Worker    pinsrw           m0, [base+round_vals+shiftq*2-12], 5
517*c0909341SAndroid Build Coastguard Worker    pshufd           m3, m0, q2222
518*c0909341SAndroid Build Coastguard Worker    mova       [tmp+32], m3
519*c0909341SAndroid Build Coastguard Worker
520*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, fg_data, h, x
521*c0909341SAndroid Build Coastguard Worker    sub            bufq, 2*(82*73-(82*3+79))
522*c0909341SAndroid Build Coastguard Worker    mov              hd, 70
523*c0909341SAndroid Build Coastguard Worker.y_loop_ar3:
524*c0909341SAndroid Build Coastguard Worker    mov              xq, -76
525*c0909341SAndroid Build Coastguard Worker
526*c0909341SAndroid Build Coastguard Worker.x_loop_ar3:
527*c0909341SAndroid Build Coastguard Worker    movu             m0, [bufq+xq*2-82*6-6+ 0]      ; y=-3,x=[-3,+4]
528*c0909341SAndroid Build Coastguard Worker    movd             m1, [bufq+xq*2-82*6-6+16]      ; y=-3,x=[+5,+6]
529*c0909341SAndroid Build Coastguard Worker    palignr          m2, m1, m0, 2                  ; y=-3,x=[-2,+5]
530*c0909341SAndroid Build Coastguard Worker    palignr          m1, m1, m0, 12                 ; y=-3,x=[+3,+6]
531*c0909341SAndroid Build Coastguard Worker    punpckhwd        m3, m0, m2                     ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5]
532*c0909341SAndroid Build Coastguard Worker    punpcklwd        m0, m2                         ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1]
533*c0909341SAndroid Build Coastguard Worker    shufps           m2, m0, m3, q1032              ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3]
534*c0909341SAndroid Build Coastguard Worker
535*c0909341SAndroid Build Coastguard Worker    pmaddwd          m0, m5
536*c0909341SAndroid Build Coastguard Worker    pmaddwd          m2, m6
537*c0909341SAndroid Build Coastguard Worker    pmaddwd          m3, m7
538*c0909341SAndroid Build Coastguard Worker    paddd            m0, m2
539*c0909341SAndroid Build Coastguard Worker    paddd            m0, m3
540*c0909341SAndroid Build Coastguard Worker    ; m0 = top line first 6 multiplied by cf, m1 = top line last entry
541*c0909341SAndroid Build Coastguard Worker
542*c0909341SAndroid Build Coastguard Worker    movu             m2, [bufq+xq*2-82*4-6+ 0]      ; y=-2,x=[-3,+4]
543*c0909341SAndroid Build Coastguard Worker    movd             m3, [bufq+xq*2-82*4-6+16]      ; y=-2,x=[+5,+6]
544*c0909341SAndroid Build Coastguard Worker    punpcklwd        m1, m2                         ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0]
545*c0909341SAndroid Build Coastguard Worker    palignr          m4, m3, m2, 2                  ; y=-3,x=[-2,+5]
546*c0909341SAndroid Build Coastguard Worker    palignr          m3, m3, m2, 4                  ; y=-3,x=[-1,+6]
547*c0909341SAndroid Build Coastguard Worker    punpckhwd        m2, m4, m3                     ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6]
548*c0909341SAndroid Build Coastguard Worker    punpcklwd        m4, m3                         ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
549*c0909341SAndroid Build Coastguard Worker    shufps           m3, m4, m2, q1032              ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
550*c0909341SAndroid Build Coastguard Worker
551*c0909341SAndroid Build Coastguard Worker    pmaddwd          m1, m8
552*c0909341SAndroid Build Coastguard Worker    pmaddwd          m4, m9
553*c0909341SAndroid Build Coastguard Worker    pmaddwd          m3, m10
554*c0909341SAndroid Build Coastguard Worker    pmaddwd          m2, m11
555*c0909341SAndroid Build Coastguard Worker    paddd            m1, m4
556*c0909341SAndroid Build Coastguard Worker    paddd            m3, m2
557*c0909341SAndroid Build Coastguard Worker    paddd            m0, m1
558*c0909341SAndroid Build Coastguard Worker    paddd            m0, m3
559*c0909341SAndroid Build Coastguard Worker    ; m0 = top 2 lines multiplied by cf
560*c0909341SAndroid Build Coastguard Worker
561*c0909341SAndroid Build Coastguard Worker    movu             m1, [bufq+xq*2-82*2-6+ 0]      ; y=-1,x=[-3,+4]
562*c0909341SAndroid Build Coastguard Worker    movd             m2, [bufq+xq*2-82*2-6+16]      ; y=-1,x=[+5,+6]
563*c0909341SAndroid Build Coastguard Worker    palignr          m3, m2, m1, 2                  ; y=-1,x=[-2,+5]
564*c0909341SAndroid Build Coastguard Worker    palignr          m2, m2, m1, 12                 ; y=-1,x=[+3,+6]
565*c0909341SAndroid Build Coastguard Worker    punpckhwd        m4, m1, m3                     ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5]
566*c0909341SAndroid Build Coastguard Worker    punpcklwd        m1, m3                         ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
567*c0909341SAndroid Build Coastguard Worker    shufps           m3, m1, m4, q1032              ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3]
568*c0909341SAndroid Build Coastguard Worker    punpcklwd        m2, [base+pw_1]
569*c0909341SAndroid Build Coastguard Worker
570*c0909341SAndroid Build Coastguard Worker%if cpuflag(sse4)
571*c0909341SAndroid Build Coastguard Worker    pmaddwd          m1, m12
572*c0909341SAndroid Build Coastguard Worker%else
573*c0909341SAndroid Build Coastguard Worker    pmaddwd          m1, [tmp+48]
574*c0909341SAndroid Build Coastguard Worker%endif
575*c0909341SAndroid Build Coastguard Worker    pmaddwd          m3, [tmp+ 0]
576*c0909341SAndroid Build Coastguard Worker    pmaddwd          m4, [tmp+16]
577*c0909341SAndroid Build Coastguard Worker    pmaddwd          m2, [tmp+32]
578*c0909341SAndroid Build Coastguard Worker    paddd            m1, m3
579*c0909341SAndroid Build Coastguard Worker    paddd            m4, m2
580*c0909341SAndroid Build Coastguard Worker    paddd            m0, m1
581*c0909341SAndroid Build Coastguard Worker    paddd            m0, m4
582*c0909341SAndroid Build Coastguard Worker    ; m0 = top 3 lines multiplied by cf plus rounding for downshift
583*c0909341SAndroid Build Coastguard Worker
584*c0909341SAndroid Build Coastguard Worker    movu             m1, [bufq+xq*2-6]      ; y=0,x=[-3,+4]
585*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_inner:
586*c0909341SAndroid Build Coastguard Worker    pmaddwd          m2, m1, m13
587*c0909341SAndroid Build Coastguard Worker    pshufd           m3, m2, q1111
588*c0909341SAndroid Build Coastguard Worker    paddd            m2, m3                 ; left+cur
589*c0909341SAndroid Build Coastguard Worker    paddd            m2, m0                 ; add top
590*c0909341SAndroid Build Coastguard Worker    psrldq           m0, 4
591*c0909341SAndroid Build Coastguard Worker    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
592*c0909341SAndroid Build Coastguard Worker    packssdw         m2, m2
593*c0909341SAndroid Build Coastguard Worker    pminsw           m2, m15
594*c0909341SAndroid Build Coastguard Worker    pmaxsw           m2, m14
595*c0909341SAndroid Build Coastguard Worker    pslldq           m2, 4
596*c0909341SAndroid Build Coastguard Worker    psrldq           m1, 2
597*c0909341SAndroid Build Coastguard Worker%if cpuflag(sse4)
598*c0909341SAndroid Build Coastguard Worker    pblendw          m1, m2, 00000100b
599*c0909341SAndroid Build Coastguard Worker%else
600*c0909341SAndroid Build Coastguard Worker    pand             m1, m12
601*c0909341SAndroid Build Coastguard Worker    pandn            m3, m12, m2
602*c0909341SAndroid Build Coastguard Worker    por              m1, m3
603*c0909341SAndroid Build Coastguard Worker%endif
604*c0909341SAndroid Build Coastguard Worker    ; overwrite a couple of pixels, should be ok
605*c0909341SAndroid Build Coastguard Worker    movq  [bufq+xq*2-4], m1
606*c0909341SAndroid Build Coastguard Worker    inc              xq
607*c0909341SAndroid Build Coastguard Worker    jz .x_loop_ar3_end
608*c0909341SAndroid Build Coastguard Worker    test             xq, 3
609*c0909341SAndroid Build Coastguard Worker    jnz .x_loop_ar3_inner
610*c0909341SAndroid Build Coastguard Worker    jmp .x_loop_ar3
611*c0909341SAndroid Build Coastguard Worker
612*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_end:
613*c0909341SAndroid Build Coastguard Worker    add            bufq, 82*2
614*c0909341SAndroid Build Coastguard Worker    dec              hd
615*c0909341SAndroid Build Coastguard Worker    jg .y_loop_ar3
616*c0909341SAndroid Build Coastguard Worker%if WIN64
617*c0909341SAndroid Build Coastguard Worker    mov             rsp, r6
618*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_32
619*c0909341SAndroid Build Coastguard Worker%undef m8
620*c0909341SAndroid Build Coastguard Worker%undef m9
621*c0909341SAndroid Build Coastguard Worker%undef m10
622*c0909341SAndroid Build Coastguard Worker%undef m11
623*c0909341SAndroid Build Coastguard Worker%undef m12
624*c0909341SAndroid Build Coastguard Worker%undef m13
625*c0909341SAndroid Build Coastguard Worker%undef m14
626*c0909341SAndroid Build Coastguard Worker%undef m15
627*c0909341SAndroid Build Coastguard Worker%endif
628*c0909341SAndroid Build Coastguard Worker    RET
629*c0909341SAndroid Build Coastguard Worker
630*c0909341SAndroid Build Coastguard Worker%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
631*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
632*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
633*c0909341SAndroid Build Coastguard Workercglobal generate_grain_uv_%1_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax, x, gaussian_reg, h, pic_reg
634*c0909341SAndroid Build Coastguard Worker%define base r8-pb_mask
635*c0909341SAndroid Build Coastguard Worker    lea              r8, [pb_mask]
636*c0909341SAndroid Build Coastguard Worker    movifnidn    bdmaxd, bdmaxm
637*c0909341SAndroid Build Coastguard Worker    lea             r6d, [bdmaxq+1]
638*c0909341SAndroid Build Coastguard Worker%else
639*c0909341SAndroid Build Coastguard Workercglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h
640*c0909341SAndroid Build Coastguard Worker%define base r2-$$
641*c0909341SAndroid Build Coastguard Worker    LEA              r2, $$
642*c0909341SAndroid Build Coastguard Worker    mov        fg_dataq, r2m
643*c0909341SAndroid Build Coastguard Worker    mov             r6d, r4m
644*c0909341SAndroid Build Coastguard Worker    inc             r6d
645*c0909341SAndroid Build Coastguard Worker%endif
646*c0909341SAndroid Build Coastguard Worker    movq             m1, [base+rnd_next_upperbit_mask]
647*c0909341SAndroid Build Coastguard Worker    movq             m4, [base+mul_bits]
648*c0909341SAndroid Build Coastguard Worker    movq             m7, [base+hmul_bits]
649*c0909341SAndroid Build Coastguard Worker    mov             r5d, [fg_dataq+FGData.grain_scale_shift]
650*c0909341SAndroid Build Coastguard Worker    shr             r6d, 11             ; 0 for 10bpc, 2 for 12bpc
651*c0909341SAndroid Build Coastguard Worker    sub              r5, r6
652*c0909341SAndroid Build Coastguard Worker    SPLATW           m6, [base+round+r5*2-2]
653*c0909341SAndroid Build Coastguard Worker    mova             m5, [base+pb_mask]
654*c0909341SAndroid Build Coastguard Worker    SPLATW           m0, [fg_dataq+FGData.seed]
655*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
656*c0909341SAndroid Build Coastguard Worker    SPLATW           m2, [base+pw_seed_xor+uvq*4]
657*c0909341SAndroid Build Coastguard Worker%else
658*c0909341SAndroid Build Coastguard Worker    mov             r5d, r3m
659*c0909341SAndroid Build Coastguard Worker    SPLATW           m2, [base+pw_seed_xor+r5*4]
660*c0909341SAndroid Build Coastguard Worker%endif
661*c0909341SAndroid Build Coastguard Worker    pxor             m0, m2
662*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
663*c0909341SAndroid Build Coastguard Worker    lea              r6, [gaussian_sequence]
664*c0909341SAndroid Build Coastguard Worker%endif
665*c0909341SAndroid Build Coastguard Worker%if %2
666*c0909341SAndroid Build Coastguard Worker    mov              hd, 73-35*%3
667*c0909341SAndroid Build Coastguard Worker    add            bufq, 44*2
668*c0909341SAndroid Build Coastguard Worker.loop_y:
669*c0909341SAndroid Build Coastguard Worker    mov              xq, -44
670*c0909341SAndroid Build Coastguard Worker%else
671*c0909341SAndroid Build Coastguard Worker    mov              xq, -82*73
672*c0909341SAndroid Build Coastguard Worker    add            bufq, 82*73*2
673*c0909341SAndroid Build Coastguard Worker%endif
674*c0909341SAndroid Build Coastguard Worker.loop_x:
675*c0909341SAndroid Build Coastguard Worker    pand             m2, m0, m1
676*c0909341SAndroid Build Coastguard Worker    psrlw            m3, m2, 10
677*c0909341SAndroid Build Coastguard Worker    por              m2, m3             ; bits 0xf, 0x1e, 0x3c and 0x78 are set
678*c0909341SAndroid Build Coastguard Worker    pmullw           m2, m4             ; bits 0x0f00 are set
679*c0909341SAndroid Build Coastguard Worker    pshufb           m3, m5, m2         ; set 15th bit for next 4 seeds
680*c0909341SAndroid Build Coastguard Worker    psllq            m2, m3, 30
681*c0909341SAndroid Build Coastguard Worker    por              m2, m3
682*c0909341SAndroid Build Coastguard Worker    psllq            m3, m2, 15
683*c0909341SAndroid Build Coastguard Worker    por              m2, m3             ; aggregate each bit into next seed's high bit
684*c0909341SAndroid Build Coastguard Worker    pmulhuw          m3, m0, m7
685*c0909341SAndroid Build Coastguard Worker    por              m2, m3             ; 4 next output seeds
686*c0909341SAndroid Build Coastguard Worker    pshuflw          m0, m2, q3333
687*c0909341SAndroid Build Coastguard Worker    psrlw            m2, 5
688*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
689*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m3, m2, r6, r9, r10, 4, 2
690*c0909341SAndroid Build Coastguard Worker%else
691*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m3, m2, base+gaussian_sequence, r5, r6, 4, 2
692*c0909341SAndroid Build Coastguard Worker%endif
693*c0909341SAndroid Build Coastguard Worker    paddw            m3, m3             ; otherwise bpc=12 w/ grain_scale_shift=0
694*c0909341SAndroid Build Coastguard Worker                                        ; shifts by 0, which pmulhrsw does not support
695*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m3, m6
696*c0909341SAndroid Build Coastguard Worker    movq    [bufq+xq*2], m3
697*c0909341SAndroid Build Coastguard Worker    add              xq, 4
698*c0909341SAndroid Build Coastguard Worker    jl .loop_x
699*c0909341SAndroid Build Coastguard Worker%if %2
700*c0909341SAndroid Build Coastguard Worker    add            bufq, 82*2
701*c0909341SAndroid Build Coastguard Worker    dec              hd
702*c0909341SAndroid Build Coastguard Worker    jg .loop_y
703*c0909341SAndroid Build Coastguard Worker%endif
704*c0909341SAndroid Build Coastguard Worker
705*c0909341SAndroid Build Coastguard Worker    ; auto-regression code
706*c0909341SAndroid Build Coastguard Worker    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
707*c0909341SAndroid Build Coastguard Worker    movsxd           r5, [base+generate_grain_uv_%1_16bpc_ssse3_table+r5*4]
708*c0909341SAndroid Build Coastguard Worker    lea              r5, [r5+base+generate_grain_uv_%1_16bpc_ssse3_table]
709*c0909341SAndroid Build Coastguard Worker    jmp              r5
710*c0909341SAndroid Build Coastguard Worker
711*c0909341SAndroid Build Coastguard Worker.ar0:
712*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
713*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
714*c0909341SAndroid Build Coastguard Worker%else
715*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
716*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK  -16*2
717*c0909341SAndroid Build Coastguard Worker    mov           bufyq, r1m
718*c0909341SAndroid Build Coastguard Worker    mov             uvd, r3m
719*c0909341SAndroid Build Coastguard Worker%endif
720*c0909341SAndroid Build Coastguard Worker    imul            uvd, 28
721*c0909341SAndroid Build Coastguard Worker    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
722*c0909341SAndroid Build Coastguard Worker    movd             m4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
723*c0909341SAndroid Build Coastguard Worker    SPLATW           m3, [base+hmul_bits+shiftq*2-10]
724*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
725*c0909341SAndroid Build Coastguard Worker    sar          bdmaxd, 1
726*c0909341SAndroid Build Coastguard Worker    SPLATW           m1, bdmaxd                     ; max_gain
727*c0909341SAndroid Build Coastguard Worker%else
728*c0909341SAndroid Build Coastguard Worker    SPLATW           m1, r4m
729*c0909341SAndroid Build Coastguard Worker    psraw            m1, 1
730*c0909341SAndroid Build Coastguard Worker%endif
731*c0909341SAndroid Build Coastguard Worker    pcmpeqw          m7, m7
732*c0909341SAndroid Build Coastguard Worker    pxor             m7, m1                         ; min_grain
733*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
734*c0909341SAndroid Build Coastguard Worker    SWAP              1, 14
735*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, h, x
736*c0909341SAndroid Build Coastguard Worker%else
737*c0909341SAndroid Build Coastguard Worker%define m14 [rsp+0*16]
738*c0909341SAndroid Build Coastguard Worker    mova            m14, m1
739*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, pic_reg, h, x
740*c0909341SAndroid Build Coastguard Worker%endif
741*c0909341SAndroid Build Coastguard Worker    pxor             m5, m5
742*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m5, m4
743*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m5
744*c0909341SAndroid Build Coastguard Worker%if %2
745*c0909341SAndroid Build Coastguard Worker    SPLATW           m6, [base+hmul_bits+2+%3*2]
746*c0909341SAndroid Build Coastguard Worker%endif
747*c0909341SAndroid Build Coastguard Worker    SPLATW           m4, m4
748*c0909341SAndroid Build Coastguard Worker    pxor             m5, m5
749*c0909341SAndroid Build Coastguard Worker%if %2
750*c0909341SAndroid Build Coastguard Worker%if !cpuflag(sse4)
751*c0909341SAndroid Build Coastguard Worker    pcmpeqw          m2, m2
752*c0909341SAndroid Build Coastguard Worker    pslldq           m2, 12
753*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
754*c0909341SAndroid Build Coastguard Worker    SWAP              2, 12
755*c0909341SAndroid Build Coastguard Worker%else
756*c0909341SAndroid Build Coastguard Worker%define m12 [rsp+1*16]
757*c0909341SAndroid Build Coastguard Worker    mova            m12, m2
758*c0909341SAndroid Build Coastguard Worker%endif
759*c0909341SAndroid Build Coastguard Worker%endif
760*c0909341SAndroid Build Coastguard Worker%endif
761*c0909341SAndroid Build Coastguard Worker%if %2
762*c0909341SAndroid Build Coastguard Worker    sub            bufq, 2*(82*(73-35*%3)+82-(82*3+41))
763*c0909341SAndroid Build Coastguard Worker%else
764*c0909341SAndroid Build Coastguard Worker    sub            bufq, 2*(82*70-3)
765*c0909341SAndroid Build Coastguard Worker%endif
766*c0909341SAndroid Build Coastguard Worker    add           bufyq, 2*(3+82*3)
767*c0909341SAndroid Build Coastguard Worker    mov              hd, 70-35*%3
768*c0909341SAndroid Build Coastguard Worker.y_loop_ar0:
769*c0909341SAndroid Build Coastguard Worker    ; first 32 pixels
770*c0909341SAndroid Build Coastguard Worker    xor              xd, xd
771*c0909341SAndroid Build Coastguard Worker.x_loop_ar0:
772*c0909341SAndroid Build Coastguard Worker    movu             m0, [bufyq+xq*(2<<%2)]
773*c0909341SAndroid Build Coastguard Worker%if %2
774*c0909341SAndroid Build Coastguard Worker%if %3
775*c0909341SAndroid Build Coastguard Worker    movu             m2, [bufyq+xq*4+82*2]
776*c0909341SAndroid Build Coastguard Worker    paddw            m0, m2
777*c0909341SAndroid Build Coastguard Worker%endif
778*c0909341SAndroid Build Coastguard Worker    movu             m1, [bufyq+xq*4     +16]
779*c0909341SAndroid Build Coastguard Worker%if %3
780*c0909341SAndroid Build Coastguard Worker    movu             m2, [bufyq+xq*4+82*2+16]
781*c0909341SAndroid Build Coastguard Worker    paddw            m1, m2
782*c0909341SAndroid Build Coastguard Worker%endif
783*c0909341SAndroid Build Coastguard Worker    phaddw           m0, m1
784*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m0, m6
785*c0909341SAndroid Build Coastguard Worker%endif
786*c0909341SAndroid Build Coastguard Worker    punpckhwd        m1, m0, m5
787*c0909341SAndroid Build Coastguard Worker    punpcklwd        m0, m5
788*c0909341SAndroid Build Coastguard Worker    REPX {pmaddwd x, m4}, m0, m1
789*c0909341SAndroid Build Coastguard Worker    REPX {psrad x, 5}, m0, m1
790*c0909341SAndroid Build Coastguard Worker    packssdw         m0, m1
791*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m0, m3
792*c0909341SAndroid Build Coastguard Worker    movu             m1, [bufq+xq*2]
793*c0909341SAndroid Build Coastguard Worker    paddw            m0, m1
794*c0909341SAndroid Build Coastguard Worker    pminsw           m0, m14
795*c0909341SAndroid Build Coastguard Worker    pmaxsw           m0, m7
796*c0909341SAndroid Build Coastguard Worker    cmp              xd, 72-40*%2
797*c0909341SAndroid Build Coastguard Worker    je .end
798*c0909341SAndroid Build Coastguard Worker    movu    [bufq+xq*2], m0
799*c0909341SAndroid Build Coastguard Worker    add              xd, 8
800*c0909341SAndroid Build Coastguard Worker    jmp .x_loop_ar0
801*c0909341SAndroid Build Coastguard Worker
802*c0909341SAndroid Build Coastguard Worker    ; last 6/4 pixels
803*c0909341SAndroid Build Coastguard Worker.end:
804*c0909341SAndroid Build Coastguard Worker%if %2
805*c0909341SAndroid Build Coastguard Worker%if cpuflag(sse4)
806*c0909341SAndroid Build Coastguard Worker    pblendw          m0, m1, 11000000b
807*c0909341SAndroid Build Coastguard Worker%else
808*c0909341SAndroid Build Coastguard Worker    pand             m1, m12
809*c0909341SAndroid Build Coastguard Worker    pandn            m2, m12, m0
810*c0909341SAndroid Build Coastguard Worker    por              m0, m1, m2
811*c0909341SAndroid Build Coastguard Worker%endif
812*c0909341SAndroid Build Coastguard Worker    movu    [bufq+xq*2], m0
813*c0909341SAndroid Build Coastguard Worker%else
814*c0909341SAndroid Build Coastguard Worker    movq    [bufq+xq*2], m0
815*c0909341SAndroid Build Coastguard Worker%endif
816*c0909341SAndroid Build Coastguard Worker
817*c0909341SAndroid Build Coastguard Worker    add            bufq, 82*2
818*c0909341SAndroid Build Coastguard Worker    add           bufyq, 82*(2<<%3)
819*c0909341SAndroid Build Coastguard Worker    dec              hd
820*c0909341SAndroid Build Coastguard Worker    jg .y_loop_ar0
821*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
822*c0909341SAndroid Build Coastguard Worker%undef m12
823*c0909341SAndroid Build Coastguard Worker%undef m14
824*c0909341SAndroid Build Coastguard Worker%endif
825*c0909341SAndroid Build Coastguard Worker    RET
826*c0909341SAndroid Build Coastguard Worker
827*c0909341SAndroid Build Coastguard Worker.ar1:
828*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
829*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x
830*c0909341SAndroid Build Coastguard Worker%else
831*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
832*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3
833*c0909341SAndroid Build Coastguard Worker    mov           bufyq, r1m
834*c0909341SAndroid Build Coastguard Worker    mov             uvd, r3m
835*c0909341SAndroid Build Coastguard Worker%endif
836*c0909341SAndroid Build Coastguard Worker    imul            uvd, 28
837*c0909341SAndroid Build Coastguard Worker    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
838*c0909341SAndroid Build Coastguard Worker    movq             m4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
839*c0909341SAndroid Build Coastguard Worker%if WIN64
840*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS shift, bufy, h, buf, max, cf3, min, val3, x, val0
841*c0909341SAndroid Build Coastguard Worker%if %2
842*c0909341SAndroid Build Coastguard Worker    lea            bufq, [r0-2*(82*(73-35*%3)+44-(82*3+41))]
843*c0909341SAndroid Build Coastguard Worker%else
844*c0909341SAndroid Build Coastguard Worker    lea            bufq, [r0-2*(82*69+3)]
845*c0909341SAndroid Build Coastguard Worker%endif
846*c0909341SAndroid Build Coastguard Worker%else
847*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
848*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, h, shift, max, cf3, min, val3, x, val0
849*c0909341SAndroid Build Coastguard Worker%else
850*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, shift, pic_reg, fg_data, val0, bufy, cf3
851*c0909341SAndroid Build Coastguard Worker%define hd dword r1m
852*c0909341SAndroid Build Coastguard Worker%define mind dword r3m
853*c0909341SAndroid Build Coastguard Worker%define maxd dword r4m
854*c0909341SAndroid Build Coastguard Worker%endif
855*c0909341SAndroid Build Coastguard Worker%if %2
856*c0909341SAndroid Build Coastguard Worker    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
857*c0909341SAndroid Build Coastguard Worker%else
858*c0909341SAndroid Build Coastguard Worker    sub            bufq, 2*(82*69+3)
859*c0909341SAndroid Build Coastguard Worker%endif
860*c0909341SAndroid Build Coastguard Worker%endif
861*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
862*c0909341SAndroid Build Coastguard Worker    mov          shiftd, [r2+FGData.ar_coeff_shift]
863*c0909341SAndroid Build Coastguard Worker%else
864*c0909341SAndroid Build Coastguard Worker    mov          shiftd, [r3+FGData.ar_coeff_shift]
865*c0909341SAndroid Build Coastguard Worker%endif
866*c0909341SAndroid Build Coastguard Worker    pxor             m5, m5
867*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m5, m4
868*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m5                 ; cf0-4 in words
869*c0909341SAndroid Build Coastguard Worker    pshuflw          m4, m4, q2100
870*c0909341SAndroid Build Coastguard Worker    psrldq           m4, 2                  ; cf0-3,4 in words
871*c0909341SAndroid Build Coastguard Worker    pshufd           m5, m4, q1111
872*c0909341SAndroid Build Coastguard Worker    pshufd           m4, m4, q0000
873*c0909341SAndroid Build Coastguard Worker    movd             m3, [base+round_vals+shiftq*2-12]    ; rnd
874*c0909341SAndroid Build Coastguard Worker    pxor             m6, m6
875*c0909341SAndroid Build Coastguard Worker    punpcklwd        m3, m6
876*c0909341SAndroid Build Coastguard Worker%if %2
877*c0909341SAndroid Build Coastguard Worker    SPLATW           m6, [base+hmul_bits+2+%3*2]
878*c0909341SAndroid Build Coastguard Worker%endif
879*c0909341SAndroid Build Coastguard Worker    SPLATD           m3, m3
880*c0909341SAndroid Build Coastguard Worker    add           bufyq, 2*(79+82*3)
881*c0909341SAndroid Build Coastguard Worker    mov              hd, 70-35*%3
882*c0909341SAndroid Build Coastguard Worker    sar            maxd, 1
883*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
884*c0909341SAndroid Build Coastguard Worker    mov            mind, maxd
885*c0909341SAndroid Build Coastguard Worker    xor            mind, -1
886*c0909341SAndroid Build Coastguard Worker%else
887*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, shift, val3, x, val0, bufy, cf3
888*c0909341SAndroid Build Coastguard Worker    mov              r2, maxd
889*c0909341SAndroid Build Coastguard Worker    xor              r2, -1
890*c0909341SAndroid Build Coastguard Worker    mov            mind, r2
891*c0909341SAndroid Build Coastguard Worker%endif
892*c0909341SAndroid Build Coastguard Worker.y_loop_ar1:
893*c0909341SAndroid Build Coastguard Worker    mov              xq, -(76>>%2)
894*c0909341SAndroid Build Coastguard Worker    movsx         val3d, word [bufq+xq*2-2]
895*c0909341SAndroid Build Coastguard Worker.x_loop_ar1:
896*c0909341SAndroid Build Coastguard Worker    movu             m0, [bufq+xq*2-82*2-2] ; top/left
897*c0909341SAndroid Build Coastguard Worker%if %2
898*c0909341SAndroid Build Coastguard Worker    movu             m7, [bufyq+xq*4]
899*c0909341SAndroid Build Coastguard Worker%if %3
900*c0909341SAndroid Build Coastguard Worker    movu             m1, [bufyq+xq*4+82*2]
901*c0909341SAndroid Build Coastguard Worker    phaddw           m7, m1
902*c0909341SAndroid Build Coastguard Worker%else
903*c0909341SAndroid Build Coastguard Worker    phaddw           m7, m7
904*c0909341SAndroid Build Coastguard Worker%endif
905*c0909341SAndroid Build Coastguard Worker%else
906*c0909341SAndroid Build Coastguard Worker    movq             m7, [bufyq+xq*2]
907*c0909341SAndroid Build Coastguard Worker%endif
908*c0909341SAndroid Build Coastguard Worker    psrldq           m2, m0, 2              ; top
909*c0909341SAndroid Build Coastguard Worker    psrldq           m1, m0, 4              ; top/right
910*c0909341SAndroid Build Coastguard Worker    punpcklwd        m0, m2
911*c0909341SAndroid Build Coastguard Worker%if %2
912*c0909341SAndroid Build Coastguard Worker%if %3
913*c0909341SAndroid Build Coastguard Worker    pshufd           m2, m7, q3232
914*c0909341SAndroid Build Coastguard Worker    paddw            m7, m2
915*c0909341SAndroid Build Coastguard Worker%endif
916*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m7, m6
917*c0909341SAndroid Build Coastguard Worker%endif
918*c0909341SAndroid Build Coastguard Worker    punpcklwd        m1, m7
919*c0909341SAndroid Build Coastguard Worker    pmaddwd          m0, m4
920*c0909341SAndroid Build Coastguard Worker    pmaddwd          m1, m5
921*c0909341SAndroid Build Coastguard Worker    paddd            m0, m1
922*c0909341SAndroid Build Coastguard Worker    paddd            m0, m3
923*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_inner:
924*c0909341SAndroid Build Coastguard Worker    movd          val0d, m0
925*c0909341SAndroid Build Coastguard Worker    psrldq           m0, 4
926*c0909341SAndroid Build Coastguard Worker    imul          val3d, cf3d
927*c0909341SAndroid Build Coastguard Worker    add           val3d, val0d
928*c0909341SAndroid Build Coastguard Worker    sar           val3d, shiftb
929*c0909341SAndroid Build Coastguard Worker    movsx         val0d, word [bufq+xq*2]
930*c0909341SAndroid Build Coastguard Worker    add           val3d, val0d
931*c0909341SAndroid Build Coastguard Worker    cmp           val3d, maxd
932*c0909341SAndroid Build Coastguard Worker    cmovg         val3d, maxd
933*c0909341SAndroid Build Coastguard Worker    cmp           val3d, mind
934*c0909341SAndroid Build Coastguard Worker    cmovl         val3d, mind
935*c0909341SAndroid Build Coastguard Worker    mov word [bufq+xq*2], val3w
936*c0909341SAndroid Build Coastguard Worker    ; keep val3d in-place as left for next x iteration
937*c0909341SAndroid Build Coastguard Worker    inc              xq
938*c0909341SAndroid Build Coastguard Worker    jz .x_loop_ar1_end
939*c0909341SAndroid Build Coastguard Worker    test             xq, 3
940*c0909341SAndroid Build Coastguard Worker    jnz .x_loop_ar1_inner
941*c0909341SAndroid Build Coastguard Worker    jmp .x_loop_ar1
942*c0909341SAndroid Build Coastguard Worker
943*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_end:
944*c0909341SAndroid Build Coastguard Worker    add            bufq, 82*2
945*c0909341SAndroid Build Coastguard Worker    add           bufyq, 82*2<<%3
946*c0909341SAndroid Build Coastguard Worker    dec              hd
947*c0909341SAndroid Build Coastguard Worker    jg .y_loop_ar1
948*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
949*c0909341SAndroid Build Coastguard Worker%undef maxd
950*c0909341SAndroid Build Coastguard Worker%undef mind
951*c0909341SAndroid Build Coastguard Worker%undef hd
952*c0909341SAndroid Build Coastguard Worker%endif
953*c0909341SAndroid Build Coastguard Worker    RET
954*c0909341SAndroid Build Coastguard Worker
955*c0909341SAndroid Build Coastguard Worker.ar2:
956*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
957*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
958*c0909341SAndroid Build Coastguard Worker%else
959*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
960*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK  -16*8
961*c0909341SAndroid Build Coastguard Worker    mov           bufyq, r1m
962*c0909341SAndroid Build Coastguard Worker    mov             uvd, r3m
963*c0909341SAndroid Build Coastguard Worker%endif
964*c0909341SAndroid Build Coastguard Worker    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
965*c0909341SAndroid Build Coastguard Worker    imul            uvd, 28
966*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
967*c0909341SAndroid Build Coastguard Worker    sar          bdmaxd, 1
968*c0909341SAndroid Build Coastguard Worker    SPLATW           m5, bdmaxd                 ; max_grain
969*c0909341SAndroid Build Coastguard Worker%else
970*c0909341SAndroid Build Coastguard Worker    SPLATW           m5, r4m
971*c0909341SAndroid Build Coastguard Worker    psraw            m5, 1
972*c0909341SAndroid Build Coastguard Worker%endif
973*c0909341SAndroid Build Coastguard Worker    pcmpeqw          m6, m6
974*c0909341SAndroid Build Coastguard Worker%if !cpuflag(sse4)
975*c0909341SAndroid Build Coastguard Worker    pcmpeqw          m7, m7
976*c0909341SAndroid Build Coastguard Worker    psrldq           m7, 14
977*c0909341SAndroid Build Coastguard Worker    pslldq           m7, 2
978*c0909341SAndroid Build Coastguard Worker    pxor             m7, m6
979*c0909341SAndroid Build Coastguard Worker%endif
980*c0909341SAndroid Build Coastguard Worker    pxor             m6, m5                    ; min_grain
981*c0909341SAndroid Build Coastguard Worker%if %2 && cpuflag(sse4)
982*c0909341SAndroid Build Coastguard Worker    SPLATW           m7, [base+hmul_bits+2+%3*2]
983*c0909341SAndroid Build Coastguard Worker%endif
984*c0909341SAndroid Build Coastguard Worker
985*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
986*c0909341SAndroid Build Coastguard Worker    SWAP              5, 13
987*c0909341SAndroid Build Coastguard Worker    SWAP              6, 14
988*c0909341SAndroid Build Coastguard Worker    SWAP              7, 15
989*c0909341SAndroid Build Coastguard Worker%else
990*c0909341SAndroid Build Coastguard Worker%define m13 [rsp+5*16]
991*c0909341SAndroid Build Coastguard Worker%define m14 [rsp+6*16]
992*c0909341SAndroid Build Coastguard Worker%define m15 [rsp+7*16]
993*c0909341SAndroid Build Coastguard Worker    mova            m13, m5
994*c0909341SAndroid Build Coastguard Worker    mova            m14, m6
995*c0909341SAndroid Build Coastguard Worker    mova            m15, m7
996*c0909341SAndroid Build Coastguard Worker%endif
997*c0909341SAndroid Build Coastguard Worker
998*c0909341SAndroid Build Coastguard Worker    ; coef values
999*c0909341SAndroid Build Coastguard Worker    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]
1000*c0909341SAndroid Build Coastguard Worker    pxor             m1, m1
1001*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m1, m0
1002*c0909341SAndroid Build Coastguard Worker    punpckhbw        m2, m0, m1
1003*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m1
1004*c0909341SAndroid Build Coastguard Worker    pinsrw           m2, [base+round_vals-12+shiftq*2], 5
1005*c0909341SAndroid Build Coastguard Worker
1006*c0909341SAndroid Build Coastguard Worker    pshufd           m6, m0, q0000
1007*c0909341SAndroid Build Coastguard Worker    pshufd           m7, m0, q1111
1008*c0909341SAndroid Build Coastguard Worker    pshufd           m1, m0, q3333
1009*c0909341SAndroid Build Coastguard Worker    pshufd           m0, m0, q2222
1010*c0909341SAndroid Build Coastguard Worker    pshufd           m3, m2, q1111
1011*c0909341SAndroid Build Coastguard Worker    pshufd           m4, m2, q2222
1012*c0909341SAndroid Build Coastguard Worker    pshufd           m2, m2, q0000
1013*c0909341SAndroid Build Coastguard Worker
1014*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1015*c0909341SAndroid Build Coastguard Worker    SWAP              0, 8
1016*c0909341SAndroid Build Coastguard Worker    SWAP              1, 9
1017*c0909341SAndroid Build Coastguard Worker    SWAP              2, 10
1018*c0909341SAndroid Build Coastguard Worker    SWAP              3, 11
1019*c0909341SAndroid Build Coastguard Worker    SWAP              4, 12
1020*c0909341SAndroid Build Coastguard Worker%else
1021*c0909341SAndroid Build Coastguard Worker%define m8 [rsp+0*16]
1022*c0909341SAndroid Build Coastguard Worker%define m9 [rsp+1*16]
1023*c0909341SAndroid Build Coastguard Worker%define m10 [rsp+2*16]
1024*c0909341SAndroid Build Coastguard Worker%define m11 [rsp+3*16]
1025*c0909341SAndroid Build Coastguard Worker%define m12 [rsp+4*16]
1026*c0909341SAndroid Build Coastguard Worker    mova             m8, m0
1027*c0909341SAndroid Build Coastguard Worker    mova             m9, m1
1028*c0909341SAndroid Build Coastguard Worker    mova            m10, m2
1029*c0909341SAndroid Build Coastguard Worker    mova            m11, m3
1030*c0909341SAndroid Build Coastguard Worker    mova            m12, m4
1031*c0909341SAndroid Build Coastguard Worker%endif
1032*c0909341SAndroid Build Coastguard Worker
1033*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1034*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, fg_data, h, x
1035*c0909341SAndroid Build Coastguard Worker%else
1036*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x
1037*c0909341SAndroid Build Coastguard Worker%endif
1038*c0909341SAndroid Build Coastguard Worker%if %2
1039*c0909341SAndroid Build Coastguard Worker    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
1040*c0909341SAndroid Build Coastguard Worker%else
1041*c0909341SAndroid Build Coastguard Worker    sub            bufq, 2*(82*69+3)
1042*c0909341SAndroid Build Coastguard Worker%endif
1043*c0909341SAndroid Build Coastguard Worker    add           bufyq, 2*(79+82*3)
1044*c0909341SAndroid Build Coastguard Worker    mov              hd, 70-35*%3
1045*c0909341SAndroid Build Coastguard Worker.y_loop_ar2:
1046*c0909341SAndroid Build Coastguard Worker    mov              xq, -(76>>%2)
1047*c0909341SAndroid Build Coastguard Worker
1048*c0909341SAndroid Build Coastguard Worker.x_loop_ar2:
1049*c0909341SAndroid Build Coastguard Worker    movu             m0, [bufq+xq*2-82*4-4]     ; y=-2,x=[-2,+5]
1050*c0909341SAndroid Build Coastguard Worker    movu             m5, [bufq+xq*2-82*2-4]     ; y=-1,x=[-2,+5]
1051*c0909341SAndroid Build Coastguard Worker    psrldq           m4, m0, 2                  ; y=-2,x=[-1,+5]
1052*c0909341SAndroid Build Coastguard Worker    psrldq           m1, m0, 4                  ; y=-2,x=[-0,+5]
1053*c0909341SAndroid Build Coastguard Worker    psrldq           m3, m0, 6                  ; y=-2,x=[+1,+5]
1054*c0909341SAndroid Build Coastguard Worker    psrldq           m2, m0, 8                  ; y=-2,x=[+2,+5]
1055*c0909341SAndroid Build Coastguard Worker    punpcklwd        m0, m4                     ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
1056*c0909341SAndroid Build Coastguard Worker    punpcklwd        m1, m3                     ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
1057*c0909341SAndroid Build Coastguard Worker    punpcklwd        m2, m5                     ; y=-2/-1,x=[+2/-2,+3/-1,+4/+0,+5/+1]
1058*c0909341SAndroid Build Coastguard Worker    pmaddwd          m0, m6
1059*c0909341SAndroid Build Coastguard Worker    pmaddwd          m1, m7
1060*c0909341SAndroid Build Coastguard Worker    pmaddwd          m2, m8
1061*c0909341SAndroid Build Coastguard Worker    paddd            m0, m1
1062*c0909341SAndroid Build Coastguard Worker    paddd            m0, m2
1063*c0909341SAndroid Build Coastguard Worker    psrldq           m3, m5, 2                  ; y=-1,x=[-1,+5]
1064*c0909341SAndroid Build Coastguard Worker    psrldq           m1, m5, 4                  ; y=-1,x=[-0,+5]
1065*c0909341SAndroid Build Coastguard Worker    psrldq           m4, m5, 6                  ; y=-1,x=[+1,+5]
1066*c0909341SAndroid Build Coastguard Worker    psrldq           m2, m5, 8                  ; y=-1,x=[+2,+5]
1067*c0909341SAndroid Build Coastguard Worker    punpcklwd        m3, m1
1068*c0909341SAndroid Build Coastguard Worker    punpcklwd        m4, m2
1069*c0909341SAndroid Build Coastguard Worker    pmaddwd          m3, m9
1070*c0909341SAndroid Build Coastguard Worker    pmaddwd          m4, m10
1071*c0909341SAndroid Build Coastguard Worker    paddd            m3, m4
1072*c0909341SAndroid Build Coastguard Worker    paddd            m0, m3
1073*c0909341SAndroid Build Coastguard Worker
1074*c0909341SAndroid Build Coastguard Worker    ; luma component & rounding
1075*c0909341SAndroid Build Coastguard Worker%if %2
1076*c0909341SAndroid Build Coastguard Worker    movu             m1, [bufyq+xq*4]
1077*c0909341SAndroid Build Coastguard Worker%if %3
1078*c0909341SAndroid Build Coastguard Worker    movu             m2, [bufyq+xq*4+82*2]
1079*c0909341SAndroid Build Coastguard Worker    phaddw           m1, m2
1080*c0909341SAndroid Build Coastguard Worker    pshufd           m2, m1, q3232
1081*c0909341SAndroid Build Coastguard Worker    paddw            m1, m2
1082*c0909341SAndroid Build Coastguard Worker%else
1083*c0909341SAndroid Build Coastguard Worker    phaddw           m1, m1
1084*c0909341SAndroid Build Coastguard Worker%endif
1085*c0909341SAndroid Build Coastguard Worker%if cpuflag(sse4)
1086*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m1, m15
1087*c0909341SAndroid Build Coastguard Worker%elif %3
1088*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m1, [base+pw_8192]
1089*c0909341SAndroid Build Coastguard Worker%else
1090*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m1, [base+pw_16384]
1091*c0909341SAndroid Build Coastguard Worker%endif
1092*c0909341SAndroid Build Coastguard Worker%else
1093*c0909341SAndroid Build Coastguard Worker    movq             m1, [bufyq+xq*2]
1094*c0909341SAndroid Build Coastguard Worker%endif
1095*c0909341SAndroid Build Coastguard Worker    punpcklwd        m1, [base+pw_1]
1096*c0909341SAndroid Build Coastguard Worker    pmaddwd          m1, m12
1097*c0909341SAndroid Build Coastguard Worker    paddd            m0, m1
1098*c0909341SAndroid Build Coastguard Worker
1099*c0909341SAndroid Build Coastguard Worker    movu             m1, [bufq+xq*2-4]      ; y=0,x=[-2,+5]
1100*c0909341SAndroid Build Coastguard Worker    pshufd           m2, m1, q3321
1101*c0909341SAndroid Build Coastguard Worker    pxor             m3, m3
1102*c0909341SAndroid Build Coastguard Worker    pcmpgtw          m3, m2
1103*c0909341SAndroid Build Coastguard Worker    punpcklwd        m2, m3                 ; y=0,x=[0,3] in dword
1104*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_inner:
1105*c0909341SAndroid Build Coastguard Worker    pmaddwd          m3, m1, m11
1106*c0909341SAndroid Build Coastguard Worker    paddd            m3, m0
1107*c0909341SAndroid Build Coastguard Worker    psrldq           m0, 4                  ; shift top to next pixel
1108*c0909341SAndroid Build Coastguard Worker    psrad            m3, [fg_dataq+FGData.ar_coeff_shift]
1109*c0909341SAndroid Build Coastguard Worker    ; we do not need to packssdw since we only care about one value
1110*c0909341SAndroid Build Coastguard Worker    paddd            m3, m2
1111*c0909341SAndroid Build Coastguard Worker    packssdw         m3, m3
1112*c0909341SAndroid Build Coastguard Worker    pminsw           m3, m13
1113*c0909341SAndroid Build Coastguard Worker    pmaxsw           m3, m14
1114*c0909341SAndroid Build Coastguard Worker    psrldq           m1, 2
1115*c0909341SAndroid Build Coastguard Worker    pslldq           m3, 2
1116*c0909341SAndroid Build Coastguard Worker    psrldq           m2, 4
1117*c0909341SAndroid Build Coastguard Worker%if cpuflag(sse4)
1118*c0909341SAndroid Build Coastguard Worker    pblendw          m1, m3, 00000010b
1119*c0909341SAndroid Build Coastguard Worker%else
1120*c0909341SAndroid Build Coastguard Worker    pand             m1, m15
1121*c0909341SAndroid Build Coastguard Worker    pandn            m4, m15, m3
1122*c0909341SAndroid Build Coastguard Worker    por              m1, m4
1123*c0909341SAndroid Build Coastguard Worker%endif
1124*c0909341SAndroid Build Coastguard Worker    ; overwrite previous pixel, should be ok
1125*c0909341SAndroid Build Coastguard Worker    movd  [bufq+xq*2-2], m1
1126*c0909341SAndroid Build Coastguard Worker    inc              xq
1127*c0909341SAndroid Build Coastguard Worker    jz .x_loop_ar2_end
1128*c0909341SAndroid Build Coastguard Worker    test             xq, 3
1129*c0909341SAndroid Build Coastguard Worker    jnz .x_loop_ar2_inner
1130*c0909341SAndroid Build Coastguard Worker    jmp .x_loop_ar2
1131*c0909341SAndroid Build Coastguard Worker
1132*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_end:
1133*c0909341SAndroid Build Coastguard Worker    add            bufq, 82*2
1134*c0909341SAndroid Build Coastguard Worker    add           bufyq, 82*2<<%3
1135*c0909341SAndroid Build Coastguard Worker    dec              hd
1136*c0909341SAndroid Build Coastguard Worker    jg .y_loop_ar2
1137*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1138*c0909341SAndroid Build Coastguard Worker%undef m13
1139*c0909341SAndroid Build Coastguard Worker%undef m14
1140*c0909341SAndroid Build Coastguard Worker%undef m15
1141*c0909341SAndroid Build Coastguard Worker%endif
1142*c0909341SAndroid Build Coastguard Worker    RET
1143*c0909341SAndroid Build Coastguard Worker
1144*c0909341SAndroid Build Coastguard Worker.ar3:
1145*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1146*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
1147*c0909341SAndroid Build Coastguard Worker%if WIN64
1148*c0909341SAndroid Build Coastguard Worker    mov              r6, rsp
1149*c0909341SAndroid Build Coastguard Worker    and             rsp, ~15
1150*c0909341SAndroid Build Coastguard Worker    sub             rsp, 96
1151*c0909341SAndroid Build Coastguard Worker    %define         tmp  rsp
1152*c0909341SAndroid Build Coastguard Worker%else
1153*c0909341SAndroid Build Coastguard Worker    %define         tmp  rsp+stack_offset-120
1154*c0909341SAndroid Build Coastguard Worker%endif
1155*c0909341SAndroid Build Coastguard Worker%else
1156*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
1157*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK  -16*14
1158*c0909341SAndroid Build Coastguard Worker    mov           bufyq, r1m
1159*c0909341SAndroid Build Coastguard Worker    mov             uvd, r3m
1160*c0909341SAndroid Build Coastguard Worker    %define         tmp  rsp
1161*c0909341SAndroid Build Coastguard Worker%endif
1162*c0909341SAndroid Build Coastguard Worker    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
1163*c0909341SAndroid Build Coastguard Worker    imul            uvd, 28
1164*c0909341SAndroid Build Coastguard Worker    SPLATW           m4, [base+round_vals-12+shiftq*2]
1165*c0909341SAndroid Build Coastguard Worker    pxor             m5, m5
1166*c0909341SAndroid Build Coastguard Worker    pcmpgtw          m5, m4
1167*c0909341SAndroid Build Coastguard Worker    punpcklwd        m4, m5
1168*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1169*c0909341SAndroid Build Coastguard Worker    sar          bdmaxd, 1
1170*c0909341SAndroid Build Coastguard Worker    SPLATW           m6, bdmaxd                 ; max_grain
1171*c0909341SAndroid Build Coastguard Worker%else
1172*c0909341SAndroid Build Coastguard Worker    SPLATW           m6, r4m
1173*c0909341SAndroid Build Coastguard Worker    psraw            m6, 1
1174*c0909341SAndroid Build Coastguard Worker%endif
1175*c0909341SAndroid Build Coastguard Worker    pcmpeqw          m7, m7
1176*c0909341SAndroid Build Coastguard Worker%if !cpuflag(sse4)
1177*c0909341SAndroid Build Coastguard Worker    pcmpeqw          m3, m3
1178*c0909341SAndroid Build Coastguard Worker    psrldq           m3, 14
1179*c0909341SAndroid Build Coastguard Worker    pslldq           m3, 4
1180*c0909341SAndroid Build Coastguard Worker    pxor             m3, m7
1181*c0909341SAndroid Build Coastguard Worker%endif
1182*c0909341SAndroid Build Coastguard Worker    pxor             m7, m6                     ; min_grain
1183*c0909341SAndroid Build Coastguard Worker%if %2 && cpuflag(sse4)
1184*c0909341SAndroid Build Coastguard Worker    SPLATW           m3, [base+hmul_bits+2+%3*2]
1185*c0909341SAndroid Build Coastguard Worker%endif
1186*c0909341SAndroid Build Coastguard Worker
1187*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1188*c0909341SAndroid Build Coastguard Worker    SWAP              3, 11
1189*c0909341SAndroid Build Coastguard Worker    SWAP              4, 12
1190*c0909341SAndroid Build Coastguard Worker    SWAP              6, 14
1191*c0909341SAndroid Build Coastguard Worker    SWAP              7, 15
1192*c0909341SAndroid Build Coastguard Worker%else
1193*c0909341SAndroid Build Coastguard Worker%define m11 [rsp+ 9*16]
1194*c0909341SAndroid Build Coastguard Worker%define m12 [rsp+10*16]
1195*c0909341SAndroid Build Coastguard Worker%define m14 [rsp+12*16]
1196*c0909341SAndroid Build Coastguard Worker%define m15 [rsp+13*16]
1197*c0909341SAndroid Build Coastguard Worker    mova            m11, m3
1198*c0909341SAndroid Build Coastguard Worker    mova            m12, m4
1199*c0909341SAndroid Build Coastguard Worker    mova            m14, m6
1200*c0909341SAndroid Build Coastguard Worker    mova            m15, m7
1201*c0909341SAndroid Build Coastguard Worker%endif
1202*c0909341SAndroid Build Coastguard Worker
1203*c0909341SAndroid Build Coastguard Worker    ; cf from y=-3,x=-3 until y=-3,x=-2
1204*c0909341SAndroid Build Coastguard Worker    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]
1205*c0909341SAndroid Build Coastguard Worker    pxor             m1, m1
1206*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m1, m0
1207*c0909341SAndroid Build Coastguard Worker    punpckhbw        m2, m0, m1
1208*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m1
1209*c0909341SAndroid Build Coastguard Worker    pshufd           m1, m0, q0000
1210*c0909341SAndroid Build Coastguard Worker    pshufd           m3, m0, q1111
1211*c0909341SAndroid Build Coastguard Worker    pshufd           m4, m0, q2222
1212*c0909341SAndroid Build Coastguard Worker    pshufd           m0, m0, q3333
1213*c0909341SAndroid Build Coastguard Worker    pshufd           m5, m2, q0000
1214*c0909341SAndroid Build Coastguard Worker    pshufd           m6, m2, q1111
1215*c0909341SAndroid Build Coastguard Worker    mova     [tmp+16*0], m1
1216*c0909341SAndroid Build Coastguard Worker    mova     [tmp+16*1], m3
1217*c0909341SAndroid Build Coastguard Worker    mova     [tmp+16*2], m4
1218*c0909341SAndroid Build Coastguard Worker    mova     [tmp+16*3], m0
1219*c0909341SAndroid Build Coastguard Worker    mova     [tmp+16*4], m5
1220*c0909341SAndroid Build Coastguard Worker    mova     [tmp+16*5], m6
1221*c0909341SAndroid Build Coastguard Worker    pshufd           m6, m2, q2222
1222*c0909341SAndroid Build Coastguard Worker    pshufd           m7, m2, q3333
1223*c0909341SAndroid Build Coastguard Worker
1224*c0909341SAndroid Build Coastguard Worker    ; cf from y=-1,x=-1 to y=0,x=-1 + luma component
1225*c0909341SAndroid Build Coastguard Worker    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+16]
1226*c0909341SAndroid Build Coastguard Worker    pxor             m1, m1
1227*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m1, m0
1228*c0909341SAndroid Build Coastguard Worker    punpckhbw        m2, m0, m1                 ; luma
1229*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m1
1230*c0909341SAndroid Build Coastguard Worker    pshufd           m3, m0, q3232
1231*c0909341SAndroid Build Coastguard Worker    psrldq           m5, m0, 10
1232*c0909341SAndroid Build Coastguard Worker    ; y=0,x=[-3 to -1] + "1.0" for current pixel
1233*c0909341SAndroid Build Coastguard Worker    pinsrw           m5, [base+round_vals-10+shiftq*2], 3
1234*c0909341SAndroid Build Coastguard Worker    ; y=-1,x=[-1 to +2]
1235*c0909341SAndroid Build Coastguard Worker    pshufd           m1, m0, q0000
1236*c0909341SAndroid Build Coastguard Worker    pshufd           m0, m0, q1111
1237*c0909341SAndroid Build Coastguard Worker    ; y=-1,x=+3 + luma
1238*c0909341SAndroid Build Coastguard Worker    punpcklwd        m3, m2
1239*c0909341SAndroid Build Coastguard Worker    pshufd           m3, m3, q0000
1240*c0909341SAndroid Build Coastguard Worker
1241*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1242*c0909341SAndroid Build Coastguard Worker    SWAP              1, 8
1243*c0909341SAndroid Build Coastguard Worker    SWAP              0, 9
1244*c0909341SAndroid Build Coastguard Worker    SWAP              3, 10
1245*c0909341SAndroid Build Coastguard Worker    SWAP              5, 13
1246*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, fg_data, h, x
1247*c0909341SAndroid Build Coastguard Worker%else
1248*c0909341SAndroid Build Coastguard Worker%define m8  [rsp+ 6*16]
1249*c0909341SAndroid Build Coastguard Worker%define m9  [rsp+ 7*16]
1250*c0909341SAndroid Build Coastguard Worker%define m10 [rsp+ 8*16]
1251*c0909341SAndroid Build Coastguard Worker%define m13 [rsp+11*16]
1252*c0909341SAndroid Build Coastguard Worker    mova             m8, m1
1253*c0909341SAndroid Build Coastguard Worker    mova             m9, m0
1254*c0909341SAndroid Build Coastguard Worker    mova            m10, m3
1255*c0909341SAndroid Build Coastguard Worker    mova            m13, m5
1256*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x
1257*c0909341SAndroid Build Coastguard Worker%endif
1258*c0909341SAndroid Build Coastguard Worker%if %2
1259*c0909341SAndroid Build Coastguard Worker    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
1260*c0909341SAndroid Build Coastguard Worker%else
1261*c0909341SAndroid Build Coastguard Worker    sub            bufq, 2*(82*69+3)
1262*c0909341SAndroid Build Coastguard Worker%endif
1263*c0909341SAndroid Build Coastguard Worker    add           bufyq, 2*(79+82*3)
1264*c0909341SAndroid Build Coastguard Worker    mov              hd, 70-35*%3
1265*c0909341SAndroid Build Coastguard Worker.y_loop_ar3:
1266*c0909341SAndroid Build Coastguard Worker    mov              xq, -(76>>%2)
1267*c0909341SAndroid Build Coastguard Worker
1268*c0909341SAndroid Build Coastguard Worker.x_loop_ar3:
1269*c0909341SAndroid Build Coastguard Worker    ; first line
1270*c0909341SAndroid Build Coastguard Worker    movu             m0, [bufq+xq*2-82*6-6+ 0]      ; y=-3,x=[-3,+4]
1271*c0909341SAndroid Build Coastguard Worker    movd             m1, [bufq+xq*2-82*6-6+16]      ; y=-3,x=[+5,+6]
1272*c0909341SAndroid Build Coastguard Worker    palignr          m2, m1, m0, 2                  ; y=-3,x=[-2,+5]
1273*c0909341SAndroid Build Coastguard Worker    palignr          m1, m1, m0, 12                 ; y=-3,x=[+3,+6]
1274*c0909341SAndroid Build Coastguard Worker    punpckhwd        m3, m0, m2                     ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5]
1275*c0909341SAndroid Build Coastguard Worker    punpcklwd        m0, m2                         ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1]
1276*c0909341SAndroid Build Coastguard Worker    shufps           m2, m0, m3, q1032              ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3]
1277*c0909341SAndroid Build Coastguard Worker
1278*c0909341SAndroid Build Coastguard Worker    pmaddwd          m0, [tmp+0*16]
1279*c0909341SAndroid Build Coastguard Worker    pmaddwd          m2, [tmp+1*16]
1280*c0909341SAndroid Build Coastguard Worker    pmaddwd          m3, [tmp+2*16]
1281*c0909341SAndroid Build Coastguard Worker    paddd            m0, m2
1282*c0909341SAndroid Build Coastguard Worker    paddd            m0, m3                         ; first 6 x of top y
1283*c0909341SAndroid Build Coastguard Worker
1284*c0909341SAndroid Build Coastguard Worker    ; second line [m0/1 are busy]
1285*c0909341SAndroid Build Coastguard Worker    movu             m2, [bufq+xq*2-82*4-6+ 0]      ; y=-2,x=[-3,+4]
1286*c0909341SAndroid Build Coastguard Worker    movd             m3, [bufq+xq*2-82*4-6+16]      ; y=-2,x=[+5,+6]
1287*c0909341SAndroid Build Coastguard Worker    punpcklwd        m1, m2                         ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0]
1288*c0909341SAndroid Build Coastguard Worker    palignr          m4, m3, m2, 2                  ; y=-2,x=[-2,+5]
1289*c0909341SAndroid Build Coastguard Worker    palignr          m3, m3, m2, 4                  ; y=-2,x=[-2,+5]
1290*c0909341SAndroid Build Coastguard Worker    punpckhwd        m5, m4, m3                     ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6]
1291*c0909341SAndroid Build Coastguard Worker    punpcklwd        m4, m3                         ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
1292*c0909341SAndroid Build Coastguard Worker    shufps           m3, m4, m5, q1032              ; t=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
1293*c0909341SAndroid Build Coastguard Worker    pmaddwd          m1, [tmp+3*16]
1294*c0909341SAndroid Build Coastguard Worker    pmaddwd          m4, [tmp+4*16]
1295*c0909341SAndroid Build Coastguard Worker    pmaddwd          m3, [tmp+5*16]
1296*c0909341SAndroid Build Coastguard Worker    pmaddwd          m5, m6
1297*c0909341SAndroid Build Coastguard Worker    paddd            m1, m4
1298*c0909341SAndroid Build Coastguard Worker    paddd            m3, m5
1299*c0909341SAndroid Build Coastguard Worker    paddd            m0, m1
1300*c0909341SAndroid Build Coastguard Worker    paddd            m0, m3                         ; top 2 lines
1301*c0909341SAndroid Build Coastguard Worker
1302*c0909341SAndroid Build Coastguard Worker    ; third line [m0 is busy] & luma + round
1303*c0909341SAndroid Build Coastguard Worker    movu             m1, [bufq+xq*2-82*2-6+ 0]      ; y=-1,x=[-3,+4]
1304*c0909341SAndroid Build Coastguard Worker    movd             m2, [bufq+xq*2-82*2-6+16]      ; y=-1,x=[+5,+6]
1305*c0909341SAndroid Build Coastguard Worker%if %2
1306*c0909341SAndroid Build Coastguard Worker    movu             m5, [bufyq+xq*4]
1307*c0909341SAndroid Build Coastguard Worker%if %3
1308*c0909341SAndroid Build Coastguard Worker    movu             m4, [bufyq+xq*4+82*2]
1309*c0909341SAndroid Build Coastguard Worker    phaddw           m5, m4
1310*c0909341SAndroid Build Coastguard Worker%else
1311*c0909341SAndroid Build Coastguard Worker    phaddw           m5, m5
1312*c0909341SAndroid Build Coastguard Worker%endif
1313*c0909341SAndroid Build Coastguard Worker%else
1314*c0909341SAndroid Build Coastguard Worker    movq             m5, [bufyq+xq*2]
1315*c0909341SAndroid Build Coastguard Worker%endif
1316*c0909341SAndroid Build Coastguard Worker    palignr          m3, m2, m1, 2                  ; y=-1,x=[-2,+5]
1317*c0909341SAndroid Build Coastguard Worker    palignr          m2, m2, m1, 12                 ; y=-1,x=[+3,+6]
1318*c0909341SAndroid Build Coastguard Worker%if %3
1319*c0909341SAndroid Build Coastguard Worker    pshufd           m4, m5, q3232
1320*c0909341SAndroid Build Coastguard Worker    paddw            m5, m4
1321*c0909341SAndroid Build Coastguard Worker%endif
1322*c0909341SAndroid Build Coastguard Worker%if %2
1323*c0909341SAndroid Build Coastguard Worker%if cpuflag(sse4)
1324*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m5, m11
1325*c0909341SAndroid Build Coastguard Worker%elif %3
1326*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m5, [base+pw_8192]
1327*c0909341SAndroid Build Coastguard Worker%else
1328*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m5, [base+pw_16384]
1329*c0909341SAndroid Build Coastguard Worker%endif
1330*c0909341SAndroid Build Coastguard Worker%endif
1331*c0909341SAndroid Build Coastguard Worker    punpckhwd        m4, m1, m3                     ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5]
1332*c0909341SAndroid Build Coastguard Worker    punpcklwd        m1, m3                         ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
1333*c0909341SAndroid Build Coastguard Worker    shufps           m3, m1, m4, q1032              ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3]
1334*c0909341SAndroid Build Coastguard Worker    punpcklwd        m2, m5
1335*c0909341SAndroid Build Coastguard Worker    pmaddwd          m1, m7
1336*c0909341SAndroid Build Coastguard Worker    pmaddwd          m3, m8
1337*c0909341SAndroid Build Coastguard Worker    pmaddwd          m4, m9
1338*c0909341SAndroid Build Coastguard Worker    pmaddwd          m2, m10
1339*c0909341SAndroid Build Coastguard Worker    paddd            m1, m3
1340*c0909341SAndroid Build Coastguard Worker    paddd            m4, m2
1341*c0909341SAndroid Build Coastguard Worker    paddd            m0, m12                        ; += round
1342*c0909341SAndroid Build Coastguard Worker    paddd            m1, m4
1343*c0909341SAndroid Build Coastguard Worker    paddd            m0, m1
1344*c0909341SAndroid Build Coastguard Worker
1345*c0909341SAndroid Build Coastguard Worker    movu             m1, [bufq+xq*2-6]      ; y=0,x=[-3,+4]
1346*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_inner:
1347*c0909341SAndroid Build Coastguard Worker    pmaddwd          m2, m1, m13
1348*c0909341SAndroid Build Coastguard Worker    pshufd           m3, m2, q1111
1349*c0909341SAndroid Build Coastguard Worker    paddd            m2, m3                 ; left+cur
1350*c0909341SAndroid Build Coastguard Worker    paddd            m2, m0                 ; add top
1351*c0909341SAndroid Build Coastguard Worker    psrldq           m0, 4
1352*c0909341SAndroid Build Coastguard Worker    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
1353*c0909341SAndroid Build Coastguard Worker    packssdw         m2, m2
1354*c0909341SAndroid Build Coastguard Worker    pminsw           m2, m14
1355*c0909341SAndroid Build Coastguard Worker    pmaxsw           m2, m15
1356*c0909341SAndroid Build Coastguard Worker    pslldq           m2, 4
1357*c0909341SAndroid Build Coastguard Worker    psrldq           m1, 2
1358*c0909341SAndroid Build Coastguard Worker%if cpuflag(sse4)
1359*c0909341SAndroid Build Coastguard Worker    pblendw          m1, m2, 00000100b
1360*c0909341SAndroid Build Coastguard Worker%else
1361*c0909341SAndroid Build Coastguard Worker    pand             m1, m11
1362*c0909341SAndroid Build Coastguard Worker    pandn            m3, m11, m2
1363*c0909341SAndroid Build Coastguard Worker    por              m1, m3
1364*c0909341SAndroid Build Coastguard Worker%endif
1365*c0909341SAndroid Build Coastguard Worker    ; overwrite previous pixels, should be ok
1366*c0909341SAndroid Build Coastguard Worker    movq  [bufq+xq*2-4], m1
1367*c0909341SAndroid Build Coastguard Worker    inc              xq
1368*c0909341SAndroid Build Coastguard Worker    jz .x_loop_ar3_end
1369*c0909341SAndroid Build Coastguard Worker    test             xq, 3
1370*c0909341SAndroid Build Coastguard Worker    jnz .x_loop_ar3_inner
1371*c0909341SAndroid Build Coastguard Worker    jmp .x_loop_ar3
1372*c0909341SAndroid Build Coastguard Worker
1373*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_end:
1374*c0909341SAndroid Build Coastguard Worker    add            bufq, 82*2
1375*c0909341SAndroid Build Coastguard Worker    add           bufyq, 82*2<<%3
1376*c0909341SAndroid Build Coastguard Worker    dec              hd
1377*c0909341SAndroid Build Coastguard Worker    jg .y_loop_ar3
1378*c0909341SAndroid Build Coastguard Worker%if WIN64
1379*c0909341SAndroid Build Coastguard Worker    mov             rsp, r6
1380*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_32
1381*c0909341SAndroid Build Coastguard Worker%undef m8
1382*c0909341SAndroid Build Coastguard Worker%undef m9
1383*c0909341SAndroid Build Coastguard Worker%undef m10
1384*c0909341SAndroid Build Coastguard Worker%undef m11
1385*c0909341SAndroid Build Coastguard Worker%undef m12
1386*c0909341SAndroid Build Coastguard Worker%undef m13
1387*c0909341SAndroid Build Coastguard Worker%undef m14
1388*c0909341SAndroid Build Coastguard Worker%undef m15
1389*c0909341SAndroid Build Coastguard Worker%endif
1390*c0909341SAndroid Build Coastguard Worker    RET
1391*c0909341SAndroid Build Coastguard Worker%endmacro
1392*c0909341SAndroid Build Coastguard Worker
1393*c0909341SAndroid Build Coastguard Workergenerate_grain_uv_fn 420, 1, 1
1394*c0909341SAndroid Build Coastguard Workergenerate_grain_uv_fn 422, 1, 0
1395*c0909341SAndroid Build Coastguard Workergenerate_grain_uv_fn 444, 0, 0
1396*c0909341SAndroid Build Coastguard Worker
1397*c0909341SAndroid Build Coastguard Worker%macro SCRATCH 3
1398*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1399*c0909341SAndroid Build Coastguard Worker    mova [rsp+%3*mmsize], m%1
1400*c0909341SAndroid Build Coastguard Worker%define m%2 [rsp+%3*mmsize]
1401*c0909341SAndroid Build Coastguard Worker%else
1402*c0909341SAndroid Build Coastguard Worker    SWAP             %1, %2
1403*c0909341SAndroid Build Coastguard Worker%endif
1404*c0909341SAndroid Build Coastguard Worker%endmacro
1405*c0909341SAndroid Build Coastguard Worker
1406*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
1407*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1408*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < mmsize
1409*c0909341SAndroid Build Coastguard Workercglobal fgy_32x32xn_16bpc, 0, 7, 8, 0-(8 * mmsize + 12 * gprsize), \
1410*c0909341SAndroid Build Coastguard Worker        dst, src, scaling, unused1, fg_data, picptr, unused2
1411*c0909341SAndroid Build Coastguard Worker    ; copy stack arguments to new position post-alignment, so that we
1412*c0909341SAndroid Build Coastguard Worker    ; don't have to keep the old stack location in a separate register
1413*c0909341SAndroid Build Coastguard Worker    mov              r0, r0m
1414*c0909341SAndroid Build Coastguard Worker    mov              r1, r2m
1415*c0909341SAndroid Build Coastguard Worker    mov              r2, r4m
1416*c0909341SAndroid Build Coastguard Worker    mov              r3, r6m
1417*c0909341SAndroid Build Coastguard Worker    mov              r4, r7m
1418*c0909341SAndroid Build Coastguard Worker    mov              r5, r8m
1419*c0909341SAndroid Build Coastguard Worker
1420*c0909341SAndroid Build Coastguard Worker%define r0m [rsp+8*mmsize+ 3*gprsize]
1421*c0909341SAndroid Build Coastguard Worker%define r2m [rsp+8*mmsize+ 5*gprsize]
1422*c0909341SAndroid Build Coastguard Worker%define r4m [rsp+8*mmsize+ 7*gprsize]
1423*c0909341SAndroid Build Coastguard Worker%define r6m [rsp+8*mmsize+ 9*gprsize]
1424*c0909341SAndroid Build Coastguard Worker%define r7m [rsp+8*mmsize+10*gprsize]
1425*c0909341SAndroid Build Coastguard Worker%define r8m [rsp+8*mmsize+11*gprsize]
1426*c0909341SAndroid Build Coastguard Worker
1427*c0909341SAndroid Build Coastguard Worker    mov             r0m, r0
1428*c0909341SAndroid Build Coastguard Worker    mov             r2m, r1
1429*c0909341SAndroid Build Coastguard Worker    mov             r4m, r2
1430*c0909341SAndroid Build Coastguard Worker    mov             r6m, r3
1431*c0909341SAndroid Build Coastguard Worker    mov             r7m, r4
1432*c0909341SAndroid Build Coastguard Worker    mov             r8m, r5
1433*c0909341SAndroid Build Coastguard Worker%else
1434*c0909341SAndroid Build Coastguard Workercglobal fgy_32x32xn_16bpc, 0, 7, 8, 8 * mmsize + 4 * gprsize, \
1435*c0909341SAndroid Build Coastguard Worker        dst, src, scaling, unused1, fg_data, picptr, unused2
1436*c0909341SAndroid Build Coastguard Worker%endif
1437*c0909341SAndroid Build Coastguard Worker    mov            srcq, srcm
1438*c0909341SAndroid Build Coastguard Worker    mov        scalingq, r5m
1439*c0909341SAndroid Build Coastguard Worker    mov        fg_dataq, r3m
1440*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < mmsize
1441*c0909341SAndroid Build Coastguard Worker    mov              r6, r9m
1442*c0909341SAndroid Build Coastguard Worker
1443*c0909341SAndroid Build Coastguard Worker%define r9m [rsp+8*mmsize+ 4*gprsize]
1444*c0909341SAndroid Build Coastguard Worker%define r3m [rsp+8*mmsize+ 6*gprsize]
1445*c0909341SAndroid Build Coastguard Worker%define r5m [rsp+8*mmsize+ 8*gprsize]
1446*c0909341SAndroid Build Coastguard Worker
1447*c0909341SAndroid Build Coastguard Worker    mov             r9m, r6
1448*c0909341SAndroid Build Coastguard Worker%endif
1449*c0909341SAndroid Build Coastguard Worker    LEA              r5, $$
1450*c0909341SAndroid Build Coastguard Worker%define base r5-$$
1451*c0909341SAndroid Build Coastguard Worker    mov             r5m, picptrq
1452*c0909341SAndroid Build Coastguard Worker%else
1453*c0909341SAndroid Build Coastguard Workercglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
1454*c0909341SAndroid Build Coastguard Worker    lea              r8, [pb_mask]
1455*c0909341SAndroid Build Coastguard Worker%define base r8-pb_mask
1456*c0909341SAndroid Build Coastguard Worker%endif
1457*c0909341SAndroid Build Coastguard Worker    mov             r6d, [fg_dataq+FGData.scaling_shift]
1458*c0909341SAndroid Build Coastguard Worker    SPLATW           m3, [base+mul_bits+r6*2-14]
1459*c0909341SAndroid Build Coastguard Worker    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
1460*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1461*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP   0, 3
1462*c0909341SAndroid Build Coastguard Worker%else
1463*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP   9, 10
1464*c0909341SAndroid Build Coastguard Worker%endif
1465*c0909341SAndroid Build Coastguard Worker    mov             t0d, r9m        ; bdmax
1466*c0909341SAndroid Build Coastguard Worker    sar             t0d, 11         ; is_12bpc
1467*c0909341SAndroid Build Coastguard Worker    inc             t0d
1468*c0909341SAndroid Build Coastguard Worker    mov             t1d, r6d
1469*c0909341SAndroid Build Coastguard Worker    imul            t1d, t0d
1470*c0909341SAndroid Build Coastguard Worker    dec             t0d
1471*c0909341SAndroid Build Coastguard Worker    SPLATW           m5, [base+min+t1*2]
1472*c0909341SAndroid Build Coastguard Worker    lea             t0d, [t0d*3]
1473*c0909341SAndroid Build Coastguard Worker    lea             t0d, [r6d*2+t0d]
1474*c0909341SAndroid Build Coastguard Worker    SPLATW           m4, [base+max+t0*2]
1475*c0909341SAndroid Build Coastguard Worker    SPLATW           m2, r9m
1476*c0909341SAndroid Build Coastguard Worker
1477*c0909341SAndroid Build Coastguard Worker    pcmpeqw          m1, m1
1478*c0909341SAndroid Build Coastguard Worker    psraw            m7, m2, 1              ; max_grain
1479*c0909341SAndroid Build Coastguard Worker    pxor             m1, m7                 ; min_grain
1480*c0909341SAndroid Build Coastguard Worker    SPLATD           m6, [base+pd_16]
1481*c0909341SAndroid Build Coastguard Worker
1482*c0909341SAndroid Build Coastguard Worker    SCRATCH           1,  9, 0
1483*c0909341SAndroid Build Coastguard Worker    SCRATCH           2, 10, 1
1484*c0909341SAndroid Build Coastguard Worker    SCRATCH           3, 11, 2
1485*c0909341SAndroid Build Coastguard Worker    SCRATCH           4, 12, 3
1486*c0909341SAndroid Build Coastguard Worker    SCRATCH           5, 13, 4
1487*c0909341SAndroid Build Coastguard Worker    SCRATCH           6, 14, 5
1488*c0909341SAndroid Build Coastguard Worker    SCRATCH           7, 15, 6
1489*c0909341SAndroid Build Coastguard Worker
1490*c0909341SAndroid Build Coastguard Worker    mova             m6, [base+pw_27_17_17_27]   ; for horizontal filter
1491*c0909341SAndroid Build Coastguard Worker
1492*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1493*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2
1494*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP   0
1495*c0909341SAndroid Build Coastguard Worker%else
1496*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
1497*c0909341SAndroid Build Coastguard Worker                sby, see
1498*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP   7
1499*c0909341SAndroid Build Coastguard Worker%endif
1500*c0909341SAndroid Build Coastguard Worker
1501*c0909341SAndroid Build Coastguard Worker    mov            sbyd, r8m
1502*c0909341SAndroid Build Coastguard Worker    movzx           t0d, byte [fg_dataq+FGData.overlap_flag]
1503*c0909341SAndroid Build Coastguard Worker    test            t0d, t0d
1504*c0909341SAndroid Build Coastguard Worker    jz .no_vertical_overlap
1505*c0909341SAndroid Build Coastguard Worker    test           sbyd, sbyd
1506*c0909341SAndroid Build Coastguard Worker    jnz .vertical_overlap
1507*c0909341SAndroid Build Coastguard Worker.no_vertical_overlap:
1508*c0909341SAndroid Build Coastguard Worker    mov       dword r8m, t0d
1509*c0909341SAndroid Build Coastguard Worker
1510*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1511*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused
1512*c0909341SAndroid Build Coastguard Worker    imul           seed, (173 << 24) | 37
1513*c0909341SAndroid Build Coastguard Worker%else
1514*c0909341SAndroid Build Coastguard Worker    imul           seed, sbyd, (173 << 24) | 37
1515*c0909341SAndroid Build Coastguard Worker%endif
1516*c0909341SAndroid Build Coastguard Worker    add            seed, (105 << 24) | 178
1517*c0909341SAndroid Build Coastguard Worker    rol            seed, 8
1518*c0909341SAndroid Build Coastguard Worker    movzx          seed, seew
1519*c0909341SAndroid Build Coastguard Worker    xor            seed, [fg_dataq+FGData.seed]
1520*c0909341SAndroid Build Coastguard Worker
1521*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1522*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
1523*c0909341SAndroid Build Coastguard Worker
1524*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
1525*c0909341SAndroid Build Coastguard Worker    mov              wq, r4m
1526*c0909341SAndroid Build Coastguard Worker%else
1527*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1528*c0909341SAndroid Build Coastguard Worker                unused1, unused2, see, src_bak
1529*c0909341SAndroid Build Coastguard Worker%endif
1530*c0909341SAndroid Build Coastguard Worker
1531*c0909341SAndroid Build Coastguard Worker    lea        src_bakq, [srcq+wq*2]
1532*c0909341SAndroid Build Coastguard Worker    mov            r9mp, src_bakq
1533*c0909341SAndroid Build Coastguard Worker    neg              wq
1534*c0909341SAndroid Build Coastguard Worker    sub           dstmp, srcq
1535*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1536*c0909341SAndroid Build Coastguard Worker    mov             r4m, wq
1537*c0909341SAndroid Build Coastguard Worker%endif
1538*c0909341SAndroid Build Coastguard Worker
1539*c0909341SAndroid Build Coastguard Worker.loop_x:
1540*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1541*c0909341SAndroid Build Coastguard Worker    mov            seed, r3m
1542*c0909341SAndroid Build Coastguard Worker%endif
1543*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
1544*c0909341SAndroid Build Coastguard Worker    or             seed, 0xEFF4
1545*c0909341SAndroid Build Coastguard Worker    shr             r6d, 1
1546*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
1547*c0909341SAndroid Build Coastguard Worker    lea            seed, [r6+0x8000]
1548*c0909341SAndroid Build Coastguard Worker    cmovp          seed, r6d                ; updated seed
1549*c0909341SAndroid Build Coastguard Worker
1550*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1551*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
1552*c0909341SAndroid Build Coastguard Worker
1553*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1554*c0909341SAndroid Build Coastguard Worker
1555*c0909341SAndroid Build Coastguard Worker    mov           offxd, offyd
1556*c0909341SAndroid Build Coastguard Worker%else
1557*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1558*c0909341SAndroid Build Coastguard Worker                offx, offy, see, src_bak
1559*c0909341SAndroid Build Coastguard Worker
1560*c0909341SAndroid Build Coastguard Worker    mov           offyd, seed
1561*c0909341SAndroid Build Coastguard Worker    mov           offxd, seed
1562*c0909341SAndroid Build Coastguard Worker%endif
1563*c0909341SAndroid Build Coastguard Worker    ror           offyd, 8
1564*c0909341SAndroid Build Coastguard Worker    shr           offxd, 12
1565*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf
1566*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164
1567*c0909341SAndroid Build Coastguard Worker    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
1568*c0909341SAndroid Build Coastguard Worker
1569*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1570*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1571*c0909341SAndroid Build Coastguard Worker%else
1572*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1573*c0909341SAndroid Build Coastguard Worker                h, offxy, see, src_bak
1574*c0909341SAndroid Build Coastguard Worker%endif
1575*c0909341SAndroid Build Coastguard Worker
1576*c0909341SAndroid Build Coastguard Worker.loop_x_odd:
1577*c0909341SAndroid Build Coastguard Worker    movzx            hd, word r7m
1578*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
1579*c0909341SAndroid Build Coastguard Worker.loop_y:
1580*c0909341SAndroid Build Coastguard Worker    ; src
1581*c0909341SAndroid Build Coastguard Worker    pand             m0, m10, [srcq+ 0]
1582*c0909341SAndroid Build Coastguard Worker    pand             m1, m10, [srcq+16]          ; m0-1: src as word
1583*c0909341SAndroid Build Coastguard Worker
1584*c0909341SAndroid Build Coastguard Worker    ; scaling[src]
1585*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1586*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m2, m0, scalingq-1, r0, r5, 8, 1, m4
1587*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m3, m1, scalingq-1, r0, r5, 8, 1, m4
1588*c0909341SAndroid Build Coastguard Worker%else
1589*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m2, m0, scalingq-1, r11, r13, 8, 1, m4
1590*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m3, m1, scalingq-1, r11, r13, 8, 1, m4
1591*c0909341SAndroid Build Coastguard Worker%endif
1592*c0909341SAndroid Build Coastguard Worker    REPX   {psrlw x, 8}, m2, m3
1593*c0909341SAndroid Build Coastguard Worker
1594*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
1595*c0909341SAndroid Build Coastguard Worker    movu             m4, [grain_lutq+offxyq*2]
1596*c0909341SAndroid Build Coastguard Worker    movu             m5, [grain_lutq+offxyq*2+16]
1597*c0909341SAndroid Build Coastguard Worker
1598*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[src] * grain, scaling_shift)
1599*c0909341SAndroid Build Coastguard Worker    REPX {pmullw x, m11}, m2, m3
1600*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m4, m2
1601*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m5, m3
1602*c0909341SAndroid Build Coastguard Worker
1603*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
1604*c0909341SAndroid Build Coastguard Worker    paddw            m0, m4
1605*c0909341SAndroid Build Coastguard Worker    paddw            m1, m5
1606*c0909341SAndroid Build Coastguard Worker    pmaxsw           m0, m13
1607*c0909341SAndroid Build Coastguard Worker    pmaxsw           m1, m13
1608*c0909341SAndroid Build Coastguard Worker    pminsw           m0, m12
1609*c0909341SAndroid Build Coastguard Worker    pminsw           m1, m12
1610*c0909341SAndroid Build Coastguard Worker    movifnidn      dstq, dstmp
1611*c0909341SAndroid Build Coastguard Worker    mova [dstq+srcq+ 0], m0
1612*c0909341SAndroid Build Coastguard Worker    mova [dstq+srcq+16], m1
1613*c0909341SAndroid Build Coastguard Worker
1614*c0909341SAndroid Build Coastguard Worker    add            srcq, r2mp               ; src += stride
1615*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82*2
1616*c0909341SAndroid Build Coastguard Worker    dec              hd
1617*c0909341SAndroid Build Coastguard Worker    jg .loop_y
1618*c0909341SAndroid Build Coastguard Worker
1619*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1620*c0909341SAndroid Build Coastguard Worker    add            r4mp, 16
1621*c0909341SAndroid Build Coastguard Worker%else
1622*c0909341SAndroid Build Coastguard Worker    add              wq, 16
1623*c0909341SAndroid Build Coastguard Worker%endif
1624*c0909341SAndroid Build Coastguard Worker    jge .end
1625*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1626*c0909341SAndroid Build Coastguard Worker    mov            srcq, r9mp
1627*c0909341SAndroid Build Coastguard Worker    add            srcq, r4mp
1628*c0909341SAndroid Build Coastguard Worker    add            srcq, r4mp
1629*c0909341SAndroid Build Coastguard Worker%else
1630*c0909341SAndroid Build Coastguard Worker    mov        src_bakq, r9mp
1631*c0909341SAndroid Build Coastguard Worker    lea            srcq, [src_bakq+wq*2]
1632*c0909341SAndroid Build Coastguard Worker%endif
1633*c0909341SAndroid Build Coastguard Worker    btc       dword r8m, 2
1634*c0909341SAndroid Build Coastguard Worker    jc .next_blk
1635*c0909341SAndroid Build Coastguard Worker    add          offxyd, 16
1636*c0909341SAndroid Build Coastguard Worker    test      dword r8m, 2
1637*c0909341SAndroid Build Coastguard Worker    jz .loop_x_odd
1638*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1639*c0909341SAndroid Build Coastguard Worker    add dword [rsp+8*mmsize+1*gprsize], 16
1640*c0909341SAndroid Build Coastguard Worker%else
1641*c0909341SAndroid Build Coastguard Worker    add            r12d, 16                 ; top_offxy += 16
1642*c0909341SAndroid Build Coastguard Worker%endif
1643*c0909341SAndroid Build Coastguard Worker    jmp .loop_x_odd_v_overlap
1644*c0909341SAndroid Build Coastguard Worker
1645*c0909341SAndroid Build Coastguard Worker.next_blk:
1646*c0909341SAndroid Build Coastguard Worker    test      dword r8m, 1
1647*c0909341SAndroid Build Coastguard Worker    jz .loop_x
1648*c0909341SAndroid Build Coastguard Worker
1649*c0909341SAndroid Build Coastguard Worker    ; r8m = sbym
1650*c0909341SAndroid Build Coastguard Worker    test      dword r8m, 2
1651*c0909341SAndroid Build Coastguard Worker    jnz .loop_x_hv_overlap
1652*c0909341SAndroid Build Coastguard Worker
1653*c0909341SAndroid Build Coastguard Worker    ; horizontal overlap (without vertical overlap)
1654*c0909341SAndroid Build Coastguard Worker.loop_x_h_overlap:
1655*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1656*c0909341SAndroid Build Coastguard Worker    add          offxyd, 16
1657*c0909341SAndroid Build Coastguard Worker    mov [rsp+8*mmsize+0*gprsize], offxyd
1658*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
1659*c0909341SAndroid Build Coastguard Worker    mov            seed, r3m
1660*c0909341SAndroid Build Coastguard Worker%endif
1661*c0909341SAndroid Build Coastguard Worker
1662*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
1663*c0909341SAndroid Build Coastguard Worker    or             seed, 0xEFF4
1664*c0909341SAndroid Build Coastguard Worker    shr             r6d, 1
1665*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
1666*c0909341SAndroid Build Coastguard Worker    lea            seed, [r6+0x8000]
1667*c0909341SAndroid Build Coastguard Worker    cmovp          seed, r6d                ; updated seed
1668*c0909341SAndroid Build Coastguard Worker
1669*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1670*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
1671*c0909341SAndroid Build Coastguard Worker
1672*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx
1673*c0909341SAndroid Build Coastguard Worker
1674*c0909341SAndroid Build Coastguard Worker    mov           offxd, offyd
1675*c0909341SAndroid Build Coastguard Worker%else
1676*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1677*c0909341SAndroid Build Coastguard Worker                offx, offy, see, src_bak, left_offxy
1678*c0909341SAndroid Build Coastguard Worker
1679*c0909341SAndroid Build Coastguard Worker    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
1680*c0909341SAndroid Build Coastguard Worker
1681*c0909341SAndroid Build Coastguard Worker    mov           offyd, seed
1682*c0909341SAndroid Build Coastguard Worker    mov           offxd, seed
1683*c0909341SAndroid Build Coastguard Worker%endif
1684*c0909341SAndroid Build Coastguard Worker    ror           offyd, 8
1685*c0909341SAndroid Build Coastguard Worker    shr           offxd, 12
1686*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf
1687*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164
1688*c0909341SAndroid Build Coastguard Worker    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
1689*c0909341SAndroid Build Coastguard Worker
1690*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1691*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1692*c0909341SAndroid Build Coastguard Worker%else
1693*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1694*c0909341SAndroid Build Coastguard Worker                h, offxy, see, src_bak, left_offxy
1695*c0909341SAndroid Build Coastguard Worker%endif
1696*c0909341SAndroid Build Coastguard Worker
1697*c0909341SAndroid Build Coastguard Worker    mov              hd, dword r7m
1698*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
1699*c0909341SAndroid Build Coastguard Worker.loop_y_h_overlap:
1700*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
1701*c0909341SAndroid Build Coastguard Worker    movu             m5, [grain_lutq+offxyq*2]
1702*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1703*c0909341SAndroid Build Coastguard Worker    mov              r5, [rsp+8*mmsize+0*gprsize]
1704*c0909341SAndroid Build Coastguard Worker    movd             m4, [grain_lutq+r5*2]
1705*c0909341SAndroid Build Coastguard Worker%else
1706*c0909341SAndroid Build Coastguard Worker    movd             m4, [grain_lutq+left_offxyq*2]
1707*c0909341SAndroid Build Coastguard Worker%endif
1708*c0909341SAndroid Build Coastguard Worker    punpcklwd        m4, m5
1709*c0909341SAndroid Build Coastguard Worker    pmaddwd          m4, m6
1710*c0909341SAndroid Build Coastguard Worker    paddd            m4, m14
1711*c0909341SAndroid Build Coastguard Worker    psrad            m4, 5
1712*c0909341SAndroid Build Coastguard Worker    packssdw         m4, m4
1713*c0909341SAndroid Build Coastguard Worker    pminsw           m4, m15
1714*c0909341SAndroid Build Coastguard Worker    pmaxsw           m4, m9
1715*c0909341SAndroid Build Coastguard Worker    shufps           m4, m5, q3210
1716*c0909341SAndroid Build Coastguard Worker
1717*c0909341SAndroid Build Coastguard Worker    ; src
1718*c0909341SAndroid Build Coastguard Worker    pand             m0, m10, [srcq+ 0]
1719*c0909341SAndroid Build Coastguard Worker    pand             m1, m10, [srcq+16]          ; m0-1: src as word
1720*c0909341SAndroid Build Coastguard Worker
1721*c0909341SAndroid Build Coastguard Worker    ; scaling[src]
1722*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1723*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m2, m0, scalingq-1, r0, r5, 8, 1, m5
1724*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m3, m1, scalingq-1, r0, r5, 8, 1, m5
1725*c0909341SAndroid Build Coastguard Worker%else
1726*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m2, m0, scalingq-1, r13, r14, 8, 1, m5
1727*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m3, m1, scalingq-1, r13, r14, 8, 1, m5
1728*c0909341SAndroid Build Coastguard Worker%endif
1729*c0909341SAndroid Build Coastguard Worker    REPX   {psrlw x, 8}, m2, m3
1730*c0909341SAndroid Build Coastguard Worker
1731*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[src] * grain, scaling_shift)
1732*c0909341SAndroid Build Coastguard Worker    movu             m5, [grain_lutq+offxyq*2+16]
1733*c0909341SAndroid Build Coastguard Worker    REPX {pmullw x, m11}, m2, m3
1734*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m4, m2
1735*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m5, m3
1736*c0909341SAndroid Build Coastguard Worker
1737*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
1738*c0909341SAndroid Build Coastguard Worker    paddw            m0, m4
1739*c0909341SAndroid Build Coastguard Worker    paddw            m1, m5
1740*c0909341SAndroid Build Coastguard Worker    pmaxsw           m0, m13
1741*c0909341SAndroid Build Coastguard Worker    pmaxsw           m1, m13
1742*c0909341SAndroid Build Coastguard Worker    pminsw           m0, m12
1743*c0909341SAndroid Build Coastguard Worker    pminsw           m1, m12
1744*c0909341SAndroid Build Coastguard Worker    movifnidn      dstq, dstmp
1745*c0909341SAndroid Build Coastguard Worker    mova [dstq+srcq+ 0], m0
1746*c0909341SAndroid Build Coastguard Worker    mova [dstq+srcq+16], m1
1747*c0909341SAndroid Build Coastguard Worker
1748*c0909341SAndroid Build Coastguard Worker    add            srcq, r2mp
1749*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82*2
1750*c0909341SAndroid Build Coastguard Worker    dec              hd
1751*c0909341SAndroid Build Coastguard Worker    jg .loop_y_h_overlap
1752*c0909341SAndroid Build Coastguard Worker
1753*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1754*c0909341SAndroid Build Coastguard Worker    add            r4mp, 16
1755*c0909341SAndroid Build Coastguard Worker%else
1756*c0909341SAndroid Build Coastguard Worker    add              wq, 16
1757*c0909341SAndroid Build Coastguard Worker%endif
1758*c0909341SAndroid Build Coastguard Worker    jge .end
1759*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1760*c0909341SAndroid Build Coastguard Worker    mov            srcq, r9mp
1761*c0909341SAndroid Build Coastguard Worker    add            srcq, r4mp
1762*c0909341SAndroid Build Coastguard Worker    add            srcq, r4mp
1763*c0909341SAndroid Build Coastguard Worker%else
1764*c0909341SAndroid Build Coastguard Worker    mov        src_bakq, r9mp
1765*c0909341SAndroid Build Coastguard Worker    lea            srcq, [src_bakq+wq*2]
1766*c0909341SAndroid Build Coastguard Worker%endif
1767*c0909341SAndroid Build Coastguard Worker    or        dword r8m, 4
1768*c0909341SAndroid Build Coastguard Worker    add          offxyd, 16
1769*c0909341SAndroid Build Coastguard Worker
1770*c0909341SAndroid Build Coastguard Worker    ; r8m = sbym
1771*c0909341SAndroid Build Coastguard Worker    test      dword r8m, 2
1772*c0909341SAndroid Build Coastguard Worker    jz .loop_x_odd
1773*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1774*c0909341SAndroid Build Coastguard Worker    add dword [rsp+8*mmsize+1*gprsize], 16
1775*c0909341SAndroid Build Coastguard Worker%else
1776*c0909341SAndroid Build Coastguard Worker    add            r12d, 16                 ; top_offxy += 16
1777*c0909341SAndroid Build Coastguard Worker%endif
1778*c0909341SAndroid Build Coastguard Worker    jmp .loop_x_odd_v_overlap
1779*c0909341SAndroid Build Coastguard Worker
1780*c0909341SAndroid Build Coastguard Worker.end:
1781*c0909341SAndroid Build Coastguard Worker    RET
1782*c0909341SAndroid Build Coastguard Worker
1783*c0909341SAndroid Build Coastguard Worker.vertical_overlap:
1784*c0909341SAndroid Build Coastguard Worker    or              t0d, 2
1785*c0909341SAndroid Build Coastguard Worker    mov             r8m, t0d
1786*c0909341SAndroid Build Coastguard Worker
1787*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1788*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused
1789*c0909341SAndroid Build Coastguard Worker%else
1790*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
1791*c0909341SAndroid Build Coastguard Worker                sby, see
1792*c0909341SAndroid Build Coastguard Worker%endif
1793*c0909341SAndroid Build Coastguard Worker
1794*c0909341SAndroid Build Coastguard Worker    movzx          sbyd, sbyb
1795*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1796*c0909341SAndroid Build Coastguard Worker    imul             r4, [fg_dataq+FGData.seed], 0x00010001
1797*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, sby, see, picptr, unused
1798*c0909341SAndroid Build Coastguard Worker%else
1799*c0909341SAndroid Build Coastguard Worker    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1800*c0909341SAndroid Build Coastguard Worker%endif
1801*c0909341SAndroid Build Coastguard Worker    imul            t0d, sbyd, 173 * 0x00010001
1802*c0909341SAndroid Build Coastguard Worker    imul           sbyd, 37 * 0x01000100
1803*c0909341SAndroid Build Coastguard Worker    add             t0d, (105 << 16) | 188
1804*c0909341SAndroid Build Coastguard Worker    add            sbyd, (178 << 24) | (141 << 8)
1805*c0909341SAndroid Build Coastguard Worker    and             t0d, 0x00ff00ff
1806*c0909341SAndroid Build Coastguard Worker    and            sbyd, 0xff00ff00
1807*c0909341SAndroid Build Coastguard Worker    xor            seed, t0d
1808*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1809*c0909341SAndroid Build Coastguard Worker    xor            sbyd, seed
1810*c0909341SAndroid Build Coastguard Worker
1811*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
1812*c0909341SAndroid Build Coastguard Worker
1813*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
1814*c0909341SAndroid Build Coastguard Worker    mov              wq, r4m
1815*c0909341SAndroid Build Coastguard Worker%else
1816*c0909341SAndroid Build Coastguard Worker    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1817*c0909341SAndroid Build Coastguard Worker
1818*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1819*c0909341SAndroid Build Coastguard Worker                unused1, unused2, see, src_bak
1820*c0909341SAndroid Build Coastguard Worker%endif
1821*c0909341SAndroid Build Coastguard Worker
1822*c0909341SAndroid Build Coastguard Worker    lea        src_bakq, [srcq+wq*2]
1823*c0909341SAndroid Build Coastguard Worker    mov            r9mp, src_bakq
1824*c0909341SAndroid Build Coastguard Worker    neg              wq
1825*c0909341SAndroid Build Coastguard Worker    sub           dstmp, srcq
1826*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1827*c0909341SAndroid Build Coastguard Worker    mov             r4m, wq
1828*c0909341SAndroid Build Coastguard Worker%endif
1829*c0909341SAndroid Build Coastguard Worker
1830*c0909341SAndroid Build Coastguard Worker.loop_x_v_overlap:
1831*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1832*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
1833*c0909341SAndroid Build Coastguard Worker    SPLATD           m7, [base+pw_27_17_17_27]
1834*c0909341SAndroid Build Coastguard Worker    mov            seed, r3m
1835*c0909341SAndroid Build Coastguard Worker%else
1836*c0909341SAndroid Build Coastguard Worker    SPLATD           m7, [pw_27_17_17_27]
1837*c0909341SAndroid Build Coastguard Worker%endif
1838*c0909341SAndroid Build Coastguard Worker
1839*c0909341SAndroid Build Coastguard Worker    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1840*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
1841*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4eff4
1842*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
1843*c0909341SAndroid Build Coastguard Worker    setp            t0b                     ; parity of top_seed
1844*c0909341SAndroid Build Coastguard Worker    shr            seed, 16
1845*c0909341SAndroid Build Coastguard Worker    shl             t0d, 16
1846*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
1847*c0909341SAndroid Build Coastguard Worker    setp            t0b                     ; parity of cur_seed
1848*c0909341SAndroid Build Coastguard Worker    or              r6d, 0x00010001
1849*c0909341SAndroid Build Coastguard Worker    xor             t0d, r6d
1850*c0909341SAndroid Build Coastguard Worker    mov            seed, t0d
1851*c0909341SAndroid Build Coastguard Worker    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
1852*c0909341SAndroid Build Coastguard Worker
1853*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1854*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
1855*c0909341SAndroid Build Coastguard Worker
1856*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1857*c0909341SAndroid Build Coastguard Worker
1858*c0909341SAndroid Build Coastguard Worker    mov           offxd, offyd
1859*c0909341SAndroid Build Coastguard Worker%else
1860*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1861*c0909341SAndroid Build Coastguard Worker                offx, offy, see, src_bak, unused, top_offxy
1862*c0909341SAndroid Build Coastguard Worker
1863*c0909341SAndroid Build Coastguard Worker    mov           offyd, seed
1864*c0909341SAndroid Build Coastguard Worker    mov           offxd, seed
1865*c0909341SAndroid Build Coastguard Worker%endif
1866*c0909341SAndroid Build Coastguard Worker    ror           offyd, 8
1867*c0909341SAndroid Build Coastguard Worker    ror           offxd, 12
1868*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf000f
1869*c0909341SAndroid Build Coastguard Worker    and           offxd, 0xf000f
1870*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164
1871*c0909341SAndroid Build Coastguard Worker    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1872*c0909341SAndroid Build Coastguard Worker    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
1873*c0909341SAndroid Build Coastguard Worker
1874*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1875*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
1876*c0909341SAndroid Build Coastguard Worker%else
1877*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1878*c0909341SAndroid Build Coastguard Worker                h, offxy, see, src_bak, unused, top_offxy
1879*c0909341SAndroid Build Coastguard Worker%endif
1880*c0909341SAndroid Build Coastguard Worker
1881*c0909341SAndroid Build Coastguard Worker    movzx    top_offxyd, offxyw
1882*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1883*c0909341SAndroid Build Coastguard Worker    mov [rsp+8*mmsize+1*gprsize], top_offxyd
1884*c0909341SAndroid Build Coastguard Worker
1885*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1886*c0909341SAndroid Build Coastguard Worker%endif
1887*c0909341SAndroid Build Coastguard Worker    shr          offxyd, 16
1888*c0909341SAndroid Build Coastguard Worker
1889*c0909341SAndroid Build Coastguard Worker.loop_x_odd_v_overlap:
1890*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1891*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
1892*c0909341SAndroid Build Coastguard Worker%endif
1893*c0909341SAndroid Build Coastguard Worker    SPLATD           m7, [PIC_ptr(pw_27_17_17_27)]
1894*c0909341SAndroid Build Coastguard Worker    mov              hd, dword r7m
1895*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
1896*c0909341SAndroid Build Coastguard Worker.loop_y_v_overlap:
1897*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
1898*c0909341SAndroid Build Coastguard Worker    movu             m3, [grain_lutq+offxyq*2]
1899*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1900*c0909341SAndroid Build Coastguard Worker    mov              r5, [rsp+8*mmsize+1*gprsize]
1901*c0909341SAndroid Build Coastguard Worker    movu             m2, [grain_lutq+r5*2]
1902*c0909341SAndroid Build Coastguard Worker%else
1903*c0909341SAndroid Build Coastguard Worker    movu             m2, [grain_lutq+top_offxyq*2]
1904*c0909341SAndroid Build Coastguard Worker%endif
1905*c0909341SAndroid Build Coastguard Worker    punpckhwd        m4, m2, m3
1906*c0909341SAndroid Build Coastguard Worker    punpcklwd        m2, m3
1907*c0909341SAndroid Build Coastguard Worker    REPX {pmaddwd x, m7}, m4, m2
1908*c0909341SAndroid Build Coastguard Worker    REPX {paddd   x, m14}, m4, m2
1909*c0909341SAndroid Build Coastguard Worker    REPX {psrad   x, 5}, m4, m2
1910*c0909341SAndroid Build Coastguard Worker    packssdw         m2, m4
1911*c0909341SAndroid Build Coastguard Worker    pminsw           m2, m15
1912*c0909341SAndroid Build Coastguard Worker    pmaxsw           m2, m9
1913*c0909341SAndroid Build Coastguard Worker    movu             m4, [grain_lutq+offxyq*2+16]
1914*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1915*c0909341SAndroid Build Coastguard Worker    movu             m3, [grain_lutq+r5*2+16]
1916*c0909341SAndroid Build Coastguard Worker%else
1917*c0909341SAndroid Build Coastguard Worker    movu             m3, [grain_lutq+top_offxyq*2+16]
1918*c0909341SAndroid Build Coastguard Worker%endif
1919*c0909341SAndroid Build Coastguard Worker    punpckhwd        m5, m3, m4
1920*c0909341SAndroid Build Coastguard Worker    punpcklwd        m3, m4
1921*c0909341SAndroid Build Coastguard Worker    REPX {pmaddwd x, m7}, m5, m3
1922*c0909341SAndroid Build Coastguard Worker    REPX {paddd   x, m14}, m5, m3
1923*c0909341SAndroid Build Coastguard Worker    REPX {psrad   x, 5}, m5, m3
1924*c0909341SAndroid Build Coastguard Worker    packssdw         m3, m5
1925*c0909341SAndroid Build Coastguard Worker    pminsw           m3, m15
1926*c0909341SAndroid Build Coastguard Worker    pmaxsw           m3, m9
1927*c0909341SAndroid Build Coastguard Worker
1928*c0909341SAndroid Build Coastguard Worker    ; src
1929*c0909341SAndroid Build Coastguard Worker    pand             m0, m10, [srcq+ 0]          ; m0-1: src as word
1930*c0909341SAndroid Build Coastguard Worker    pand             m1, m10, [srcq+16]          ; m0-1: src as word
1931*c0909341SAndroid Build Coastguard Worker
1932*c0909341SAndroid Build Coastguard Worker    ; scaling[src]
1933*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[src] * grain, scaling_shift)
1934*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1935*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m4, m0, scalingq-1, r0, r5, 8, 1, m5
1936*c0909341SAndroid Build Coastguard Worker%else
1937*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m4, m0, scalingq-1, r11, r13, 8, 1, m5
1938*c0909341SAndroid Build Coastguard Worker%endif
1939*c0909341SAndroid Build Coastguard Worker    psrlw            m4, 8
1940*c0909341SAndroid Build Coastguard Worker    pmullw           m4, m11
1941*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m4, m2
1942*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1943*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m1, scalingq-1, r0, r5, 8, 1, m2
1944*c0909341SAndroid Build Coastguard Worker%else
1945*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m1, scalingq-1, r11, r13, 8, 1, m2
1946*c0909341SAndroid Build Coastguard Worker%endif
1947*c0909341SAndroid Build Coastguard Worker    psrlw            m5, 8
1948*c0909341SAndroid Build Coastguard Worker    pmullw           m5, m11
1949*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m5, m3
1950*c0909341SAndroid Build Coastguard Worker
1951*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
1952*c0909341SAndroid Build Coastguard Worker    paddw            m0, m4
1953*c0909341SAndroid Build Coastguard Worker    paddw            m1, m5
1954*c0909341SAndroid Build Coastguard Worker    pmaxsw           m0, m13
1955*c0909341SAndroid Build Coastguard Worker    pmaxsw           m1, m13
1956*c0909341SAndroid Build Coastguard Worker    pminsw           m0, m12
1957*c0909341SAndroid Build Coastguard Worker    pminsw           m1, m12
1958*c0909341SAndroid Build Coastguard Worker    movifnidn      dstq, dstmp
1959*c0909341SAndroid Build Coastguard Worker    mova [dstq+srcq+ 0], m0
1960*c0909341SAndroid Build Coastguard Worker    mova [dstq+srcq+16], m1
1961*c0909341SAndroid Build Coastguard Worker
1962*c0909341SAndroid Build Coastguard Worker    add            srcq, r2mp
1963*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82*2
1964*c0909341SAndroid Build Coastguard Worker    dec              hw
1965*c0909341SAndroid Build Coastguard Worker    jz .end_y_v_overlap
1966*c0909341SAndroid Build Coastguard Worker    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1967*c0909341SAndroid Build Coastguard Worker    ; remaining (up to) 30 lines
1968*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1969*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
1970*c0909341SAndroid Build Coastguard Worker%endif
1971*c0909341SAndroid Build Coastguard Worker    SPLATD           m7, [PIC_ptr(pw_27_17_17_27)+4]
1972*c0909341SAndroid Build Coastguard Worker    xor              hd, 0x10000
1973*c0909341SAndroid Build Coastguard Worker    test             hd, 0x10000
1974*c0909341SAndroid Build Coastguard Worker    jnz .loop_y_v_overlap
1975*c0909341SAndroid Build Coastguard Worker    jmp .loop_y
1976*c0909341SAndroid Build Coastguard Worker
1977*c0909341SAndroid Build Coastguard Worker.end_y_v_overlap:
1978*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1979*c0909341SAndroid Build Coastguard Worker    add            r4mp, 16
1980*c0909341SAndroid Build Coastguard Worker%else
1981*c0909341SAndroid Build Coastguard Worker    add              wq, 16
1982*c0909341SAndroid Build Coastguard Worker%endif
1983*c0909341SAndroid Build Coastguard Worker    jge .end_hv
1984*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1985*c0909341SAndroid Build Coastguard Worker    mov            srcq, r9mp
1986*c0909341SAndroid Build Coastguard Worker    add            srcq, r4mp
1987*c0909341SAndroid Build Coastguard Worker    add            srcq, r4mp
1988*c0909341SAndroid Build Coastguard Worker%else
1989*c0909341SAndroid Build Coastguard Worker    mov        src_bakq, r9mp
1990*c0909341SAndroid Build Coastguard Worker    lea            srcq, [src_bakq+wq*2]
1991*c0909341SAndroid Build Coastguard Worker%endif
1992*c0909341SAndroid Build Coastguard Worker    btc       dword r8m, 2
1993*c0909341SAndroid Build Coastguard Worker    jc .next_blk_v
1994*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1995*c0909341SAndroid Build Coastguard Worker    add dword [rsp+8*mmsize+1*gprsize], 16
1996*c0909341SAndroid Build Coastguard Worker%else
1997*c0909341SAndroid Build Coastguard Worker    add      top_offxyd, 16
1998*c0909341SAndroid Build Coastguard Worker%endif
1999*c0909341SAndroid Build Coastguard Worker    add          offxyd, 16
2000*c0909341SAndroid Build Coastguard Worker    jmp .loop_x_odd_v_overlap
2001*c0909341SAndroid Build Coastguard Worker
2002*c0909341SAndroid Build Coastguard Worker.next_blk_v:
2003*c0909341SAndroid Build Coastguard Worker    ; since fg_dataq.overlap is guaranteed to be set, we never jump
2004*c0909341SAndroid Build Coastguard Worker    ; back to .loop_x_v_overlap, and instead always fall-through to
2005*c0909341SAndroid Build Coastguard Worker    ; h+v overlap
2006*c0909341SAndroid Build Coastguard Worker
2007*c0909341SAndroid Build Coastguard Worker.loop_x_hv_overlap:
2008*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2009*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
2010*c0909341SAndroid Build Coastguard Worker
2011*c0909341SAndroid Build Coastguard Worker    mov              r0, [rsp+8*mmsize+1*gprsize]
2012*c0909341SAndroid Build Coastguard Worker    add              r3, 16
2013*c0909341SAndroid Build Coastguard Worker    add              r0, 16
2014*c0909341SAndroid Build Coastguard Worker    mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy
2015*c0909341SAndroid Build Coastguard Worker    mov [rsp+8*mmsize+2*gprsize], r0 ; topleft_offxy
2016*c0909341SAndroid Build Coastguard Worker
2017*c0909341SAndroid Build Coastguard Worker    mov            seed, r3m
2018*c0909341SAndroid Build Coastguard Worker    xor              r0, r0
2019*c0909341SAndroid Build Coastguard Worker%else
2020*c0909341SAndroid Build Coastguard Worker    ; we assume from the block above that bits 8-15 of r7d are zero'ed
2021*c0909341SAndroid Build Coastguard Worker%endif
2022*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
2023*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4eff4
2024*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
2025*c0909341SAndroid Build Coastguard Worker    setp            t0b                     ; parity of top_seed
2026*c0909341SAndroid Build Coastguard Worker    shr            seed, 16
2027*c0909341SAndroid Build Coastguard Worker    shl             t0d, 16
2028*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
2029*c0909341SAndroid Build Coastguard Worker    setp            t0b                     ; parity of cur_seed
2030*c0909341SAndroid Build Coastguard Worker    or              r6d, 0x00010001
2031*c0909341SAndroid Build Coastguard Worker    xor             t0d, r6d
2032*c0909341SAndroid Build Coastguard Worker    mov            seed, t0d
2033*c0909341SAndroid Build Coastguard Worker    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
2034*c0909341SAndroid Build Coastguard Worker
2035*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2036*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
2037*c0909341SAndroid Build Coastguard Worker
2038*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS  dst, src, scaling, offy, w, picptr, offx
2039*c0909341SAndroid Build Coastguard Worker
2040*c0909341SAndroid Build Coastguard Worker    mov           offxd, offyd
2041*c0909341SAndroid Build Coastguard Worker%else
2042*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2043*c0909341SAndroid Build Coastguard Worker                offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy
2044*c0909341SAndroid Build Coastguard Worker
2045*c0909341SAndroid Build Coastguard Worker    lea  topleft_offxyq, [top_offxyq+16]
2046*c0909341SAndroid Build Coastguard Worker    lea     left_offxyq, [offyq+16]
2047*c0909341SAndroid Build Coastguard Worker    mov           offyd, seed
2048*c0909341SAndroid Build Coastguard Worker    mov           offxd, seed
2049*c0909341SAndroid Build Coastguard Worker%endif
2050*c0909341SAndroid Build Coastguard Worker    ror           offyd, 8
2051*c0909341SAndroid Build Coastguard Worker    ror           offxd, 12
2052*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf000f
2053*c0909341SAndroid Build Coastguard Worker    and           offxd, 0xf000f
2054*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164
2055*c0909341SAndroid Build Coastguard Worker    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
2056*c0909341SAndroid Build Coastguard Worker    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
2057*c0909341SAndroid Build Coastguard Worker
2058*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2059*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS top_offxy, src, scaling, offxy, w, picptr, grain_lut
2060*c0909341SAndroid Build Coastguard Worker%else
2061*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2062*c0909341SAndroid Build Coastguard Worker                h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy
2063*c0909341SAndroid Build Coastguard Worker%endif
2064*c0909341SAndroid Build Coastguard Worker
2065*c0909341SAndroid Build Coastguard Worker    movzx    top_offxyd, offxyw
2066*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2067*c0909341SAndroid Build Coastguard Worker    mov [rsp+8*mmsize+1*gprsize], top_offxyd
2068*c0909341SAndroid Build Coastguard Worker
2069*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2070*c0909341SAndroid Build Coastguard Worker%endif
2071*c0909341SAndroid Build Coastguard Worker    shr          offxyd, 16
2072*c0909341SAndroid Build Coastguard Worker
2073*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2074*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
2075*c0909341SAndroid Build Coastguard Worker%endif
2076*c0909341SAndroid Build Coastguard Worker    SPLATD           m7, [PIC_ptr(pw_27_17_17_27)]
2077*c0909341SAndroid Build Coastguard Worker
2078*c0909341SAndroid Build Coastguard Worker    movzx            hd, word r7m
2079*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
2080*c0909341SAndroid Build Coastguard Worker.loop_y_hv_overlap:
2081*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
2082*c0909341SAndroid Build Coastguard Worker    movu             m2, [grain_lutq+offxyq*2]
2083*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2084*c0909341SAndroid Build Coastguard Worker    mov              r0, [rsp+8*mmsize+1*gprsize] ; top_offxy
2085*c0909341SAndroid Build Coastguard Worker    mov              r5, [rsp+8*mmsize+0*gprsize] ; left_offxy
2086*c0909341SAndroid Build Coastguard Worker    movu             m4, [grain_lutq+r0*2]
2087*c0909341SAndroid Build Coastguard Worker    movd             m5, [grain_lutq+r5*2]
2088*c0909341SAndroid Build Coastguard Worker    mov              r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy
2089*c0909341SAndroid Build Coastguard Worker    movd             m3, [grain_lutq+r5*2]
2090*c0909341SAndroid Build Coastguard Worker%else
2091*c0909341SAndroid Build Coastguard Worker    movu             m4, [grain_lutq+top_offxyq*2]
2092*c0909341SAndroid Build Coastguard Worker    movd             m5, [grain_lutq+left_offxyq*2]
2093*c0909341SAndroid Build Coastguard Worker    movd             m3, [grain_lutq+topleft_offxyq*2]
2094*c0909341SAndroid Build Coastguard Worker%endif
2095*c0909341SAndroid Build Coastguard Worker    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
2096*c0909341SAndroid Build Coastguard Worker    punpcklwd        m5, m2
2097*c0909341SAndroid Build Coastguard Worker    punpcklwd        m3, m4
2098*c0909341SAndroid Build Coastguard Worker    REPX {pmaddwd x, m6}, m5, m3
2099*c0909341SAndroid Build Coastguard Worker    REPX {paddd   x, m14}, m5, m3
2100*c0909341SAndroid Build Coastguard Worker    REPX {psrad   x, 5}, m5, m3
2101*c0909341SAndroid Build Coastguard Worker    packssdw         m5, m3
2102*c0909341SAndroid Build Coastguard Worker    pminsw           m5, m15
2103*c0909341SAndroid Build Coastguard Worker    pmaxsw           m5, m9
2104*c0909341SAndroid Build Coastguard Worker    shufps           m3, m5, m2, q3210
2105*c0909341SAndroid Build Coastguard Worker    shufps           m5, m4, q3232
2106*c0909341SAndroid Build Coastguard Worker    ; followed by v interpolation (top | cur -> cur)
2107*c0909341SAndroid Build Coastguard Worker    movu             m0, [grain_lutq+offxyq*2+16]
2108*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2109*c0909341SAndroid Build Coastguard Worker    movu             m1, [grain_lutq+r0*2+16]
2110*c0909341SAndroid Build Coastguard Worker%else
2111*c0909341SAndroid Build Coastguard Worker    movu             m1, [grain_lutq+top_offxyq*2+16]
2112*c0909341SAndroid Build Coastguard Worker%endif
2113*c0909341SAndroid Build Coastguard Worker    punpcklwd        m2, m5, m3
2114*c0909341SAndroid Build Coastguard Worker    punpckhwd        m5, m3
2115*c0909341SAndroid Build Coastguard Worker    punpcklwd        m3, m1, m0
2116*c0909341SAndroid Build Coastguard Worker    punpckhwd        m1, m0
2117*c0909341SAndroid Build Coastguard Worker    REPX {pmaddwd x, m7}, m2, m5, m3, m1
2118*c0909341SAndroid Build Coastguard Worker    REPX {paddd   x, m14}, m2, m5, m3, m1
2119*c0909341SAndroid Build Coastguard Worker    REPX {psrad   x, 5}, m2, m5, m3, m1
2120*c0909341SAndroid Build Coastguard Worker    packssdw         m2, m5
2121*c0909341SAndroid Build Coastguard Worker    packssdw         m3, m1
2122*c0909341SAndroid Build Coastguard Worker    REPX {pminsw x, m15}, m2, m3
2123*c0909341SAndroid Build Coastguard Worker    REPX {pmaxsw x, m9}, m2, m3
2124*c0909341SAndroid Build Coastguard Worker
2125*c0909341SAndroid Build Coastguard Worker    ; src
2126*c0909341SAndroid Build Coastguard Worker    pand             m0, m10, [srcq+ 0]
2127*c0909341SAndroid Build Coastguard Worker    pand             m1, m10, [srcq+16]          ; m0-1: src as word
2128*c0909341SAndroid Build Coastguard Worker
2129*c0909341SAndroid Build Coastguard Worker    ; scaling[src]
2130*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[src] * grain, scaling_shift)
2131*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2132*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m4, m0, scalingq-1, r0, r5, 8, 1, m5
2133*c0909341SAndroid Build Coastguard Worker%else
2134*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m4, m0, scalingq-1, r14, r10, 8, 1, m5
2135*c0909341SAndroid Build Coastguard Worker%endif
2136*c0909341SAndroid Build Coastguard Worker    psrlw            m4, 8
2137*c0909341SAndroid Build Coastguard Worker    pmullw           m4, m11
2138*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m2, m4
2139*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2140*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m1, scalingq-1, r0, r5, 8, 1, m4
2141*c0909341SAndroid Build Coastguard Worker%else
2142*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m1, scalingq-1, r14, r10, 8, 1, m4
2143*c0909341SAndroid Build Coastguard Worker%endif
2144*c0909341SAndroid Build Coastguard Worker    psrlw            m5, 8
2145*c0909341SAndroid Build Coastguard Worker    pmullw           m5, m11
2146*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m3, m5
2147*c0909341SAndroid Build Coastguard Worker
2148*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
2149*c0909341SAndroid Build Coastguard Worker    paddw            m0, m2
2150*c0909341SAndroid Build Coastguard Worker    paddw            m1, m3
2151*c0909341SAndroid Build Coastguard Worker    pmaxsw           m0, m13
2152*c0909341SAndroid Build Coastguard Worker    pmaxsw           m1, m13
2153*c0909341SAndroid Build Coastguard Worker    pminsw           m0, m12
2154*c0909341SAndroid Build Coastguard Worker    pminsw           m1, m12
2155*c0909341SAndroid Build Coastguard Worker    movifnidn      dstq, dstmp
2156*c0909341SAndroid Build Coastguard Worker    mova [dstq+srcq+ 0], m0
2157*c0909341SAndroid Build Coastguard Worker    mova [dstq+srcq+16], m1
2158*c0909341SAndroid Build Coastguard Worker
2159*c0909341SAndroid Build Coastguard Worker    add            srcq, r2mp
2160*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82*2
2161*c0909341SAndroid Build Coastguard Worker    dec              hw
2162*c0909341SAndroid Build Coastguard Worker    jz .end_y_hv_overlap
2163*c0909341SAndroid Build Coastguard Worker    ; 2 lines get vertical overlap, then fall back to non-overlap code for
2164*c0909341SAndroid Build Coastguard Worker    ; remaining (up to) 30 lines
2165*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2166*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
2167*c0909341SAndroid Build Coastguard Worker%endif
2168*c0909341SAndroid Build Coastguard Worker    SPLATD           m7, [PIC_ptr(pw_27_17_17_27)+4]
2169*c0909341SAndroid Build Coastguard Worker    xor              hd, 0x10000
2170*c0909341SAndroid Build Coastguard Worker    test             hd, 0x10000
2171*c0909341SAndroid Build Coastguard Worker    jnz .loop_y_hv_overlap
2172*c0909341SAndroid Build Coastguard Worker    jmp .loop_y_h_overlap
2173*c0909341SAndroid Build Coastguard Worker
2174*c0909341SAndroid Build Coastguard Worker.end_y_hv_overlap:
2175*c0909341SAndroid Build Coastguard Worker    or        dword r8m, 4
2176*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2177*c0909341SAndroid Build Coastguard Worker    add            r4mp, 16
2178*c0909341SAndroid Build Coastguard Worker%else
2179*c0909341SAndroid Build Coastguard Worker    add              wq, 16
2180*c0909341SAndroid Build Coastguard Worker%endif
2181*c0909341SAndroid Build Coastguard Worker    jge .end_hv
2182*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2183*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
2184*c0909341SAndroid Build Coastguard Worker    add          offxyd, 16
2185*c0909341SAndroid Build Coastguard Worker    add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16
2186*c0909341SAndroid Build Coastguard Worker    mov            srcq, r9mp
2187*c0909341SAndroid Build Coastguard Worker    add            srcq, r4mp
2188*c0909341SAndroid Build Coastguard Worker    add            srcq, r4mp
2189*c0909341SAndroid Build Coastguard Worker%else
2190*c0909341SAndroid Build Coastguard Worker    add          offxyd, 16
2191*c0909341SAndroid Build Coastguard Worker    add      top_offxyd, 16
2192*c0909341SAndroid Build Coastguard Worker    mov        src_bakq, r9mp
2193*c0909341SAndroid Build Coastguard Worker    lea            srcq, [src_bakq+wq*2]
2194*c0909341SAndroid Build Coastguard Worker%endif
2195*c0909341SAndroid Build Coastguard Worker    jmp .loop_x_odd_v_overlap
2196*c0909341SAndroid Build Coastguard Worker
2197*c0909341SAndroid Build Coastguard Worker.end_hv:
2198*c0909341SAndroid Build Coastguard Worker    RET
2199*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2200*c0909341SAndroid Build Coastguard Worker    DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
2201*c0909341SAndroid Build Coastguard Worker%endif
2202*c0909341SAndroid Build Coastguard Worker
2203*c0909341SAndroid Build Coastguard Worker%macro FGUV_FN 3 ; name, ss_hor, ss_ver
2204*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
2205*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2206*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < mmsize
2207*c0909341SAndroid Build Coastguard Workercglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \
2208*c0909341SAndroid Build Coastguard Worker        tmp, src, scaling, h, fg_data, picptr, unused
2209*c0909341SAndroid Build Coastguard Worker    mov              r0, r0m
2210*c0909341SAndroid Build Coastguard Worker    mov              r1, r1m
2211*c0909341SAndroid Build Coastguard Worker    mov              r2, r2m
2212*c0909341SAndroid Build Coastguard Worker    mov              r4, r3m
2213*c0909341SAndroid Build Coastguard Worker    mov              r3, r4m
2214*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
2215*c0909341SAndroid Build Coastguard Worker%define r0m [rsp+8*mmsize+ 3*gprsize]
2216*c0909341SAndroid Build Coastguard Worker%define r1m [rsp+8*mmsize+ 4*gprsize]
2217*c0909341SAndroid Build Coastguard Worker%define r2m [rsp+8*mmsize+ 5*gprsize]
2218*c0909341SAndroid Build Coastguard Worker%define r3m [rsp+8*mmsize+ 6*gprsize]
2219*c0909341SAndroid Build Coastguard Worker%define r4m [rsp+8*mmsize+ 7*gprsize]
2220*c0909341SAndroid Build Coastguard Worker%define r5m [rsp+8*mmsize+ 8*gprsize]
2221*c0909341SAndroid Build Coastguard Worker    mov             r0m, r0
2222*c0909341SAndroid Build Coastguard Worker    mov             r2m, r2
2223*c0909341SAndroid Build Coastguard Worker    mov             r4m, r3
2224*c0909341SAndroid Build Coastguard Worker    mov             r5m, r5
2225*c0909341SAndroid Build Coastguard Worker
2226*c0909341SAndroid Build Coastguard Worker    mov              r0, r6m
2227*c0909341SAndroid Build Coastguard Worker    mov              r2, r7m
2228*c0909341SAndroid Build Coastguard Worker    mov              r3, r8m
2229*c0909341SAndroid Build Coastguard Worker    mov              r5, r9m
2230*c0909341SAndroid Build Coastguard Worker%define r6m [rsp+8*mmsize+ 9*gprsize]
2231*c0909341SAndroid Build Coastguard Worker%define r7m [rsp+8*mmsize+10*gprsize]
2232*c0909341SAndroid Build Coastguard Worker%define r8m [rsp+8*mmsize+11*gprsize]
2233*c0909341SAndroid Build Coastguard Worker%define r9m [rsp+8*mmsize+12*gprsize]
2234*c0909341SAndroid Build Coastguard Worker    mov             r6m, r0
2235*c0909341SAndroid Build Coastguard Worker    mov             r7m, r2
2236*c0909341SAndroid Build Coastguard Worker    mov             r8m, r3
2237*c0909341SAndroid Build Coastguard Worker    mov             r9m, r5
2238*c0909341SAndroid Build Coastguard Worker
2239*c0909341SAndroid Build Coastguard Worker    mov              r2, r10m
2240*c0909341SAndroid Build Coastguard Worker    mov              r3, r11m
2241*c0909341SAndroid Build Coastguard Worker    mov              r5, r12m
2242*c0909341SAndroid Build Coastguard Worker    mov              r0, r13m
2243*c0909341SAndroid Build Coastguard Worker%define r10m [rsp+8*mmsize+13*gprsize]
2244*c0909341SAndroid Build Coastguard Worker%define r11m [rsp+8*mmsize+14*gprsize]
2245*c0909341SAndroid Build Coastguard Worker%define r12m [rsp+8*mmsize+15*gprsize]
2246*c0909341SAndroid Build Coastguard Worker    mov            r10m, r2
2247*c0909341SAndroid Build Coastguard Worker    mov            r11m, r3
2248*c0909341SAndroid Build Coastguard Worker    mov            r12m, r5
2249*c0909341SAndroid Build Coastguard Worker
2250*c0909341SAndroid Build Coastguard Worker    SPLATW           m2, r13m
2251*c0909341SAndroid Build Coastguard Worker%else
2252*c0909341SAndroid Build Coastguard Workercglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
2253*c0909341SAndroid Build Coastguard Worker        tmp, src, scaling, h, fg_data, picptr, unused
2254*c0909341SAndroid Build Coastguard Worker    mov            srcq, srcm
2255*c0909341SAndroid Build Coastguard Worker    mov        fg_dataq, r3m
2256*c0909341SAndroid Build Coastguard Worker%endif
2257*c0909341SAndroid Build Coastguard Worker    LEA              r5, $$
2258*c0909341SAndroid Build Coastguard Worker%define base r5-$$
2259*c0909341SAndroid Build Coastguard Worker
2260*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP   0, 2, 3
2261*c0909341SAndroid Build Coastguard Worker%else
2262*c0909341SAndroid Build Coastguard Workercglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
2263*c0909341SAndroid Build Coastguard Worker                                      grain_lut, h, sby, luma, lstride, uv_pl, is_id
2264*c0909341SAndroid Build Coastguard Worker%define base r8-pb_mask
2265*c0909341SAndroid Build Coastguard Worker    lea              r8, [pb_mask]
2266*c0909341SAndroid Build Coastguard Worker
2267*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP   9, 10, 11
2268*c0909341SAndroid Build Coastguard Worker%endif
2269*c0909341SAndroid Build Coastguard Worker    mov             r6d, [fg_dataq+FGData.scaling_shift]
2270*c0909341SAndroid Build Coastguard Worker    SPLATW           m3, [base+mul_bits+r6*2-14]
2271*c0909341SAndroid Build Coastguard Worker    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
2272*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT >= mmsize
2273*c0909341SAndroid Build Coastguard Worker    mov             t0d, r13m               ; bdmax
2274*c0909341SAndroid Build Coastguard Worker%endif
2275*c0909341SAndroid Build Coastguard Worker    sar             t0d, 11                 ; is_12bpc
2276*c0909341SAndroid Build Coastguard Worker    inc             t0d
2277*c0909341SAndroid Build Coastguard Worker    mov             t1d, r6d
2278*c0909341SAndroid Build Coastguard Worker    imul            t1d, t0d
2279*c0909341SAndroid Build Coastguard Worker    dec             t0d
2280*c0909341SAndroid Build Coastguard Worker    SPLATW           m5, [base+min+t1*2]
2281*c0909341SAndroid Build Coastguard Worker    lea             t1d, [t0d*3]
2282*c0909341SAndroid Build Coastguard Worker    mov             t2d, r12m
2283*c0909341SAndroid Build Coastguard Worker    inc             t2d
2284*c0909341SAndroid Build Coastguard Worker    imul            r6d, t2d
2285*c0909341SAndroid Build Coastguard Worker    add             t1d, r6d
2286*c0909341SAndroid Build Coastguard Worker    SPLATW           m4, [base+max+t1*2]
2287*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT >= mmsize
2288*c0909341SAndroid Build Coastguard Worker    SPLATW           m2, r13m
2289*c0909341SAndroid Build Coastguard Worker%endif
2290*c0909341SAndroid Build Coastguard Worker
2291*c0909341SAndroid Build Coastguard Worker    SCRATCH           2, 10, 2
2292*c0909341SAndroid Build Coastguard Worker    SCRATCH           3, 11, 3
2293*c0909341SAndroid Build Coastguard Worker    SCRATCH           4, 12, 4
2294*c0909341SAndroid Build Coastguard Worker    SCRATCH           5, 13, 5
2295*c0909341SAndroid Build Coastguard Worker
2296*c0909341SAndroid Build Coastguard Worker%define mzero m7
2297*c0909341SAndroid Build Coastguard Worker
2298*c0909341SAndroid Build Coastguard Worker%if %3
2299*c0909341SAndroid Build Coastguard Worker    SPLATD           m2, [base+pw_23_22]
2300*c0909341SAndroid Build Coastguard Worker%endif
2301*c0909341SAndroid Build Coastguard Worker
2302*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2303*c0909341SAndroid Build Coastguard Worker    mov        scalingq, r5m
2304*c0909341SAndroid Build Coastguard Worker    mov             r5m, r5
2305*c0909341SAndroid Build Coastguard Worker%else
2306*c0909341SAndroid Build Coastguard Worker    mov           r13mp, strideq
2307*c0909341SAndroid Build Coastguard Worker%endif
2308*c0909341SAndroid Build Coastguard Worker
2309*c0909341SAndroid Build Coastguard Worker    pcmpeqw          m0, m0
2310*c0909341SAndroid Build Coastguard Worker    psraw            m1, m10, 1
2311*c0909341SAndroid Build Coastguard Worker    pxor             m0, m1
2312*c0909341SAndroid Build Coastguard Worker
2313*c0909341SAndroid Build Coastguard Worker    SCRATCH           0,  8, 0
2314*c0909341SAndroid Build Coastguard Worker    SCRATCH           1,  9, 1
2315*c0909341SAndroid Build Coastguard Worker
2316*c0909341SAndroid Build Coastguard Worker    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
2317*c0909341SAndroid Build Coastguard Worker    jne .csfl
2318*c0909341SAndroid Build Coastguard Worker
2319*c0909341SAndroid Build Coastguard Worker%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_h, ss_v
2320*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2321*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
2322*c0909341SAndroid Build Coastguard Worker
2323*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP    0
2324*c0909341SAndroid Build Coastguard Worker%else
2325*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
2326*c0909341SAndroid Build Coastguard Worker
2327*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP    9
2328*c0909341SAndroid Build Coastguard Worker%endif
2329*c0909341SAndroid Build Coastguard Worker
2330*c0909341SAndroid Build Coastguard Worker%if %1
2331*c0909341SAndroid Build Coastguard Worker    mov             r6d, r11m
2332*c0909341SAndroid Build Coastguard Worker    SPLATW           m0, [fg_dataq+FGData.uv_mult+r6*4]
2333*c0909341SAndroid Build Coastguard Worker    SPLATW           m1, [fg_dataq+FGData.uv_luma_mult+r6*4]
2334*c0909341SAndroid Build Coastguard Worker    punpcklwd        m6, m1, m0
2335*c0909341SAndroid Build Coastguard Worker    SPLATW           m5, [fg_dataq+FGData.uv_offset+r6*4]
2336*c0909341SAndroid Build Coastguard Worker    SPLATD           m7, [base+pw_4+t0*4]
2337*c0909341SAndroid Build Coastguard Worker    pmullw           m5, m7
2338*c0909341SAndroid Build Coastguard Worker%else
2339*c0909341SAndroid Build Coastguard Worker    SPLATD           m6, [base+pd_16]
2340*c0909341SAndroid Build Coastguard Worker%if %2
2341*c0909341SAndroid Build Coastguard Worker    mova             m5, [base+pw_23_22]
2342*c0909341SAndroid Build Coastguard Worker%else
2343*c0909341SAndroid Build Coastguard Worker    mova             m5, [base+pw_27_17_17_27]
2344*c0909341SAndroid Build Coastguard Worker%endif
2345*c0909341SAndroid Build Coastguard Worker%endif
2346*c0909341SAndroid Build Coastguard Worker
2347*c0909341SAndroid Build Coastguard Worker    SCRATCH           6, 14, 6
2348*c0909341SAndroid Build Coastguard Worker    SCRATCH           5, 15, 7
2349*c0909341SAndroid Build Coastguard Worker
2350*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2351*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP   0
2352*c0909341SAndroid Build Coastguard Worker%else
2353*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP   7
2354*c0909341SAndroid Build Coastguard Worker%endif
2355*c0909341SAndroid Build Coastguard Worker
2356*c0909341SAndroid Build Coastguard Worker    mov            sbyd, r8m
2357*c0909341SAndroid Build Coastguard Worker    mov             t0d, [fg_dataq+FGData.overlap_flag]
2358*c0909341SAndroid Build Coastguard Worker    test            t0d, t0d
2359*c0909341SAndroid Build Coastguard Worker    jz %%no_vertical_overlap
2360*c0909341SAndroid Build Coastguard Worker    test           sbyd, sbyd
2361*c0909341SAndroid Build Coastguard Worker    jnz %%vertical_overlap
2362*c0909341SAndroid Build Coastguard Worker
2363*c0909341SAndroid Build Coastguard Worker%%no_vertical_overlap:
2364*c0909341SAndroid Build Coastguard Worker    mov             r8m, t0d
2365*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2366*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap
2367*c0909341SAndroid Build Coastguard Worker    imul           seed, (173 << 24) | 37
2368*c0909341SAndroid Build Coastguard Worker%else
2369*c0909341SAndroid Build Coastguard Worker    imul           seed, sbyd, (173 << 24) | 37
2370*c0909341SAndroid Build Coastguard Worker%endif
2371*c0909341SAndroid Build Coastguard Worker    add            seed, (105 << 24) | 178
2372*c0909341SAndroid Build Coastguard Worker    rol            seed, 8
2373*c0909341SAndroid Build Coastguard Worker    movzx          seed, seew
2374*c0909341SAndroid Build Coastguard Worker    xor            seed, [fg_dataq+FGData.seed]
2375*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2376*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
2377*c0909341SAndroid Build Coastguard Worker
2378*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, see, w, picptr, luma
2379*c0909341SAndroid Build Coastguard Worker
2380*c0909341SAndroid Build Coastguard Worker    mov            dstq, r0mp
2381*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r9mp
2382*c0909341SAndroid Build Coastguard Worker    mov              wq, r4m
2383*c0909341SAndroid Build Coastguard Worker    lea              r3, [srcq+wq*2]
2384*c0909341SAndroid Build Coastguard Worker    mov            r1mp, r3
2385*c0909341SAndroid Build Coastguard Worker    lea              r3, [dstq+wq*2]
2386*c0909341SAndroid Build Coastguard Worker    mov           r11mp, r3
2387*c0909341SAndroid Build Coastguard Worker    lea              r3, [lumaq+wq*(2<<%2)]
2388*c0909341SAndroid Build Coastguard Worker    mov           r12mp, r3
2389*c0909341SAndroid Build Coastguard Worker%if %3
2390*c0909341SAndroid Build Coastguard Worker    shl           r10mp, 1
2391*c0909341SAndroid Build Coastguard Worker%endif
2392*c0909341SAndroid Build Coastguard Worker%else
2393*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2394*c0909341SAndroid Build Coastguard Worker                unused2, unused3, see, unused4, unused5, unused6, luma, lstride
2395*c0909341SAndroid Build Coastguard Worker
2396*c0909341SAndroid Build Coastguard Worker    mov        lstrideq, r10mp
2397*c0909341SAndroid Build Coastguard Worker%if %3
2398*c0909341SAndroid Build Coastguard Worker    add        lstrideq, lstrideq
2399*c0909341SAndroid Build Coastguard Worker%endif
2400*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r9mp
2401*c0909341SAndroid Build Coastguard Worker    lea             r10, [srcq+wq*2]
2402*c0909341SAndroid Build Coastguard Worker    lea             r11, [dstq+wq*2]
2403*c0909341SAndroid Build Coastguard Worker    lea             r12, [lumaq+wq*(2<<%2)]
2404*c0909341SAndroid Build Coastguard Worker    mov           r10mp, r10
2405*c0909341SAndroid Build Coastguard Worker    mov           r11mp, r11
2406*c0909341SAndroid Build Coastguard Worker    mov           r12mp, r12
2407*c0909341SAndroid Build Coastguard Worker%endif
2408*c0909341SAndroid Build Coastguard Worker    neg              wq
2409*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2410*c0909341SAndroid Build Coastguard Worker    mov           r4mp, wq
2411*c0909341SAndroid Build Coastguard Worker%endif
2412*c0909341SAndroid Build Coastguard Worker
2413*c0909341SAndroid Build Coastguard Worker%%loop_x:
2414*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2415*c0909341SAndroid Build Coastguard Worker    mov            seed, r3m
2416*c0909341SAndroid Build Coastguard Worker%endif
2417*c0909341SAndroid Build Coastguard Worker
2418*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
2419*c0909341SAndroid Build Coastguard Worker    or             seed, 0xEFF4
2420*c0909341SAndroid Build Coastguard Worker    shr             r6d, 1
2421*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
2422*c0909341SAndroid Build Coastguard Worker    lea            seed, [r6+0x8000]
2423*c0909341SAndroid Build Coastguard Worker    cmovp          seed, r6d               ; updated seed
2424*c0909341SAndroid Build Coastguard Worker
2425*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2426*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
2427*c0909341SAndroid Build Coastguard Worker
2428*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
2429*c0909341SAndroid Build Coastguard Worker
2430*c0909341SAndroid Build Coastguard Worker    mov           offxd, offyd
2431*c0909341SAndroid Build Coastguard Worker%else
2432*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2433*c0909341SAndroid Build Coastguard Worker                offx, offy, see, unused1, unused2, unused3, luma, lstride
2434*c0909341SAndroid Build Coastguard Worker
2435*c0909341SAndroid Build Coastguard Worker    mov           offxd, seed
2436*c0909341SAndroid Build Coastguard Worker    mov           offyd, seed
2437*c0909341SAndroid Build Coastguard Worker%endif
2438*c0909341SAndroid Build Coastguard Worker    ror           offyd, 8
2439*c0909341SAndroid Build Coastguard Worker    shr           offxd, 12
2440*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf
2441*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164>>%3
2442*c0909341SAndroid Build Coastguard Worker    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
2443*c0909341SAndroid Build Coastguard Worker
2444*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2445*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2446*c0909341SAndroid Build Coastguard Worker%else
2447*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2448*c0909341SAndroid Build Coastguard Worker                h, offxy, see, unused1, unused2, unused3, luma, lstride
2449*c0909341SAndroid Build Coastguard Worker%endif
2450*c0909341SAndroid Build Coastguard Worker
2451*c0909341SAndroid Build Coastguard Worker%if %2 == 0
2452*c0909341SAndroid Build Coastguard Worker%%loop_x_odd:
2453*c0909341SAndroid Build Coastguard Worker%endif
2454*c0909341SAndroid Build Coastguard Worker    mov              hd, r7m
2455*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
2456*c0909341SAndroid Build Coastguard Worker%%loop_y:
2457*c0909341SAndroid Build Coastguard Worker    ; src
2458*c0909341SAndroid Build Coastguard Worker    mova             m0, [srcq]
2459*c0909341SAndroid Build Coastguard Worker    mova             m1, [srcq+16]          ; m0-1: src as word
2460*c0909341SAndroid Build Coastguard Worker
2461*c0909341SAndroid Build Coastguard Worker    ; luma_src
2462*c0909341SAndroid Build Coastguard Worker    pxor          mzero, mzero
2463*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2464*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
2465*c0909341SAndroid Build Coastguard Worker
2466*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r9m
2467*c0909341SAndroid Build Coastguard Worker%endif
2468*c0909341SAndroid Build Coastguard Worker    mova             m4, [lumaq+ 0]
2469*c0909341SAndroid Build Coastguard Worker    mova             m6, [lumaq+(16<<%2)]
2470*c0909341SAndroid Build Coastguard Worker%if %2
2471*c0909341SAndroid Build Coastguard Worker    phaddw           m4, [lumaq+16]
2472*c0909341SAndroid Build Coastguard Worker    phaddw           m6, [lumaq+48]
2473*c0909341SAndroid Build Coastguard Worker%endif
2474*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2475*c0909341SAndroid Build Coastguard Worker    add           lumaq, r10mp
2476*c0909341SAndroid Build Coastguard Worker    mov             r9m, lumaq
2477*c0909341SAndroid Build Coastguard Worker%endif
2478*c0909341SAndroid Build Coastguard Worker%if %2
2479*c0909341SAndroid Build Coastguard Worker    pavgw            m4, mzero
2480*c0909341SAndroid Build Coastguard Worker    pavgw            m6, mzero
2481*c0909341SAndroid Build Coastguard Worker%endif
2482*c0909341SAndroid Build Coastguard Worker
2483*c0909341SAndroid Build Coastguard Worker%if %1
2484*c0909341SAndroid Build Coastguard Worker    punpckhwd        m3, m4, m0
2485*c0909341SAndroid Build Coastguard Worker    punpcklwd        m4, m0
2486*c0909341SAndroid Build Coastguard Worker    punpckhwd        m5, m6, m1
2487*c0909341SAndroid Build Coastguard Worker    punpcklwd        m6, m1                 ; { luma, chroma }
2488*c0909341SAndroid Build Coastguard Worker    REPX {pmaddwd x, m14}, m3, m4, m5, m6
2489*c0909341SAndroid Build Coastguard Worker    REPX {psrad   x, 6}, m3, m4, m5, m6
2490*c0909341SAndroid Build Coastguard Worker    packssdw         m4, m3
2491*c0909341SAndroid Build Coastguard Worker    packssdw         m6, m5
2492*c0909341SAndroid Build Coastguard Worker    REPX {paddw x, m15}, m4, m6
2493*c0909341SAndroid Build Coastguard Worker    REPX {pmaxsw x, mzero}, m4, m6
2494*c0909341SAndroid Build Coastguard Worker    REPX {pminsw x, m10}, m4, m6             ; clip_pixel()
2495*c0909341SAndroid Build Coastguard Worker%else
2496*c0909341SAndroid Build Coastguard Worker    REPX  {pand x, m10}, m4, m6
2497*c0909341SAndroid Build Coastguard Worker%endif
2498*c0909341SAndroid Build Coastguard Worker
2499*c0909341SAndroid Build Coastguard Worker    ; scaling[luma_src]
2500*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2501*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m3, m4, scalingq-1, r0, r5, 8, 1
2502*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m6, scalingq-1, r0, r5, 8, 1
2503*c0909341SAndroid Build Coastguard Worker%else
2504*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m3, m4, scalingq-1, r10, r12, 8, 1
2505*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m6, scalingq-1, r10, r12, 8, 1
2506*c0909341SAndroid Build Coastguard Worker%endif
2507*c0909341SAndroid Build Coastguard Worker    REPX   {psrlw x, 8}, m3, m5
2508*c0909341SAndroid Build Coastguard Worker
2509*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
2510*c0909341SAndroid Build Coastguard Worker    movu             m4, [grain_lutq+offxyq*2]
2511*c0909341SAndroid Build Coastguard Worker    movu             m6, [grain_lutq+offxyq*2+16]
2512*c0909341SAndroid Build Coastguard Worker
2513*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2514*c0909341SAndroid Build Coastguard Worker    REPX {pmullw x, m11}, m3, m5
2515*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m4, m3
2516*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m6, m5
2517*c0909341SAndroid Build Coastguard Worker
2518*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
2519*c0909341SAndroid Build Coastguard Worker    paddw            m0, m4
2520*c0909341SAndroid Build Coastguard Worker    paddw            m1, m6
2521*c0909341SAndroid Build Coastguard Worker    pmaxsw           m0, m13
2522*c0909341SAndroid Build Coastguard Worker    pmaxsw           m1, m13
2523*c0909341SAndroid Build Coastguard Worker    pminsw           m0, m12
2524*c0909341SAndroid Build Coastguard Worker    pminsw           m1, m12
2525*c0909341SAndroid Build Coastguard Worker    movifnidn      dstq, dstmp
2526*c0909341SAndroid Build Coastguard Worker    mova      [dstq+ 0], m0
2527*c0909341SAndroid Build Coastguard Worker    mova      [dstq+16], m1
2528*c0909341SAndroid Build Coastguard Worker
2529*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2530*c0909341SAndroid Build Coastguard Worker    add            srcq, r2mp
2531*c0909341SAndroid Build Coastguard Worker    add            dstq, r2mp
2532*c0909341SAndroid Build Coastguard Worker    mov           dstmp, dstq
2533*c0909341SAndroid Build Coastguard Worker%else
2534*c0909341SAndroid Build Coastguard Worker    add            srcq, r13mp
2535*c0909341SAndroid Build Coastguard Worker    add            dstq, r13mp
2536*c0909341SAndroid Build Coastguard Worker    add           lumaq, lstrideq
2537*c0909341SAndroid Build Coastguard Worker%endif
2538*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82*2
2539*c0909341SAndroid Build Coastguard Worker    dec              hd
2540*c0909341SAndroid Build Coastguard Worker    jg %%loop_y
2541*c0909341SAndroid Build Coastguard Worker
2542*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2543*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, w, picptr, luma
2544*c0909341SAndroid Build Coastguard Worker
2545*c0909341SAndroid Build Coastguard Worker    mov              wq, r4mp
2546*c0909341SAndroid Build Coastguard Worker%endif
2547*c0909341SAndroid Build Coastguard Worker    add              wq, 16
2548*c0909341SAndroid Build Coastguard Worker    jge %%end
2549*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2550*c0909341SAndroid Build Coastguard Worker    mov            srcq, r1mp
2551*c0909341SAndroid Build Coastguard Worker%else
2552*c0909341SAndroid Build Coastguard Worker    mov            srcq, r10mp
2553*c0909341SAndroid Build Coastguard Worker%endif
2554*c0909341SAndroid Build Coastguard Worker    mov            dstq, r11mp
2555*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r12mp
2556*c0909341SAndroid Build Coastguard Worker    lea            srcq, [srcq+wq*2]
2557*c0909341SAndroid Build Coastguard Worker    lea            dstq, [dstq+wq*2]
2558*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [lumaq+wq*(2<<%2)]
2559*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2560*c0909341SAndroid Build Coastguard Worker    mov             r0m, dstq
2561*c0909341SAndroid Build Coastguard Worker    mov             r9m, lumaq
2562*c0909341SAndroid Build Coastguard Worker    mov             r4m, wq
2563*c0909341SAndroid Build Coastguard Worker%endif
2564*c0909341SAndroid Build Coastguard Worker%if %2 == 0
2565*c0909341SAndroid Build Coastguard Worker    btc       dword r8m, 2
2566*c0909341SAndroid Build Coastguard Worker    jc %%next_blk
2567*c0909341SAndroid Build Coastguard Worker    add          offxyd, 16
2568*c0909341SAndroid Build Coastguard Worker    test      dword r8m, 2
2569*c0909341SAndroid Build Coastguard Worker    jz %%loop_x_odd
2570*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2571*c0909341SAndroid Build Coastguard Worker    add dword [rsp+8*mmsize+1*gprsize], 16
2572*c0909341SAndroid Build Coastguard Worker%else
2573*c0909341SAndroid Build Coastguard Worker    add            r11d, 16
2574*c0909341SAndroid Build Coastguard Worker%endif
2575*c0909341SAndroid Build Coastguard Worker    jmp %%loop_x_odd_v_overlap
2576*c0909341SAndroid Build Coastguard Worker%%next_blk:
2577*c0909341SAndroid Build Coastguard Worker%endif
2578*c0909341SAndroid Build Coastguard Worker    test      dword r8m, 1
2579*c0909341SAndroid Build Coastguard Worker    je %%loop_x
2580*c0909341SAndroid Build Coastguard Worker
2581*c0909341SAndroid Build Coastguard Worker    ; r8m = sbym
2582*c0909341SAndroid Build Coastguard Worker    test      dword r8m, 2
2583*c0909341SAndroid Build Coastguard Worker    jnz %%loop_x_hv_overlap
2584*c0909341SAndroid Build Coastguard Worker
2585*c0909341SAndroid Build Coastguard Worker    ; horizontal overlap (without vertical overlap)
2586*c0909341SAndroid Build Coastguard Worker%%loop_x_h_overlap:
2587*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2588*c0909341SAndroid Build Coastguard Worker    add          offxyd, 16
2589*c0909341SAndroid Build Coastguard Worker    mov [rsp+8*mmsize+0*gprsize], offxyd
2590*c0909341SAndroid Build Coastguard Worker
2591*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut
2592*c0909341SAndroid Build Coastguard Worker
2593*c0909341SAndroid Build Coastguard Worker    mov            seed, r3m
2594*c0909341SAndroid Build Coastguard Worker%endif
2595*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
2596*c0909341SAndroid Build Coastguard Worker    or             seed, 0xEFF4
2597*c0909341SAndroid Build Coastguard Worker    shr             r6d, 1
2598*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
2599*c0909341SAndroid Build Coastguard Worker    lea            seed, [r6+0x8000]
2600*c0909341SAndroid Build Coastguard Worker    cmovp          seed, r6d               ; updated seed
2601*c0909341SAndroid Build Coastguard Worker
2602*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2603*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
2604*c0909341SAndroid Build Coastguard Worker
2605*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
2606*c0909341SAndroid Build Coastguard Worker
2607*c0909341SAndroid Build Coastguard Worker    mov           offxd, offyd
2608*c0909341SAndroid Build Coastguard Worker%else
2609*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2610*c0909341SAndroid Build Coastguard Worker                offx, offy, see, left_offxy, unused1, unused2, luma, lstride
2611*c0909341SAndroid Build Coastguard Worker
2612*c0909341SAndroid Build Coastguard Worker    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
2613*c0909341SAndroid Build Coastguard Worker    mov           offxd, seed
2614*c0909341SAndroid Build Coastguard Worker    mov           offyd, seed
2615*c0909341SAndroid Build Coastguard Worker%endif
2616*c0909341SAndroid Build Coastguard Worker    ror           offyd, 8
2617*c0909341SAndroid Build Coastguard Worker    shr           offxd, 12
2618*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf
2619*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164>>%3
2620*c0909341SAndroid Build Coastguard Worker    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
2621*c0909341SAndroid Build Coastguard Worker
2622*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2623*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2624*c0909341SAndroid Build Coastguard Worker%else
2625*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2626*c0909341SAndroid Build Coastguard Worker                h, offxy, see, left_offxy, unused1, unused2, luma, lstride
2627*c0909341SAndroid Build Coastguard Worker%endif
2628*c0909341SAndroid Build Coastguard Worker
2629*c0909341SAndroid Build Coastguard Worker    mov              hd, r7m
2630*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
2631*c0909341SAndroid Build Coastguard Worker%%loop_y_h_overlap:
2632*c0909341SAndroid Build Coastguard Worker    mova             m0, [srcq]
2633*c0909341SAndroid Build Coastguard Worker    mova             m1, [srcq+16]
2634*c0909341SAndroid Build Coastguard Worker
2635*c0909341SAndroid Build Coastguard Worker    ; luma_src
2636*c0909341SAndroid Build Coastguard Worker    pxor          mzero, mzero
2637*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2638*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
2639*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r9m
2640*c0909341SAndroid Build Coastguard Worker%endif
2641*c0909341SAndroid Build Coastguard Worker    mova             m4, [lumaq+ 0]
2642*c0909341SAndroid Build Coastguard Worker    mova             m6, [lumaq+(16<<%2)]
2643*c0909341SAndroid Build Coastguard Worker%if %2
2644*c0909341SAndroid Build Coastguard Worker    phaddw           m4, [lumaq+16]
2645*c0909341SAndroid Build Coastguard Worker    phaddw           m6, [lumaq+48]
2646*c0909341SAndroid Build Coastguard Worker%endif
2647*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2648*c0909341SAndroid Build Coastguard Worker    add           lumaq, r10mp
2649*c0909341SAndroid Build Coastguard Worker    mov             r9m, lumaq
2650*c0909341SAndroid Build Coastguard Worker%endif
2651*c0909341SAndroid Build Coastguard Worker%if %2
2652*c0909341SAndroid Build Coastguard Worker    pavgw            m4, mzero
2653*c0909341SAndroid Build Coastguard Worker    pavgw            m6, mzero
2654*c0909341SAndroid Build Coastguard Worker%endif
2655*c0909341SAndroid Build Coastguard Worker
2656*c0909341SAndroid Build Coastguard Worker%if %1
2657*c0909341SAndroid Build Coastguard Worker    punpckhwd        m3, m4, m0
2658*c0909341SAndroid Build Coastguard Worker    punpcklwd        m4, m0
2659*c0909341SAndroid Build Coastguard Worker    punpckhwd        m5, m6, m1
2660*c0909341SAndroid Build Coastguard Worker    punpcklwd        m6, m1                 ; { luma, chroma }
2661*c0909341SAndroid Build Coastguard Worker    REPX {pmaddwd x, m14}, m3, m4, m5, m6
2662*c0909341SAndroid Build Coastguard Worker    REPX {psrad   x, 6}, m3, m4, m5, m6
2663*c0909341SAndroid Build Coastguard Worker    packssdw         m4, m3
2664*c0909341SAndroid Build Coastguard Worker    packssdw         m6, m5
2665*c0909341SAndroid Build Coastguard Worker    REPX {paddw x, m15}, m4, m6
2666*c0909341SAndroid Build Coastguard Worker    REPX {pmaxsw x, mzero}, m4, m6
2667*c0909341SAndroid Build Coastguard Worker    REPX {pminsw x, m10}, m4, m6             ; clip_pixel()
2668*c0909341SAndroid Build Coastguard Worker%else
2669*c0909341SAndroid Build Coastguard Worker    REPX  {pand x, m10}, m4, m6
2670*c0909341SAndroid Build Coastguard Worker%endif
2671*c0909341SAndroid Build Coastguard Worker
2672*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
2673*c0909341SAndroid Build Coastguard Worker    movu             m7, [grain_lutq+offxyq*2]
2674*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2675*c0909341SAndroid Build Coastguard Worker    mov              r5, [rsp+8*mmsize+0*gprsize]
2676*c0909341SAndroid Build Coastguard Worker    movd             m5, [grain_lutq+r5*2]
2677*c0909341SAndroid Build Coastguard Worker%else
2678*c0909341SAndroid Build Coastguard Worker    movd             m5, [grain_lutq+left_offxyq*2+ 0]
2679*c0909341SAndroid Build Coastguard Worker%endif
2680*c0909341SAndroid Build Coastguard Worker    punpcklwd        m5, m7                ; {left0, cur0}
2681*c0909341SAndroid Build Coastguard Worker%if %1
2682*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2683*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
2684*c0909341SAndroid Build Coastguard Worker%endif
2685*c0909341SAndroid Build Coastguard Worker%if %2
2686*c0909341SAndroid Build Coastguard Worker    pmaddwd          m5, [PIC_ptr(pw_23_22)]
2687*c0909341SAndroid Build Coastguard Worker%else
2688*c0909341SAndroid Build Coastguard Worker    pmaddwd          m5, [PIC_ptr(pw_27_17_17_27)]
2689*c0909341SAndroid Build Coastguard Worker%endif
2690*c0909341SAndroid Build Coastguard Worker    paddd            m5, [PIC_ptr(pd_16)]
2691*c0909341SAndroid Build Coastguard Worker%else
2692*c0909341SAndroid Build Coastguard Worker    pmaddwd          m5, m15
2693*c0909341SAndroid Build Coastguard Worker    paddd            m5, m14
2694*c0909341SAndroid Build Coastguard Worker%endif
2695*c0909341SAndroid Build Coastguard Worker    psrad            m5, 5
2696*c0909341SAndroid Build Coastguard Worker    packssdw         m5, m5
2697*c0909341SAndroid Build Coastguard Worker    pmaxsw           m5, m8
2698*c0909341SAndroid Build Coastguard Worker    pminsw           m5, m9
2699*c0909341SAndroid Build Coastguard Worker    shufps           m5, m7, q3210
2700*c0909341SAndroid Build Coastguard Worker    movu             m3, [grain_lutq+offxyq*2+16]
2701*c0909341SAndroid Build Coastguard Worker
2702*c0909341SAndroid Build Coastguard Worker    ; scaling[luma_src]
2703*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2704*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m7, m4, scalingq-1, r0, r5, 8, 1
2705*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m4, m6, scalingq-1, r0, r5, 8, 1
2706*c0909341SAndroid Build Coastguard Worker%else
2707*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m7, m4, scalingq-1, r2, r12, 8, 1
2708*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m4, m6, scalingq-1, r2, r12, 8, 1
2709*c0909341SAndroid Build Coastguard Worker%endif
2710*c0909341SAndroid Build Coastguard Worker    REPX   {psrlw x, 8}, m7, m4
2711*c0909341SAndroid Build Coastguard Worker
2712*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2713*c0909341SAndroid Build Coastguard Worker    REPX {pmullw x, m11}, m7, m4
2714*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m5, m7
2715*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m3, m4
2716*c0909341SAndroid Build Coastguard Worker
2717*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
2718*c0909341SAndroid Build Coastguard Worker    paddw            m0, m5
2719*c0909341SAndroid Build Coastguard Worker    paddw            m1, m3
2720*c0909341SAndroid Build Coastguard Worker    pmaxsw           m0, m13
2721*c0909341SAndroid Build Coastguard Worker    pmaxsw           m1, m13
2722*c0909341SAndroid Build Coastguard Worker    pminsw           m0, m12
2723*c0909341SAndroid Build Coastguard Worker    pminsw           m1, m12
2724*c0909341SAndroid Build Coastguard Worker    movifnidn      dstq, dstmp
2725*c0909341SAndroid Build Coastguard Worker    mova      [dstq+ 0], m0
2726*c0909341SAndroid Build Coastguard Worker    mova      [dstq+16], m1
2727*c0909341SAndroid Build Coastguard Worker
2728*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2729*c0909341SAndroid Build Coastguard Worker    add            srcq, r2mp
2730*c0909341SAndroid Build Coastguard Worker    add            dstq, r2mp
2731*c0909341SAndroid Build Coastguard Worker    mov           dstmp, dstq
2732*c0909341SAndroid Build Coastguard Worker%else
2733*c0909341SAndroid Build Coastguard Worker    add            srcq, r13mp
2734*c0909341SAndroid Build Coastguard Worker    add            dstq, r13mp
2735*c0909341SAndroid Build Coastguard Worker    add           lumaq, lstrideq
2736*c0909341SAndroid Build Coastguard Worker%endif
2737*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82*2
2738*c0909341SAndroid Build Coastguard Worker    dec              hd
2739*c0909341SAndroid Build Coastguard Worker    jg %%loop_y_h_overlap
2740*c0909341SAndroid Build Coastguard Worker
2741*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2742*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
2743*c0909341SAndroid Build Coastguard Worker    mov              wq, r4mp
2744*c0909341SAndroid Build Coastguard Worker%endif
2745*c0909341SAndroid Build Coastguard Worker    add              wq, 16
2746*c0909341SAndroid Build Coastguard Worker    jge %%end
2747*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2748*c0909341SAndroid Build Coastguard Worker    mov            srcq, r1mp
2749*c0909341SAndroid Build Coastguard Worker%else
2750*c0909341SAndroid Build Coastguard Worker    mov            srcq, r10mp
2751*c0909341SAndroid Build Coastguard Worker%endif
2752*c0909341SAndroid Build Coastguard Worker    mov            dstq, r11mp
2753*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r12mp
2754*c0909341SAndroid Build Coastguard Worker    lea            srcq, [srcq+wq*2]
2755*c0909341SAndroid Build Coastguard Worker    lea            dstq, [dstq+wq*2]
2756*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [lumaq+wq*(2<<%2)]
2757*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2758*c0909341SAndroid Build Coastguard Worker    mov            r0mp, dstq
2759*c0909341SAndroid Build Coastguard Worker    mov            r9mp, lumaq
2760*c0909341SAndroid Build Coastguard Worker    mov             r4m, wq
2761*c0909341SAndroid Build Coastguard Worker%endif
2762*c0909341SAndroid Build Coastguard Worker
2763*c0909341SAndroid Build Coastguard Worker%if %2
2764*c0909341SAndroid Build Coastguard Worker    ; r8m = sbym
2765*c0909341SAndroid Build Coastguard Worker    test      dword r8m, 2
2766*c0909341SAndroid Build Coastguard Worker    jne %%loop_x_hv_overlap
2767*c0909341SAndroid Build Coastguard Worker    jmp %%loop_x_h_overlap
2768*c0909341SAndroid Build Coastguard Worker%else
2769*c0909341SAndroid Build Coastguard Worker    or        dword r8m, 4
2770*c0909341SAndroid Build Coastguard Worker    add          offxyd, 16
2771*c0909341SAndroid Build Coastguard Worker
2772*c0909341SAndroid Build Coastguard Worker    ; r8m = sbym
2773*c0909341SAndroid Build Coastguard Worker    test      dword r8m, 2
2774*c0909341SAndroid Build Coastguard Worker    jz %%loop_x_odd
2775*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2776*c0909341SAndroid Build Coastguard Worker    add dword [rsp+8*mmsize+1*gprsize], 16
2777*c0909341SAndroid Build Coastguard Worker%else
2778*c0909341SAndroid Build Coastguard Worker    add            r11d, 16                 ; top_offxy += 16
2779*c0909341SAndroid Build Coastguard Worker%endif
2780*c0909341SAndroid Build Coastguard Worker    jmp %%loop_x_odd_v_overlap
2781*c0909341SAndroid Build Coastguard Worker%endif
2782*c0909341SAndroid Build Coastguard Worker
2783*c0909341SAndroid Build Coastguard Worker%%end:
2784*c0909341SAndroid Build Coastguard Worker    RET
2785*c0909341SAndroid Build Coastguard Worker
2786*c0909341SAndroid Build Coastguard Worker%%vertical_overlap:
2787*c0909341SAndroid Build Coastguard Worker    or              t0d, 2
2788*c0909341SAndroid Build Coastguard Worker    mov             r8m, t0d
2789*c0909341SAndroid Build Coastguard Worker
2790*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2791*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
2792*c0909341SAndroid Build Coastguard Worker%else
2793*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
2794*c0909341SAndroid Build Coastguard Worker                sby, see, unused1, unused2, unused3, lstride
2795*c0909341SAndroid Build Coastguard Worker%endif
2796*c0909341SAndroid Build Coastguard Worker
2797*c0909341SAndroid Build Coastguard Worker    movzx          sbyd, sbyb
2798*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2799*c0909341SAndroid Build Coastguard Worker    imul             r4, [fg_dataq+FGData.seed], 0x00010001
2800*c0909341SAndroid Build Coastguard Worker
2801*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
2802*c0909341SAndroid Build Coastguard Worker%else
2803*c0909341SAndroid Build Coastguard Worker    imul           seed, [fg_dataq+FGData.seed], 0x00010001
2804*c0909341SAndroid Build Coastguard Worker%endif
2805*c0909341SAndroid Build Coastguard Worker    imul            t0d, sbyd, 173 * 0x00010001
2806*c0909341SAndroid Build Coastguard Worker    imul           sbyd, 37 * 0x01000100
2807*c0909341SAndroid Build Coastguard Worker    add             t0d, (105 << 16) | 188
2808*c0909341SAndroid Build Coastguard Worker    add            sbyd, (178 << 24) | (141 << 8)
2809*c0909341SAndroid Build Coastguard Worker    and             t0d, 0x00ff00ff
2810*c0909341SAndroid Build Coastguard Worker    and            sbyd, 0xff00ff00
2811*c0909341SAndroid Build Coastguard Worker    xor            seed, t0d
2812*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2813*c0909341SAndroid Build Coastguard Worker    xor            sbyd, seed
2814*c0909341SAndroid Build Coastguard Worker
2815*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, see, w, picptr, luma
2816*c0909341SAndroid Build Coastguard Worker
2817*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
2818*c0909341SAndroid Build Coastguard Worker    mov            dstq, r0mp
2819*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r9mp
2820*c0909341SAndroid Build Coastguard Worker    mov              wq, r4m
2821*c0909341SAndroid Build Coastguard Worker    lea              r3, [srcq+wq*2]
2822*c0909341SAndroid Build Coastguard Worker    mov            r1mp, r3
2823*c0909341SAndroid Build Coastguard Worker    lea              r3, [dstq+wq*2]
2824*c0909341SAndroid Build Coastguard Worker    mov           r11mp, r3
2825*c0909341SAndroid Build Coastguard Worker    lea              r3, [lumaq+wq*(2<<%2)]
2826*c0909341SAndroid Build Coastguard Worker    mov           r12mp, r3
2827*c0909341SAndroid Build Coastguard Worker%if %3
2828*c0909341SAndroid Build Coastguard Worker    shl           r10mp, 1
2829*c0909341SAndroid Build Coastguard Worker%endif
2830*c0909341SAndroid Build Coastguard Worker%else
2831*c0909341SAndroid Build Coastguard Worker    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
2832*c0909341SAndroid Build Coastguard Worker
2833*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2834*c0909341SAndroid Build Coastguard Worker                unused1, unused2, see, unused3, unused4, unused5, luma, lstride
2835*c0909341SAndroid Build Coastguard Worker
2836*c0909341SAndroid Build Coastguard Worker    mov        lstrideq, r10mp
2837*c0909341SAndroid Build Coastguard Worker%if %3
2838*c0909341SAndroid Build Coastguard Worker    add        lstrideq, lstrideq
2839*c0909341SAndroid Build Coastguard Worker%endif
2840*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r9mp
2841*c0909341SAndroid Build Coastguard Worker    lea             r10, [srcq+wq*2]
2842*c0909341SAndroid Build Coastguard Worker    lea             r11, [dstq+wq*2]
2843*c0909341SAndroid Build Coastguard Worker    lea             r12, [lumaq+wq*(2<<%2)]
2844*c0909341SAndroid Build Coastguard Worker    mov           r10mp, r10
2845*c0909341SAndroid Build Coastguard Worker    mov           r11mp, r11
2846*c0909341SAndroid Build Coastguard Worker    mov           r12mp, r12
2847*c0909341SAndroid Build Coastguard Worker%endif
2848*c0909341SAndroid Build Coastguard Worker    neg              wq
2849*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2850*c0909341SAndroid Build Coastguard Worker    mov             r4m, wq
2851*c0909341SAndroid Build Coastguard Worker%endif
2852*c0909341SAndroid Build Coastguard Worker
2853*c0909341SAndroid Build Coastguard Worker%%loop_x_v_overlap:
2854*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2855*c0909341SAndroid Build Coastguard Worker    mov            seed, r3m
2856*c0909341SAndroid Build Coastguard Worker    xor             t0d, t0d
2857*c0909341SAndroid Build Coastguard Worker%else
2858*c0909341SAndroid Build Coastguard Worker    ; we assume from the block above that bits 8-15 of r7d are zero'ed
2859*c0909341SAndroid Build Coastguard Worker%endif
2860*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
2861*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4eff4
2862*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
2863*c0909341SAndroid Build Coastguard Worker    setp            t0b                     ; parity of top_seed
2864*c0909341SAndroid Build Coastguard Worker    shr            seed, 16
2865*c0909341SAndroid Build Coastguard Worker    shl             t0d, 16
2866*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
2867*c0909341SAndroid Build Coastguard Worker    setp            t0b                     ; parity of cur_seed
2868*c0909341SAndroid Build Coastguard Worker    or              r6d, 0x00010001
2869*c0909341SAndroid Build Coastguard Worker    xor             t0d, r6d
2870*c0909341SAndroid Build Coastguard Worker    mov            seed, t0d
2871*c0909341SAndroid Build Coastguard Worker    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
2872*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2873*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
2874*c0909341SAndroid Build Coastguard Worker
2875*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
2876*c0909341SAndroid Build Coastguard Worker
2877*c0909341SAndroid Build Coastguard Worker    mov           offxd, offyd
2878*c0909341SAndroid Build Coastguard Worker%else
2879*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2880*c0909341SAndroid Build Coastguard Worker                offx, offy, see, unused1, top_offxy, unused2, luma, lstride
2881*c0909341SAndroid Build Coastguard Worker
2882*c0909341SAndroid Build Coastguard Worker    mov           offyd, seed
2883*c0909341SAndroid Build Coastguard Worker    mov           offxd, seed
2884*c0909341SAndroid Build Coastguard Worker%endif
2885*c0909341SAndroid Build Coastguard Worker    ror           offyd, 8
2886*c0909341SAndroid Build Coastguard Worker    ror           offxd, 12
2887*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf000f
2888*c0909341SAndroid Build Coastguard Worker    and           offxd, 0xf000f
2889*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164>>%3
2890*c0909341SAndroid Build Coastguard Worker    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
2891*c0909341SAndroid Build Coastguard Worker    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
2892*c0909341SAndroid Build Coastguard Worker
2893*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2894*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
2895*c0909341SAndroid Build Coastguard Worker%else
2896*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2897*c0909341SAndroid Build Coastguard Worker                h, offxy, see, unused1, top_offxy, unused2, luma, lstride
2898*c0909341SAndroid Build Coastguard Worker%endif
2899*c0909341SAndroid Build Coastguard Worker    movzx    top_offxyd, offxyw
2900*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2901*c0909341SAndroid Build Coastguard Worker    mov [rsp+8*mmsize+1*gprsize], top_offxyd
2902*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2903*c0909341SAndroid Build Coastguard Worker%endif
2904*c0909341SAndroid Build Coastguard Worker    shr          offxyd, 16
2905*c0909341SAndroid Build Coastguard Worker
2906*c0909341SAndroid Build Coastguard Worker%if %2 == 0
2907*c0909341SAndroid Build Coastguard Worker%%loop_x_odd_v_overlap:
2908*c0909341SAndroid Build Coastguard Worker%endif
2909*c0909341SAndroid Build Coastguard Worker%if %3 == 0
2910*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2911*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
2912*c0909341SAndroid Build Coastguard Worker%endif
2913*c0909341SAndroid Build Coastguard Worker    SPLATD           m2, [PIC_ptr(pw_27_17_17_27)]
2914*c0909341SAndroid Build Coastguard Worker%endif
2915*c0909341SAndroid Build Coastguard Worker
2916*c0909341SAndroid Build Coastguard Worker    mov              hd, r7m
2917*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
2918*c0909341SAndroid Build Coastguard Worker%%loop_y_v_overlap:
2919*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
2920*c0909341SAndroid Build Coastguard Worker    movu             m3, [grain_lutq+offxyq*2]
2921*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2922*c0909341SAndroid Build Coastguard Worker    mov              r0, [rsp+mmsize*8+gprsize*1] ; top_offxy
2923*c0909341SAndroid Build Coastguard Worker    movu             m5, [grain_lutq+r0*2]
2924*c0909341SAndroid Build Coastguard Worker%else
2925*c0909341SAndroid Build Coastguard Worker    movu             m5, [grain_lutq+top_offxyq*2]
2926*c0909341SAndroid Build Coastguard Worker%endif
2927*c0909341SAndroid Build Coastguard Worker    punpckhwd        m7, m5, m3
2928*c0909341SAndroid Build Coastguard Worker    punpcklwd        m5, m3                 ; {top/cur interleaved}
2929*c0909341SAndroid Build Coastguard Worker    REPX {pmaddwd x, m2}, m7, m5
2930*c0909341SAndroid Build Coastguard Worker%if %1
2931*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2932*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
2933*c0909341SAndroid Build Coastguard Worker%endif
2934*c0909341SAndroid Build Coastguard Worker    REPX  {paddd x, [PIC_ptr(pd_16)]}, m7, m5
2935*c0909341SAndroid Build Coastguard Worker%else
2936*c0909341SAndroid Build Coastguard Worker    REPX  {paddd x, m14}, m7, m5
2937*c0909341SAndroid Build Coastguard Worker%endif
2938*c0909341SAndroid Build Coastguard Worker    REPX   {psrad x, 5}, m7, m5
2939*c0909341SAndroid Build Coastguard Worker    packssdw         m3, m5, m7
2940*c0909341SAndroid Build Coastguard Worker    pmaxsw           m3, m8
2941*c0909341SAndroid Build Coastguard Worker    pminsw           m3, m9
2942*c0909341SAndroid Build Coastguard Worker
2943*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
2944*c0909341SAndroid Build Coastguard Worker    movu             m4, [grain_lutq+offxyq*2+16]
2945*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2946*c0909341SAndroid Build Coastguard Worker    movu             m5, [grain_lutq+r0*2+16]
2947*c0909341SAndroid Build Coastguard Worker%else
2948*c0909341SAndroid Build Coastguard Worker    movu             m5, [grain_lutq+top_offxyq*2+16]
2949*c0909341SAndroid Build Coastguard Worker%endif
2950*c0909341SAndroid Build Coastguard Worker    punpckhwd        m7, m5, m4
2951*c0909341SAndroid Build Coastguard Worker    punpcklwd        m5, m4                 ; {top/cur interleaved}
2952*c0909341SAndroid Build Coastguard Worker    REPX {pmaddwd x, m2}, m7, m5
2953*c0909341SAndroid Build Coastguard Worker%if %1
2954*c0909341SAndroid Build Coastguard Worker    REPX  {paddd x, [PIC_ptr(pd_16)]}, m7, m5
2955*c0909341SAndroid Build Coastguard Worker%else
2956*c0909341SAndroid Build Coastguard Worker    REPX  {paddd x, m14}, m7, m5
2957*c0909341SAndroid Build Coastguard Worker%endif
2958*c0909341SAndroid Build Coastguard Worker    REPX   {psrad x, 5}, m7, m5
2959*c0909341SAndroid Build Coastguard Worker    packssdw         m4, m5, m7
2960*c0909341SAndroid Build Coastguard Worker    pmaxsw           m4, m8
2961*c0909341SAndroid Build Coastguard Worker    pminsw           m4, m9
2962*c0909341SAndroid Build Coastguard Worker
2963*c0909341SAndroid Build Coastguard Worker    ; src
2964*c0909341SAndroid Build Coastguard Worker    mova             m0, [srcq]
2965*c0909341SAndroid Build Coastguard Worker    mova             m1, [srcq+16]
2966*c0909341SAndroid Build Coastguard Worker
2967*c0909341SAndroid Build Coastguard Worker    ; luma_src
2968*c0909341SAndroid Build Coastguard Worker    pxor          mzero, mzero
2969*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2970*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
2971*c0909341SAndroid Build Coastguard Worker
2972*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r9mp
2973*c0909341SAndroid Build Coastguard Worker%endif
2974*c0909341SAndroid Build Coastguard Worker    mova             m5, [lumaq+ 0]
2975*c0909341SAndroid Build Coastguard Worker    mova             m6, [lumaq+(16<<%2)]
2976*c0909341SAndroid Build Coastguard Worker%if %2
2977*c0909341SAndroid Build Coastguard Worker    phaddw           m5, [lumaq+16]
2978*c0909341SAndroid Build Coastguard Worker    phaddw           m6, [lumaq+48]
2979*c0909341SAndroid Build Coastguard Worker%endif
2980*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2981*c0909341SAndroid Build Coastguard Worker    add           lumaq, r10mp
2982*c0909341SAndroid Build Coastguard Worker    mov            r9mp, lumaq
2983*c0909341SAndroid Build Coastguard Worker%endif
2984*c0909341SAndroid Build Coastguard Worker%if %2
2985*c0909341SAndroid Build Coastguard Worker    pavgw            m5, mzero
2986*c0909341SAndroid Build Coastguard Worker    pavgw            m6, mzero
2987*c0909341SAndroid Build Coastguard Worker%endif
2988*c0909341SAndroid Build Coastguard Worker
2989*c0909341SAndroid Build Coastguard Worker%if %1
2990*c0909341SAndroid Build Coastguard Worker    punpckhwd        m7, m5, m0
2991*c0909341SAndroid Build Coastguard Worker    punpcklwd        m5, m0
2992*c0909341SAndroid Build Coastguard Worker    REPX {pmaddwd x, m14}, m7, m5
2993*c0909341SAndroid Build Coastguard Worker    REPX {psrad   x, 6}, m7, m5
2994*c0909341SAndroid Build Coastguard Worker    packssdw         m5, m7
2995*c0909341SAndroid Build Coastguard Worker    punpckhwd        m7, m6, m1
2996*c0909341SAndroid Build Coastguard Worker    punpcklwd        m6, m1                 ; { luma, chroma }
2997*c0909341SAndroid Build Coastguard Worker    REPX {pmaddwd x, m14}, m7, m6
2998*c0909341SAndroid Build Coastguard Worker    REPX {psrad   x, 6}, m7, m6
2999*c0909341SAndroid Build Coastguard Worker    packssdw         m6, m7
3000*c0909341SAndroid Build Coastguard Worker    pxor          mzero, mzero
3001*c0909341SAndroid Build Coastguard Worker    REPX {paddw x, m15}, m5, m6
3002*c0909341SAndroid Build Coastguard Worker    REPX {pmaxsw x, mzero}, m5, m6
3003*c0909341SAndroid Build Coastguard Worker    REPX {pminsw x, m10}, m5, m6            ; clip_pixel()
3004*c0909341SAndroid Build Coastguard Worker%else
3005*c0909341SAndroid Build Coastguard Worker    REPX  {pand x, m10}, m5, m6
3006*c0909341SAndroid Build Coastguard Worker%endif
3007*c0909341SAndroid Build Coastguard Worker
3008*c0909341SAndroid Build Coastguard Worker    ; scaling[luma_src]
3009*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3010*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m7, m5, scalingq-1, r0, r5, 8, 1
3011*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m6, scalingq-1, r0, r5, 8, 1
3012*c0909341SAndroid Build Coastguard Worker%else
3013*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m7, m5, scalingq-1, r10, r12, 8, 1
3014*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m6, scalingq-1, r10, r12, 8, 1
3015*c0909341SAndroid Build Coastguard Worker%endif
3016*c0909341SAndroid Build Coastguard Worker    REPX   {psrlw x, 8}, m7, m5
3017*c0909341SAndroid Build Coastguard Worker
3018*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
3019*c0909341SAndroid Build Coastguard Worker    REPX {pmullw x, m11}, m7, m5
3020*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m3, m7
3021*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m4, m5
3022*c0909341SAndroid Build Coastguard Worker
3023*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
3024*c0909341SAndroid Build Coastguard Worker    paddw            m0, m3
3025*c0909341SAndroid Build Coastguard Worker    paddw            m1, m4
3026*c0909341SAndroid Build Coastguard Worker    pmaxsw           m0, m13
3027*c0909341SAndroid Build Coastguard Worker    pmaxsw           m1, m13
3028*c0909341SAndroid Build Coastguard Worker    pminsw           m0, m12
3029*c0909341SAndroid Build Coastguard Worker    pminsw           m1, m12
3030*c0909341SAndroid Build Coastguard Worker    movifnidn      dstq, dstmp
3031*c0909341SAndroid Build Coastguard Worker    mova      [dstq+ 0], m0
3032*c0909341SAndroid Build Coastguard Worker    mova      [dstq+16], m1
3033*c0909341SAndroid Build Coastguard Worker
3034*c0909341SAndroid Build Coastguard Worker    dec              hw
3035*c0909341SAndroid Build Coastguard Worker    jle %%end_y_v_overlap
3036*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3037*c0909341SAndroid Build Coastguard Worker    add            srcq, r2mp
3038*c0909341SAndroid Build Coastguard Worker    add            dstq, r2mp
3039*c0909341SAndroid Build Coastguard Worker    mov           dstmp, dstq
3040*c0909341SAndroid Build Coastguard Worker%else
3041*c0909341SAndroid Build Coastguard Worker    add            srcq, r13mp
3042*c0909341SAndroid Build Coastguard Worker    add            dstq, r13mp
3043*c0909341SAndroid Build Coastguard Worker    add           lumaq, lstrideq
3044*c0909341SAndroid Build Coastguard Worker%endif
3045*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82*2
3046*c0909341SAndroid Build Coastguard Worker%if %3
3047*c0909341SAndroid Build Coastguard Worker    jmp %%loop_y
3048*c0909341SAndroid Build Coastguard Worker%else
3049*c0909341SAndroid Build Coastguard Worker    btc              hd, 16
3050*c0909341SAndroid Build Coastguard Worker    jc %%loop_y
3051*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3052*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
3053*c0909341SAndroid Build Coastguard Worker%endif
3054*c0909341SAndroid Build Coastguard Worker    SPLATD           m2, [PIC_ptr(pw_27_17_17_27)+4]
3055*c0909341SAndroid Build Coastguard Worker    jmp %%loop_y_v_overlap
3056*c0909341SAndroid Build Coastguard Worker%endif
3057*c0909341SAndroid Build Coastguard Worker
3058*c0909341SAndroid Build Coastguard Worker%%end_y_v_overlap:
3059*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3060*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
3061*c0909341SAndroid Build Coastguard Worker
3062*c0909341SAndroid Build Coastguard Worker    mov              wq, r4m
3063*c0909341SAndroid Build Coastguard Worker%endif
3064*c0909341SAndroid Build Coastguard Worker    add              wq, 16
3065*c0909341SAndroid Build Coastguard Worker    jge %%end_hv
3066*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3067*c0909341SAndroid Build Coastguard Worker    mov            srcq, r1mp
3068*c0909341SAndroid Build Coastguard Worker%else
3069*c0909341SAndroid Build Coastguard Worker    mov            srcq, r10mp
3070*c0909341SAndroid Build Coastguard Worker%endif
3071*c0909341SAndroid Build Coastguard Worker    mov            dstq, r11mp
3072*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r12mp
3073*c0909341SAndroid Build Coastguard Worker    lea            srcq, [srcq+wq*2]
3074*c0909341SAndroid Build Coastguard Worker    lea            dstq, [dstq+wq*2]
3075*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [lumaq+wq*(2<<%2)]
3076*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3077*c0909341SAndroid Build Coastguard Worker    mov            r0mp, dstq
3078*c0909341SAndroid Build Coastguard Worker    mov            r9mp, lumaq
3079*c0909341SAndroid Build Coastguard Worker    mov             r4m, wq
3080*c0909341SAndroid Build Coastguard Worker%endif
3081*c0909341SAndroid Build Coastguard Worker
3082*c0909341SAndroid Build Coastguard Worker%if %2
3083*c0909341SAndroid Build Coastguard Worker    ; since fg_dataq.overlap is guaranteed to be set, we never jump
3084*c0909341SAndroid Build Coastguard Worker    ; back to .loop_x_v_overlap, and instead always fall-through to
3085*c0909341SAndroid Build Coastguard Worker    ; h+v overlap
3086*c0909341SAndroid Build Coastguard Worker%else
3087*c0909341SAndroid Build Coastguard Worker    btc       dword r8m, 2
3088*c0909341SAndroid Build Coastguard Worker    jc %%loop_x_hv_overlap
3089*c0909341SAndroid Build Coastguard Worker    add          offxyd, 16
3090*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3091*c0909341SAndroid Build Coastguard Worker    add dword [rsp+8*mmsize+1*gprsize], 16
3092*c0909341SAndroid Build Coastguard Worker%else
3093*c0909341SAndroid Build Coastguard Worker    add            r11d, 16
3094*c0909341SAndroid Build Coastguard Worker%endif
3095*c0909341SAndroid Build Coastguard Worker    jmp %%loop_x_odd_v_overlap
3096*c0909341SAndroid Build Coastguard Worker%endif
3097*c0909341SAndroid Build Coastguard Worker
3098*c0909341SAndroid Build Coastguard Worker%%loop_x_hv_overlap:
3099*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3100*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, w, picptr, grain_lut
3101*c0909341SAndroid Build Coastguard Worker
3102*c0909341SAndroid Build Coastguard Worker    mov             t0d, [rsp+mmsize*8+gprsize*1] ; top_offxy
3103*c0909341SAndroid Build Coastguard Worker    add          offxyd, 16
3104*c0909341SAndroid Build Coastguard Worker    add             t0d, 16
3105*c0909341SAndroid Build Coastguard Worker    mov [rsp+mmsize*8+gprsize*0], offxyd ; left_offxyd
3106*c0909341SAndroid Build Coastguard Worker    mov [rsp+mmsize*8+gprsize*2], t0d ; topleft_offxyd
3107*c0909341SAndroid Build Coastguard Worker
3108*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut
3109*c0909341SAndroid Build Coastguard Worker
3110*c0909341SAndroid Build Coastguard Worker    mov            seed, r3m
3111*c0909341SAndroid Build Coastguard Worker    xor             t0d, t0d
3112*c0909341SAndroid Build Coastguard Worker%else
3113*c0909341SAndroid Build Coastguard Worker    ; we assume from the block above that bits 8-15 of r7d are zero'ed
3114*c0909341SAndroid Build Coastguard Worker%endif
3115*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
3116*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4eff4
3117*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
3118*c0909341SAndroid Build Coastguard Worker    setp            t0b                     ; parity of top_seed
3119*c0909341SAndroid Build Coastguard Worker    shr            seed, 16
3120*c0909341SAndroid Build Coastguard Worker    shl             t0d, 16
3121*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
3122*c0909341SAndroid Build Coastguard Worker    setp            t0b                     ; parity of cur_seed
3123*c0909341SAndroid Build Coastguard Worker    or              r6d, 0x00010001
3124*c0909341SAndroid Build Coastguard Worker    xor             t0d, r6d
3125*c0909341SAndroid Build Coastguard Worker    mov            seed, t0d
3126*c0909341SAndroid Build Coastguard Worker    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
3127*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3128*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
3129*c0909341SAndroid Build Coastguard Worker
3130*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
3131*c0909341SAndroid Build Coastguard Worker
3132*c0909341SAndroid Build Coastguard Worker    mov           offxd, offyd
3133*c0909341SAndroid Build Coastguard Worker%else
3134*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
3135*c0909341SAndroid Build Coastguard Worker                offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
3136*c0909341SAndroid Build Coastguard Worker
3137*c0909341SAndroid Build Coastguard Worker    lea  topleft_offxyq, [top_offxyq+16]
3138*c0909341SAndroid Build Coastguard Worker    lea     left_offxyq, [offyq+16]
3139*c0909341SAndroid Build Coastguard Worker    mov           offyd, seed
3140*c0909341SAndroid Build Coastguard Worker    mov           offxd, seed
3141*c0909341SAndroid Build Coastguard Worker%endif
3142*c0909341SAndroid Build Coastguard Worker    ror           offyd, 8
3143*c0909341SAndroid Build Coastguard Worker    ror           offxd, 12
3144*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf000f
3145*c0909341SAndroid Build Coastguard Worker    and           offxd, 0xf000f
3146*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164>>%3
3147*c0909341SAndroid Build Coastguard Worker    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
3148*c0909341SAndroid Build Coastguard Worker    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
3149*c0909341SAndroid Build Coastguard Worker
3150*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3151*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, top_offxy
3152*c0909341SAndroid Build Coastguard Worker%else
3153*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
3154*c0909341SAndroid Build Coastguard Worker                h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
3155*c0909341SAndroid Build Coastguard Worker%endif
3156*c0909341SAndroid Build Coastguard Worker    movzx    top_offxyd, offxyw
3157*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3158*c0909341SAndroid Build Coastguard Worker    mov [rsp+8*mmsize+1*gprsize], top_offxyd
3159*c0909341SAndroid Build Coastguard Worker
3160*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
3161*c0909341SAndroid Build Coastguard Worker%endif
3162*c0909341SAndroid Build Coastguard Worker    shr          offxyd, 16
3163*c0909341SAndroid Build Coastguard Worker
3164*c0909341SAndroid Build Coastguard Worker%if %3 == 0
3165*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3166*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
3167*c0909341SAndroid Build Coastguard Worker%endif
3168*c0909341SAndroid Build Coastguard Worker    SPLATD           m2, [PIC_ptr(pw_27_17_17_27)]
3169*c0909341SAndroid Build Coastguard Worker%endif
3170*c0909341SAndroid Build Coastguard Worker
3171*c0909341SAndroid Build Coastguard Worker    mov              hd, r7m
3172*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
3173*c0909341SAndroid Build Coastguard Worker%%loop_y_hv_overlap:
3174*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
3175*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3176*c0909341SAndroid Build Coastguard Worker    mov              r5, [rsp+8*mmsize+0*gprsize] ; left_offxy
3177*c0909341SAndroid Build Coastguard Worker    mov              r0, [rsp+8*mmsize+1*gprsize] ; top_offxy
3178*c0909341SAndroid Build Coastguard Worker    movd             m5, [grain_lutq+r5*2]
3179*c0909341SAndroid Build Coastguard Worker%else
3180*c0909341SAndroid Build Coastguard Worker    movd             m5, [grain_lutq+left_offxyq*2]
3181*c0909341SAndroid Build Coastguard Worker%endif
3182*c0909341SAndroid Build Coastguard Worker    movu             m7, [grain_lutq+offxyq*2]
3183*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3184*c0909341SAndroid Build Coastguard Worker    mov              r5, [rsp+8*mmsize+2*gprsize]
3185*c0909341SAndroid Build Coastguard Worker    movu             m4, [grain_lutq+r0*2]
3186*c0909341SAndroid Build Coastguard Worker%if %2
3187*c0909341SAndroid Build Coastguard Worker    pinsrw           m5, [grain_lutq+r5*2], 2
3188*c0909341SAndroid Build Coastguard Worker%else
3189*c0909341SAndroid Build Coastguard Worker    movd             m3, [grain_lutq+r5*2]
3190*c0909341SAndroid Build Coastguard Worker%endif
3191*c0909341SAndroid Build Coastguard Worker%else
3192*c0909341SAndroid Build Coastguard Worker    movu             m4, [grain_lutq+top_offxyq*2]
3193*c0909341SAndroid Build Coastguard Worker%if %2
3194*c0909341SAndroid Build Coastguard Worker    pinsrw           m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left }
3195*c0909341SAndroid Build Coastguard Worker%else
3196*c0909341SAndroid Build Coastguard Worker    movd             m3, [grain_lutq+topleft_offxyq*2]
3197*c0909341SAndroid Build Coastguard Worker%endif
3198*c0909341SAndroid Build Coastguard Worker%endif
3199*c0909341SAndroid Build Coastguard Worker%if %2 == 0
3200*c0909341SAndroid Build Coastguard Worker    punpckldq        m5, m3
3201*c0909341SAndroid Build Coastguard Worker%endif
3202*c0909341SAndroid Build Coastguard Worker    punpckldq        m3, m7, m4             ; { cur0/1,top0/1,cur2/3,top2/3 }
3203*c0909341SAndroid Build Coastguard Worker    punpcklwd        m5, m3                 ; { left/cur0,_/cur1,topleft/top0,_/top1 }
3204*c0909341SAndroid Build Coastguard Worker%if %1
3205*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3206*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
3207*c0909341SAndroid Build Coastguard Worker%endif
3208*c0909341SAndroid Build Coastguard Worker%if %2
3209*c0909341SAndroid Build Coastguard Worker    movddup          m0, [PIC_ptr(pw_23_22)]
3210*c0909341SAndroid Build Coastguard Worker%else
3211*c0909341SAndroid Build Coastguard Worker    movddup          m0, [PIC_ptr(pw_27_17_17_27)]
3212*c0909341SAndroid Build Coastguard Worker%endif
3213*c0909341SAndroid Build Coastguard Worker%else
3214*c0909341SAndroid Build Coastguard Worker    pshufd           m0, m15, q1010
3215*c0909341SAndroid Build Coastguard Worker%endif
3216*c0909341SAndroid Build Coastguard Worker    pmaddwd          m5, m0
3217*c0909341SAndroid Build Coastguard Worker%if %1
3218*c0909341SAndroid Build Coastguard Worker    paddd            m5, [PIC_ptr(pd_16)]
3219*c0909341SAndroid Build Coastguard Worker%else
3220*c0909341SAndroid Build Coastguard Worker    paddd            m5, m14
3221*c0909341SAndroid Build Coastguard Worker%endif
3222*c0909341SAndroid Build Coastguard Worker    psrad            m5, 5
3223*c0909341SAndroid Build Coastguard Worker    packssdw         m5, m5
3224*c0909341SAndroid Build Coastguard Worker    pmaxsw           m5, m8
3225*c0909341SAndroid Build Coastguard Worker    pminsw           m5, m9
3226*c0909341SAndroid Build Coastguard Worker    shufps           m5, m3, q3210          ; cur0/1,top0/1,cur2/3,top2/3
3227*c0909341SAndroid Build Coastguard Worker    shufps           m3, m5, m7, q3220      ; cur0-7 post-h_filter
3228*c0909341SAndroid Build Coastguard Worker    shufps           m5, m4, q3231          ; top0-7 post-h_filter
3229*c0909341SAndroid Build Coastguard Worker
3230*c0909341SAndroid Build Coastguard Worker    punpckhwd        m7, m5, m3
3231*c0909341SAndroid Build Coastguard Worker    punpcklwd        m5, m3                 ; {top/cur interleaved}
3232*c0909341SAndroid Build Coastguard Worker    REPX {pmaddwd x, m2}, m7, m5
3233*c0909341SAndroid Build Coastguard Worker%if %1
3234*c0909341SAndroid Build Coastguard Worker    REPX  {paddd x, [PIC_ptr(pd_16)]}, m5, m7
3235*c0909341SAndroid Build Coastguard Worker%else
3236*c0909341SAndroid Build Coastguard Worker    REPX  {paddd x, m14}, m5, m7
3237*c0909341SAndroid Build Coastguard Worker%endif
3238*c0909341SAndroid Build Coastguard Worker    REPX   {psrad x, 5}, m5, m7
3239*c0909341SAndroid Build Coastguard Worker    packssdw         m3, m5, m7
3240*c0909341SAndroid Build Coastguard Worker    pmaxsw           m3, m8
3241*c0909341SAndroid Build Coastguard Worker    pminsw           m3, m9
3242*c0909341SAndroid Build Coastguard Worker
3243*c0909341SAndroid Build Coastguard Worker    ; right half
3244*c0909341SAndroid Build Coastguard Worker    movu             m4, [grain_lutq+offxyq*2+16]
3245*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3246*c0909341SAndroid Build Coastguard Worker    movu             m0, [grain_lutq+r0*2+16]
3247*c0909341SAndroid Build Coastguard Worker%else
3248*c0909341SAndroid Build Coastguard Worker    movu             m0, [grain_lutq+top_offxyq*2+16]
3249*c0909341SAndroid Build Coastguard Worker%endif
3250*c0909341SAndroid Build Coastguard Worker    punpckhwd        m1, m0, m4
3251*c0909341SAndroid Build Coastguard Worker    punpcklwd        m0, m4                 ; {top/cur interleaved}
3252*c0909341SAndroid Build Coastguard Worker    REPX {pmaddwd x, m2}, m1, m0
3253*c0909341SAndroid Build Coastguard Worker%if %1
3254*c0909341SAndroid Build Coastguard Worker    REPX  {paddd x, [PIC_ptr(pd_16)]}, m1, m0
3255*c0909341SAndroid Build Coastguard Worker%else
3256*c0909341SAndroid Build Coastguard Worker    REPX  {paddd x, m14}, m1, m0
3257*c0909341SAndroid Build Coastguard Worker%endif
3258*c0909341SAndroid Build Coastguard Worker    REPX   {psrad x, 5}, m1, m0
3259*c0909341SAndroid Build Coastguard Worker    packssdw         m4, m0, m1
3260*c0909341SAndroid Build Coastguard Worker    pmaxsw           m4, m8
3261*c0909341SAndroid Build Coastguard Worker    pminsw           m4, m9
3262*c0909341SAndroid Build Coastguard Worker
3263*c0909341SAndroid Build Coastguard Worker    ; src
3264*c0909341SAndroid Build Coastguard Worker    mova             m0, [srcq]
3265*c0909341SAndroid Build Coastguard Worker    mova             m1, [srcq+16]
3266*c0909341SAndroid Build Coastguard Worker
3267*c0909341SAndroid Build Coastguard Worker    ; luma_src
3268*c0909341SAndroid Build Coastguard Worker    pxor          mzero, mzero
3269*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3270*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
3271*c0909341SAndroid Build Coastguard Worker
3272*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r9mp
3273*c0909341SAndroid Build Coastguard Worker%endif
3274*c0909341SAndroid Build Coastguard Worker    mova             m6, [lumaq+ 0]
3275*c0909341SAndroid Build Coastguard Worker    mova             m5, [lumaq+(16<<%2)]
3276*c0909341SAndroid Build Coastguard Worker%if %2
3277*c0909341SAndroid Build Coastguard Worker    phaddw           m6, [lumaq+16]
3278*c0909341SAndroid Build Coastguard Worker    phaddw           m5, [lumaq+48]
3279*c0909341SAndroid Build Coastguard Worker%endif
3280*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3281*c0909341SAndroid Build Coastguard Worker    add           lumaq, r10mp
3282*c0909341SAndroid Build Coastguard Worker    mov            r9mp, lumaq
3283*c0909341SAndroid Build Coastguard Worker%endif
3284*c0909341SAndroid Build Coastguard Worker%if %2
3285*c0909341SAndroid Build Coastguard Worker    pavgw            m6, mzero
3286*c0909341SAndroid Build Coastguard Worker    pavgw            m5, mzero
3287*c0909341SAndroid Build Coastguard Worker%endif
3288*c0909341SAndroid Build Coastguard Worker
3289*c0909341SAndroid Build Coastguard Worker%if %1
3290*c0909341SAndroid Build Coastguard Worker    punpckhwd        m7, m6, m0
3291*c0909341SAndroid Build Coastguard Worker    punpcklwd        m6, m0
3292*c0909341SAndroid Build Coastguard Worker    REPX {pmaddwd x, m14}, m7, m6
3293*c0909341SAndroid Build Coastguard Worker    REPX {psrad   x, 6}, m7, m6
3294*c0909341SAndroid Build Coastguard Worker    packssdw         m6, m7
3295*c0909341SAndroid Build Coastguard Worker    punpckhwd        m7, m5, m1
3296*c0909341SAndroid Build Coastguard Worker    punpcklwd        m5, m1                 ; { luma, chroma }
3297*c0909341SAndroid Build Coastguard Worker    REPX {pmaddwd x, m14}, m7, m5
3298*c0909341SAndroid Build Coastguard Worker    REPX {psrad   x, 6}, m7, m5
3299*c0909341SAndroid Build Coastguard Worker    packssdw         m5, m7
3300*c0909341SAndroid Build Coastguard Worker    pxor          mzero, mzero
3301*c0909341SAndroid Build Coastguard Worker    REPX {paddw x, m15}, m6, m5
3302*c0909341SAndroid Build Coastguard Worker    REPX {pmaxsw x, mzero}, m6, m5
3303*c0909341SAndroid Build Coastguard Worker    REPX {pminsw x, m10}, m6, m5            ; clip_pixel()
3304*c0909341SAndroid Build Coastguard Worker%else
3305*c0909341SAndroid Build Coastguard Worker    REPX  {pand x, m10}, m6, m5
3306*c0909341SAndroid Build Coastguard Worker%endif
3307*c0909341SAndroid Build Coastguard Worker
3308*c0909341SAndroid Build Coastguard Worker    ; scaling[luma_src]
3309*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3310*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m7, m6, scalingq-1, r0, r5, 8, 1
3311*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m6, m5, scalingq-1, r0, r5, 8, 1
3312*c0909341SAndroid Build Coastguard Worker%else
3313*c0909341SAndroid Build Coastguard Worker%if %3 == 0
3314*c0909341SAndroid Build Coastguard Worker    ; register shortage :)
3315*c0909341SAndroid Build Coastguard Worker    push            r12
3316*c0909341SAndroid Build Coastguard Worker%endif
3317*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m7, m6, scalingq-1, r2, r12, 8, 1
3318*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m6, m5, scalingq-1, r2, r12, 8, 1
3319*c0909341SAndroid Build Coastguard Worker%if %3 == 0
3320*c0909341SAndroid Build Coastguard Worker    pop             r12
3321*c0909341SAndroid Build Coastguard Worker%endif
3322*c0909341SAndroid Build Coastguard Worker%endif
3323*c0909341SAndroid Build Coastguard Worker    REPX   {psrlw x, 8}, m7, m6
3324*c0909341SAndroid Build Coastguard Worker
3325*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
3326*c0909341SAndroid Build Coastguard Worker    REPX {pmullw x, m11}, m7, m6
3327*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m3, m7
3328*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m4, m6
3329*c0909341SAndroid Build Coastguard Worker
3330*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
3331*c0909341SAndroid Build Coastguard Worker    paddw            m0, m3
3332*c0909341SAndroid Build Coastguard Worker    paddw            m1, m4
3333*c0909341SAndroid Build Coastguard Worker    pmaxsw           m0, m13
3334*c0909341SAndroid Build Coastguard Worker    pmaxsw           m1, m13
3335*c0909341SAndroid Build Coastguard Worker    pminsw           m0, m12
3336*c0909341SAndroid Build Coastguard Worker    pminsw           m1, m12
3337*c0909341SAndroid Build Coastguard Worker    movifnidn      dstq, dstmp
3338*c0909341SAndroid Build Coastguard Worker    mova      [dstq+ 0], m0
3339*c0909341SAndroid Build Coastguard Worker    mova      [dstq+16], m1
3340*c0909341SAndroid Build Coastguard Worker
3341*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3342*c0909341SAndroid Build Coastguard Worker    add            srcq, r2mp
3343*c0909341SAndroid Build Coastguard Worker    add            dstq, r2mp
3344*c0909341SAndroid Build Coastguard Worker    mov           dstmp, dstq
3345*c0909341SAndroid Build Coastguard Worker%else
3346*c0909341SAndroid Build Coastguard Worker    add            srcq, r13mp
3347*c0909341SAndroid Build Coastguard Worker    add            dstq, r13mp
3348*c0909341SAndroid Build Coastguard Worker    add           lumaq, lstrideq
3349*c0909341SAndroid Build Coastguard Worker%endif
3350*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82*2
3351*c0909341SAndroid Build Coastguard Worker    dec              hw
3352*c0909341SAndroid Build Coastguard Worker%if %3
3353*c0909341SAndroid Build Coastguard Worker    jg %%loop_y_h_overlap
3354*c0909341SAndroid Build Coastguard Worker%else
3355*c0909341SAndroid Build Coastguard Worker    jle %%end_y_hv_overlap
3356*c0909341SAndroid Build Coastguard Worker    btc              hd, 16
3357*c0909341SAndroid Build Coastguard Worker    jc %%loop_y_h_overlap
3358*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3359*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
3360*c0909341SAndroid Build Coastguard Worker%endif
3361*c0909341SAndroid Build Coastguard Worker    SPLATD           m2, [PIC_ptr(pw_27_17_17_27)+4]
3362*c0909341SAndroid Build Coastguard Worker    jmp %%loop_y_hv_overlap
3363*c0909341SAndroid Build Coastguard Worker%%end_y_hv_overlap:
3364*c0909341SAndroid Build Coastguard Worker%endif
3365*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3366*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
3367*c0909341SAndroid Build Coastguard Worker
3368*c0909341SAndroid Build Coastguard Worker    mov              wq, r4m
3369*c0909341SAndroid Build Coastguard Worker%endif
3370*c0909341SAndroid Build Coastguard Worker    add              wq, 16
3371*c0909341SAndroid Build Coastguard Worker    jge %%end_hv
3372*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3373*c0909341SAndroid Build Coastguard Worker    mov            srcq, r1mp
3374*c0909341SAndroid Build Coastguard Worker%else
3375*c0909341SAndroid Build Coastguard Worker    mov            srcq, r10mp
3376*c0909341SAndroid Build Coastguard Worker%endif
3377*c0909341SAndroid Build Coastguard Worker    mov            dstq, r11mp
3378*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r12mp
3379*c0909341SAndroid Build Coastguard Worker    lea            srcq, [srcq+wq*2]
3380*c0909341SAndroid Build Coastguard Worker    lea            dstq, [dstq+wq*2]
3381*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [lumaq+wq*(2<<%2)]
3382*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3383*c0909341SAndroid Build Coastguard Worker    mov           dstmp, dstq
3384*c0909341SAndroid Build Coastguard Worker    mov            r9mp, lumaq
3385*c0909341SAndroid Build Coastguard Worker    mov             r4m, wq
3386*c0909341SAndroid Build Coastguard Worker%endif
3387*c0909341SAndroid Build Coastguard Worker%if %2
3388*c0909341SAndroid Build Coastguard Worker    jmp %%loop_x_hv_overlap
3389*c0909341SAndroid Build Coastguard Worker%else
3390*c0909341SAndroid Build Coastguard Worker    or        dword r8m, 4
3391*c0909341SAndroid Build Coastguard Worker    add          offxyd, 16
3392*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3393*c0909341SAndroid Build Coastguard Worker    add dword [rsp+8*mmsize+1*gprsize], 16
3394*c0909341SAndroid Build Coastguard Worker%else
3395*c0909341SAndroid Build Coastguard Worker    add            r11d, 16                 ; top_offxy += 16
3396*c0909341SAndroid Build Coastguard Worker%endif
3397*c0909341SAndroid Build Coastguard Worker    jmp %%loop_x_odd_v_overlap
3398*c0909341SAndroid Build Coastguard Worker%endif
3399*c0909341SAndroid Build Coastguard Worker
3400*c0909341SAndroid Build Coastguard Worker%%end_hv:
3401*c0909341SAndroid Build Coastguard Worker    RET
3402*c0909341SAndroid Build Coastguard Worker%endmacro
3403*c0909341SAndroid Build Coastguard Worker
3404*c0909341SAndroid Build Coastguard Worker    %%FGUV_32x32xN_LOOP 1, %2, %3
3405*c0909341SAndroid Build Coastguard Worker.csfl:
3406*c0909341SAndroid Build Coastguard Worker    %%FGUV_32x32xN_LOOP 0, %2, %3
3407*c0909341SAndroid Build Coastguard Worker
3408*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < mmsize
3409*c0909341SAndroid Build Coastguard WorkerDECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
3410*c0909341SAndroid Build Coastguard Worker%endif
3411*c0909341SAndroid Build Coastguard Worker%endmacro
3412*c0909341SAndroid Build Coastguard Worker
3413*c0909341SAndroid Build Coastguard WorkerFGUV_FN 420, 1, 1
3414*c0909341SAndroid Build Coastguard WorkerFGUV_FN 422, 1, 0
3415*c0909341SAndroid Build Coastguard WorkerFGUV_FN 444, 0, 0
3416