xref: /aosp_15_r20/external/libdav1d/src/x86/filmgrain_sse.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2019-2021, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2019, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker%include "x86/filmgrain_common.asm"
29*c0909341SAndroid Build Coastguard Worker
30*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA
31*c0909341SAndroid Build Coastguard Worker
32*c0909341SAndroid Build Coastguard Workerpw_1024: times 8 dw 1024
33*c0909341SAndroid Build Coastguard Workerpb_27_17_17_27: db 27, 17, 17, 27
34*c0909341SAndroid Build Coastguard Worker                times 6 db 0, 32
35*c0909341SAndroid Build Coastguard Workerpb_23_22_h: db 23, 22
36*c0909341SAndroid Build Coastguard Worker            times 7 db 0, 32
37*c0909341SAndroid Build Coastguard Workerpb_27_17: times 8 db 27, 17
38*c0909341SAndroid Build Coastguard Workerpb_17_27: times 8 db 17, 27
39*c0909341SAndroid Build Coastguard Workerpb_23_22: times 8 db 23, 22
40*c0909341SAndroid Build Coastguard Workerpb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
41*c0909341SAndroid Build Coastguard Workerrnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
42*c0909341SAndroid Build Coastguard Workerbyte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
43*c0909341SAndroid Build Coastguard Workerpw_seed_xor: times 2 dw 0xb524
44*c0909341SAndroid Build Coastguard Worker             times 2 dw 0x49d8
45*c0909341SAndroid Build Coastguard Workerpb_1: times 4 db 1
46*c0909341SAndroid Build Coastguard Workerhmul_bits: dw 32768, 16384, 8192, 4096
47*c0909341SAndroid Build Coastguard Workerround: dw 2048, 1024, 512
48*c0909341SAndroid Build Coastguard Workermul_bits: dw 256, 128, 64, 32, 16
49*c0909341SAndroid Build Coastguard Workerround_vals: dw 32, 64, 128, 256, 512
50*c0909341SAndroid Build Coastguard Workermax: dw 255, 240, 235
51*c0909341SAndroid Build Coastguard Workermin: dw 0, 16
52*c0909341SAndroid Build Coastguard Workerpw_1: dw 1
53*c0909341SAndroid Build Coastguard Worker
54*c0909341SAndroid Build Coastguard Worker%macro JMP_TABLE 2-*
55*c0909341SAndroid Build Coastguard Worker    %xdefine %1_8bpc_%2_table %%table
56*c0909341SAndroid Build Coastguard Worker    %xdefine %%base %1_8bpc_%2_table
57*c0909341SAndroid Build Coastguard Worker    %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
58*c0909341SAndroid Build Coastguard Worker    %%table:
59*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
60*c0909341SAndroid Build Coastguard Worker        dd %%prefix %+ .ar%3 - %%base
61*c0909341SAndroid Build Coastguard Worker        %rotate 1
62*c0909341SAndroid Build Coastguard Worker    %endrep
63*c0909341SAndroid Build Coastguard Worker%endmacro
64*c0909341SAndroid Build Coastguard Worker
65*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3
66*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3
67*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3
68*c0909341SAndroid Build Coastguard WorkerJMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3
69*c0909341SAndroid Build Coastguard Worker
70*c0909341SAndroid Build Coastguard WorkerSECTION .text
71*c0909341SAndroid Build Coastguard Worker
72*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
73*c0909341SAndroid Build Coastguard Worker%define PIC_ptr(a) base+a
74*c0909341SAndroid Build Coastguard Worker%else
75*c0909341SAndroid Build Coastguard Worker%define PIC_ptr(a) a
76*c0909341SAndroid Build Coastguard Worker%endif
77*c0909341SAndroid Build Coastguard Worker
78*c0909341SAndroid Build Coastguard Worker%macro SCRATCH 3
79*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
80*c0909341SAndroid Build Coastguard Worker    mova [rsp+%3*mmsize], m%1
81*c0909341SAndroid Build Coastguard Worker%define m%2 [rsp+%3*mmsize]
82*c0909341SAndroid Build Coastguard Worker%else
83*c0909341SAndroid Build Coastguard Worker    SWAP             %1, %2
84*c0909341SAndroid Build Coastguard Worker%endif
85*c0909341SAndroid Build Coastguard Worker%endmacro
86*c0909341SAndroid Build Coastguard Worker
87*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
88*c0909341SAndroid Build Coastguard Workercglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
89*c0909341SAndroid Build Coastguard Worker    LEA              r4, $$
90*c0909341SAndroid Build Coastguard Worker%define base r4-$$
91*c0909341SAndroid Build Coastguard Worker    movq             m1, [base+rnd_next_upperbit_mask]
92*c0909341SAndroid Build Coastguard Worker    movq             m4, [base+mul_bits]
93*c0909341SAndroid Build Coastguard Worker    movq             m7, [base+hmul_bits]
94*c0909341SAndroid Build Coastguard Worker    mov             r2d, [fg_dataq+FGData.grain_scale_shift]
95*c0909341SAndroid Build Coastguard Worker    movd             m2, [base+round+r2*2]
96*c0909341SAndroid Build Coastguard Worker    movd             m0, [fg_dataq+FGData.seed]
97*c0909341SAndroid Build Coastguard Worker    mova             m5, [base+pb_mask]
98*c0909341SAndroid Build Coastguard Worker    pshuflw          m2, m2, q0000
99*c0909341SAndroid Build Coastguard Worker    pshuflw          m0, m0, q0000
100*c0909341SAndroid Build Coastguard Worker    mov              r2, -73*82
101*c0909341SAndroid Build Coastguard Worker    sub            bufq, r2
102*c0909341SAndroid Build Coastguard Worker    lea              r3, [base+gaussian_sequence]
103*c0909341SAndroid Build Coastguard Worker.loop:
104*c0909341SAndroid Build Coastguard Worker    pand             m6, m0, m1
105*c0909341SAndroid Build Coastguard Worker    psrlw            m3, m6, 10
106*c0909341SAndroid Build Coastguard Worker    por              m6, m3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
107*c0909341SAndroid Build Coastguard Worker    pmullw           m6, m4            ; bits 0x0f00 are set
108*c0909341SAndroid Build Coastguard Worker    pshufb           m3, m5, m6        ; set 15th bit for next 4 seeds
109*c0909341SAndroid Build Coastguard Worker    psllq            m6, m3, 30
110*c0909341SAndroid Build Coastguard Worker    por              m3, m6
111*c0909341SAndroid Build Coastguard Worker    psllq            m6, m3, 15
112*c0909341SAndroid Build Coastguard Worker    por              m3, m6            ; aggregate each bit into next seed's high bit
113*c0909341SAndroid Build Coastguard Worker    pmulhuw          m6, m0, m7
114*c0909341SAndroid Build Coastguard Worker    por              m3, m6            ; 4 next output seeds
115*c0909341SAndroid Build Coastguard Worker    pshuflw          m0, m3, q3333
116*c0909341SAndroid Build Coastguard Worker    psrlw            m3, 5
117*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
118*c0909341SAndroid Build Coastguard Worker    movq             r6, m3
119*c0909341SAndroid Build Coastguard Worker    mov              r8, r6
120*c0909341SAndroid Build Coastguard Worker    movzx           r5d, r6w
121*c0909341SAndroid Build Coastguard Worker    shr             r6d, 16
122*c0909341SAndroid Build Coastguard Worker    shr              r8, 32
123*c0909341SAndroid Build Coastguard Worker    movzx            r7, r8w
124*c0909341SAndroid Build Coastguard Worker    shr              r8, 16
125*c0909341SAndroid Build Coastguard Worker
126*c0909341SAndroid Build Coastguard Worker    movd             m6, [r3+r5*2]
127*c0909341SAndroid Build Coastguard Worker    pinsrw           m6, [r3+r6*2], 1
128*c0909341SAndroid Build Coastguard Worker    pinsrw           m6, [r3+r7*2], 2
129*c0909341SAndroid Build Coastguard Worker    pinsrw           m6, [r3+r8*2], 3
130*c0909341SAndroid Build Coastguard Worker%else
131*c0909341SAndroid Build Coastguard Worker    movd             r6, m3
132*c0909341SAndroid Build Coastguard Worker    pshuflw          m3, m3, q3232
133*c0909341SAndroid Build Coastguard Worker    movzx            r5, r6w
134*c0909341SAndroid Build Coastguard Worker    shr              r6, 16
135*c0909341SAndroid Build Coastguard Worker
136*c0909341SAndroid Build Coastguard Worker    movd             m6, [r3+r5*2]
137*c0909341SAndroid Build Coastguard Worker    pinsrw           m6, [r3+r6*2], 1
138*c0909341SAndroid Build Coastguard Worker
139*c0909341SAndroid Build Coastguard Worker    movd             r6, m3
140*c0909341SAndroid Build Coastguard Worker    movzx            r5, r6w
141*c0909341SAndroid Build Coastguard Worker    shr              r6, 16
142*c0909341SAndroid Build Coastguard Worker
143*c0909341SAndroid Build Coastguard Worker    pinsrw           m6, [r3+r5*2], 2
144*c0909341SAndroid Build Coastguard Worker    pinsrw           m6, [r3+r6*2], 3
145*c0909341SAndroid Build Coastguard Worker%endif
146*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m6, m2
147*c0909341SAndroid Build Coastguard Worker    packsswb         m6, m6
148*c0909341SAndroid Build Coastguard Worker    movd      [bufq+r2], m6
149*c0909341SAndroid Build Coastguard Worker    add              r2, 4
150*c0909341SAndroid Build Coastguard Worker    jl .loop
151*c0909341SAndroid Build Coastguard Worker
152*c0909341SAndroid Build Coastguard Worker    ; auto-regression code
153*c0909341SAndroid Build Coastguard Worker    movsxd           r2, [fg_dataq+FGData.ar_coeff_lag]
154*c0909341SAndroid Build Coastguard Worker    movsxd           r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4]
155*c0909341SAndroid Build Coastguard Worker    lea              r2, [r2+base+generate_grain_y_8bpc_ssse3_table]
156*c0909341SAndroid Build Coastguard Worker    jmp              r2
157*c0909341SAndroid Build Coastguard Worker
158*c0909341SAndroid Build Coastguard Worker.ar1:
159*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
160*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max
161*c0909341SAndroid Build Coastguard Worker%elif WIN64
162*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0
163*c0909341SAndroid Build Coastguard Worker    mov            bufq, r0
164*c0909341SAndroid Build Coastguard Worker%else
165*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0
166*c0909341SAndroid Build Coastguard Worker%endif
167*c0909341SAndroid Build Coastguard Worker    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
168*c0909341SAndroid Build Coastguard Worker    movd             m4, [fg_dataq+FGData.ar_coeffs_y]
169*c0909341SAndroid Build Coastguard Worker    mov             ecx, [fg_dataq+FGData.ar_coeff_shift]
170*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
171*c0909341SAndroid Build Coastguard Worker    mov             r1m, cf3d
172*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, shift, val3, min, max, x, val0
173*c0909341SAndroid Build Coastguard Worker%define hd r0mp
174*c0909341SAndroid Build Coastguard Worker%define cf3d r1mp
175*c0909341SAndroid Build Coastguard Worker%elif WIN64
176*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0
177*c0909341SAndroid Build Coastguard Worker%else
178*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0
179*c0909341SAndroid Build Coastguard Worker%endif
180*c0909341SAndroid Build Coastguard Worker    pxor             m6, m6
181*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m7, m6, m4
182*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m7
183*c0909341SAndroid Build Coastguard Worker    pinsrw           m4, [base+pw_1], 3
184*c0909341SAndroid Build Coastguard Worker    pshufd           m5, m4, q1111
185*c0909341SAndroid Build Coastguard Worker    pshufd           m4, m4, q0000
186*c0909341SAndroid Build Coastguard Worker    movd             m3, [base+round_vals+shiftq*2-12]    ; rnd
187*c0909341SAndroid Build Coastguard Worker    pshuflw          m3, m3, q0000
188*c0909341SAndroid Build Coastguard Worker    sub            bufq, 82*73-(82*3+79)
189*c0909341SAndroid Build Coastguard Worker    mov              hd, 70
190*c0909341SAndroid Build Coastguard Worker    mov            mind, -128
191*c0909341SAndroid Build Coastguard Worker    mov            maxd, 127
192*c0909341SAndroid Build Coastguard Worker.y_loop_ar1:
193*c0909341SAndroid Build Coastguard Worker    mov              xq, -76
194*c0909341SAndroid Build Coastguard Worker    movsx         val3d, byte [bufq+xq-1]
195*c0909341SAndroid Build Coastguard Worker.x_loop_ar1:
196*c0909341SAndroid Build Coastguard Worker    movq             m0, [bufq+xq-82-1]     ; top/left
197*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m7, m6, m0
198*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m7
199*c0909341SAndroid Build Coastguard Worker    psrldq           m2, m0, 2              ; top
200*c0909341SAndroid Build Coastguard Worker    psrldq           m1, m0, 4              ; top/right
201*c0909341SAndroid Build Coastguard Worker    punpcklwd        m0, m2
202*c0909341SAndroid Build Coastguard Worker    punpcklwd        m1, m3
203*c0909341SAndroid Build Coastguard Worker    pmaddwd          m0, m4
204*c0909341SAndroid Build Coastguard Worker    pmaddwd          m1, m5
205*c0909341SAndroid Build Coastguard Worker    paddd            m0, m1
206*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_inner:
207*c0909341SAndroid Build Coastguard Worker    movd          val0d, m0
208*c0909341SAndroid Build Coastguard Worker    psrldq           m0, 4
209*c0909341SAndroid Build Coastguard Worker    imul          val3d, cf3d
210*c0909341SAndroid Build Coastguard Worker    add           val3d, val0d
211*c0909341SAndroid Build Coastguard Worker    sar           val3d, shiftb
212*c0909341SAndroid Build Coastguard Worker    movsx         val0d, byte [bufq+xq]
213*c0909341SAndroid Build Coastguard Worker    add           val3d, val0d
214*c0909341SAndroid Build Coastguard Worker    cmp           val3d, maxd
215*c0909341SAndroid Build Coastguard Worker    cmovns        val3d, maxd
216*c0909341SAndroid Build Coastguard Worker    cmp           val3d, mind
217*c0909341SAndroid Build Coastguard Worker    cmovs         val3d, mind
218*c0909341SAndroid Build Coastguard Worker    mov  byte [bufq+xq], val3b
219*c0909341SAndroid Build Coastguard Worker    ; keep val3d in-place as left for next x iteration
220*c0909341SAndroid Build Coastguard Worker    inc              xq
221*c0909341SAndroid Build Coastguard Worker    jz .x_loop_ar1_end
222*c0909341SAndroid Build Coastguard Worker    test             xq, 3
223*c0909341SAndroid Build Coastguard Worker    jnz .x_loop_ar1_inner
224*c0909341SAndroid Build Coastguard Worker    jmp .x_loop_ar1
225*c0909341SAndroid Build Coastguard Worker
226*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_end:
227*c0909341SAndroid Build Coastguard Worker    add            bufq, 82
228*c0909341SAndroid Build Coastguard Worker    dec              hd
229*c0909341SAndroid Build Coastguard Worker    jg .y_loop_ar1
230*c0909341SAndroid Build Coastguard Worker.ar0:
231*c0909341SAndroid Build Coastguard Worker    RET
232*c0909341SAndroid Build Coastguard Worker
233*c0909341SAndroid Build Coastguard Worker.ar2:
234*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
235*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK -16*8
236*c0909341SAndroid Build Coastguard Worker%endif
237*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, fg_data, shift
238*c0909341SAndroid Build Coastguard Worker    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
239*c0909341SAndroid Build Coastguard Worker    movd             m6, [base+round_vals-12+shiftq*2]
240*c0909341SAndroid Build Coastguard Worker    movd             m7, [base+byte_blend+1]
241*c0909341SAndroid Build Coastguard Worker    SCRATCH           7, 15, 7
242*c0909341SAndroid Build Coastguard Worker    movq             m0, [fg_dataq+FGData.ar_coeffs_y+0]    ; cf0-7
243*c0909341SAndroid Build Coastguard Worker    movd             m1, [fg_dataq+FGData.ar_coeffs_y+8]    ; cf8-11
244*c0909341SAndroid Build Coastguard Worker    pxor             m7, m7
245*c0909341SAndroid Build Coastguard Worker    pshuflw          m6, m6, q0000
246*c0909341SAndroid Build Coastguard Worker    punpcklwd        m6, m7
247*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m4, m7, m0
248*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m5, m7, m1
249*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m4
250*c0909341SAndroid Build Coastguard Worker    punpcklbw        m1, m5
251*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, fg_data, h, x
252*c0909341SAndroid Build Coastguard Worker    pshufd           m4, m1, q0000
253*c0909341SAndroid Build Coastguard Worker    pshufd           m5, m1, q1111
254*c0909341SAndroid Build Coastguard Worker    pshufd           m3, m0, q3333
255*c0909341SAndroid Build Coastguard Worker    pshufd           m2, m0, q2222
256*c0909341SAndroid Build Coastguard Worker    pshufd           m1, m0, q1111
257*c0909341SAndroid Build Coastguard Worker    pshufd           m0, m0, q0000
258*c0909341SAndroid Build Coastguard Worker    SCRATCH           0, 8,  0
259*c0909341SAndroid Build Coastguard Worker    SCRATCH           1, 9,  1
260*c0909341SAndroid Build Coastguard Worker    SCRATCH           2, 10, 2
261*c0909341SAndroid Build Coastguard Worker    SCRATCH           3, 11, 3
262*c0909341SAndroid Build Coastguard Worker    SCRATCH           4, 12, 4
263*c0909341SAndroid Build Coastguard Worker    SCRATCH           5, 13, 5
264*c0909341SAndroid Build Coastguard Worker    SCRATCH           6, 14, 6
265*c0909341SAndroid Build Coastguard Worker    sub            bufq, 82*73-(82*3+79)
266*c0909341SAndroid Build Coastguard Worker    mov              hd, 70
267*c0909341SAndroid Build Coastguard Worker.y_loop_ar2:
268*c0909341SAndroid Build Coastguard Worker    mov              xq, -76
269*c0909341SAndroid Build Coastguard Worker
270*c0909341SAndroid Build Coastguard Worker.x_loop_ar2:
271*c0909341SAndroid Build Coastguard Worker    movq             m0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
272*c0909341SAndroid Build Coastguard Worker    movhps           m0, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
273*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m2, m7, m0
274*c0909341SAndroid Build Coastguard Worker    punpckhbw        m1, m0, m2
275*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m2
276*c0909341SAndroid Build Coastguard Worker    psrldq           m5, m0, 2              ; y=-2,x=[-1,+5]
277*c0909341SAndroid Build Coastguard Worker    psrldq           m3, m1, 2              ; y=-1,x=[-1,+5]
278*c0909341SAndroid Build Coastguard Worker    psrldq           m4, m1, 4              ; y=-1,x=[+0,+5]
279*c0909341SAndroid Build Coastguard Worker    punpcklwd        m2, m0, m5
280*c0909341SAndroid Build Coastguard Worker    punpcklwd        m3, m4
281*c0909341SAndroid Build Coastguard Worker    pmaddwd          m2, m8
282*c0909341SAndroid Build Coastguard Worker    pmaddwd          m3, m11
283*c0909341SAndroid Build Coastguard Worker    paddd            m2, m3
284*c0909341SAndroid Build Coastguard Worker
285*c0909341SAndroid Build Coastguard Worker    psrldq           m4, m0, 4              ; y=-2,x=[+0,+5]
286*c0909341SAndroid Build Coastguard Worker    psrldq           m5, m0, 6              ; y=-2,x=[+1,+5]
287*c0909341SAndroid Build Coastguard Worker    psrldq           m6, m0, 8              ; y=-2,x=[+2,+5]
288*c0909341SAndroid Build Coastguard Worker    punpcklwd        m4, m5
289*c0909341SAndroid Build Coastguard Worker    punpcklwd        m6, m1
290*c0909341SAndroid Build Coastguard Worker    psrldq           m5, m1, 6              ; y=-1,x=[+1,+5]
291*c0909341SAndroid Build Coastguard Worker    psrldq           m1, m1, 8              ; y=-1,x=[+2,+5]
292*c0909341SAndroid Build Coastguard Worker    punpcklwd        m5, m1
293*c0909341SAndroid Build Coastguard Worker    pmaddwd          m4, m9
294*c0909341SAndroid Build Coastguard Worker    pmaddwd          m6, m10
295*c0909341SAndroid Build Coastguard Worker    pmaddwd          m5, m12
296*c0909341SAndroid Build Coastguard Worker    paddd            m4, m6
297*c0909341SAndroid Build Coastguard Worker    paddd            m2, m5
298*c0909341SAndroid Build Coastguard Worker    paddd            m2, m4
299*c0909341SAndroid Build Coastguard Worker    paddd            m2, m14
300*c0909341SAndroid Build Coastguard Worker
301*c0909341SAndroid Build Coastguard Worker    movq             m0, [bufq+xq-2]        ; y=0,x=[-2,+5]
302*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_inner:
303*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m4, m7, m0
304*c0909341SAndroid Build Coastguard Worker    punpcklbw        m1, m0, m4
305*c0909341SAndroid Build Coastguard Worker    pmaddwd          m3, m1, m13
306*c0909341SAndroid Build Coastguard Worker    paddd            m3, m2
307*c0909341SAndroid Build Coastguard Worker    psrldq           m1, 4                  ; y=0,x=0
308*c0909341SAndroid Build Coastguard Worker    psrldq           m2, 4                  ; shift top to next pixel
309*c0909341SAndroid Build Coastguard Worker    psrad            m3, [fg_dataq+FGData.ar_coeff_shift]
310*c0909341SAndroid Build Coastguard Worker    ; don't packssdw since we only care about one value
311*c0909341SAndroid Build Coastguard Worker    paddw            m3, m1
312*c0909341SAndroid Build Coastguard Worker    packsswb         m3, m3
313*c0909341SAndroid Build Coastguard Worker    pslldq           m3, 2
314*c0909341SAndroid Build Coastguard Worker    pand             m3, m15
315*c0909341SAndroid Build Coastguard Worker    pandn            m1, m15, m0
316*c0909341SAndroid Build Coastguard Worker    por              m0, m1, m3
317*c0909341SAndroid Build Coastguard Worker    psrldq           m0, 1
318*c0909341SAndroid Build Coastguard Worker    ; overwrite 2 pixels, but that's ok
319*c0909341SAndroid Build Coastguard Worker    movd      [bufq+xq-1], m0
320*c0909341SAndroid Build Coastguard Worker    inc              xq
321*c0909341SAndroid Build Coastguard Worker    jz .x_loop_ar2_end
322*c0909341SAndroid Build Coastguard Worker    test             xq, 3
323*c0909341SAndroid Build Coastguard Worker    jnz .x_loop_ar2_inner
324*c0909341SAndroid Build Coastguard Worker    jmp .x_loop_ar2
325*c0909341SAndroid Build Coastguard Worker
326*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_end:
327*c0909341SAndroid Build Coastguard Worker    add            bufq, 82
328*c0909341SAndroid Build Coastguard Worker    dec              hd
329*c0909341SAndroid Build Coastguard Worker    jg .y_loop_ar2
330*c0909341SAndroid Build Coastguard Worker    RET
331*c0909341SAndroid Build Coastguard Worker
332*c0909341SAndroid Build Coastguard Worker.ar3:
333*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, fg_data, shift
334*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
335*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK  -16*14
336*c0909341SAndroid Build Coastguard Worker%elif WIN64
337*c0909341SAndroid Build Coastguard Worker    SUB             rsp, 16*6
338*c0909341SAndroid Build Coastguard Worker%assign stack_size_padded (stack_size_padded+16*6)
339*c0909341SAndroid Build Coastguard Worker%assign stack_size (stack_size+16*6)
340*c0909341SAndroid Build Coastguard Worker%else
341*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK  -16*6
342*c0909341SAndroid Build Coastguard Worker%endif
343*c0909341SAndroid Build Coastguard Worker    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
344*c0909341SAndroid Build Coastguard Worker    movd             m6, [base+round_vals-12+shiftq*2]
345*c0909341SAndroid Build Coastguard Worker    movd             m7, [base+byte_blend]
346*c0909341SAndroid Build Coastguard Worker    movu             m0, [fg_dataq+FGData.ar_coeffs_y+ 0]   ; cf0-15
347*c0909341SAndroid Build Coastguard Worker    movq             m2, [fg_dataq+FGData.ar_coeffs_y+16]   ; cf16-23
348*c0909341SAndroid Build Coastguard Worker    pxor             m3, m3
349*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m4, m3, m0
350*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m3, m2
351*c0909341SAndroid Build Coastguard Worker    pshuflw          m6, m6, q0000
352*c0909341SAndroid Build Coastguard Worker    SCRATCH           6, 14, 12
353*c0909341SAndroid Build Coastguard Worker    SCRATCH           7, 15, 13
354*c0909341SAndroid Build Coastguard Worker    punpckhbw        m1, m0, m4
355*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m4
356*c0909341SAndroid Build Coastguard Worker    punpcklbw        m2, m3
357*c0909341SAndroid Build Coastguard Worker    pshufd           m3, m0, q1111
358*c0909341SAndroid Build Coastguard Worker    pshufd           m4, m0, q2222
359*c0909341SAndroid Build Coastguard Worker    pshufd           m5, m0, q3333
360*c0909341SAndroid Build Coastguard Worker    pshufd           m0, m0, q0000
361*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 0*16], m0
362*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 1*16], m3
363*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 2*16], m4
364*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 3*16], m5
365*c0909341SAndroid Build Coastguard Worker    pshufd           m6, m1, q1111
366*c0909341SAndroid Build Coastguard Worker    pshufd           m7, m1, q2222
367*c0909341SAndroid Build Coastguard Worker    pshufd           m5, m1, q3333
368*c0909341SAndroid Build Coastguard Worker    pshufd           m1, m1, q0000
369*c0909341SAndroid Build Coastguard Worker    pshufd           m3, m2, q1111
370*c0909341SAndroid Build Coastguard Worker    psrldq           m0, m2, 10
371*c0909341SAndroid Build Coastguard Worker    pinsrw           m2, [base+pw_1], 5
372*c0909341SAndroid Build Coastguard Worker    pshufd           m4, m2, q2222
373*c0909341SAndroid Build Coastguard Worker    pshufd           m2, m2, q0000
374*c0909341SAndroid Build Coastguard Worker    pinsrw           m0, [base+round_vals+shiftq*2-10], 3
375*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 4*16], m1
376*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 5*16], m6
377*c0909341SAndroid Build Coastguard Worker    SCRATCH           7, 8,  6
378*c0909341SAndroid Build Coastguard Worker    SCRATCH           5, 9,  7
379*c0909341SAndroid Build Coastguard Worker    SCRATCH           2, 10, 8
380*c0909341SAndroid Build Coastguard Worker    SCRATCH           3, 11, 9
381*c0909341SAndroid Build Coastguard Worker    SCRATCH           4, 12, 10
382*c0909341SAndroid Build Coastguard Worker    SCRATCH           0, 13, 11
383*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, fg_data, h, x
384*c0909341SAndroid Build Coastguard Worker    sub            bufq, 82*73-(82*3+79)
385*c0909341SAndroid Build Coastguard Worker    mov              hd, 70
386*c0909341SAndroid Build Coastguard Worker.y_loop_ar3:
387*c0909341SAndroid Build Coastguard Worker    mov              xq, -76
388*c0909341SAndroid Build Coastguard Worker
389*c0909341SAndroid Build Coastguard Worker.x_loop_ar3:
390*c0909341SAndroid Build Coastguard Worker    movu             m0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
391*c0909341SAndroid Build Coastguard Worker    pxor             m3, m3
392*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m3, m0
393*c0909341SAndroid Build Coastguard Worker    punpckhbw        m2, m0, m3
394*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m3
395*c0909341SAndroid Build Coastguard Worker
396*c0909341SAndroid Build Coastguard Worker    psrldq           m5, m0, 2
397*c0909341SAndroid Build Coastguard Worker    psrldq           m6, m0, 4
398*c0909341SAndroid Build Coastguard Worker    psrldq           m7, m0, 6
399*c0909341SAndroid Build Coastguard Worker    punpcklwd        m4, m0, m5
400*c0909341SAndroid Build Coastguard Worker    punpcklwd        m6, m7
401*c0909341SAndroid Build Coastguard Worker    pmaddwd          m4, [rsp+ 0*16]
402*c0909341SAndroid Build Coastguard Worker    pmaddwd          m6, [rsp+ 1*16]
403*c0909341SAndroid Build Coastguard Worker    paddd            m4, m6
404*c0909341SAndroid Build Coastguard Worker
405*c0909341SAndroid Build Coastguard Worker    movu             m1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
406*c0909341SAndroid Build Coastguard Worker    pxor             m5, m5
407*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m5, m1
408*c0909341SAndroid Build Coastguard Worker    punpckhbw        m3, m1, m5
409*c0909341SAndroid Build Coastguard Worker    punpcklbw        m1, m5
410*c0909341SAndroid Build Coastguard Worker    palignr          m6, m2, m0, 10
411*c0909341SAndroid Build Coastguard Worker    palignr          m7, m2, m0, 12
412*c0909341SAndroid Build Coastguard Worker    psrldq           m0, 8
413*c0909341SAndroid Build Coastguard Worker    punpcklwd        m0, m6
414*c0909341SAndroid Build Coastguard Worker    punpcklwd        m7, m1
415*c0909341SAndroid Build Coastguard Worker    pmaddwd          m0, [rsp+ 2*16]
416*c0909341SAndroid Build Coastguard Worker    pmaddwd          m7, [rsp+ 3*16]
417*c0909341SAndroid Build Coastguard Worker    paddd            m0, m7
418*c0909341SAndroid Build Coastguard Worker    paddd            m0, m4
419*c0909341SAndroid Build Coastguard Worker
420*c0909341SAndroid Build Coastguard Worker    psrldq           m4, m1, 2
421*c0909341SAndroid Build Coastguard Worker    psrldq           m5, m1, 4
422*c0909341SAndroid Build Coastguard Worker    psrldq           m6, m1, 6
423*c0909341SAndroid Build Coastguard Worker    psrldq           m7, m1, 8
424*c0909341SAndroid Build Coastguard Worker    punpcklwd        m4, m5
425*c0909341SAndroid Build Coastguard Worker    punpcklwd        m6, m7
426*c0909341SAndroid Build Coastguard Worker    pmaddwd          m4, [rsp+ 4*16]
427*c0909341SAndroid Build Coastguard Worker    pmaddwd          m6, [rsp+ 5*16]
428*c0909341SAndroid Build Coastguard Worker    paddd            m4, m6
429*c0909341SAndroid Build Coastguard Worker    paddd            m0, m4
430*c0909341SAndroid Build Coastguard Worker
431*c0909341SAndroid Build Coastguard Worker    movu             m2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
432*c0909341SAndroid Build Coastguard Worker    pxor             m7, m7
433*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m7, m2
434*c0909341SAndroid Build Coastguard Worker    punpckhbw        m5, m2, m7
435*c0909341SAndroid Build Coastguard Worker    punpcklbw        m2, m7
436*c0909341SAndroid Build Coastguard Worker    palignr          m7, m3, m1, 10
437*c0909341SAndroid Build Coastguard Worker    palignr          m3, m1, 12
438*c0909341SAndroid Build Coastguard Worker    psrldq           m1, m2, 2
439*c0909341SAndroid Build Coastguard Worker    punpcklwd        m7, m3
440*c0909341SAndroid Build Coastguard Worker    punpcklwd        m3, m2, m1
441*c0909341SAndroid Build Coastguard Worker    pmaddwd          m7, m8
442*c0909341SAndroid Build Coastguard Worker    pmaddwd          m3, m9
443*c0909341SAndroid Build Coastguard Worker    paddd            m7, m3
444*c0909341SAndroid Build Coastguard Worker    paddd            m0, m7
445*c0909341SAndroid Build Coastguard Worker
446*c0909341SAndroid Build Coastguard Worker    psrldq           m6, m2, 4
447*c0909341SAndroid Build Coastguard Worker    psrldq           m1, m2, 6
448*c0909341SAndroid Build Coastguard Worker    psrldq           m3, m2, 8
449*c0909341SAndroid Build Coastguard Worker    palignr          m4, m5, m2, 10
450*c0909341SAndroid Build Coastguard Worker    palignr          m5, m5, m2, 12
451*c0909341SAndroid Build Coastguard Worker
452*c0909341SAndroid Build Coastguard Worker    punpcklwd        m6, m1
453*c0909341SAndroid Build Coastguard Worker    punpcklwd        m3, m4
454*c0909341SAndroid Build Coastguard Worker    punpcklwd        m5, m14
455*c0909341SAndroid Build Coastguard Worker    pmaddwd          m6, m10
456*c0909341SAndroid Build Coastguard Worker    pmaddwd          m3, m11
457*c0909341SAndroid Build Coastguard Worker    pmaddwd          m5, m12
458*c0909341SAndroid Build Coastguard Worker    paddd            m0, m6
459*c0909341SAndroid Build Coastguard Worker    paddd            m3, m5
460*c0909341SAndroid Build Coastguard Worker    paddd            m0, m3
461*c0909341SAndroid Build Coastguard Worker
462*c0909341SAndroid Build Coastguard Worker    movq             m1, [bufq+xq-3]        ; y=0,x=[-3,+4]
463*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_inner:
464*c0909341SAndroid Build Coastguard Worker    pxor             m5, m5
465*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m5, m1
466*c0909341SAndroid Build Coastguard Worker    punpcklbw        m2, m1, m5
467*c0909341SAndroid Build Coastguard Worker    pmaddwd          m2, m13
468*c0909341SAndroid Build Coastguard Worker    pshufd           m3, m2, q1111
469*c0909341SAndroid Build Coastguard Worker    paddd            m2, m3                 ; left+cur
470*c0909341SAndroid Build Coastguard Worker    paddd            m2, m0                 ; add top
471*c0909341SAndroid Build Coastguard Worker    psrldq           m0, 4
472*c0909341SAndroid Build Coastguard Worker    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
473*c0909341SAndroid Build Coastguard Worker    ; don't packssdw since we only care about one value
474*c0909341SAndroid Build Coastguard Worker    packsswb         m2, m2
475*c0909341SAndroid Build Coastguard Worker    pslldq           m2, 3
476*c0909341SAndroid Build Coastguard Worker    pand             m2, m15
477*c0909341SAndroid Build Coastguard Worker    pandn            m3, m15, m1
478*c0909341SAndroid Build Coastguard Worker    por              m1, m2, m3
479*c0909341SAndroid Build Coastguard Worker    movd    [bufq+xq-3], m1
480*c0909341SAndroid Build Coastguard Worker    psrldq           m1, 1
481*c0909341SAndroid Build Coastguard Worker    inc              xq
482*c0909341SAndroid Build Coastguard Worker    jz .x_loop_ar3_end
483*c0909341SAndroid Build Coastguard Worker    test             xq, 3
484*c0909341SAndroid Build Coastguard Worker    jnz .x_loop_ar3_inner
485*c0909341SAndroid Build Coastguard Worker    jmp .x_loop_ar3
486*c0909341SAndroid Build Coastguard Worker
487*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_end:
488*c0909341SAndroid Build Coastguard Worker    add            bufq, 82
489*c0909341SAndroid Build Coastguard Worker    dec              hd
490*c0909341SAndroid Build Coastguard Worker    jg .y_loop_ar3
491*c0909341SAndroid Build Coastguard Worker    RET
492*c0909341SAndroid Build Coastguard Worker
493*c0909341SAndroid Build Coastguard Worker%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
494*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
495*c0909341SAndroid Build Coastguard Workercglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv
496*c0909341SAndroid Build Coastguard Worker    movifnidn        r2, r2mp
497*c0909341SAndroid Build Coastguard Worker    movifnidn        r3, r3mp
498*c0909341SAndroid Build Coastguard Worker    LEA              r4, $$
499*c0909341SAndroid Build Coastguard Worker%define base r4-$$
500*c0909341SAndroid Build Coastguard Worker    movq             m1, [base+rnd_next_upperbit_mask]
501*c0909341SAndroid Build Coastguard Worker    movq             m4, [base+mul_bits]
502*c0909341SAndroid Build Coastguard Worker    movq             m7, [base+hmul_bits]
503*c0909341SAndroid Build Coastguard Worker    mov             r5d, [fg_dataq+FGData.grain_scale_shift]
504*c0909341SAndroid Build Coastguard Worker    movd             m6, [base+round+r5*2]
505*c0909341SAndroid Build Coastguard Worker    mova             m5, [base+pb_mask]
506*c0909341SAndroid Build Coastguard Worker    movd             m0, [fg_dataq+FGData.seed]
507*c0909341SAndroid Build Coastguard Worker    movd             m2, [base+pw_seed_xor+uvq*4]
508*c0909341SAndroid Build Coastguard Worker    pxor             m0, m2
509*c0909341SAndroid Build Coastguard Worker    pshuflw          m6, m6, q0000
510*c0909341SAndroid Build Coastguard Worker    pshuflw          m0, m0, q0000
511*c0909341SAndroid Build Coastguard Worker    lea              r6, [base+gaussian_sequence]
512*c0909341SAndroid Build Coastguard Worker%if %2
513*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
514*c0909341SAndroid Build Coastguard Worker    mov             r7d, 73-35*%3
515*c0909341SAndroid Build Coastguard Worker%else
516*c0909341SAndroid Build Coastguard Worker    mov            r3mp, 73-35*%3
517*c0909341SAndroid Build Coastguard Worker%endif
518*c0909341SAndroid Build Coastguard Worker    add            bufq, 44
519*c0909341SAndroid Build Coastguard Worker.loop_y:
520*c0909341SAndroid Build Coastguard Worker    mov              r5, -44
521*c0909341SAndroid Build Coastguard Worker.loop_x:
522*c0909341SAndroid Build Coastguard Worker%else
523*c0909341SAndroid Build Coastguard Worker    mov              r5, -82*73
524*c0909341SAndroid Build Coastguard Worker    sub            bufq, r5
525*c0909341SAndroid Build Coastguard Worker.loop:
526*c0909341SAndroid Build Coastguard Worker%endif
527*c0909341SAndroid Build Coastguard Worker    pand             m2, m0, m1
528*c0909341SAndroid Build Coastguard Worker    psrlw            m3, m2, 10
529*c0909341SAndroid Build Coastguard Worker    por              m2, m3             ; bits 0xf, 0x1e, 0x3c and 0x78 are set
530*c0909341SAndroid Build Coastguard Worker    pmullw           m2, m4             ; bits 0x0f00 are set
531*c0909341SAndroid Build Coastguard Worker    pshufb           m3, m5, m2         ; set 15th bit for next 4 seeds
532*c0909341SAndroid Build Coastguard Worker    psllq            m2, m3, 30
533*c0909341SAndroid Build Coastguard Worker    por              m3, m2
534*c0909341SAndroid Build Coastguard Worker    psllq            m2, m3, 15
535*c0909341SAndroid Build Coastguard Worker    por              m3, m2             ; aggregate each bit into next seed's high bit
536*c0909341SAndroid Build Coastguard Worker    pmulhuw          m2, m0, m7
537*c0909341SAndroid Build Coastguard Worker    por              m2, m3             ; 4 next output seeds
538*c0909341SAndroid Build Coastguard Worker    pshuflw          m0, m2, q3333
539*c0909341SAndroid Build Coastguard Worker    psrlw            m2, 5
540*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
541*c0909341SAndroid Build Coastguard Worker    movd            r9d, m2
542*c0909341SAndroid Build Coastguard Worker    pshuflw          m2, m2, q3232
543*c0909341SAndroid Build Coastguard Worker    movzx            r8, r9w
544*c0909341SAndroid Build Coastguard Worker    shr              r9, 16
545*c0909341SAndroid Build Coastguard Worker
546*c0909341SAndroid Build Coastguard Worker    movd             m3, [r6+r8*2]
547*c0909341SAndroid Build Coastguard Worker    pinsrw           m3, [r6+r9*2], 1
548*c0909341SAndroid Build Coastguard Worker
549*c0909341SAndroid Build Coastguard Worker    movd            r9d, m2
550*c0909341SAndroid Build Coastguard Worker    movzx            r8, r9w
551*c0909341SAndroid Build Coastguard Worker    shr              r9, 16
552*c0909341SAndroid Build Coastguard Worker
553*c0909341SAndroid Build Coastguard Worker    pinsrw           m3, [r6+r8*2], 2
554*c0909341SAndroid Build Coastguard Worker    pinsrw           m3, [r6+r9*2], 3
555*c0909341SAndroid Build Coastguard Worker%else
556*c0909341SAndroid Build Coastguard Worker    movd             r2, m2
557*c0909341SAndroid Build Coastguard Worker    pshuflw          m2, m2, q3232
558*c0909341SAndroid Build Coastguard Worker    movzx            r1, r2w
559*c0909341SAndroid Build Coastguard Worker    shr              r2, 16
560*c0909341SAndroid Build Coastguard Worker
561*c0909341SAndroid Build Coastguard Worker    movd             m3, [r6+r1*2]
562*c0909341SAndroid Build Coastguard Worker    pinsrw           m3, [r6+r2*2], 1
563*c0909341SAndroid Build Coastguard Worker
564*c0909341SAndroid Build Coastguard Worker    movd             r2, m2
565*c0909341SAndroid Build Coastguard Worker    movzx            r1, r2w
566*c0909341SAndroid Build Coastguard Worker    shr              r2, 16
567*c0909341SAndroid Build Coastguard Worker
568*c0909341SAndroid Build Coastguard Worker    pinsrw           m3, [r6+r1*2], 2
569*c0909341SAndroid Build Coastguard Worker    pinsrw           m3, [r6+r2*2], 3
570*c0909341SAndroid Build Coastguard Worker%endif
571*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m3, m6
572*c0909341SAndroid Build Coastguard Worker    packsswb         m3, m3
573*c0909341SAndroid Build Coastguard Worker    movd      [bufq+r5], m3
574*c0909341SAndroid Build Coastguard Worker    add              r5, 4
575*c0909341SAndroid Build Coastguard Worker%if %2
576*c0909341SAndroid Build Coastguard Worker    jl .loop_x
577*c0909341SAndroid Build Coastguard Worker    add            bufq, 82
578*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
579*c0909341SAndroid Build Coastguard Worker    dec             r7d
580*c0909341SAndroid Build Coastguard Worker%else
581*c0909341SAndroid Build Coastguard Worker    dec            r3mp
582*c0909341SAndroid Build Coastguard Worker%endif
583*c0909341SAndroid Build Coastguard Worker    jg .loop_y
584*c0909341SAndroid Build Coastguard Worker%else
585*c0909341SAndroid Build Coastguard Worker    jl .loop
586*c0909341SAndroid Build Coastguard Worker%endif
587*c0909341SAndroid Build Coastguard Worker
588*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
589*c0909341SAndroid Build Coastguard Worker    mov              r2, r2mp
590*c0909341SAndroid Build Coastguard Worker%endif
591*c0909341SAndroid Build Coastguard Worker
592*c0909341SAndroid Build Coastguard Worker    ; auto-regression code
593*c0909341SAndroid Build Coastguard Worker    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
594*c0909341SAndroid Build Coastguard Worker    movsxd           r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4]
595*c0909341SAndroid Build Coastguard Worker    lea              r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table]
596*c0909341SAndroid Build Coastguard Worker    jmp              r5
597*c0909341SAndroid Build Coastguard Worker
598*c0909341SAndroid Build Coastguard Worker.ar0:
599*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
600*c0909341SAndroid Build Coastguard Worker    movifnidn     bufyq, bufymp
601*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
602*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK   -2*16
603*c0909341SAndroid Build Coastguard Worker%endif
604*c0909341SAndroid Build Coastguard Worker    imul            uvd, 28
605*c0909341SAndroid Build Coastguard Worker    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
606*c0909341SAndroid Build Coastguard Worker    movd             m5, [fg_dataq+FGData.ar_coeffs_uv+uvq]
607*c0909341SAndroid Build Coastguard Worker    movd             m4, [base+hmul_bits+shiftq*2]
608*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, h, x
609*c0909341SAndroid Build Coastguard Worker    pxor             m0, m0
610*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m0, m5
611*c0909341SAndroid Build Coastguard Worker    punpcklbw        m5, m0
612*c0909341SAndroid Build Coastguard Worker    movd             m7, [base+pb_1]
613*c0909341SAndroid Build Coastguard Worker%if %2
614*c0909341SAndroid Build Coastguard Worker    movd             m6, [base+hmul_bits+2+%3*2]
615*c0909341SAndroid Build Coastguard Worker%endif
616*c0909341SAndroid Build Coastguard Worker    pshuflw          m5, m5, q0000
617*c0909341SAndroid Build Coastguard Worker    pshuflw          m4, m4, q0000
618*c0909341SAndroid Build Coastguard Worker    pshufd           m7, m7, q0000
619*c0909341SAndroid Build Coastguard Worker%if %2
620*c0909341SAndroid Build Coastguard Worker    pshuflw          m6, m6, q0000
621*c0909341SAndroid Build Coastguard Worker%endif
622*c0909341SAndroid Build Coastguard Worker    punpcklqdq       m5, m5
623*c0909341SAndroid Build Coastguard Worker    punpcklqdq       m4, m4
624*c0909341SAndroid Build Coastguard Worker%if %2
625*c0909341SAndroid Build Coastguard Worker    punpcklqdq       m6, m6
626*c0909341SAndroid Build Coastguard Worker%endif
627*c0909341SAndroid Build Coastguard Worker    pcmpeqw          m1, m1
628*c0909341SAndroid Build Coastguard Worker    pslldq           m1, 12>>%2
629*c0909341SAndroid Build Coastguard Worker    SCRATCH           1, 8, 0
630*c0909341SAndroid Build Coastguard Worker    SCRATCH           4, 9, 1
631*c0909341SAndroid Build Coastguard Worker%if %2
632*c0909341SAndroid Build Coastguard Worker    sub            bufq, 82*(73-35*%3)+82-(82*3+41)
633*c0909341SAndroid Build Coastguard Worker%else
634*c0909341SAndroid Build Coastguard Worker    sub            bufq, 82*70-3
635*c0909341SAndroid Build Coastguard Worker%endif
636*c0909341SAndroid Build Coastguard Worker    add           bufyq, 3+82*3
637*c0909341SAndroid Build Coastguard Worker    mov              hd, 70-35*%3
638*c0909341SAndroid Build Coastguard Worker.y_loop_ar0:
639*c0909341SAndroid Build Coastguard Worker    xor              xd, xd
640*c0909341SAndroid Build Coastguard Worker.x_loop_ar0:
641*c0909341SAndroid Build Coastguard Worker    ; first 32 pixels
642*c0909341SAndroid Build Coastguard Worker%if %2
643*c0909341SAndroid Build Coastguard Worker    movu             m1, [bufyq+xq*2]
644*c0909341SAndroid Build Coastguard Worker%if %3
645*c0909341SAndroid Build Coastguard Worker    movu             m2, [bufyq+xq*2+82]
646*c0909341SAndroid Build Coastguard Worker%endif
647*c0909341SAndroid Build Coastguard Worker    movu             m3, [bufyq+xq*2+16]
648*c0909341SAndroid Build Coastguard Worker%if %3
649*c0909341SAndroid Build Coastguard Worker    movu             m4, [bufyq+xq*2+82+16]
650*c0909341SAndroid Build Coastguard Worker%endif
651*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m0, m7, m1
652*c0909341SAndroid Build Coastguard Worker%if %3
653*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m1, m7, m2
654*c0909341SAndroid Build Coastguard Worker%endif
655*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m2, m7, m3
656*c0909341SAndroid Build Coastguard Worker%if %3
657*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m3, m7, m4
658*c0909341SAndroid Build Coastguard Worker    paddw            m0, m1
659*c0909341SAndroid Build Coastguard Worker    paddw            m2, m3
660*c0909341SAndroid Build Coastguard Worker%endif
661*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m0, m6
662*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m2, m6
663*c0909341SAndroid Build Coastguard Worker%else
664*c0909341SAndroid Build Coastguard Worker    movu             m0, [bufyq+xq]
665*c0909341SAndroid Build Coastguard Worker    pxor             m6, m6
666*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m6, m0
667*c0909341SAndroid Build Coastguard Worker    punpckhbw        m2, m0, m6
668*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m6
669*c0909341SAndroid Build Coastguard Worker%endif
670*c0909341SAndroid Build Coastguard Worker    pmullw           m0, m5
671*c0909341SAndroid Build Coastguard Worker    pmullw           m2, m5
672*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m0, m9
673*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m2, m9
674*c0909341SAndroid Build Coastguard Worker    movu             m1, [bufq+xq]
675*c0909341SAndroid Build Coastguard Worker    pxor             m4, m4
676*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m4, m1
677*c0909341SAndroid Build Coastguard Worker    punpckhbw        m3, m1, m4
678*c0909341SAndroid Build Coastguard Worker%if %2
679*c0909341SAndroid Build Coastguard Worker    punpcklbw        m1, m4
680*c0909341SAndroid Build Coastguard Worker    paddw            m2, m3
681*c0909341SAndroid Build Coastguard Worker    paddw            m0, m1
682*c0909341SAndroid Build Coastguard Worker%else
683*c0909341SAndroid Build Coastguard Worker    punpcklbw        m6, m1, m4
684*c0909341SAndroid Build Coastguard Worker    paddw            m2, m3
685*c0909341SAndroid Build Coastguard Worker    paddw            m0, m6
686*c0909341SAndroid Build Coastguard Worker%endif
687*c0909341SAndroid Build Coastguard Worker    packsswb         m0, m2
688*c0909341SAndroid Build Coastguard Worker%if %2
689*c0909341SAndroid Build Coastguard Worker    movu      [bufq+xq], m0
690*c0909341SAndroid Build Coastguard Worker    add              xd, 16
691*c0909341SAndroid Build Coastguard Worker    cmp              xd, 32
692*c0909341SAndroid Build Coastguard Worker    jl .x_loop_ar0
693*c0909341SAndroid Build Coastguard Worker
694*c0909341SAndroid Build Coastguard Worker    ; last 6/12 pixels
695*c0909341SAndroid Build Coastguard Worker    movu             m1, [bufyq+xq*(1+%2)]
696*c0909341SAndroid Build Coastguard Worker%if %3
697*c0909341SAndroid Build Coastguard Worker    movu             m2, [bufyq+xq*2+82]
698*c0909341SAndroid Build Coastguard Worker%endif
699*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m0, m7, m1
700*c0909341SAndroid Build Coastguard Worker%if %3
701*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m1, m7, m2
702*c0909341SAndroid Build Coastguard Worker    paddw            m0, m1
703*c0909341SAndroid Build Coastguard Worker%endif
704*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m0, m6
705*c0909341SAndroid Build Coastguard Worker    pmullw           m0, m5
706*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m0, m9
707*c0909341SAndroid Build Coastguard Worker    movq             m1, [bufq+xq]
708*c0909341SAndroid Build Coastguard Worker    pxor             m4, m4
709*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m4, m1
710*c0909341SAndroid Build Coastguard Worker    punpcklbw        m2, m1, m4
711*c0909341SAndroid Build Coastguard Worker    paddw            m0, m2
712*c0909341SAndroid Build Coastguard Worker    packsswb         m0, m0
713*c0909341SAndroid Build Coastguard Worker    pandn            m2, m8, m0
714*c0909341SAndroid Build Coastguard Worker    pand             m1, m8
715*c0909341SAndroid Build Coastguard Worker    por              m2, m1
716*c0909341SAndroid Build Coastguard Worker    movq      [bufq+xq], m2
717*c0909341SAndroid Build Coastguard Worker%else
718*c0909341SAndroid Build Coastguard Worker    add              xd, 16
719*c0909341SAndroid Build Coastguard Worker    cmp              xd, 80
720*c0909341SAndroid Build Coastguard Worker    je .y_loop_final_ar0
721*c0909341SAndroid Build Coastguard Worker    movu   [bufq+xq-16], m0
722*c0909341SAndroid Build Coastguard Worker    jmp .x_loop_ar0
723*c0909341SAndroid Build Coastguard Worker.y_loop_final_ar0:
724*c0909341SAndroid Build Coastguard Worker    pandn            m2, m8, m0
725*c0909341SAndroid Build Coastguard Worker    pand             m1, m8
726*c0909341SAndroid Build Coastguard Worker    por              m2, m1
727*c0909341SAndroid Build Coastguard Worker    movu   [bufq+xq-16], m2
728*c0909341SAndroid Build Coastguard Worker%endif
729*c0909341SAndroid Build Coastguard Worker
730*c0909341SAndroid Build Coastguard Worker    add            bufq, 82
731*c0909341SAndroid Build Coastguard Worker    add           bufyq, 82<<%3
732*c0909341SAndroid Build Coastguard Worker    dec              hd
733*c0909341SAndroid Build Coastguard Worker    jg .y_loop_ar0
734*c0909341SAndroid Build Coastguard Worker    RET
735*c0909341SAndroid Build Coastguard Worker
736*c0909341SAndroid Build Coastguard Worker.ar1:
737*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
738*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
739*c0909341SAndroid Build Coastguard Worker%endif
740*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x
741*c0909341SAndroid Build Coastguard Worker    imul            uvd, 28
742*c0909341SAndroid Build Coastguard Worker    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
743*c0909341SAndroid Build Coastguard Worker    movd             m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1]
744*c0909341SAndroid Build Coastguard Worker    pinsrw           m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2
745*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
746*c0909341SAndroid Build Coastguard Worker    mov            r3mp, cf3d
747*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, shift, fg_data, val3, min, max, x
748*c0909341SAndroid Build Coastguard Worker%elif WIN64
749*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x
750*c0909341SAndroid Build Coastguard Worker    mov            bufq, r0
751*c0909341SAndroid Build Coastguard Worker%else
752*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x
753*c0909341SAndroid Build Coastguard Worker%endif
754*c0909341SAndroid Build Coastguard Worker    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
755*c0909341SAndroid Build Coastguard Worker    movd             m3, [base+round_vals+shiftq*2-12]    ; rnd
756*c0909341SAndroid Build Coastguard Worker%if %2
757*c0909341SAndroid Build Coastguard Worker    movd             m7, [base+pb_1]
758*c0909341SAndroid Build Coastguard Worker    movd             m6, [base+hmul_bits+2+%3*2]
759*c0909341SAndroid Build Coastguard Worker%endif
760*c0909341SAndroid Build Coastguard Worker    psrldq           m4, 1
761*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
762*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, shift, val0, val3, min, max, x
763*c0909341SAndroid Build Coastguard Worker%elif WIN64
764*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0
765*c0909341SAndroid Build Coastguard Worker%else
766*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0
767*c0909341SAndroid Build Coastguard Worker%endif
768*c0909341SAndroid Build Coastguard Worker    pxor             m5, m5
769*c0909341SAndroid Build Coastguard Worker    punpcklwd        m3, m5
770*c0909341SAndroid Build Coastguard Worker%if %2
771*c0909341SAndroid Build Coastguard Worker    punpcklwd        m6, m6
772*c0909341SAndroid Build Coastguard Worker%endif
773*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m5, m4
774*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m5
775*c0909341SAndroid Build Coastguard Worker    pshufd           m5, m4, q1111
776*c0909341SAndroid Build Coastguard Worker    pshufd           m4, m4, q0000
777*c0909341SAndroid Build Coastguard Worker    pshufd           m3, m3, q0000
778*c0909341SAndroid Build Coastguard Worker%if %2
779*c0909341SAndroid Build Coastguard Worker    pshufd           m7, m7, q0000
780*c0909341SAndroid Build Coastguard Worker    pshufd           m6, m6, q0000
781*c0909341SAndroid Build Coastguard Worker    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
782*c0909341SAndroid Build Coastguard Worker%else
783*c0909341SAndroid Build Coastguard Worker    sub            bufq, 82*69+3
784*c0909341SAndroid Build Coastguard Worker%endif
785*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
786*c0909341SAndroid Build Coastguard Worker    add            r1mp, 79+82*3
787*c0909341SAndroid Build Coastguard Worker    mov            r0mp, 70-35*%3
788*c0909341SAndroid Build Coastguard Worker%else
789*c0909341SAndroid Build Coastguard Worker    add           bufyq, 79+82*3
790*c0909341SAndroid Build Coastguard Worker    mov              hd, 70-35*%3
791*c0909341SAndroid Build Coastguard Worker%endif
792*c0909341SAndroid Build Coastguard Worker    mov            mind, -128
793*c0909341SAndroid Build Coastguard Worker    mov            maxd, 127
794*c0909341SAndroid Build Coastguard Worker.y_loop_ar1:
795*c0909341SAndroid Build Coastguard Worker    mov              xq, -(76>>%2)
796*c0909341SAndroid Build Coastguard Worker    movsx         val3d, byte [bufq+xq-1]
797*c0909341SAndroid Build Coastguard Worker.x_loop_ar1:
798*c0909341SAndroid Build Coastguard Worker%if %2
799*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
800*c0909341SAndroid Build Coastguard Worker    mov              r2, r1mp
801*c0909341SAndroid Build Coastguard Worker    movq             m0, [r2+xq*2]
802*c0909341SAndroid Build Coastguard Worker%if %3
803*c0909341SAndroid Build Coastguard Worker    movq             m1, [r2+xq*2+82]
804*c0909341SAndroid Build Coastguard Worker%endif
805*c0909341SAndroid Build Coastguard Worker%else
806*c0909341SAndroid Build Coastguard Worker    movq             m0, [bufyq+xq*2]
807*c0909341SAndroid Build Coastguard Worker%if %3
808*c0909341SAndroid Build Coastguard Worker    movq             m1, [bufyq+xq*2+82]
809*c0909341SAndroid Build Coastguard Worker%endif
810*c0909341SAndroid Build Coastguard Worker%endif
811*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m2, m7, m0
812*c0909341SAndroid Build Coastguard Worker%if %3
813*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m0, m7, m1
814*c0909341SAndroid Build Coastguard Worker    paddw            m2, m0
815*c0909341SAndroid Build Coastguard Worker%endif
816*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m2, m6
817*c0909341SAndroid Build Coastguard Worker%else
818*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
819*c0909341SAndroid Build Coastguard Worker    mov              r2, r1mp
820*c0909341SAndroid Build Coastguard Worker    movd             m2, [r2+xq]
821*c0909341SAndroid Build Coastguard Worker%else
822*c0909341SAndroid Build Coastguard Worker    movd             m2, [bufyq+xq]
823*c0909341SAndroid Build Coastguard Worker%endif
824*c0909341SAndroid Build Coastguard Worker    pxor             m0, m0
825*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m0, m2
826*c0909341SAndroid Build Coastguard Worker    punpcklbw        m2, m0
827*c0909341SAndroid Build Coastguard Worker%endif
828*c0909341SAndroid Build Coastguard Worker
829*c0909341SAndroid Build Coastguard Worker    movq             m0, [bufq+xq-82-1]     ; top/left
830*c0909341SAndroid Build Coastguard Worker    pxor             m1, m1
831*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m1, m0
832*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m1
833*c0909341SAndroid Build Coastguard Worker    psrldq           m1, m0, 4              ; top/right
834*c0909341SAndroid Build Coastguard Worker    punpcklwd        m1, m2
835*c0909341SAndroid Build Coastguard Worker    psrldq           m2, m0, 2              ; top
836*c0909341SAndroid Build Coastguard Worker    punpcklwd        m0, m2
837*c0909341SAndroid Build Coastguard Worker    pmaddwd          m0, m4
838*c0909341SAndroid Build Coastguard Worker    pmaddwd          m1, m5
839*c0909341SAndroid Build Coastguard Worker    paddd            m0, m1
840*c0909341SAndroid Build Coastguard Worker    paddd            m0, m3
841*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_inner:
842*c0909341SAndroid Build Coastguard Worker    movd          val0d, m0
843*c0909341SAndroid Build Coastguard Worker    psrldq           m0, 4
844*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
845*c0909341SAndroid Build Coastguard Worker    imul          val3d, r3mp
846*c0909341SAndroid Build Coastguard Worker%else
847*c0909341SAndroid Build Coastguard Worker    imul          val3d, cf3d
848*c0909341SAndroid Build Coastguard Worker%endif
849*c0909341SAndroid Build Coastguard Worker    add           val3d, val0d
850*c0909341SAndroid Build Coastguard Worker    sar           val3d, shiftb
851*c0909341SAndroid Build Coastguard Worker    movsx         val0d, byte [bufq+xq]
852*c0909341SAndroid Build Coastguard Worker    add           val3d, val0d
853*c0909341SAndroid Build Coastguard Worker    cmp           val3d, maxd
854*c0909341SAndroid Build Coastguard Worker    cmovns        val3d, maxd
855*c0909341SAndroid Build Coastguard Worker    cmp           val3d, mind
856*c0909341SAndroid Build Coastguard Worker    cmovs         val3d, mind
857*c0909341SAndroid Build Coastguard Worker    mov  byte [bufq+xq], val3b
858*c0909341SAndroid Build Coastguard Worker    ; keep val3d in-place as left for next x iteration
859*c0909341SAndroid Build Coastguard Worker    inc              xq
860*c0909341SAndroid Build Coastguard Worker    jz .x_loop_ar1_end
861*c0909341SAndroid Build Coastguard Worker    test             xq, 3
862*c0909341SAndroid Build Coastguard Worker    jnz .x_loop_ar1_inner
863*c0909341SAndroid Build Coastguard Worker    jmp .x_loop_ar1
864*c0909341SAndroid Build Coastguard Worker
865*c0909341SAndroid Build Coastguard Worker.x_loop_ar1_end:
866*c0909341SAndroid Build Coastguard Worker    add            bufq, 82
867*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
868*c0909341SAndroid Build Coastguard Worker    add            r1mp, 82<<%3
869*c0909341SAndroid Build Coastguard Worker    dec            r0mp
870*c0909341SAndroid Build Coastguard Worker%else
871*c0909341SAndroid Build Coastguard Worker    add           bufyq, 82<<%3
872*c0909341SAndroid Build Coastguard Worker    dec              hd
873*c0909341SAndroid Build Coastguard Worker%endif
874*c0909341SAndroid Build Coastguard Worker    jg .y_loop_ar1
875*c0909341SAndroid Build Coastguard Worker    RET
876*c0909341SAndroid Build Coastguard Worker
877*c0909341SAndroid Build Coastguard Worker.ar2:
878*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
879*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK   -8*16
880*c0909341SAndroid Build Coastguard Worker%endif
881*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
882*c0909341SAndroid Build Coastguard Worker    movifnidn     bufyq, bufymp
883*c0909341SAndroid Build Coastguard Worker    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
884*c0909341SAndroid Build Coastguard Worker    imul            uvd, 28
885*c0909341SAndroid Build Coastguard Worker    movd             m7, [base+round_vals-12+shiftq*2]
886*c0909341SAndroid Build Coastguard Worker    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]   ; cf0-12
887*c0909341SAndroid Build Coastguard Worker    pxor             m2, m2
888*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m2, m0
889*c0909341SAndroid Build Coastguard Worker    punpckhbw        m1, m0, m2
890*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m2
891*c0909341SAndroid Build Coastguard Worker    pinsrw           m1, [base+pw_1], 5
892*c0909341SAndroid Build Coastguard Worker    punpcklwd        m7, m7
893*c0909341SAndroid Build Coastguard Worker    pshufd           m7, m7, q0000
894*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
895*c0909341SAndroid Build Coastguard Worker    pshufd           m4, m1, q0000
896*c0909341SAndroid Build Coastguard Worker    pshufd           m5, m1, q1111
897*c0909341SAndroid Build Coastguard Worker    pshufd           m6, m1, q2222
898*c0909341SAndroid Build Coastguard Worker    pshufd           m3, m0, q3333
899*c0909341SAndroid Build Coastguard Worker    pshufd           m2, m0, q2222
900*c0909341SAndroid Build Coastguard Worker    pshufd           m1, m0, q1111
901*c0909341SAndroid Build Coastguard Worker    pshufd           m0, m0, q0000
902*c0909341SAndroid Build Coastguard Worker    SCRATCH           0, 8,  0
903*c0909341SAndroid Build Coastguard Worker    SCRATCH           1, 9,  1
904*c0909341SAndroid Build Coastguard Worker    SCRATCH           2, 10, 2
905*c0909341SAndroid Build Coastguard Worker    SCRATCH           3, 11, 3
906*c0909341SAndroid Build Coastguard Worker    SCRATCH           4, 12, 4
907*c0909341SAndroid Build Coastguard Worker    SCRATCH           5, 13, 5
908*c0909341SAndroid Build Coastguard Worker    SCRATCH           6, 14, 6
909*c0909341SAndroid Build Coastguard Worker    SCRATCH           7, 15, 7
910*c0909341SAndroid Build Coastguard Worker%if %2
911*c0909341SAndroid Build Coastguard Worker    movd             m7, [base+hmul_bits+2+%3*2]
912*c0909341SAndroid Build Coastguard Worker    movd             m6, [base+pb_1]
913*c0909341SAndroid Build Coastguard Worker    punpcklwd        m7, m7
914*c0909341SAndroid Build Coastguard Worker    pshufd           m6, m6, q0000
915*c0909341SAndroid Build Coastguard Worker    pshufd           m7, m7, q0000
916*c0909341SAndroid Build Coastguard Worker    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
917*c0909341SAndroid Build Coastguard Worker%else
918*c0909341SAndroid Build Coastguard Worker    sub            bufq, 82*69+3
919*c0909341SAndroid Build Coastguard Worker%endif
920*c0909341SAndroid Build Coastguard Worker    add           bufyq, 79+82*3
921*c0909341SAndroid Build Coastguard Worker    mov              hd, 70-35*%3
922*c0909341SAndroid Build Coastguard Worker.y_loop_ar2:
923*c0909341SAndroid Build Coastguard Worker    mov              xq, -(76>>%2)
924*c0909341SAndroid Build Coastguard Worker
925*c0909341SAndroid Build Coastguard Worker.x_loop_ar2:
926*c0909341SAndroid Build Coastguard Worker    pxor             m2, m2
927*c0909341SAndroid Build Coastguard Worker    movq             m0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
928*c0909341SAndroid Build Coastguard Worker    movhps           m0, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
929*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m2, m0
930*c0909341SAndroid Build Coastguard Worker    punpckhbw        m1, m0, m2
931*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m2
932*c0909341SAndroid Build Coastguard Worker    psrldq           m5, m0, 2              ; y=-2,x=[-1,+5]
933*c0909341SAndroid Build Coastguard Worker    psrldq           m3, m1, 2              ; y=-1,x=[-1,+5]
934*c0909341SAndroid Build Coastguard Worker    psrldq           m4, m1, 4              ; y=-1,x=[+0,+5]
935*c0909341SAndroid Build Coastguard Worker    punpcklwd        m2, m0, m5
936*c0909341SAndroid Build Coastguard Worker    punpcklwd        m3, m4
937*c0909341SAndroid Build Coastguard Worker    pmaddwd          m2, m8
938*c0909341SAndroid Build Coastguard Worker    pmaddwd          m3, m11
939*c0909341SAndroid Build Coastguard Worker    paddd            m2, m3
940*c0909341SAndroid Build Coastguard Worker
941*c0909341SAndroid Build Coastguard Worker    psrldq           m4, m0, 4              ; y=-2,x=[+0,+5]
942*c0909341SAndroid Build Coastguard Worker    psrldq           m5, m0, 6              ; y=-2,x=[+1,+5]
943*c0909341SAndroid Build Coastguard Worker    psrldq           m0, 8                  ; y=-2,x=[+2,+5]
944*c0909341SAndroid Build Coastguard Worker    punpcklwd        m4, m5
945*c0909341SAndroid Build Coastguard Worker    punpcklwd        m0, m1
946*c0909341SAndroid Build Coastguard Worker    psrldq           m3, m1, 6              ; y=-1,x=[+1,+5]
947*c0909341SAndroid Build Coastguard Worker    psrldq           m1, m1, 8              ; y=-1,x=[+2,+5]
948*c0909341SAndroid Build Coastguard Worker    punpcklwd        m3, m1
949*c0909341SAndroid Build Coastguard Worker    pmaddwd          m4, m9
950*c0909341SAndroid Build Coastguard Worker    pmaddwd          m0, m10
951*c0909341SAndroid Build Coastguard Worker    pmaddwd          m3, m12
952*c0909341SAndroid Build Coastguard Worker    paddd            m4, m0
953*c0909341SAndroid Build Coastguard Worker    paddd            m2, m3
954*c0909341SAndroid Build Coastguard Worker    paddd            m2, m4
955*c0909341SAndroid Build Coastguard Worker
956*c0909341SAndroid Build Coastguard Worker%if %2
957*c0909341SAndroid Build Coastguard Worker    movq             m1, [bufyq+xq*2]
958*c0909341SAndroid Build Coastguard Worker%if %3
959*c0909341SAndroid Build Coastguard Worker    movq             m3, [bufyq+xq*2+82]
960*c0909341SAndroid Build Coastguard Worker%endif
961*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m0, m6, m1
962*c0909341SAndroid Build Coastguard Worker%if %3
963*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m1, m6, m3
964*c0909341SAndroid Build Coastguard Worker    paddw            m0, m1
965*c0909341SAndroid Build Coastguard Worker%endif
966*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m0, m7
967*c0909341SAndroid Build Coastguard Worker%else
968*c0909341SAndroid Build Coastguard Worker    movd             m0, [bufyq+xq]
969*c0909341SAndroid Build Coastguard Worker    pxor             m1, m1
970*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m1, m0
971*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m1
972*c0909341SAndroid Build Coastguard Worker%endif
973*c0909341SAndroid Build Coastguard Worker    punpcklwd        m0, m15
974*c0909341SAndroid Build Coastguard Worker    pmaddwd          m0, m14
975*c0909341SAndroid Build Coastguard Worker    paddd            m2, m0
976*c0909341SAndroid Build Coastguard Worker
977*c0909341SAndroid Build Coastguard Worker    movq             m0, [bufq+xq-2]        ; y=0,x=[-2,+5]
978*c0909341SAndroid Build Coastguard Worker    pxor             m4, m4
979*c0909341SAndroid Build Coastguard Worker    movd             m5, [base+byte_blend+1]
980*c0909341SAndroid Build Coastguard Worker    punpcklbw        m5, m5
981*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_inner:
982*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m1, m4, m0
983*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m1
984*c0909341SAndroid Build Coastguard Worker    pmaddwd          m3, m0, m13
985*c0909341SAndroid Build Coastguard Worker    paddd            m3, m2
986*c0909341SAndroid Build Coastguard Worker    psrldq           m2, 4                  ; shift top to next pixel
987*c0909341SAndroid Build Coastguard Worker    psrad            m3, [fg_dataq+FGData.ar_coeff_shift]
988*c0909341SAndroid Build Coastguard Worker    pslldq           m3, 4
989*c0909341SAndroid Build Coastguard Worker    pand             m3, m5
990*c0909341SAndroid Build Coastguard Worker    paddw            m0, m3
991*c0909341SAndroid Build Coastguard Worker    packsswb         m0, m0
992*c0909341SAndroid Build Coastguard Worker    movd    [bufq+xq-2], m0
993*c0909341SAndroid Build Coastguard Worker    psrldq           m0, 1
994*c0909341SAndroid Build Coastguard Worker    inc              xq
995*c0909341SAndroid Build Coastguard Worker    jz .x_loop_ar2_end
996*c0909341SAndroid Build Coastguard Worker    test             xq, 3
997*c0909341SAndroid Build Coastguard Worker    jnz .x_loop_ar2_inner
998*c0909341SAndroid Build Coastguard Worker    jmp .x_loop_ar2
999*c0909341SAndroid Build Coastguard Worker
1000*c0909341SAndroid Build Coastguard Worker.x_loop_ar2_end:
1001*c0909341SAndroid Build Coastguard Worker    add            bufq, 82
1002*c0909341SAndroid Build Coastguard Worker    add           bufyq, 82<<%3
1003*c0909341SAndroid Build Coastguard Worker    dec              hd
1004*c0909341SAndroid Build Coastguard Worker    jg .y_loop_ar2
1005*c0909341SAndroid Build Coastguard Worker    RET
1006*c0909341SAndroid Build Coastguard Worker
1007*c0909341SAndroid Build Coastguard Worker.ar3:
1008*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1009*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
1010*c0909341SAndroid Build Coastguard Worker%endif
1011*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
1012*c0909341SAndroid Build Coastguard Worker    movifnidn     bufyq, bufymp
1013*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1014*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK  -15*16
1015*c0909341SAndroid Build Coastguard Worker%else
1016*c0909341SAndroid Build Coastguard Worker    SUB             rsp, 16*7
1017*c0909341SAndroid Build Coastguard Worker%assign stack_size_padded (stack_size_padded+16*7)
1018*c0909341SAndroid Build Coastguard Worker%assign stack_size (stack_size+16*7)
1019*c0909341SAndroid Build Coastguard Worker%endif
1020*c0909341SAndroid Build Coastguard Worker    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
1021*c0909341SAndroid Build Coastguard Worker    imul            uvd, 28
1022*c0909341SAndroid Build Coastguard Worker
1023*c0909341SAndroid Build Coastguard Worker    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]   ; cf0-15
1024*c0909341SAndroid Build Coastguard Worker    pxor             m3, m3
1025*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m3, m0
1026*c0909341SAndroid Build Coastguard Worker    punpckhbw        m1, m0, m3
1027*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m3
1028*c0909341SAndroid Build Coastguard Worker    pshufd           m2, m0, q1111
1029*c0909341SAndroid Build Coastguard Worker    pshufd           m3, m0, q2222
1030*c0909341SAndroid Build Coastguard Worker    pshufd           m4, m0, q3333
1031*c0909341SAndroid Build Coastguard Worker    pshufd           m0, m0, q0000
1032*c0909341SAndroid Build Coastguard Worker    pshufd           m5, m1, q1111
1033*c0909341SAndroid Build Coastguard Worker    pshufd           m6, m1, q2222
1034*c0909341SAndroid Build Coastguard Worker    pshufd           m7, m1, q3333
1035*c0909341SAndroid Build Coastguard Worker    pshufd           m1, m1, q0000
1036*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 0*16], m0
1037*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 1*16], m2
1038*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 2*16], m3
1039*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 3*16], m4
1040*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 4*16], m1
1041*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 5*16], m5
1042*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 6*16], m6
1043*c0909341SAndroid Build Coastguard Worker    SCRATCH           7, 8, 7
1044*c0909341SAndroid Build Coastguard Worker
1045*c0909341SAndroid Build Coastguard Worker    movu             m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16]   ; cf16-24 [24=luma]
1046*c0909341SAndroid Build Coastguard Worker    pxor             m4, m4
1047*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m4, m2
1048*c0909341SAndroid Build Coastguard Worker    punpckhbw        m5, m2, m4
1049*c0909341SAndroid Build Coastguard Worker    punpcklbw        m2, m4
1050*c0909341SAndroid Build Coastguard Worker    pshufd           m4, m2, q3232
1051*c0909341SAndroid Build Coastguard Worker    punpcklwd        m3, m4, m5
1052*c0909341SAndroid Build Coastguard Worker    pshuflw          m5, m4, q3321
1053*c0909341SAndroid Build Coastguard Worker    pshufd           m4, m3, q0000
1054*c0909341SAndroid Build Coastguard Worker    pshufd           m3, m2, q1111
1055*c0909341SAndroid Build Coastguard Worker    pshufd           m2, m2, q0000
1056*c0909341SAndroid Build Coastguard Worker    pinsrw           m5, [base+round_vals+shiftq*2-10], 3
1057*c0909341SAndroid Build Coastguard Worker    SCRATCH           2, 9,  8
1058*c0909341SAndroid Build Coastguard Worker    SCRATCH           3, 10, 9
1059*c0909341SAndroid Build Coastguard Worker    SCRATCH           4, 11, 10
1060*c0909341SAndroid Build Coastguard Worker    SCRATCH           5, 12, 11
1061*c0909341SAndroid Build Coastguard Worker
1062*c0909341SAndroid Build Coastguard Worker    movd             m2, [base+round_vals-12+shiftq*2]
1063*c0909341SAndroid Build Coastguard Worker%if %2
1064*c0909341SAndroid Build Coastguard Worker    movd             m1, [base+pb_1]
1065*c0909341SAndroid Build Coastguard Worker    movd             m3, [base+hmul_bits+2+%3*2]
1066*c0909341SAndroid Build Coastguard Worker%endif
1067*c0909341SAndroid Build Coastguard Worker    pxor             m0, m0
1068*c0909341SAndroid Build Coastguard Worker    punpcklwd        m2, m0
1069*c0909341SAndroid Build Coastguard Worker%if %2
1070*c0909341SAndroid Build Coastguard Worker    punpcklwd        m3, m3
1071*c0909341SAndroid Build Coastguard Worker%endif
1072*c0909341SAndroid Build Coastguard Worker    pshufd           m2, m2, q0000
1073*c0909341SAndroid Build Coastguard Worker%if %2
1074*c0909341SAndroid Build Coastguard Worker    pshufd           m1, m1, q0000
1075*c0909341SAndroid Build Coastguard Worker    pshufd           m3, m3, q0000
1076*c0909341SAndroid Build Coastguard Worker    SCRATCH           1, 13, 12
1077*c0909341SAndroid Build Coastguard Worker%endif
1078*c0909341SAndroid Build Coastguard Worker    SCRATCH           2, 14, 13
1079*c0909341SAndroid Build Coastguard Worker%if %2
1080*c0909341SAndroid Build Coastguard Worker    SCRATCH           3, 15, 14
1081*c0909341SAndroid Build Coastguard Worker%endif
1082*c0909341SAndroid Build Coastguard Worker
1083*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
1084*c0909341SAndroid Build Coastguard Worker%if %2
1085*c0909341SAndroid Build Coastguard Worker    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
1086*c0909341SAndroid Build Coastguard Worker%else
1087*c0909341SAndroid Build Coastguard Worker    sub            bufq, 82*69+3
1088*c0909341SAndroid Build Coastguard Worker%endif
1089*c0909341SAndroid Build Coastguard Worker    add           bufyq, 79+82*3
1090*c0909341SAndroid Build Coastguard Worker    mov              hd, 70-35*%3
1091*c0909341SAndroid Build Coastguard Worker.y_loop_ar3:
1092*c0909341SAndroid Build Coastguard Worker    mov              xq, -(76>>%2)
1093*c0909341SAndroid Build Coastguard Worker
1094*c0909341SAndroid Build Coastguard Worker.x_loop_ar3:
1095*c0909341SAndroid Build Coastguard Worker    movu             m0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
1096*c0909341SAndroid Build Coastguard Worker    pxor             m4, m4
1097*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m4, m0
1098*c0909341SAndroid Build Coastguard Worker    punpckhbw        m3, m0, m4
1099*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m4
1100*c0909341SAndroid Build Coastguard Worker
1101*c0909341SAndroid Build Coastguard Worker    psrldq           m5, m0, 2
1102*c0909341SAndroid Build Coastguard Worker    psrldq           m6, m0, 4
1103*c0909341SAndroid Build Coastguard Worker    psrldq           m7, m0, 6
1104*c0909341SAndroid Build Coastguard Worker    punpcklwd        m4, m0, m5
1105*c0909341SAndroid Build Coastguard Worker    punpcklwd        m6, m7
1106*c0909341SAndroid Build Coastguard Worker    pmaddwd          m4, [rsp+ 0*16]
1107*c0909341SAndroid Build Coastguard Worker    pmaddwd          m6, [rsp+ 1*16]
1108*c0909341SAndroid Build Coastguard Worker    paddd            m4, m6
1109*c0909341SAndroid Build Coastguard Worker
1110*c0909341SAndroid Build Coastguard Worker    palignr          m2, m3, m0, 10
1111*c0909341SAndroid Build Coastguard Worker    palignr          m3, m0, 12
1112*c0909341SAndroid Build Coastguard Worker    psrldq           m0, 8
1113*c0909341SAndroid Build Coastguard Worker
1114*c0909341SAndroid Build Coastguard Worker    movu             m1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
1115*c0909341SAndroid Build Coastguard Worker    pxor             m6, m6
1116*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m6, m1
1117*c0909341SAndroid Build Coastguard Worker    punpckhbw        m5, m1, m6
1118*c0909341SAndroid Build Coastguard Worker    punpcklbw        m1, m6
1119*c0909341SAndroid Build Coastguard Worker
1120*c0909341SAndroid Build Coastguard Worker    punpcklwd        m0, m2
1121*c0909341SAndroid Build Coastguard Worker    punpcklwd        m3, m1
1122*c0909341SAndroid Build Coastguard Worker    pmaddwd          m0, [rsp+ 2*16]
1123*c0909341SAndroid Build Coastguard Worker    pmaddwd          m3, [rsp+ 3*16]
1124*c0909341SAndroid Build Coastguard Worker    paddd            m0, m3
1125*c0909341SAndroid Build Coastguard Worker    paddd            m0, m4
1126*c0909341SAndroid Build Coastguard Worker
1127*c0909341SAndroid Build Coastguard Worker    movu             m2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
1128*c0909341SAndroid Build Coastguard Worker    pxor             m7, m7
1129*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m7, m2
1130*c0909341SAndroid Build Coastguard Worker    punpckhbw        m6, m2, m7
1131*c0909341SAndroid Build Coastguard Worker    punpcklbw        m2, m7
1132*c0909341SAndroid Build Coastguard Worker
1133*c0909341SAndroid Build Coastguard Worker    palignr          m3, m5, m1, 10
1134*c0909341SAndroid Build Coastguard Worker    palignr          m5, m1, 12
1135*c0909341SAndroid Build Coastguard Worker    psrldq           m4, m2, 2
1136*c0909341SAndroid Build Coastguard Worker
1137*c0909341SAndroid Build Coastguard Worker    punpcklwd        m3, m5
1138*c0909341SAndroid Build Coastguard Worker    punpcklwd        m5, m2, m4
1139*c0909341SAndroid Build Coastguard Worker    pmaddwd          m3, [rsp+ 6*16]
1140*c0909341SAndroid Build Coastguard Worker    pmaddwd          m5, m8
1141*c0909341SAndroid Build Coastguard Worker    paddd            m3, m5
1142*c0909341SAndroid Build Coastguard Worker    paddd            m0, m3
1143*c0909341SAndroid Build Coastguard Worker
1144*c0909341SAndroid Build Coastguard Worker    psrldq           m3, m1, 2
1145*c0909341SAndroid Build Coastguard Worker    psrldq           m4, m1, 4
1146*c0909341SAndroid Build Coastguard Worker    psrldq           m5, m1, 6
1147*c0909341SAndroid Build Coastguard Worker    psrldq           m1, 8
1148*c0909341SAndroid Build Coastguard Worker
1149*c0909341SAndroid Build Coastguard Worker    punpcklwd        m3, m4
1150*c0909341SAndroid Build Coastguard Worker    punpcklwd        m5, m1
1151*c0909341SAndroid Build Coastguard Worker    pmaddwd          m3, [rsp+ 4*16]
1152*c0909341SAndroid Build Coastguard Worker    pmaddwd          m5, [rsp+ 5*16]
1153*c0909341SAndroid Build Coastguard Worker    paddd            m3, m5
1154*c0909341SAndroid Build Coastguard Worker    paddd            m0, m3
1155*c0909341SAndroid Build Coastguard Worker
1156*c0909341SAndroid Build Coastguard Worker%if %2
1157*c0909341SAndroid Build Coastguard Worker    movq             m1, [bufyq+xq*2]
1158*c0909341SAndroid Build Coastguard Worker%if %3
1159*c0909341SAndroid Build Coastguard Worker    movq             m3, [bufyq+xq*2+82]
1160*c0909341SAndroid Build Coastguard Worker%endif
1161*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m7, m13, m1
1162*c0909341SAndroid Build Coastguard Worker%if %3
1163*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m5, m13, m3
1164*c0909341SAndroid Build Coastguard Worker    paddw            m7, m5
1165*c0909341SAndroid Build Coastguard Worker%endif
1166*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m7, m15
1167*c0909341SAndroid Build Coastguard Worker%else
1168*c0909341SAndroid Build Coastguard Worker    movd             m7, [bufyq+xq]
1169*c0909341SAndroid Build Coastguard Worker    pxor             m1, m1
1170*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m1, m7
1171*c0909341SAndroid Build Coastguard Worker    punpcklbw        m7, m1
1172*c0909341SAndroid Build Coastguard Worker%endif
1173*c0909341SAndroid Build Coastguard Worker
1174*c0909341SAndroid Build Coastguard Worker    psrldq           m1, m2, 4
1175*c0909341SAndroid Build Coastguard Worker    psrldq           m3, m2, 6
1176*c0909341SAndroid Build Coastguard Worker    palignr          m4, m6, m2, 10
1177*c0909341SAndroid Build Coastguard Worker    palignr          m6, m2, 12
1178*c0909341SAndroid Build Coastguard Worker    psrldq           m2, 8
1179*c0909341SAndroid Build Coastguard Worker
1180*c0909341SAndroid Build Coastguard Worker    punpcklwd        m1, m3
1181*c0909341SAndroid Build Coastguard Worker    punpcklwd        m2, m4
1182*c0909341SAndroid Build Coastguard Worker    punpcklwd        m6, m7
1183*c0909341SAndroid Build Coastguard Worker    pmaddwd          m1, m9
1184*c0909341SAndroid Build Coastguard Worker    pmaddwd          m2, m10
1185*c0909341SAndroid Build Coastguard Worker    pmaddwd          m6, m11
1186*c0909341SAndroid Build Coastguard Worker    paddd            m1, m2
1187*c0909341SAndroid Build Coastguard Worker    paddd            m0, m6
1188*c0909341SAndroid Build Coastguard Worker    paddd            m0, m1
1189*c0909341SAndroid Build Coastguard Worker    paddd            m0, m14
1190*c0909341SAndroid Build Coastguard Worker
1191*c0909341SAndroid Build Coastguard Worker    movq             m1, [bufq+xq-3]        ; y=0,x=[-3,+4]
1192*c0909341SAndroid Build Coastguard Worker    pxor             m4, m4
1193*c0909341SAndroid Build Coastguard Worker    movd             m5, [base+byte_blend]
1194*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_inner:
1195*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m2, m4, m1
1196*c0909341SAndroid Build Coastguard Worker    punpcklbw        m3, m1, m2
1197*c0909341SAndroid Build Coastguard Worker    pmaddwd          m2, m3, m12
1198*c0909341SAndroid Build Coastguard Worker    pshufd           m3, m2, q1111
1199*c0909341SAndroid Build Coastguard Worker    paddd            m2, m3                 ; left+cur
1200*c0909341SAndroid Build Coastguard Worker    paddd            m2, m0                 ; add top
1201*c0909341SAndroid Build Coastguard Worker    psrldq           m0, 4
1202*c0909341SAndroid Build Coastguard Worker    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
1203*c0909341SAndroid Build Coastguard Worker    ; don't packssdw, we only care about one value
1204*c0909341SAndroid Build Coastguard Worker    packsswb         m2, m2
1205*c0909341SAndroid Build Coastguard Worker    pandn            m3, m5, m1
1206*c0909341SAndroid Build Coastguard Worker    pslld            m2, 24
1207*c0909341SAndroid Build Coastguard Worker    pand             m2, m5
1208*c0909341SAndroid Build Coastguard Worker    por              m1, m2, m3
1209*c0909341SAndroid Build Coastguard Worker    movd    [bufq+xq-3], m1
1210*c0909341SAndroid Build Coastguard Worker    psrldq           m1, 1
1211*c0909341SAndroid Build Coastguard Worker    inc              xq
1212*c0909341SAndroid Build Coastguard Worker    jz .x_loop_ar3_end
1213*c0909341SAndroid Build Coastguard Worker    test             xq, 3
1214*c0909341SAndroid Build Coastguard Worker    jnz .x_loop_ar3_inner
1215*c0909341SAndroid Build Coastguard Worker    jmp .x_loop_ar3
1216*c0909341SAndroid Build Coastguard Worker
1217*c0909341SAndroid Build Coastguard Worker.x_loop_ar3_end:
1218*c0909341SAndroid Build Coastguard Worker    add            bufq, 82
1219*c0909341SAndroid Build Coastguard Worker    add           bufyq, 82<<%3
1220*c0909341SAndroid Build Coastguard Worker    dec              hd
1221*c0909341SAndroid Build Coastguard Worker    jg .y_loop_ar3
1222*c0909341SAndroid Build Coastguard Worker    RET
1223*c0909341SAndroid Build Coastguard Worker%endmacro
1224*c0909341SAndroid Build Coastguard Worker
1225*c0909341SAndroid Build Coastguard Workergenerate_grain_uv_fn 420, 1, 1
1226*c0909341SAndroid Build Coastguard Workergenerate_grain_uv_fn 422, 1, 0
1227*c0909341SAndroid Build Coastguard Workergenerate_grain_uv_fn 444, 0, 0
1228*c0909341SAndroid Build Coastguard Worker
1229*c0909341SAndroid Build Coastguard Worker%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg
1230*c0909341SAndroid Build Coastguard Worker%assign %%idx 0
1231*c0909341SAndroid Build Coastguard Worker%define %%tmp %2
1232*c0909341SAndroid Build Coastguard Worker%if %0 == 6
1233*c0909341SAndroid Build Coastguard Worker%define %%tmp %6
1234*c0909341SAndroid Build Coastguard Worker%endif
1235*c0909341SAndroid Build Coastguard Worker%rep 4
1236*c0909341SAndroid Build Coastguard Worker%if %%idx == 0
1237*c0909341SAndroid Build Coastguard Worker    movd        %5 %+ d, %2
1238*c0909341SAndroid Build Coastguard Worker    pshuflw       %%tmp, %2, q3232
1239*c0909341SAndroid Build Coastguard Worker%else
1240*c0909341SAndroid Build Coastguard Worker    movd        %5 %+ d, %%tmp
1241*c0909341SAndroid Build Coastguard Worker%if %%idx == 2
1242*c0909341SAndroid Build Coastguard Worker    punpckhqdq    %%tmp, %%tmp
1243*c0909341SAndroid Build Coastguard Worker%elif %%idx == 4
1244*c0909341SAndroid Build Coastguard Worker    psrlq         %%tmp, 32
1245*c0909341SAndroid Build Coastguard Worker%endif
1246*c0909341SAndroid Build Coastguard Worker%endif
1247*c0909341SAndroid Build Coastguard Worker    movzx       %4 %+ d, %5 %+ w
1248*c0909341SAndroid Build Coastguard Worker    shr         %5 %+ d, 16
1249*c0909341SAndroid Build Coastguard Worker
1250*c0909341SAndroid Build Coastguard Worker%if %%idx == 0
1251*c0909341SAndroid Build Coastguard Worker    movd             %1, [%3+%4]
1252*c0909341SAndroid Build Coastguard Worker%else
1253*c0909341SAndroid Build Coastguard Worker    pinsrw           %1, [%3+%4], %%idx + 0
1254*c0909341SAndroid Build Coastguard Worker%endif
1255*c0909341SAndroid Build Coastguard Worker    pinsrw           %1, [%3+%5], %%idx + 1
1256*c0909341SAndroid Build Coastguard Worker%assign %%idx %%idx+2
1257*c0909341SAndroid Build Coastguard Worker%endrep
1258*c0909341SAndroid Build Coastguard Worker%endmacro
1259*c0909341SAndroid Build Coastguard Worker
1260*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
1261*c0909341SAndroid Build Coastguard Worker; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby)
1262*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1263*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < mmsize
1264*c0909341SAndroid Build Coastguard Workercglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \
1265*c0909341SAndroid Build Coastguard Worker        dst, src, scaling, unused1, fg_data, picptr, unused2
1266*c0909341SAndroid Build Coastguard Worker    ; copy stack arguments to new position post-alignment, so that we
1267*c0909341SAndroid Build Coastguard Worker    ; don't have to keep the old stack location in a separate register
1268*c0909341SAndroid Build Coastguard Worker    mov              r0, r0m
1269*c0909341SAndroid Build Coastguard Worker    mov              r1, r2m
1270*c0909341SAndroid Build Coastguard Worker    mov              r2, r4m
1271*c0909341SAndroid Build Coastguard Worker    mov              r3, r6m
1272*c0909341SAndroid Build Coastguard Worker    mov              r4, r7m
1273*c0909341SAndroid Build Coastguard Worker    mov              r5, r8m
1274*c0909341SAndroid Build Coastguard Worker
1275*c0909341SAndroid Build Coastguard Worker    mov [rsp+5*mmsize+ 4*gprsize], r0
1276*c0909341SAndroid Build Coastguard Worker    mov [rsp+5*mmsize+ 6*gprsize], r1
1277*c0909341SAndroid Build Coastguard Worker    mov [rsp+5*mmsize+ 8*gprsize], r2
1278*c0909341SAndroid Build Coastguard Worker    mov [rsp+5*mmsize+10*gprsize], r3
1279*c0909341SAndroid Build Coastguard Worker    mov [rsp+5*mmsize+11*gprsize], r4
1280*c0909341SAndroid Build Coastguard Worker    mov [rsp+5*mmsize+12*gprsize], r5
1281*c0909341SAndroid Build Coastguard Worker%else
1282*c0909341SAndroid Build Coastguard Workercglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \
1283*c0909341SAndroid Build Coastguard Worker        dst, src, scaling, unused1, fg_data, picptr, unused2
1284*c0909341SAndroid Build Coastguard Worker%endif
1285*c0909341SAndroid Build Coastguard Worker    mov            srcq, srcm
1286*c0909341SAndroid Build Coastguard Worker    mov        fg_dataq, r3m
1287*c0909341SAndroid Build Coastguard Worker    mov        scalingq, r5m
1288*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < mmsize
1289*c0909341SAndroid Build Coastguard Worker%define r0m [rsp+5*mmsize+ 4*gprsize]
1290*c0909341SAndroid Build Coastguard Worker%define r1m [rsp+5*mmsize+ 5*gprsize]
1291*c0909341SAndroid Build Coastguard Worker%define r2m [rsp+5*mmsize+ 6*gprsize]
1292*c0909341SAndroid Build Coastguard Worker%define r3m [rsp+5*mmsize+ 7*gprsize]
1293*c0909341SAndroid Build Coastguard Worker%define r4m [rsp+5*mmsize+ 8*gprsize]
1294*c0909341SAndroid Build Coastguard Worker%define r5m [rsp+5*mmsize+ 9*gprsize]
1295*c0909341SAndroid Build Coastguard Worker%define r6m [rsp+5*mmsize+10*gprsize]
1296*c0909341SAndroid Build Coastguard Worker%define r7m [rsp+5*mmsize+11*gprsize]
1297*c0909341SAndroid Build Coastguard Worker%define r8m [rsp+5*mmsize+12*gprsize]
1298*c0909341SAndroid Build Coastguard Worker%endif
1299*c0909341SAndroid Build Coastguard Worker    LEA              r5, pb_mask
1300*c0909341SAndroid Build Coastguard Worker%define base r5-pb_mask
1301*c0909341SAndroid Build Coastguard Worker    mov             r5m, picptrq
1302*c0909341SAndroid Build Coastguard Worker%else
1303*c0909341SAndroid Build Coastguard Workercglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
1304*c0909341SAndroid Build Coastguard Worker    lea              r7, [pb_mask]
1305*c0909341SAndroid Build Coastguard Worker%define base r7-pb_mask
1306*c0909341SAndroid Build Coastguard Worker%endif
1307*c0909341SAndroid Build Coastguard Worker    mov             r6d, [fg_dataq+FGData.scaling_shift]
1308*c0909341SAndroid Build Coastguard Worker    movd             m3, [base+mul_bits+r6*2-14]
1309*c0909341SAndroid Build Coastguard Worker    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
1310*c0909341SAndroid Build Coastguard Worker    movd             m4, [base+max+r6*4]
1311*c0909341SAndroid Build Coastguard Worker    movd             m5, [base+min+r6*2]
1312*c0909341SAndroid Build Coastguard Worker    punpcklwd        m3, m3
1313*c0909341SAndroid Build Coastguard Worker    punpcklwd        m4, m4
1314*c0909341SAndroid Build Coastguard Worker    punpcklwd        m5, m5
1315*c0909341SAndroid Build Coastguard Worker    pshufd           m3, m3, q0000
1316*c0909341SAndroid Build Coastguard Worker    pshufd           m4, m4, q0000
1317*c0909341SAndroid Build Coastguard Worker    pshufd           m5, m5, q0000
1318*c0909341SAndroid Build Coastguard Worker    SCRATCH           3, 11, 0
1319*c0909341SAndroid Build Coastguard Worker    SCRATCH           4, 12, 1
1320*c0909341SAndroid Build Coastguard Worker    SCRATCH           5, 13, 2
1321*c0909341SAndroid Build Coastguard Worker
1322*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1323*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
1324*c0909341SAndroid Build Coastguard Worker%else
1325*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
1326*c0909341SAndroid Build Coastguard Worker%endif
1327*c0909341SAndroid Build Coastguard Worker
1328*c0909341SAndroid Build Coastguard Worker    mov            sbyd, r8m
1329*c0909341SAndroid Build Coastguard Worker    mov        overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
1330*c0909341SAndroid Build Coastguard Worker    test       overlapd, overlapd
1331*c0909341SAndroid Build Coastguard Worker    jz .no_vertical_overlap
1332*c0909341SAndroid Build Coastguard Worker    mova             m6, [base+pw_1024]
1333*c0909341SAndroid Build Coastguard Worker    mova             m7, [base+pb_27_17_17_27]
1334*c0909341SAndroid Build Coastguard Worker    SCRATCH           6, 14, 3
1335*c0909341SAndroid Build Coastguard Worker    SCRATCH           7, 15, 4
1336*c0909341SAndroid Build Coastguard Worker    test           sbyd, sbyd
1337*c0909341SAndroid Build Coastguard Worker    jnz .vertical_overlap
1338*c0909341SAndroid Build Coastguard Worker    ; fall-through
1339*c0909341SAndroid Build Coastguard Worker
1340*c0909341SAndroid Build Coastguard Worker.no_vertical_overlap:
1341*c0909341SAndroid Build Coastguard Worker    mov             r8m, overlapd
1342*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1343*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused
1344*c0909341SAndroid Build Coastguard Worker    imul           seed, (173 << 24) | 37
1345*c0909341SAndroid Build Coastguard Worker%else
1346*c0909341SAndroid Build Coastguard Worker    imul           seed, sbyd, (173 << 24) | 37
1347*c0909341SAndroid Build Coastguard Worker%endif
1348*c0909341SAndroid Build Coastguard Worker    add            seed, (105 << 24) | 178
1349*c0909341SAndroid Build Coastguard Worker    rol            seed, 8
1350*c0909341SAndroid Build Coastguard Worker    movzx          seed, seew
1351*c0909341SAndroid Build Coastguard Worker    xor            seed, [fg_dataq+FGData.seed]
1352*c0909341SAndroid Build Coastguard Worker
1353*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1354*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
1355*c0909341SAndroid Build Coastguard Worker
1356*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
1357*c0909341SAndroid Build Coastguard Worker    mov              wq, r4m
1358*c0909341SAndroid Build Coastguard Worker%else
1359*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1360*c0909341SAndroid Build Coastguard Worker                unused1, unused2, see, unused3
1361*c0909341SAndroid Build Coastguard Worker%endif
1362*c0909341SAndroid Build Coastguard Worker
1363*c0909341SAndroid Build Coastguard Worker    lea        src_bakq, [srcq+wq]
1364*c0909341SAndroid Build Coastguard Worker    neg              wq
1365*c0909341SAndroid Build Coastguard Worker    sub           dstmp, srcq
1366*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1367*c0909341SAndroid Build Coastguard Worker    mov             r1m, src_bakq
1368*c0909341SAndroid Build Coastguard Worker    mov             r4m, wq
1369*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
1370*c0909341SAndroid Build Coastguard Worker%endif
1371*c0909341SAndroid Build Coastguard Worker
1372*c0909341SAndroid Build Coastguard Worker.loop_x:
1373*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1374*c0909341SAndroid Build Coastguard Worker    mov            seed, r3m
1375*c0909341SAndroid Build Coastguard Worker%endif
1376*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
1377*c0909341SAndroid Build Coastguard Worker    or             seed, 0xEFF4
1378*c0909341SAndroid Build Coastguard Worker    shr             r6d, 1
1379*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
1380*c0909341SAndroid Build Coastguard Worker    lea            seed, [r6+0x8000]
1381*c0909341SAndroid Build Coastguard Worker    cmovp          seed, r6d                ; updated seed
1382*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1383*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
1384*c0909341SAndroid Build Coastguard Worker
1385*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1386*c0909341SAndroid Build Coastguard Worker
1387*c0909341SAndroid Build Coastguard Worker    mov           offxd, offyd
1388*c0909341SAndroid Build Coastguard Worker%else
1389*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1390*c0909341SAndroid Build Coastguard Worker                offx, offy, see, unused
1391*c0909341SAndroid Build Coastguard Worker
1392*c0909341SAndroid Build Coastguard Worker    mov           offyd, seed
1393*c0909341SAndroid Build Coastguard Worker    mov           offxd, seed
1394*c0909341SAndroid Build Coastguard Worker%endif
1395*c0909341SAndroid Build Coastguard Worker    ror           offyd, 8
1396*c0909341SAndroid Build Coastguard Worker    shr           offxd, 12
1397*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf
1398*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164
1399*c0909341SAndroid Build Coastguard Worker    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
1400*c0909341SAndroid Build Coastguard Worker
1401*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1402*c0909341SAndroid Build Coastguard Worker    ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
1403*c0909341SAndroid Build Coastguard Worker    ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
1404*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1405*c0909341SAndroid Build Coastguard Worker%else
1406*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1407*c0909341SAndroid Build Coastguard Worker                h, offxy, see, unused
1408*c0909341SAndroid Build Coastguard Worker%endif
1409*c0909341SAndroid Build Coastguard Worker
1410*c0909341SAndroid Build Coastguard Worker.loop_x_odd:
1411*c0909341SAndroid Build Coastguard Worker    mov              hd, r7m
1412*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
1413*c0909341SAndroid Build Coastguard Worker.loop_y:
1414*c0909341SAndroid Build Coastguard Worker    ; src
1415*c0909341SAndroid Build Coastguard Worker    mova             m0, [srcq]
1416*c0909341SAndroid Build Coastguard Worker    pxor             m2, m2
1417*c0909341SAndroid Build Coastguard Worker    punpckhbw        m1, m0, m2
1418*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m2                 ; m0-1: src as word
1419*c0909341SAndroid Build Coastguard Worker
1420*c0909341SAndroid Build Coastguard Worker    ; scaling[src]
1421*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1422*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m4, m0, scalingq-1, r0, r5, m3
1423*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m1, scalingq-1, r0, r5, m3
1424*c0909341SAndroid Build Coastguard Worker%else
1425*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m4, m0, scalingq-1, r12, r13, m3
1426*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m1, scalingq-1, r12, r13, m3
1427*c0909341SAndroid Build Coastguard Worker%endif
1428*c0909341SAndroid Build Coastguard Worker    REPX {psrlw x, 8}, m4, m5
1429*c0909341SAndroid Build Coastguard Worker
1430*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
1431*c0909341SAndroid Build Coastguard Worker    movu             m3, [grain_lutq+offxyq]
1432*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m7, m2, m3
1433*c0909341SAndroid Build Coastguard Worker    punpcklbw        m2, m3, m7
1434*c0909341SAndroid Build Coastguard Worker    punpckhbw        m3, m7
1435*c0909341SAndroid Build Coastguard Worker
1436*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[src] * grain, scaling_shift)
1437*c0909341SAndroid Build Coastguard Worker    pmullw           m2, m4
1438*c0909341SAndroid Build Coastguard Worker    pmullw           m3, m5
1439*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m2, m11
1440*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m3, m11
1441*c0909341SAndroid Build Coastguard Worker
1442*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
1443*c0909341SAndroid Build Coastguard Worker    paddw            m0, m2
1444*c0909341SAndroid Build Coastguard Worker    paddw            m1, m3
1445*c0909341SAndroid Build Coastguard Worker    pmaxsw           m0, m13
1446*c0909341SAndroid Build Coastguard Worker    pmaxsw           m1, m13
1447*c0909341SAndroid Build Coastguard Worker    pminsw           m0, m12
1448*c0909341SAndroid Build Coastguard Worker    pminsw           m1, m12
1449*c0909341SAndroid Build Coastguard Worker    packuswb         m0, m1
1450*c0909341SAndroid Build Coastguard Worker    movifnidn      dstq, dstmp
1451*c0909341SAndroid Build Coastguard Worker    mova    [dstq+srcq], m0
1452*c0909341SAndroid Build Coastguard Worker
1453*c0909341SAndroid Build Coastguard Worker    add            srcq, r2mp
1454*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82
1455*c0909341SAndroid Build Coastguard Worker    dec              hd
1456*c0909341SAndroid Build Coastguard Worker    jg .loop_y
1457*c0909341SAndroid Build Coastguard Worker
1458*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1459*c0909341SAndroid Build Coastguard Worker    add            r4mp, 16
1460*c0909341SAndroid Build Coastguard Worker%else
1461*c0909341SAndroid Build Coastguard Worker    add              wq, 16
1462*c0909341SAndroid Build Coastguard Worker%endif
1463*c0909341SAndroid Build Coastguard Worker    jge .end
1464*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1465*c0909341SAndroid Build Coastguard Worker    mov            srcq, r1mp
1466*c0909341SAndroid Build Coastguard Worker    add            srcq, r4mp
1467*c0909341SAndroid Build Coastguard Worker%else
1468*c0909341SAndroid Build Coastguard Worker    lea            srcq, [src_bakq+wq]
1469*c0909341SAndroid Build Coastguard Worker%endif
1470*c0909341SAndroid Build Coastguard Worker    btc       dword r8m, 2
1471*c0909341SAndroid Build Coastguard Worker    jc .next_blk
1472*c0909341SAndroid Build Coastguard Worker
1473*c0909341SAndroid Build Coastguard Worker    add          offxyd, 16
1474*c0909341SAndroid Build Coastguard Worker    test      dword r8m, 2              ; r8m & 2 = have_top_overlap
1475*c0909341SAndroid Build Coastguard Worker    jz .loop_x_odd
1476*c0909341SAndroid Build Coastguard Worker
1477*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1478*c0909341SAndroid Build Coastguard Worker    add dword [rsp+5*mmsize+1*gprsize], 16
1479*c0909341SAndroid Build Coastguard Worker%else
1480*c0909341SAndroid Build Coastguard Worker    add            r11d, 16             ; top_offxyd
1481*c0909341SAndroid Build Coastguard Worker%endif
1482*c0909341SAndroid Build Coastguard Worker    jnz .loop_x_odd_v_overlap
1483*c0909341SAndroid Build Coastguard Worker
1484*c0909341SAndroid Build Coastguard Worker.next_blk:
1485*c0909341SAndroid Build Coastguard Worker    test      dword r8m, 1
1486*c0909341SAndroid Build Coastguard Worker    jz .loop_x
1487*c0909341SAndroid Build Coastguard Worker
1488*c0909341SAndroid Build Coastguard Worker    test      dword r8m, 2
1489*c0909341SAndroid Build Coastguard Worker    jnz .loop_x_hv_overlap
1490*c0909341SAndroid Build Coastguard Worker
1491*c0909341SAndroid Build Coastguard Worker    ; horizontal overlap (without vertical overlap)
1492*c0909341SAndroid Build Coastguard Worker.loop_x_h_overlap:
1493*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1494*c0909341SAndroid Build Coastguard Worker    ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
1495*c0909341SAndroid Build Coastguard Worker    ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
1496*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3
1497*c0909341SAndroid Build Coastguard Worker
1498*c0909341SAndroid Build Coastguard Worker    add          offxyd, 16                 ; left_offxyd
1499*c0909341SAndroid Build Coastguard Worker    mov [rsp+5*mmsize+0*gprsize], offxyd
1500*c0909341SAndroid Build Coastguard Worker
1501*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
1502*c0909341SAndroid Build Coastguard Worker
1503*c0909341SAndroid Build Coastguard Worker    mov            seed, r3m
1504*c0909341SAndroid Build Coastguard Worker%else
1505*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1506*c0909341SAndroid Build Coastguard Worker                offx, offy, see, left_offxy
1507*c0909341SAndroid Build Coastguard Worker
1508*c0909341SAndroid Build Coastguard Worker    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
1509*c0909341SAndroid Build Coastguard Worker%endif
1510*c0909341SAndroid Build Coastguard Worker
1511*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
1512*c0909341SAndroid Build Coastguard Worker    or             seed, 0xEFF4
1513*c0909341SAndroid Build Coastguard Worker    shr             r6d, 1
1514*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
1515*c0909341SAndroid Build Coastguard Worker    lea            seed, [r6+0x8000]
1516*c0909341SAndroid Build Coastguard Worker    cmovp          seed, r6d                ; updated seed
1517*c0909341SAndroid Build Coastguard Worker
1518*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1519*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
1520*c0909341SAndroid Build Coastguard Worker
1521*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1522*c0909341SAndroid Build Coastguard Worker
1523*c0909341SAndroid Build Coastguard Worker    mov           offxd, offyd
1524*c0909341SAndroid Build Coastguard Worker%else
1525*c0909341SAndroid Build Coastguard Worker    mov           offyd, seed
1526*c0909341SAndroid Build Coastguard Worker    mov           offxd, seed
1527*c0909341SAndroid Build Coastguard Worker%endif
1528*c0909341SAndroid Build Coastguard Worker    ror           offyd, 8
1529*c0909341SAndroid Build Coastguard Worker    shr           offxd, 12
1530*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf
1531*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164
1532*c0909341SAndroid Build Coastguard Worker    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
1533*c0909341SAndroid Build Coastguard Worker
1534*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1535*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1536*c0909341SAndroid Build Coastguard Worker%else
1537*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1538*c0909341SAndroid Build Coastguard Worker                h, offxy, see, left_offxy
1539*c0909341SAndroid Build Coastguard Worker%endif
1540*c0909341SAndroid Build Coastguard Worker
1541*c0909341SAndroid Build Coastguard Worker    mov              hd, r7m
1542*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
1543*c0909341SAndroid Build Coastguard Worker.loop_y_h_overlap:
1544*c0909341SAndroid Build Coastguard Worker    ; src
1545*c0909341SAndroid Build Coastguard Worker    mova             m0, [srcq]
1546*c0909341SAndroid Build Coastguard Worker    pxor             m2, m2
1547*c0909341SAndroid Build Coastguard Worker    punpckhbw        m1, m0, m2
1548*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m2                 ; m0-1: src as word
1549*c0909341SAndroid Build Coastguard Worker
1550*c0909341SAndroid Build Coastguard Worker    ; scaling[src]
1551*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1552*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m4, m0, scalingq-1, r0, r5, m3
1553*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m1, scalingq-1, r0, r5, m3
1554*c0909341SAndroid Build Coastguard Worker%else
1555*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m4, m0, scalingq-1, r12, r13, m3
1556*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m1, scalingq-1, r12, r13, m3
1557*c0909341SAndroid Build Coastguard Worker%endif
1558*c0909341SAndroid Build Coastguard Worker    REPX {psrlw x, 8}, m4, m5
1559*c0909341SAndroid Build Coastguard Worker
1560*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
1561*c0909341SAndroid Build Coastguard Worker    movu             m3, [grain_lutq+offxyq]
1562*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1563*c0909341SAndroid Build Coastguard Worker    mov              r5, [rsp+5*mmsize+0*gprsize]
1564*c0909341SAndroid Build Coastguard Worker    movd             m7, [grain_lutq+r5]
1565*c0909341SAndroid Build Coastguard Worker%else
1566*c0909341SAndroid Build Coastguard Worker    movd             m7, [grain_lutq+left_offxyq]
1567*c0909341SAndroid Build Coastguard Worker%endif
1568*c0909341SAndroid Build Coastguard Worker    punpcklbw        m7, m3
1569*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m6, m15, m7
1570*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m6, m14
1571*c0909341SAndroid Build Coastguard Worker    packsswb         m6, m6
1572*c0909341SAndroid Build Coastguard Worker    shufps           m6, m3, q3210
1573*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m2, m6
1574*c0909341SAndroid Build Coastguard Worker    punpcklbw        m7, m6, m2
1575*c0909341SAndroid Build Coastguard Worker    punpckhbw        m6, m2
1576*c0909341SAndroid Build Coastguard Worker
1577*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[src] * grain, scaling_shift)
1578*c0909341SAndroid Build Coastguard Worker    pmullw           m7, m4
1579*c0909341SAndroid Build Coastguard Worker    pmullw           m6, m5
1580*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m7, m11
1581*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m6, m11
1582*c0909341SAndroid Build Coastguard Worker
1583*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
1584*c0909341SAndroid Build Coastguard Worker    paddw            m0, m7
1585*c0909341SAndroid Build Coastguard Worker    paddw            m1, m6
1586*c0909341SAndroid Build Coastguard Worker    pmaxsw           m0, m13
1587*c0909341SAndroid Build Coastguard Worker    pmaxsw           m1, m13
1588*c0909341SAndroid Build Coastguard Worker    pminsw           m0, m12
1589*c0909341SAndroid Build Coastguard Worker    pminsw           m1, m12
1590*c0909341SAndroid Build Coastguard Worker    packuswb         m0, m1
1591*c0909341SAndroid Build Coastguard Worker    movifnidn      dstq, dstmp
1592*c0909341SAndroid Build Coastguard Worker    mova    [dstq+srcq], m0
1593*c0909341SAndroid Build Coastguard Worker
1594*c0909341SAndroid Build Coastguard Worker    add            srcq, r2mp
1595*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82
1596*c0909341SAndroid Build Coastguard Worker    dec              hd
1597*c0909341SAndroid Build Coastguard Worker    jg .loop_y_h_overlap
1598*c0909341SAndroid Build Coastguard Worker
1599*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1600*c0909341SAndroid Build Coastguard Worker    add            r4mp, 16
1601*c0909341SAndroid Build Coastguard Worker%else
1602*c0909341SAndroid Build Coastguard Worker    add              wq, 16
1603*c0909341SAndroid Build Coastguard Worker%endif
1604*c0909341SAndroid Build Coastguard Worker    jge .end
1605*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1606*c0909341SAndroid Build Coastguard Worker    mov            srcq, r1m
1607*c0909341SAndroid Build Coastguard Worker    add            srcq, r4m
1608*c0909341SAndroid Build Coastguard Worker%else
1609*c0909341SAndroid Build Coastguard Worker    lea            srcq, [src_bakq+wq]
1610*c0909341SAndroid Build Coastguard Worker%endif
1611*c0909341SAndroid Build Coastguard Worker    xor       dword r8m, 4
1612*c0909341SAndroid Build Coastguard Worker    add          offxyd, 16
1613*c0909341SAndroid Build Coastguard Worker
1614*c0909341SAndroid Build Coastguard Worker    ; since this half-block had left-overlap, the next does not
1615*c0909341SAndroid Build Coastguard Worker    test      dword r8m, 2              ; have_top_overlap
1616*c0909341SAndroid Build Coastguard Worker    jz .loop_x_odd
1617*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1618*c0909341SAndroid Build Coastguard Worker    add dword [rsp+5*mmsize+1*gprsize], 16
1619*c0909341SAndroid Build Coastguard Worker%else
1620*c0909341SAndroid Build Coastguard Worker    add            r11d, 16             ; top_offxyd
1621*c0909341SAndroid Build Coastguard Worker%endif
1622*c0909341SAndroid Build Coastguard Worker    jmp .loop_x_odd_v_overlap
1623*c0909341SAndroid Build Coastguard Worker
1624*c0909341SAndroid Build Coastguard Worker.end:
1625*c0909341SAndroid Build Coastguard Worker    RET
1626*c0909341SAndroid Build Coastguard Worker
1627*c0909341SAndroid Build Coastguard Worker.vertical_overlap:
1628*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1629*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
1630*c0909341SAndroid Build Coastguard Worker%else
1631*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
1632*c0909341SAndroid Build Coastguard Worker%endif
1633*c0909341SAndroid Build Coastguard Worker
1634*c0909341SAndroid Build Coastguard Worker    or         overlapd, 2                  ; top_overlap: overlap & 2
1635*c0909341SAndroid Build Coastguard Worker    mov             r8m, overlapd
1636*c0909341SAndroid Build Coastguard Worker    movzx          sbyd, sbyb
1637*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1638*c0909341SAndroid Build Coastguard Worker    imul             r4, [fg_dataq+FGData.seed], 0x00010001
1639*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
1640*c0909341SAndroid Build Coastguard Worker%else
1641*c0909341SAndroid Build Coastguard Worker    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1642*c0909341SAndroid Build Coastguard Worker%endif
1643*c0909341SAndroid Build Coastguard Worker    imul           tmpd, sbyd, 173 * 0x00010001
1644*c0909341SAndroid Build Coastguard Worker    imul           sbyd, 37 * 0x01000100
1645*c0909341SAndroid Build Coastguard Worker    add            tmpd, (105 << 16) | 188
1646*c0909341SAndroid Build Coastguard Worker    add            sbyd, (178 << 24) | (141 << 8)
1647*c0909341SAndroid Build Coastguard Worker    and            tmpd, 0x00ff00ff
1648*c0909341SAndroid Build Coastguard Worker    and            sbyd, 0xff00ff00
1649*c0909341SAndroid Build Coastguard Worker    xor            seed, tmpd
1650*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1651*c0909341SAndroid Build Coastguard Worker    xor            sbyd, seed               ; (cur_seed << 16) | top_seed
1652*c0909341SAndroid Build Coastguard Worker
1653*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
1654*c0909341SAndroid Build Coastguard Worker
1655*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
1656*c0909341SAndroid Build Coastguard Worker    mov              wq, r4m
1657*c0909341SAndroid Build Coastguard Worker%else
1658*c0909341SAndroid Build Coastguard Worker    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1659*c0909341SAndroid Build Coastguard Worker
1660*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1661*c0909341SAndroid Build Coastguard Worker                tmp, unused2, see, unused3
1662*c0909341SAndroid Build Coastguard Worker%endif
1663*c0909341SAndroid Build Coastguard Worker
1664*c0909341SAndroid Build Coastguard Worker    lea        src_bakq, [srcq+wq]
1665*c0909341SAndroid Build Coastguard Worker    neg              wq
1666*c0909341SAndroid Build Coastguard Worker    sub           dstmp, srcq
1667*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1668*c0909341SAndroid Build Coastguard Worker    mov             r1m, src_bakq
1669*c0909341SAndroid Build Coastguard Worker    mov             r4m, wq
1670*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
1671*c0909341SAndroid Build Coastguard Worker%endif
1672*c0909341SAndroid Build Coastguard Worker
1673*c0909341SAndroid Build Coastguard Worker.loop_x_v_overlap:
1674*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1675*c0909341SAndroid Build Coastguard Worker    mov            seed, r3m
1676*c0909341SAndroid Build Coastguard Worker%endif
1677*c0909341SAndroid Build Coastguard Worker    ; we assume from the block above that bits 8-15 of tmpd are zero'ed,
1678*c0909341SAndroid Build Coastguard Worker    ; because of the 'and tmpd, 0x00ff00ff' above
1679*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
1680*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4eff4
1681*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
1682*c0909341SAndroid Build Coastguard Worker    setp           tmpb                     ; parity of top_seed
1683*c0909341SAndroid Build Coastguard Worker    shr            seed, 16
1684*c0909341SAndroid Build Coastguard Worker    shl            tmpd, 16
1685*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
1686*c0909341SAndroid Build Coastguard Worker    setp           tmpb                     ; parity of cur_seed
1687*c0909341SAndroid Build Coastguard Worker    or              r6d, 0x00010001
1688*c0909341SAndroid Build Coastguard Worker    xor            tmpd, r6d
1689*c0909341SAndroid Build Coastguard Worker    mov            seed, tmpd
1690*c0909341SAndroid Build Coastguard Worker    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
1691*c0909341SAndroid Build Coastguard Worker
1692*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1693*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
1694*c0909341SAndroid Build Coastguard Worker
1695*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1696*c0909341SAndroid Build Coastguard Worker
1697*c0909341SAndroid Build Coastguard Worker    mov           offxd, offyd
1698*c0909341SAndroid Build Coastguard Worker%else
1699*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1700*c0909341SAndroid Build Coastguard Worker                offx, offy, see, unused, top_offxy
1701*c0909341SAndroid Build Coastguard Worker
1702*c0909341SAndroid Build Coastguard Worker    mov           offyd, seed
1703*c0909341SAndroid Build Coastguard Worker    mov           offxd, seed
1704*c0909341SAndroid Build Coastguard Worker%endif
1705*c0909341SAndroid Build Coastguard Worker
1706*c0909341SAndroid Build Coastguard Worker    ror           offyd, 8
1707*c0909341SAndroid Build Coastguard Worker    ror           offxd, 12
1708*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf000f
1709*c0909341SAndroid Build Coastguard Worker    and           offxd, 0xf000f
1710*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164
1711*c0909341SAndroid Build Coastguard Worker    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1712*c0909341SAndroid Build Coastguard Worker    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
1713*c0909341SAndroid Build Coastguard Worker
1714*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1715*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
1716*c0909341SAndroid Build Coastguard Worker%else
1717*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1718*c0909341SAndroid Build Coastguard Worker                h, offxy, see, unused, top_offxy
1719*c0909341SAndroid Build Coastguard Worker%endif
1720*c0909341SAndroid Build Coastguard Worker
1721*c0909341SAndroid Build Coastguard Worker    movzx    top_offxyd, offxyw
1722*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1723*c0909341SAndroid Build Coastguard Worker    mov [rsp+5*mmsize+1*gprsize], top_offxyd
1724*c0909341SAndroid Build Coastguard Worker
1725*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1726*c0909341SAndroid Build Coastguard Worker%endif
1727*c0909341SAndroid Build Coastguard Worker    shr          offxyd, 16
1728*c0909341SAndroid Build Coastguard Worker
1729*c0909341SAndroid Build Coastguard Worker.loop_x_odd_v_overlap:
1730*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1731*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
1732*c0909341SAndroid Build Coastguard Worker    lea              r5, [base+pb_27_17]
1733*c0909341SAndroid Build Coastguard Worker    mov [rsp+5*mmsize+12], r5
1734*c0909341SAndroid Build Coastguard Worker%else
1735*c0909341SAndroid Build Coastguard Worker    mova             m8, [pb_27_17]
1736*c0909341SAndroid Build Coastguard Worker%endif
1737*c0909341SAndroid Build Coastguard Worker    mov              hd, r7m
1738*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
1739*c0909341SAndroid Build Coastguard Worker.loop_y_v_overlap:
1740*c0909341SAndroid Build Coastguard Worker    ; src
1741*c0909341SAndroid Build Coastguard Worker    mova             m0, [srcq]
1742*c0909341SAndroid Build Coastguard Worker    pxor             m2, m2
1743*c0909341SAndroid Build Coastguard Worker    punpckhbw        m1, m0, m2
1744*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m2                 ; m0-1: src as word
1745*c0909341SAndroid Build Coastguard Worker
1746*c0909341SAndroid Build Coastguard Worker    ; scaling[src]
1747*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1748*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m4, m0, scalingq-1, r0, r5, m3
1749*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m1, scalingq-1, r0, r5, m3
1750*c0909341SAndroid Build Coastguard Worker%else
1751*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m4, m0, scalingq-1, r12, r13, m3
1752*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m1, scalingq-1, r12, r13, m3
1753*c0909341SAndroid Build Coastguard Worker%endif
1754*c0909341SAndroid Build Coastguard Worker    REPX {psrlw x, 8}, m4, m5
1755*c0909341SAndroid Build Coastguard Worker
1756*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
1757*c0909341SAndroid Build Coastguard Worker    movu             m3, [grain_lutq+offxyq]
1758*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1759*c0909341SAndroid Build Coastguard Worker    mov              r5, [rsp+5*mmsize+1*gprsize]
1760*c0909341SAndroid Build Coastguard Worker    movu             m7, [grain_lutq+r5]
1761*c0909341SAndroid Build Coastguard Worker%else
1762*c0909341SAndroid Build Coastguard Worker    movu             m7, [grain_lutq+top_offxyq]
1763*c0909341SAndroid Build Coastguard Worker%endif
1764*c0909341SAndroid Build Coastguard Worker    punpckhbw        m6, m7, m3
1765*c0909341SAndroid Build Coastguard Worker    punpcklbw        m7, m3
1766*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1767*c0909341SAndroid Build Coastguard Worker    mov              r5, [rsp+5*mmsize+12]
1768*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m3, [r5], m6
1769*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m6, [r5], m7
1770*c0909341SAndroid Build Coastguard Worker%else
1771*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m3, m8, m6
1772*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m6, m8, m7
1773*c0909341SAndroid Build Coastguard Worker%endif
1774*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m3, m14
1775*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m6, m14
1776*c0909341SAndroid Build Coastguard Worker    packsswb         m6, m3
1777*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m7, m2, m6
1778*c0909341SAndroid Build Coastguard Worker    punpcklbw        m2, m6, m7
1779*c0909341SAndroid Build Coastguard Worker    punpckhbw        m6, m7
1780*c0909341SAndroid Build Coastguard Worker
1781*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[src] * grain, scaling_shift)
1782*c0909341SAndroid Build Coastguard Worker    pmullw           m2, m4
1783*c0909341SAndroid Build Coastguard Worker    pmullw           m6, m5
1784*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m2, m11
1785*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m6, m11
1786*c0909341SAndroid Build Coastguard Worker
1787*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
1788*c0909341SAndroid Build Coastguard Worker    paddw            m0, m2
1789*c0909341SAndroid Build Coastguard Worker    paddw            m1, m6
1790*c0909341SAndroid Build Coastguard Worker    pmaxsw           m0, m13
1791*c0909341SAndroid Build Coastguard Worker    pmaxsw           m1, m13
1792*c0909341SAndroid Build Coastguard Worker    pminsw           m0, m12
1793*c0909341SAndroid Build Coastguard Worker    pminsw           m1, m12
1794*c0909341SAndroid Build Coastguard Worker    packuswb         m0, m1
1795*c0909341SAndroid Build Coastguard Worker    movifnidn      dstq, dstmp
1796*c0909341SAndroid Build Coastguard Worker    mova    [dstq+srcq], m0
1797*c0909341SAndroid Build Coastguard Worker
1798*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1799*c0909341SAndroid Build Coastguard Worker    add dword [rsp+5*mmsize+12], mmsize
1800*c0909341SAndroid Build Coastguard Worker%else
1801*c0909341SAndroid Build Coastguard Worker    mova             m8, [pb_17_27]
1802*c0909341SAndroid Build Coastguard Worker%endif
1803*c0909341SAndroid Build Coastguard Worker    add            srcq, r2mp
1804*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82
1805*c0909341SAndroid Build Coastguard Worker    dec              hw
1806*c0909341SAndroid Build Coastguard Worker    jz .end_y_v_overlap
1807*c0909341SAndroid Build Coastguard Worker    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1808*c0909341SAndroid Build Coastguard Worker    ; remaining (up to) 30 lines
1809*c0909341SAndroid Build Coastguard Worker    btc              hd, 16
1810*c0909341SAndroid Build Coastguard Worker    jnc .loop_y_v_overlap
1811*c0909341SAndroid Build Coastguard Worker    jmp .loop_y
1812*c0909341SAndroid Build Coastguard Worker
1813*c0909341SAndroid Build Coastguard Worker.end_y_v_overlap:
1814*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1815*c0909341SAndroid Build Coastguard Worker    add            r4mp, 16
1816*c0909341SAndroid Build Coastguard Worker%else
1817*c0909341SAndroid Build Coastguard Worker    add              wq, 16
1818*c0909341SAndroid Build Coastguard Worker%endif
1819*c0909341SAndroid Build Coastguard Worker    jge .end_hv
1820*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1821*c0909341SAndroid Build Coastguard Worker    mov            srcq, r1mp
1822*c0909341SAndroid Build Coastguard Worker    add            srcq, r4mp
1823*c0909341SAndroid Build Coastguard Worker%else
1824*c0909341SAndroid Build Coastguard Worker    lea            srcq, [src_bakq+wq]
1825*c0909341SAndroid Build Coastguard Worker%endif
1826*c0909341SAndroid Build Coastguard Worker    btc       dword r8m, 2
1827*c0909341SAndroid Build Coastguard Worker    jc .loop_x_hv_overlap
1828*c0909341SAndroid Build Coastguard Worker    add          offxyd, 16
1829*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1830*c0909341SAndroid Build Coastguard Worker    add dword [rsp+5*mmsize+1*gprsize], 16
1831*c0909341SAndroid Build Coastguard Worker%else
1832*c0909341SAndroid Build Coastguard Worker    add      top_offxyd, 16
1833*c0909341SAndroid Build Coastguard Worker%endif
1834*c0909341SAndroid Build Coastguard Worker    jmp .loop_x_odd_v_overlap
1835*c0909341SAndroid Build Coastguard Worker
1836*c0909341SAndroid Build Coastguard Worker.loop_x_hv_overlap:
1837*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1838*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
1839*c0909341SAndroid Build Coastguard Worker    lea              r5, [base+pb_27_17]
1840*c0909341SAndroid Build Coastguard Worker    mov [rsp+5*mmsize+12], r5
1841*c0909341SAndroid Build Coastguard Worker
1842*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak
1843*c0909341SAndroid Build Coastguard Worker
1844*c0909341SAndroid Build Coastguard Worker    mov              r5, [rsp+5*mmsize+1*gprsize]
1845*c0909341SAndroid Build Coastguard Worker    mov              r4, offxyd
1846*c0909341SAndroid Build Coastguard Worker    add              r5, 16
1847*c0909341SAndroid Build Coastguard Worker    add              r4, 16
1848*c0909341SAndroid Build Coastguard Worker    mov [rsp+5*mmsize+2*gprsize], r5        ; topleft_offxy
1849*c0909341SAndroid Build Coastguard Worker    mov [rsp+5*mmsize+0*gprsize], r4        ; left_offxy
1850*c0909341SAndroid Build Coastguard Worker
1851*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak
1852*c0909341SAndroid Build Coastguard Worker
1853*c0909341SAndroid Build Coastguard Worker    xor            tmpd, tmpd
1854*c0909341SAndroid Build Coastguard Worker    mov            seed, r3m
1855*c0909341SAndroid Build Coastguard Worker%else
1856*c0909341SAndroid Build Coastguard Worker    mova             m8, [pb_27_17]
1857*c0909341SAndroid Build Coastguard Worker
1858*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1859*c0909341SAndroid Build Coastguard Worker                tmp, unused2, see, unused3
1860*c0909341SAndroid Build Coastguard Worker
1861*c0909341SAndroid Build Coastguard Worker    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
1862*c0909341SAndroid Build Coastguard Worker%endif
1863*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
1864*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4eff4
1865*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
1866*c0909341SAndroid Build Coastguard Worker    setp           tmpb                     ; parity of top_seed
1867*c0909341SAndroid Build Coastguard Worker    shr            seed, 16
1868*c0909341SAndroid Build Coastguard Worker    shl            tmpd, 16
1869*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
1870*c0909341SAndroid Build Coastguard Worker    setp           tmpb                     ; parity of cur_seed
1871*c0909341SAndroid Build Coastguard Worker    or              r6d, 0x00010001
1872*c0909341SAndroid Build Coastguard Worker    xor            tmpd, r6d
1873*c0909341SAndroid Build Coastguard Worker    mov            seed, tmpd
1874*c0909341SAndroid Build Coastguard Worker    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
1875*c0909341SAndroid Build Coastguard Worker
1876*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1877*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
1878*c0909341SAndroid Build Coastguard Worker
1879*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1880*c0909341SAndroid Build Coastguard Worker
1881*c0909341SAndroid Build Coastguard Worker    mov           offxd, offyd
1882*c0909341SAndroid Build Coastguard Worker%else
1883*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1884*c0909341SAndroid Build Coastguard Worker                offx, offy, see, left_offxy, top_offxy, topleft_offxy
1885*c0909341SAndroid Build Coastguard Worker
1886*c0909341SAndroid Build Coastguard Worker    lea  topleft_offxyq, [top_offxyq+16]
1887*c0909341SAndroid Build Coastguard Worker    lea     left_offxyq, [offyq+16]
1888*c0909341SAndroid Build Coastguard Worker    mov           offyd, seed
1889*c0909341SAndroid Build Coastguard Worker    mov           offxd, seed
1890*c0909341SAndroid Build Coastguard Worker%endif
1891*c0909341SAndroid Build Coastguard Worker    ror           offyd, 8
1892*c0909341SAndroid Build Coastguard Worker    ror           offxd, 12
1893*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf000f
1894*c0909341SAndroid Build Coastguard Worker    and           offxd, 0xf000f
1895*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164
1896*c0909341SAndroid Build Coastguard Worker    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1897*c0909341SAndroid Build Coastguard Worker    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
1898*c0909341SAndroid Build Coastguard Worker
1899*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1900*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1901*c0909341SAndroid Build Coastguard Worker
1902*c0909341SAndroid Build Coastguard Worker    movzx            r5, offxyw             ; top_offxy
1903*c0909341SAndroid Build Coastguard Worker    mov [rsp+5*mmsize+1*gprsize], r5
1904*c0909341SAndroid Build Coastguard Worker%else
1905*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1906*c0909341SAndroid Build Coastguard Worker                h, offxy, see, left_offxy, top_offxy, topleft_offxy
1907*c0909341SAndroid Build Coastguard Worker
1908*c0909341SAndroid Build Coastguard Worker    movzx    top_offxyd, offxyw
1909*c0909341SAndroid Build Coastguard Worker%endif
1910*c0909341SAndroid Build Coastguard Worker    shr          offxyd, 16
1911*c0909341SAndroid Build Coastguard Worker
1912*c0909341SAndroid Build Coastguard Worker    mov              hd, r7m
1913*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
1914*c0909341SAndroid Build Coastguard Worker.loop_y_hv_overlap:
1915*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
1916*c0909341SAndroid Build Coastguard Worker    movu             m3, [grain_lutq+offxyq]
1917*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1918*c0909341SAndroid Build Coastguard Worker    mov              r5, [rsp+5*mmsize+1*gprsize]   ; top_offxy
1919*c0909341SAndroid Build Coastguard Worker    mov              r0, [rsp+5*mmsize+0*gprsize]   ; left_offxy
1920*c0909341SAndroid Build Coastguard Worker    movu             m6, [grain_lutq+r5]
1921*c0909341SAndroid Build Coastguard Worker    mov              r5, [rsp+5*mmsize+2*gprsize]   ; topleft_offxy
1922*c0909341SAndroid Build Coastguard Worker    movd             m4, [grain_lutq+r0]
1923*c0909341SAndroid Build Coastguard Worker    movd             m7, [grain_lutq+r5]
1924*c0909341SAndroid Build Coastguard Worker%else
1925*c0909341SAndroid Build Coastguard Worker    movu             m6, [grain_lutq+top_offxyq]
1926*c0909341SAndroid Build Coastguard Worker    movd             m4, [grain_lutq+left_offxyq]
1927*c0909341SAndroid Build Coastguard Worker    movd             m7, [grain_lutq+topleft_offxyq]
1928*c0909341SAndroid Build Coastguard Worker%endif
1929*c0909341SAndroid Build Coastguard Worker    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
1930*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m3
1931*c0909341SAndroid Build Coastguard Worker    punpcklbw        m7, m6
1932*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m2, m15, m4
1933*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m4, m15, m7
1934*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m2, m14
1935*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m4, m14
1936*c0909341SAndroid Build Coastguard Worker    packsswb         m2, m2
1937*c0909341SAndroid Build Coastguard Worker    packsswb         m4, m4
1938*c0909341SAndroid Build Coastguard Worker    shufps           m2, m3, q3210
1939*c0909341SAndroid Build Coastguard Worker    shufps           m4, m6, q3210
1940*c0909341SAndroid Build Coastguard Worker    ; followed by v interpolation (top | cur -> cur)
1941*c0909341SAndroid Build Coastguard Worker    punpcklbw        m3, m4, m2
1942*c0909341SAndroid Build Coastguard Worker    punpckhbw        m4, m2
1943*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1944*c0909341SAndroid Build Coastguard Worker    mov              r5, [rsp+5*mmsize+12]
1945*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m7, [r5], m4
1946*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m4, [r5], m3
1947*c0909341SAndroid Build Coastguard Worker%else
1948*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m7, m8, m4
1949*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m4, m8, m3
1950*c0909341SAndroid Build Coastguard Worker%endif
1951*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m7, m14
1952*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m4, m14
1953*c0909341SAndroid Build Coastguard Worker    packsswb         m4, m7
1954*c0909341SAndroid Build Coastguard Worker    pxor             m2, m2
1955*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m7, m2, m4
1956*c0909341SAndroid Build Coastguard Worker    punpcklbw        m3, m4, m7
1957*c0909341SAndroid Build Coastguard Worker    punpckhbw        m4, m7
1958*c0909341SAndroid Build Coastguard Worker
1959*c0909341SAndroid Build Coastguard Worker    ; src
1960*c0909341SAndroid Build Coastguard Worker    mova             m0, [srcq]
1961*c0909341SAndroid Build Coastguard Worker    punpckhbw        m1, m0, m2
1962*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m2                 ; m0-1: src as word
1963*c0909341SAndroid Build Coastguard Worker
1964*c0909341SAndroid Build Coastguard Worker    ; scaling[src]
1965*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1966*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m0, scalingq-1, r0, r5, m7
1967*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m6, m1, scalingq-1, r0, r5, m7
1968*c0909341SAndroid Build Coastguard Worker%else
1969*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m0, scalingq-1, r13, r14, m7
1970*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m6, m1, scalingq-1, r13, r14, m7
1971*c0909341SAndroid Build Coastguard Worker%endif
1972*c0909341SAndroid Build Coastguard Worker    REPX {psrlw x, 8}, m5, m6
1973*c0909341SAndroid Build Coastguard Worker
1974*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[src] * grain, scaling_shift)
1975*c0909341SAndroid Build Coastguard Worker    pmullw           m3, m5
1976*c0909341SAndroid Build Coastguard Worker    pmullw           m4, m6
1977*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m3, m11
1978*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m4, m11
1979*c0909341SAndroid Build Coastguard Worker
1980*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
1981*c0909341SAndroid Build Coastguard Worker    paddw            m0, m3
1982*c0909341SAndroid Build Coastguard Worker    paddw            m1, m4
1983*c0909341SAndroid Build Coastguard Worker    pmaxsw           m0, m13
1984*c0909341SAndroid Build Coastguard Worker    pmaxsw           m1, m13
1985*c0909341SAndroid Build Coastguard Worker    pminsw           m0, m12
1986*c0909341SAndroid Build Coastguard Worker    pminsw           m1, m12
1987*c0909341SAndroid Build Coastguard Worker    packuswb         m0, m1
1988*c0909341SAndroid Build Coastguard Worker    movifnidn      dstq, dstmp
1989*c0909341SAndroid Build Coastguard Worker    mova    [dstq+srcq], m0
1990*c0909341SAndroid Build Coastguard Worker
1991*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1992*c0909341SAndroid Build Coastguard Worker    add dword [rsp+5*mmsize+12], mmsize
1993*c0909341SAndroid Build Coastguard Worker%else
1994*c0909341SAndroid Build Coastguard Worker    mova             m8, [pb_17_27]
1995*c0909341SAndroid Build Coastguard Worker%endif
1996*c0909341SAndroid Build Coastguard Worker    add            srcq, r2mp
1997*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82
1998*c0909341SAndroid Build Coastguard Worker    dec              hw
1999*c0909341SAndroid Build Coastguard Worker    jz .end_y_hv_overlap
2000*c0909341SAndroid Build Coastguard Worker    ; 2 lines get vertical overlap, then fall back to non-overlap code for
2001*c0909341SAndroid Build Coastguard Worker    ; remaining (up to) 30 lines
2002*c0909341SAndroid Build Coastguard Worker    btc              hd, 16
2003*c0909341SAndroid Build Coastguard Worker    jnc .loop_y_hv_overlap
2004*c0909341SAndroid Build Coastguard Worker    jmp .loop_y_h_overlap
2005*c0909341SAndroid Build Coastguard Worker
2006*c0909341SAndroid Build Coastguard Worker.end_y_hv_overlap:
2007*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2008*c0909341SAndroid Build Coastguard Worker    add            r4mp, 16
2009*c0909341SAndroid Build Coastguard Worker%else
2010*c0909341SAndroid Build Coastguard Worker    add              wq, 16
2011*c0909341SAndroid Build Coastguard Worker%endif
2012*c0909341SAndroid Build Coastguard Worker    jge .end_hv
2013*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2014*c0909341SAndroid Build Coastguard Worker    mov            srcq, r1m
2015*c0909341SAndroid Build Coastguard Worker    add            srcq, r4m
2016*c0909341SAndroid Build Coastguard Worker%else
2017*c0909341SAndroid Build Coastguard Worker    lea            srcq, [src_bakq+wq]
2018*c0909341SAndroid Build Coastguard Worker%endif
2019*c0909341SAndroid Build Coastguard Worker    xor       dword r8m, 4
2020*c0909341SAndroid Build Coastguard Worker    add          offxyd, 16
2021*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2022*c0909341SAndroid Build Coastguard Worker    add dword [rsp+5*mmsize+1*gprsize], 16
2023*c0909341SAndroid Build Coastguard Worker%else
2024*c0909341SAndroid Build Coastguard Worker    add      top_offxyd, 16
2025*c0909341SAndroid Build Coastguard Worker%endif
2026*c0909341SAndroid Build Coastguard Worker    jmp .loop_x_odd_v_overlap
2027*c0909341SAndroid Build Coastguard Worker
2028*c0909341SAndroid Build Coastguard Worker.end_hv:
2029*c0909341SAndroid Build Coastguard Worker    RET
2030*c0909341SAndroid Build Coastguard Worker
2031*c0909341SAndroid Build Coastguard Worker%macro FGUV_FN 3 ; name, ss_hor, ss_ver
2032*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
2033*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2034*c0909341SAndroid Build Coastguard Worker; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h,
2035*c0909341SAndroid Build Coastguard Worker;                         sby, luma, lstride, uv_pl, is_id)
2036*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < mmsize
2037*c0909341SAndroid Build Coastguard WorkerDECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8
2038*c0909341SAndroid Build Coastguard Workercglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \
2039*c0909341SAndroid Build Coastguard Worker        tmp, src, scaling, h, fg_data, picptr, unused
2040*c0909341SAndroid Build Coastguard Worker    mov              r0, r0m
2041*c0909341SAndroid Build Coastguard Worker    mov              r1, r2m
2042*c0909341SAndroid Build Coastguard Worker    mov              r2, r4m
2043*c0909341SAndroid Build Coastguard Worker    mov              r3, r6m
2044*c0909341SAndroid Build Coastguard Worker    mov              r4, r7m
2045*c0909341SAndroid Build Coastguard Worker    mov [rsp+7*mmsize+3*gprsize], r0
2046*c0909341SAndroid Build Coastguard Worker    mov [rsp+7*mmsize+5*gprsize], r1
2047*c0909341SAndroid Build Coastguard Worker    mov [rsp+7*mmsize+7*gprsize], r2
2048*c0909341SAndroid Build Coastguard Worker    mov [rsp+7*mmsize+9*gprsize], r3
2049*c0909341SAndroid Build Coastguard Worker    mov [rsp+7*mmsize+10*gprsize], r4
2050*c0909341SAndroid Build Coastguard Worker
2051*c0909341SAndroid Build Coastguard Worker    mov              r0, r8m
2052*c0909341SAndroid Build Coastguard Worker    mov              r1, r9m
2053*c0909341SAndroid Build Coastguard Worker    mov              r2, r10m
2054*c0909341SAndroid Build Coastguard Worker    mov              r4, r11m
2055*c0909341SAndroid Build Coastguard Worker    mov              r3, r12m
2056*c0909341SAndroid Build Coastguard Worker    mov [rsp+7*mmsize+11*gprsize], r0
2057*c0909341SAndroid Build Coastguard Worker    mov [rsp+7*mmsize+12*gprsize], r1
2058*c0909341SAndroid Build Coastguard Worker    mov [rsp+7*mmsize+13*gprsize], r2
2059*c0909341SAndroid Build Coastguard Worker    mov [rsp+7*mmsize+14*gprsize], r4
2060*c0909341SAndroid Build Coastguard Worker%else
2061*c0909341SAndroid Build Coastguard Workercglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \
2062*c0909341SAndroid Build Coastguard Worker        tmp, src, scaling, h, fg_data, picptr, unused
2063*c0909341SAndroid Build Coastguard Worker%endif
2064*c0909341SAndroid Build Coastguard Worker    mov            srcq, srcm
2065*c0909341SAndroid Build Coastguard Worker    mov        fg_dataq, r3m
2066*c0909341SAndroid Build Coastguard Worker    mov        scalingq, r5m
2067*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < mmsize
2068*c0909341SAndroid Build Coastguard Worker%define r0m [rsp+7*mmsize+ 3*gprsize]
2069*c0909341SAndroid Build Coastguard Worker%define r1m [rsp+7*mmsize+ 4*gprsize]
2070*c0909341SAndroid Build Coastguard Worker%define r2m [rsp+7*mmsize+ 5*gprsize]
2071*c0909341SAndroid Build Coastguard Worker%define r3m [rsp+7*mmsize+ 6*gprsize]
2072*c0909341SAndroid Build Coastguard Worker%define r4m [rsp+7*mmsize+ 7*gprsize]
2073*c0909341SAndroid Build Coastguard Worker%define r5m [rsp+7*mmsize+ 8*gprsize]
2074*c0909341SAndroid Build Coastguard Worker%define r6m [rsp+7*mmsize+ 9*gprsize]
2075*c0909341SAndroid Build Coastguard Worker%define r7m [rsp+7*mmsize+10*gprsize]
2076*c0909341SAndroid Build Coastguard Worker%define r8m [rsp+7*mmsize+11*gprsize]
2077*c0909341SAndroid Build Coastguard Worker%define r9m [rsp+7*mmsize+12*gprsize]
2078*c0909341SAndroid Build Coastguard Worker%define r10m [rsp+7*mmsize+13*gprsize]
2079*c0909341SAndroid Build Coastguard Worker%define r11m [rsp+7*mmsize+14*gprsize]
2080*c0909341SAndroid Build Coastguard Worker%define r12m [rsp+7*mmsize+15*gprsize]
2081*c0909341SAndroid Build Coastguard Worker%endif
2082*c0909341SAndroid Build Coastguard Worker    LEA              r5, pb_mask
2083*c0909341SAndroid Build Coastguard Worker%define base r5-pb_mask
2084*c0909341SAndroid Build Coastguard Worker    mov             r5m, r5
2085*c0909341SAndroid Build Coastguard Worker%else
2086*c0909341SAndroid Build Coastguard Workercglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
2087*c0909341SAndroid Build Coastguard Worker                                     grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
2088*c0909341SAndroid Build Coastguard Worker    lea              r8, [pb_mask]
2089*c0909341SAndroid Build Coastguard Worker%define base r8-pb_mask
2090*c0909341SAndroid Build Coastguard Worker%endif
2091*c0909341SAndroid Build Coastguard Worker    mov             r6d, [fg_dataq+FGData.scaling_shift]
2092*c0909341SAndroid Build Coastguard Worker    movd             m3, [base+mul_bits+r6*2-14]
2093*c0909341SAndroid Build Coastguard Worker    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
2094*c0909341SAndroid Build Coastguard Worker    lea            tmpd, [r6d*2]
2095*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize
2096*c0909341SAndroid Build Coastguard Worker    test             r3, r3
2097*c0909341SAndroid Build Coastguard Worker%else
2098*c0909341SAndroid Build Coastguard Worker    cmp      dword r12m, 0                      ; is_idm
2099*c0909341SAndroid Build Coastguard Worker%endif
2100*c0909341SAndroid Build Coastguard Worker    movd             m5, [base+min+r6*2]
2101*c0909341SAndroid Build Coastguard Worker    cmovne          r6d, tmpd
2102*c0909341SAndroid Build Coastguard Worker    movd             m4, [base+max+r6*2]
2103*c0909341SAndroid Build Coastguard Worker    punpcklwd        m3, m3
2104*c0909341SAndroid Build Coastguard Worker    punpcklwd        m5, m5
2105*c0909341SAndroid Build Coastguard Worker    punpcklwd        m4, m4
2106*c0909341SAndroid Build Coastguard Worker    pshufd           m3, m3, q0000
2107*c0909341SAndroid Build Coastguard Worker    pshufd           m5, m5, q0000
2108*c0909341SAndroid Build Coastguard Worker    pshufd           m4, m4, q0000
2109*c0909341SAndroid Build Coastguard Worker    SCRATCH           3, 11, 0
2110*c0909341SAndroid Build Coastguard Worker    SCRATCH           4, 12, 1
2111*c0909341SAndroid Build Coastguard Worker    SCRATCH           5, 13, 2
2112*c0909341SAndroid Build Coastguard Worker
2113*c0909341SAndroid Build Coastguard Worker    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
2114*c0909341SAndroid Build Coastguard Worker    jne .csfl
2115*c0909341SAndroid Build Coastguard Worker
2116*c0909341SAndroid Build Coastguard Worker%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
2117*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2118*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
2119*c0909341SAndroid Build Coastguard Worker%else
2120*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
2121*c0909341SAndroid Build Coastguard Worker%endif
2122*c0909341SAndroid Build Coastguard Worker
2123*c0909341SAndroid Build Coastguard Worker%if %1
2124*c0909341SAndroid Build Coastguard Worker    mov             r6d, dword r11m
2125*c0909341SAndroid Build Coastguard Worker    movd             m0, [fg_dataq+FGData.uv_mult+r6*4]
2126*c0909341SAndroid Build Coastguard Worker    movd             m1, [fg_dataq+FGData.uv_luma_mult+r6*4]
2127*c0909341SAndroid Build Coastguard Worker    punpcklbw        m6, m1, m0
2128*c0909341SAndroid Build Coastguard Worker    movd             m7, [fg_dataq+FGData.uv_offset+r6*4]
2129*c0909341SAndroid Build Coastguard Worker    punpcklwd        m6, m6
2130*c0909341SAndroid Build Coastguard Worker    punpcklwd        m7, m7
2131*c0909341SAndroid Build Coastguard Worker    pshufd           m6, m6, q0000
2132*c0909341SAndroid Build Coastguard Worker    pshufd           m7, m7, q0000
2133*c0909341SAndroid Build Coastguard Worker    SCRATCH           6, 14, 3
2134*c0909341SAndroid Build Coastguard Worker    SCRATCH           7, 15, 4
2135*c0909341SAndroid Build Coastguard Worker%endif
2136*c0909341SAndroid Build Coastguard Worker
2137*c0909341SAndroid Build Coastguard Worker    mov            sbyd, r8m
2138*c0909341SAndroid Build Coastguard Worker    mov        overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
2139*c0909341SAndroid Build Coastguard Worker    test       overlapd, overlapd
2140*c0909341SAndroid Build Coastguard Worker    jz %%no_vertical_overlap
2141*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2142*c0909341SAndroid Build Coastguard Worker%if %2
2143*c0909341SAndroid Build Coastguard Worker    mova             m1, [base+pb_23_22_h]
2144*c0909341SAndroid Build Coastguard Worker%else
2145*c0909341SAndroid Build Coastguard Worker    mova             m1, [base+pb_27_17_17_27]
2146*c0909341SAndroid Build Coastguard Worker%endif
2147*c0909341SAndroid Build Coastguard Worker    mova             m0, [base+pw_1024]
2148*c0909341SAndroid Build Coastguard Worker%else
2149*c0909341SAndroid Build Coastguard Worker%if %2
2150*c0909341SAndroid Build Coastguard Worker    mova             m1, [pb_23_22_h]
2151*c0909341SAndroid Build Coastguard Worker%else
2152*c0909341SAndroid Build Coastguard Worker    mova             m1, [pb_27_17_17_27]
2153*c0909341SAndroid Build Coastguard Worker%endif
2154*c0909341SAndroid Build Coastguard Worker    mova             m0, [pw_1024]
2155*c0909341SAndroid Build Coastguard Worker%endif
2156*c0909341SAndroid Build Coastguard Worker    SCRATCH           0, 8, 5
2157*c0909341SAndroid Build Coastguard Worker    SCRATCH           1, 9, 6
2158*c0909341SAndroid Build Coastguard Worker    test           sbyd, sbyd
2159*c0909341SAndroid Build Coastguard Worker    jnz %%vertical_overlap
2160*c0909341SAndroid Build Coastguard Worker    ; fall-through
2161*c0909341SAndroid Build Coastguard Worker
2162*c0909341SAndroid Build Coastguard Worker%%no_vertical_overlap:
2163*c0909341SAndroid Build Coastguard Worker    mov             r8m, overlapd
2164*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2165*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap
2166*c0909341SAndroid Build Coastguard Worker    imul           seed, (173 << 24) | 37
2167*c0909341SAndroid Build Coastguard Worker%else
2168*c0909341SAndroid Build Coastguard Worker    imul           seed, sbyd, (173 << 24) | 37
2169*c0909341SAndroid Build Coastguard Worker%endif
2170*c0909341SAndroid Build Coastguard Worker    add            seed, (105 << 24) | 178
2171*c0909341SAndroid Build Coastguard Worker    rol            seed, 8
2172*c0909341SAndroid Build Coastguard Worker    movzx          seed, seew
2173*c0909341SAndroid Build Coastguard Worker    xor            seed, [fg_dataq+FGData.seed]
2174*c0909341SAndroid Build Coastguard Worker
2175*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2176*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
2177*c0909341SAndroid Build Coastguard Worker
2178*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
2179*c0909341SAndroid Build Coastguard Worker%define luma_bakq lumaq
2180*c0909341SAndroid Build Coastguard Worker
2181*c0909341SAndroid Build Coastguard Worker    mov              wq, r4m
2182*c0909341SAndroid Build Coastguard Worker%if %3
2183*c0909341SAndroid Build Coastguard Worker    shl           r10mp, 1
2184*c0909341SAndroid Build Coastguard Worker%endif
2185*c0909341SAndroid Build Coastguard Worker%else
2186*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2187*c0909341SAndroid Build Coastguard Worker                unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak
2188*c0909341SAndroid Build Coastguard Worker
2189*c0909341SAndroid Build Coastguard Worker    mov        lstrideq, r10mp
2190*c0909341SAndroid Build Coastguard Worker%endif
2191*c0909341SAndroid Build Coastguard Worker
2192*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r9mp
2193*c0909341SAndroid Build Coastguard Worker    lea        src_bakq, [srcq+wq]
2194*c0909341SAndroid Build Coastguard Worker    lea       luma_bakq, [lumaq+wq*(1+%2)]
2195*c0909341SAndroid Build Coastguard Worker    neg              wq
2196*c0909341SAndroid Build Coastguard Worker    sub            r0mp, srcq
2197*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2198*c0909341SAndroid Build Coastguard Worker    mov             r1m, src_bakq
2199*c0909341SAndroid Build Coastguard Worker    mov            r11m, luma_bakq
2200*c0909341SAndroid Build Coastguard Worker    mov             r4m, wq
2201*c0909341SAndroid Build Coastguard Worker
2202*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
2203*c0909341SAndroid Build Coastguard Worker%else
2204*c0909341SAndroid Build Coastguard Worker    mov           r11mp, src_bakq
2205*c0909341SAndroid Build Coastguard Worker    mov           r12mp, strideq
2206*c0909341SAndroid Build Coastguard Worker%endif
2207*c0909341SAndroid Build Coastguard Worker
2208*c0909341SAndroid Build Coastguard Worker%%loop_x:
2209*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2210*c0909341SAndroid Build Coastguard Worker    mov            seed, r3m
2211*c0909341SAndroid Build Coastguard Worker%endif
2212*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
2213*c0909341SAndroid Build Coastguard Worker    or             seed, 0xEFF4
2214*c0909341SAndroid Build Coastguard Worker    shr             r6d, 1
2215*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
2216*c0909341SAndroid Build Coastguard Worker    lea            seed, [r6+0x8000]
2217*c0909341SAndroid Build Coastguard Worker    cmovp          seed, r6d               ; updated seed
2218*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2219*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
2220*c0909341SAndroid Build Coastguard Worker
2221*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
2222*c0909341SAndroid Build Coastguard Worker
2223*c0909341SAndroid Build Coastguard Worker    mov           offxd, offyd
2224*c0909341SAndroid Build Coastguard Worker%else
2225*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2226*c0909341SAndroid Build Coastguard Worker                offx, offy, see, overlap, unused1, unused2, lstride
2227*c0909341SAndroid Build Coastguard Worker
2228*c0909341SAndroid Build Coastguard Worker    mov           offyd, seed
2229*c0909341SAndroid Build Coastguard Worker    mov           offxd, seed
2230*c0909341SAndroid Build Coastguard Worker%endif
2231*c0909341SAndroid Build Coastguard Worker    ror           offyd, 8
2232*c0909341SAndroid Build Coastguard Worker    shr           offxd, 12
2233*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf
2234*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164>>%3
2235*c0909341SAndroid Build Coastguard Worker    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))]  ; offy*stride+offx
2236*c0909341SAndroid Build Coastguard Worker
2237*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2238*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
2239*c0909341SAndroid Build Coastguard Worker%else
2240*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2241*c0909341SAndroid Build Coastguard Worker                h, offxy, see, overlap, unused1, unused2, lstride, luma_bak
2242*c0909341SAndroid Build Coastguard Worker%endif
2243*c0909341SAndroid Build Coastguard Worker
2244*c0909341SAndroid Build Coastguard Worker%%loop_x_odd:
2245*c0909341SAndroid Build Coastguard Worker    mov              hd, r7m
2246*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
2247*c0909341SAndroid Build Coastguard Worker%%loop_y:
2248*c0909341SAndroid Build Coastguard Worker    ; src
2249*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2250*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r9mp
2251*c0909341SAndroid Build Coastguard Worker%endif
2252*c0909341SAndroid Build Coastguard Worker%if %2
2253*c0909341SAndroid Build Coastguard Worker    mova             m4, [lumaq+ 0]
2254*c0909341SAndroid Build Coastguard Worker    mova             m6, [lumaq+16]
2255*c0909341SAndroid Build Coastguard Worker    mova             m0, [srcq]
2256*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2257*c0909341SAndroid Build Coastguard Worker    add           lumaq, r10mp
2258*c0909341SAndroid Build Coastguard Worker    mov            r9mp, lumaq
2259*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
2260*c0909341SAndroid Build Coastguard Worker    movd             m7, [base+pb_1]
2261*c0909341SAndroid Build Coastguard Worker%else
2262*c0909341SAndroid Build Coastguard Worker    movd             m7, [pb_1]
2263*c0909341SAndroid Build Coastguard Worker%endif
2264*c0909341SAndroid Build Coastguard Worker    pshufd           m7, m7, q0000
2265*c0909341SAndroid Build Coastguard Worker    pxor             m2, m2
2266*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m4, m7
2267*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m6, m7
2268*c0909341SAndroid Build Coastguard Worker    pavgw            m4, m2
2269*c0909341SAndroid Build Coastguard Worker    pavgw            m6, m2
2270*c0909341SAndroid Build Coastguard Worker%else
2271*c0909341SAndroid Build Coastguard Worker    mova             m4, [lumaq]
2272*c0909341SAndroid Build Coastguard Worker    mova             m0, [srcq]
2273*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2274*c0909341SAndroid Build Coastguard Worker    add           lumaq, r10mp
2275*c0909341SAndroid Build Coastguard Worker    mov            r9mp, lumaq
2276*c0909341SAndroid Build Coastguard Worker%endif
2277*c0909341SAndroid Build Coastguard Worker    pxor             m2, m2
2278*c0909341SAndroid Build Coastguard Worker%endif
2279*c0909341SAndroid Build Coastguard Worker
2280*c0909341SAndroid Build Coastguard Worker%if %1
2281*c0909341SAndroid Build Coastguard Worker%if %2
2282*c0909341SAndroid Build Coastguard Worker    packuswb         m4, m6                 ; luma
2283*c0909341SAndroid Build Coastguard Worker%endif
2284*c0909341SAndroid Build Coastguard Worker    punpckhbw        m6, m4, m0
2285*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m0                 ; { luma, chroma }
2286*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m6, m14
2287*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m4, m14
2288*c0909341SAndroid Build Coastguard Worker    psraw            m6, 6
2289*c0909341SAndroid Build Coastguard Worker    psraw            m4, 6
2290*c0909341SAndroid Build Coastguard Worker    paddw            m6, m15
2291*c0909341SAndroid Build Coastguard Worker    paddw            m4, m15
2292*c0909341SAndroid Build Coastguard Worker    packuswb         m4, m6                 ; pack+unpack = clip
2293*c0909341SAndroid Build Coastguard Worker    punpckhbw        m6, m4, m2
2294*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m2
2295*c0909341SAndroid Build Coastguard Worker%elif %2 == 0
2296*c0909341SAndroid Build Coastguard Worker    punpckhbw        m6, m4, m2
2297*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m2
2298*c0909341SAndroid Build Coastguard Worker%endif
2299*c0909341SAndroid Build Coastguard Worker
2300*c0909341SAndroid Build Coastguard Worker    ; scaling[luma_src]
2301*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2302*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m7, m4, scalingq-1, r0, r5
2303*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m6, scalingq-1, r0, r5
2304*c0909341SAndroid Build Coastguard Worker%else
2305*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m7, m4, scalingq-1, r12, r2
2306*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m6, scalingq-1, r12, r2
2307*c0909341SAndroid Build Coastguard Worker%endif
2308*c0909341SAndroid Build Coastguard Worker    REPX {psrlw x, 8}, m7, m5
2309*c0909341SAndroid Build Coastguard Worker
2310*c0909341SAndroid Build Coastguard Worker    ; unpack chroma_source
2311*c0909341SAndroid Build Coastguard Worker    punpckhbw        m1, m0, m2
2312*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m2                 ; m0-1: src as word
2313*c0909341SAndroid Build Coastguard Worker
2314*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
2315*c0909341SAndroid Build Coastguard Worker    movu             m3, [grain_lutq+offxyq+ 0]
2316*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m6, m2, m3
2317*c0909341SAndroid Build Coastguard Worker    punpcklbw        m2, m3, m6
2318*c0909341SAndroid Build Coastguard Worker    punpckhbw        m3, m6
2319*c0909341SAndroid Build Coastguard Worker
2320*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2321*c0909341SAndroid Build Coastguard Worker    pmullw           m2, m7
2322*c0909341SAndroid Build Coastguard Worker    pmullw           m3, m5
2323*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m2, m11
2324*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m3, m11
2325*c0909341SAndroid Build Coastguard Worker
2326*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2327*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2328*c0909341SAndroid Build Coastguard Worker%endif
2329*c0909341SAndroid Build Coastguard Worker
2330*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
2331*c0909341SAndroid Build Coastguard Worker    paddw            m0, m2
2332*c0909341SAndroid Build Coastguard Worker    paddw            m1, m3
2333*c0909341SAndroid Build Coastguard Worker    pmaxsw           m0, m13
2334*c0909341SAndroid Build Coastguard Worker    pmaxsw           m1, m13
2335*c0909341SAndroid Build Coastguard Worker    pminsw           m0, m12
2336*c0909341SAndroid Build Coastguard Worker    pminsw           m1, m12
2337*c0909341SAndroid Build Coastguard Worker    packuswb         m0, m1
2338*c0909341SAndroid Build Coastguard Worker    movifnidn      dstq, dstmp
2339*c0909341SAndroid Build Coastguard Worker    mova    [dstq+srcq], m0
2340*c0909341SAndroid Build Coastguard Worker
2341*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2342*c0909341SAndroid Build Coastguard Worker    add            srcq, r2mp
2343*c0909341SAndroid Build Coastguard Worker    ; we already incremented lumaq above
2344*c0909341SAndroid Build Coastguard Worker%else
2345*c0909341SAndroid Build Coastguard Worker    add            srcq, r12mp
2346*c0909341SAndroid Build Coastguard Worker%if %3
2347*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [lumaq+lstrideq*2]
2348*c0909341SAndroid Build Coastguard Worker%else
2349*c0909341SAndroid Build Coastguard Worker    add           lumaq, lstrideq
2350*c0909341SAndroid Build Coastguard Worker%endif
2351*c0909341SAndroid Build Coastguard Worker%endif
2352*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82
2353*c0909341SAndroid Build Coastguard Worker    dec              hw
2354*c0909341SAndroid Build Coastguard Worker    jg %%loop_y
2355*c0909341SAndroid Build Coastguard Worker
2356*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2357*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
2358*c0909341SAndroid Build Coastguard Worker
2359*c0909341SAndroid Build Coastguard Worker    mov              wq, r4m
2360*c0909341SAndroid Build Coastguard Worker%endif
2361*c0909341SAndroid Build Coastguard Worker    add              wq, 16
2362*c0909341SAndroid Build Coastguard Worker    jge %%end
2363*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2364*c0909341SAndroid Build Coastguard Worker    mov            srcq, r1mp
2365*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r11mp
2366*c0909341SAndroid Build Coastguard Worker%else
2367*c0909341SAndroid Build Coastguard Worker    mov            srcq, r11mp
2368*c0909341SAndroid Build Coastguard Worker%endif
2369*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [luma_bakq+wq*(1+%2)]
2370*c0909341SAndroid Build Coastguard Worker    add            srcq, wq
2371*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2372*c0909341SAndroid Build Coastguard Worker    mov             r4m, wq
2373*c0909341SAndroid Build Coastguard Worker    mov             r9m, lumaq
2374*c0909341SAndroid Build Coastguard Worker%endif
2375*c0909341SAndroid Build Coastguard Worker%if %2 == 0
2376*c0909341SAndroid Build Coastguard Worker    ; adjust top_offxy
2377*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2378*c0909341SAndroid Build Coastguard Worker    add dword [rsp+7*mmsize+1*gprsize], 16
2379*c0909341SAndroid Build Coastguard Worker%else
2380*c0909341SAndroid Build Coastguard Worker    add            r11d, 16
2381*c0909341SAndroid Build Coastguard Worker%endif
2382*c0909341SAndroid Build Coastguard Worker    add          offxyd, 16
2383*c0909341SAndroid Build Coastguard Worker    btc       dword r8m, 2
2384*c0909341SAndroid Build Coastguard Worker    jc %%loop_x_even
2385*c0909341SAndroid Build Coastguard Worker    test      dword r8m, 2
2386*c0909341SAndroid Build Coastguard Worker    jz %%loop_x_odd
2387*c0909341SAndroid Build Coastguard Worker    jmp %%loop_x_odd_v_overlap
2388*c0909341SAndroid Build Coastguard Worker%%loop_x_even:
2389*c0909341SAndroid Build Coastguard Worker%endif
2390*c0909341SAndroid Build Coastguard Worker    test      dword r8m, 1
2391*c0909341SAndroid Build Coastguard Worker    jz %%loop_x
2392*c0909341SAndroid Build Coastguard Worker
2393*c0909341SAndroid Build Coastguard Worker    ; r8m = sbym
2394*c0909341SAndroid Build Coastguard Worker    test      dword r8m, 2
2395*c0909341SAndroid Build Coastguard Worker    jne %%loop_x_hv_overlap
2396*c0909341SAndroid Build Coastguard Worker
2397*c0909341SAndroid Build Coastguard Worker    ; horizontal overlap (without vertical overlap)
2398*c0909341SAndroid Build Coastguard Worker%%loop_x_h_overlap:
2399*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2400*c0909341SAndroid Build Coastguard Worker%if %2
2401*c0909341SAndroid Build Coastguard Worker    lea              r6, [offxyd+16]
2402*c0909341SAndroid Build Coastguard Worker    mov [rsp+7*mmsize+0*gprsize], r6
2403*c0909341SAndroid Build Coastguard Worker%else
2404*c0909341SAndroid Build Coastguard Worker    mov [rsp+7*mmsize+0*gprsize], offxyd
2405*c0909341SAndroid Build Coastguard Worker%endif
2406*c0909341SAndroid Build Coastguard Worker
2407*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut
2408*c0909341SAndroid Build Coastguard Worker
2409*c0909341SAndroid Build Coastguard Worker    mov            seed, r3m
2410*c0909341SAndroid Build Coastguard Worker%else
2411*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2412*c0909341SAndroid Build Coastguard Worker                offx, offy, see, left_offxy, unused1, unused2, lstride
2413*c0909341SAndroid Build Coastguard Worker
2414*c0909341SAndroid Build Coastguard Worker%if %2
2415*c0909341SAndroid Build Coastguard Worker    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
2416*c0909341SAndroid Build Coastguard Worker%else
2417*c0909341SAndroid Build Coastguard Worker    mov     left_offxyd, offyd
2418*c0909341SAndroid Build Coastguard Worker%endif
2419*c0909341SAndroid Build Coastguard Worker%endif
2420*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
2421*c0909341SAndroid Build Coastguard Worker    or             seed, 0xEFF4
2422*c0909341SAndroid Build Coastguard Worker    shr             r6d, 1
2423*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
2424*c0909341SAndroid Build Coastguard Worker    lea            seed, [r6+0x8000]
2425*c0909341SAndroid Build Coastguard Worker    cmovp          seed, r6d                ; updated seed
2426*c0909341SAndroid Build Coastguard Worker
2427*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2428*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
2429*c0909341SAndroid Build Coastguard Worker
2430*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx
2431*c0909341SAndroid Build Coastguard Worker
2432*c0909341SAndroid Build Coastguard Worker    mov          offxd, offyd
2433*c0909341SAndroid Build Coastguard Worker%else
2434*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2435*c0909341SAndroid Build Coastguard Worker                offx, offy, see, left_offxy, unused1, unused2, lstride
2436*c0909341SAndroid Build Coastguard Worker
2437*c0909341SAndroid Build Coastguard Worker    mov           offyd, seed
2438*c0909341SAndroid Build Coastguard Worker    mov           offxd, seed
2439*c0909341SAndroid Build Coastguard Worker%endif
2440*c0909341SAndroid Build Coastguard Worker    ror           offyd, 8
2441*c0909341SAndroid Build Coastguard Worker    shr           offxd, 12
2442*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf
2443*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164>>%3
2444*c0909341SAndroid Build Coastguard Worker    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
2445*c0909341SAndroid Build Coastguard Worker
2446*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2447*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
2448*c0909341SAndroid Build Coastguard Worker%else
2449*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2450*c0909341SAndroid Build Coastguard Worker                h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak
2451*c0909341SAndroid Build Coastguard Worker%endif
2452*c0909341SAndroid Build Coastguard Worker
2453*c0909341SAndroid Build Coastguard Worker    mov              hd, r7m
2454*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
2455*c0909341SAndroid Build Coastguard Worker%%loop_y_h_overlap:
2456*c0909341SAndroid Build Coastguard Worker    ; src
2457*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2458*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r9mp
2459*c0909341SAndroid Build Coastguard Worker%endif
2460*c0909341SAndroid Build Coastguard Worker%if %2
2461*c0909341SAndroid Build Coastguard Worker    mova             m4, [lumaq+ 0]
2462*c0909341SAndroid Build Coastguard Worker    mova             m6, [lumaq+16]
2463*c0909341SAndroid Build Coastguard Worker    mova             m0, [srcq]
2464*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2465*c0909341SAndroid Build Coastguard Worker    add           lumaq, r10mp
2466*c0909341SAndroid Build Coastguard Worker    mov            r9mp, lumaq
2467*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
2468*c0909341SAndroid Build Coastguard Worker    movd             m7, [base+pb_1]
2469*c0909341SAndroid Build Coastguard Worker%else
2470*c0909341SAndroid Build Coastguard Worker    movd             m7, [pb_1]
2471*c0909341SAndroid Build Coastguard Worker%endif
2472*c0909341SAndroid Build Coastguard Worker    pshufd           m7, m7, q0000
2473*c0909341SAndroid Build Coastguard Worker    pxor             m2, m2
2474*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m4, m7
2475*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m6, m7
2476*c0909341SAndroid Build Coastguard Worker    pavgw            m4, m2
2477*c0909341SAndroid Build Coastguard Worker    pavgw            m6, m2
2478*c0909341SAndroid Build Coastguard Worker%else
2479*c0909341SAndroid Build Coastguard Worker    mova             m4, [lumaq]
2480*c0909341SAndroid Build Coastguard Worker    mova             m0, [srcq]
2481*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2482*c0909341SAndroid Build Coastguard Worker    add           lumaq, r10mp
2483*c0909341SAndroid Build Coastguard Worker    mov            r9mp, lumaq
2484*c0909341SAndroid Build Coastguard Worker%endif
2485*c0909341SAndroid Build Coastguard Worker    pxor             m2, m2
2486*c0909341SAndroid Build Coastguard Worker%endif
2487*c0909341SAndroid Build Coastguard Worker
2488*c0909341SAndroid Build Coastguard Worker%if %1
2489*c0909341SAndroid Build Coastguard Worker%if %2
2490*c0909341SAndroid Build Coastguard Worker    packuswb         m4, m6                 ; luma
2491*c0909341SAndroid Build Coastguard Worker%endif
2492*c0909341SAndroid Build Coastguard Worker    punpckhbw        m6, m4, m0
2493*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m0                 ; { luma, chroma }
2494*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m6, m14
2495*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m4, m14
2496*c0909341SAndroid Build Coastguard Worker    psraw            m6, 6
2497*c0909341SAndroid Build Coastguard Worker    psraw            m4, 6
2498*c0909341SAndroid Build Coastguard Worker    paddw            m6, m15
2499*c0909341SAndroid Build Coastguard Worker    paddw            m4, m15
2500*c0909341SAndroid Build Coastguard Worker    packuswb         m4, m6                 ; pack+unpack = clip
2501*c0909341SAndroid Build Coastguard Worker    punpckhbw        m6, m4, m2
2502*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m2
2503*c0909341SAndroid Build Coastguard Worker%elif %2 == 0
2504*c0909341SAndroid Build Coastguard Worker    punpckhbw        m6, m4, m2
2505*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m2
2506*c0909341SAndroid Build Coastguard Worker%endif
2507*c0909341SAndroid Build Coastguard Worker
2508*c0909341SAndroid Build Coastguard Worker    ; scaling[luma_src]
2509*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2510*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m7, m4, scalingq-1, r0, r5
2511*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m6, scalingq-1, r0, r5
2512*c0909341SAndroid Build Coastguard Worker%else
2513*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m7, m4, scalingq-1, r12, r2
2514*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m6, scalingq-1, r12, r2
2515*c0909341SAndroid Build Coastguard Worker%endif
2516*c0909341SAndroid Build Coastguard Worker    REPX {psrlw x, 8}, m7, m5
2517*c0909341SAndroid Build Coastguard Worker
2518*c0909341SAndroid Build Coastguard Worker    ; unpack chroma_source
2519*c0909341SAndroid Build Coastguard Worker    punpckhbw        m1, m0, m2
2520*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m2                 ; m0-1: src as word
2521*c0909341SAndroid Build Coastguard Worker
2522*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
2523*c0909341SAndroid Build Coastguard Worker    movu             m4, [grain_lutq+offxyq+ 0]
2524*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2525*c0909341SAndroid Build Coastguard Worker    mov              r0, [rsp+7*mmsize+0*gprsize]
2526*c0909341SAndroid Build Coastguard Worker    movd             m2, [grain_lutq+r0+ 0]
2527*c0909341SAndroid Build Coastguard Worker%else
2528*c0909341SAndroid Build Coastguard Worker    movd             m2, [grain_lutq+left_offxyq+ 0]
2529*c0909341SAndroid Build Coastguard Worker%endif
2530*c0909341SAndroid Build Coastguard Worker    punpcklbw        m2, m4
2531*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m3, m9, m2
2532*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m3, m8
2533*c0909341SAndroid Build Coastguard Worker    packsswb         m3, m3
2534*c0909341SAndroid Build Coastguard Worker    shufps           m3, m4, q3210
2535*c0909341SAndroid Build Coastguard Worker    pxor             m4, m4
2536*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m4, m3
2537*c0909341SAndroid Build Coastguard Worker    punpcklbw        m2, m3, m4
2538*c0909341SAndroid Build Coastguard Worker    punpckhbw        m3, m4
2539*c0909341SAndroid Build Coastguard Worker
2540*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2541*c0909341SAndroid Build Coastguard Worker    pmullw           m2, m7
2542*c0909341SAndroid Build Coastguard Worker    pmullw           m3, m5
2543*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m2, m11
2544*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m3, m11
2545*c0909341SAndroid Build Coastguard Worker
2546*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2547*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2548*c0909341SAndroid Build Coastguard Worker%endif
2549*c0909341SAndroid Build Coastguard Worker
2550*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
2551*c0909341SAndroid Build Coastguard Worker    paddw            m0, m2
2552*c0909341SAndroid Build Coastguard Worker    paddw            m1, m3
2553*c0909341SAndroid Build Coastguard Worker    pmaxsw           m0, m13
2554*c0909341SAndroid Build Coastguard Worker    pmaxsw           m1, m13
2555*c0909341SAndroid Build Coastguard Worker    pminsw           m0, m12
2556*c0909341SAndroid Build Coastguard Worker    pminsw           m1, m12
2557*c0909341SAndroid Build Coastguard Worker    packuswb         m0, m1
2558*c0909341SAndroid Build Coastguard Worker    movifnidn      dstq, dstmp
2559*c0909341SAndroid Build Coastguard Worker    mova    [dstq+srcq], m0
2560*c0909341SAndroid Build Coastguard Worker
2561*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2562*c0909341SAndroid Build Coastguard Worker    add            srcq, r2mp
2563*c0909341SAndroid Build Coastguard Worker    ; lumaq has already been incremented above
2564*c0909341SAndroid Build Coastguard Worker%else
2565*c0909341SAndroid Build Coastguard Worker    add            srcq, r12mp
2566*c0909341SAndroid Build Coastguard Worker%if %3
2567*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [lumaq+lstrideq*2]
2568*c0909341SAndroid Build Coastguard Worker%else
2569*c0909341SAndroid Build Coastguard Worker    add           lumaq, lstrideq
2570*c0909341SAndroid Build Coastguard Worker%endif
2571*c0909341SAndroid Build Coastguard Worker%endif
2572*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82
2573*c0909341SAndroid Build Coastguard Worker    dec              hw
2574*c0909341SAndroid Build Coastguard Worker    jg %%loop_y_h_overlap
2575*c0909341SAndroid Build Coastguard Worker
2576*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2577*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
2578*c0909341SAndroid Build Coastguard Worker
2579*c0909341SAndroid Build Coastguard Worker    mov              wq, r4m
2580*c0909341SAndroid Build Coastguard Worker%endif
2581*c0909341SAndroid Build Coastguard Worker    add              wq, 16
2582*c0909341SAndroid Build Coastguard Worker    jge %%end
2583*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2584*c0909341SAndroid Build Coastguard Worker    mov            srcq, r1mp
2585*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r11mp
2586*c0909341SAndroid Build Coastguard Worker%else
2587*c0909341SAndroid Build Coastguard Worker    mov            srcq, r11mp
2588*c0909341SAndroid Build Coastguard Worker%endif
2589*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [luma_bakq+wq*(1+%2)]
2590*c0909341SAndroid Build Coastguard Worker    add            srcq, wq
2591*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2592*c0909341SAndroid Build Coastguard Worker    mov             r4m, wq
2593*c0909341SAndroid Build Coastguard Worker    mov             r9m, lumaq
2594*c0909341SAndroid Build Coastguard Worker%endif
2595*c0909341SAndroid Build Coastguard Worker%if %2 == 0
2596*c0909341SAndroid Build Coastguard Worker    xor       dword r8m, 4
2597*c0909341SAndroid Build Coastguard Worker    ; adjust top_offxyd
2598*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2599*c0909341SAndroid Build Coastguard Worker    add dword [rsp+7*mmsize+1*gprsize], 16
2600*c0909341SAndroid Build Coastguard Worker%else
2601*c0909341SAndroid Build Coastguard Worker    add            r11d, 16
2602*c0909341SAndroid Build Coastguard Worker%endif
2603*c0909341SAndroid Build Coastguard Worker    add          offxyd, 16
2604*c0909341SAndroid Build Coastguard Worker%endif
2605*c0909341SAndroid Build Coastguard Worker
2606*c0909341SAndroid Build Coastguard Worker    ; r8m = sbym
2607*c0909341SAndroid Build Coastguard Worker    test      dword r8m, 2
2608*c0909341SAndroid Build Coastguard Worker%if %2
2609*c0909341SAndroid Build Coastguard Worker    jne %%loop_x_hv_overlap
2610*c0909341SAndroid Build Coastguard Worker    jmp %%loop_x_h_overlap
2611*c0909341SAndroid Build Coastguard Worker%else
2612*c0909341SAndroid Build Coastguard Worker    jne %%loop_x_odd_v_overlap
2613*c0909341SAndroid Build Coastguard Worker    jmp %%loop_x_odd
2614*c0909341SAndroid Build Coastguard Worker%endif
2615*c0909341SAndroid Build Coastguard Worker
2616*c0909341SAndroid Build Coastguard Worker%%end:
2617*c0909341SAndroid Build Coastguard Worker    RET
2618*c0909341SAndroid Build Coastguard Worker
2619*c0909341SAndroid Build Coastguard Worker%%vertical_overlap:
2620*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2621*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
2622*c0909341SAndroid Build Coastguard Worker%else
2623*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
2624*c0909341SAndroid Build Coastguard Worker%endif
2625*c0909341SAndroid Build Coastguard Worker
2626*c0909341SAndroid Build Coastguard Worker    or         overlapd, 2                  ; top_overlap: overlap & 2
2627*c0909341SAndroid Build Coastguard Worker    mov             r8m, overlapd
2628*c0909341SAndroid Build Coastguard Worker    movzx          sbyd, sbyb
2629*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2630*c0909341SAndroid Build Coastguard Worker    imul             r4, [fg_dataq+FGData.seed], 0x00010001
2631*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
2632*c0909341SAndroid Build Coastguard Worker%else
2633*c0909341SAndroid Build Coastguard Worker    imul           seed, [fg_dataq+FGData.seed], 0x00010001
2634*c0909341SAndroid Build Coastguard Worker%endif
2635*c0909341SAndroid Build Coastguard Worker    imul           tmpd, sbyd, 173 * 0x00010001
2636*c0909341SAndroid Build Coastguard Worker    imul           sbyd, 37 * 0x01000100
2637*c0909341SAndroid Build Coastguard Worker    add            tmpd, (105 << 16) | 188
2638*c0909341SAndroid Build Coastguard Worker    add            sbyd, (178 << 24) | (141 << 8)
2639*c0909341SAndroid Build Coastguard Worker    and            tmpd, 0x00ff00ff
2640*c0909341SAndroid Build Coastguard Worker    and            sbyd, 0xff00ff00
2641*c0909341SAndroid Build Coastguard Worker    xor            seed, tmpd
2642*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2643*c0909341SAndroid Build Coastguard Worker    xor            sbyd, seed               ; (cur_seed << 16) | top_seed
2644*c0909341SAndroid Build Coastguard Worker
2645*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
2646*c0909341SAndroid Build Coastguard Worker
2647*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
2648*c0909341SAndroid Build Coastguard Worker    mov              wq, r4m
2649*c0909341SAndroid Build Coastguard Worker%if %3
2650*c0909341SAndroid Build Coastguard Worker    shl           r10mp, 1
2651*c0909341SAndroid Build Coastguard Worker%endif
2652*c0909341SAndroid Build Coastguard Worker%else
2653*c0909341SAndroid Build Coastguard Worker    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
2654*c0909341SAndroid Build Coastguard Worker
2655*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2656*c0909341SAndroid Build Coastguard Worker                tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak
2657*c0909341SAndroid Build Coastguard Worker
2658*c0909341SAndroid Build Coastguard Worker    mov        lstrideq, r10mp
2659*c0909341SAndroid Build Coastguard Worker%endif
2660*c0909341SAndroid Build Coastguard Worker
2661*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r9mp
2662*c0909341SAndroid Build Coastguard Worker    lea        src_bakq, [srcq+wq]
2663*c0909341SAndroid Build Coastguard Worker    lea       luma_bakq, [lumaq+wq*(1+%2)]
2664*c0909341SAndroid Build Coastguard Worker    neg              wq
2665*c0909341SAndroid Build Coastguard Worker    sub            r0mp, srcq
2666*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2667*c0909341SAndroid Build Coastguard Worker    mov             r1m, src_bakq
2668*c0909341SAndroid Build Coastguard Worker    mov            r11m, luma_bakq
2669*c0909341SAndroid Build Coastguard Worker    mov             r4m, wq
2670*c0909341SAndroid Build Coastguard Worker
2671*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
2672*c0909341SAndroid Build Coastguard Worker%else
2673*c0909341SAndroid Build Coastguard Worker    mov           r11mp, src_bakq
2674*c0909341SAndroid Build Coastguard Worker    mov           r12mp, strideq
2675*c0909341SAndroid Build Coastguard Worker%endif
2676*c0909341SAndroid Build Coastguard Worker
2677*c0909341SAndroid Build Coastguard Worker%%loop_x_v_overlap:
2678*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2679*c0909341SAndroid Build Coastguard Worker    mov            seed, r3m
2680*c0909341SAndroid Build Coastguard Worker    xor            tmpd, tmpd
2681*c0909341SAndroid Build Coastguard Worker%endif
2682*c0909341SAndroid Build Coastguard Worker    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
2683*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
2684*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4eff4
2685*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
2686*c0909341SAndroid Build Coastguard Worker    setp           tmpb                     ; parity of top_seed
2687*c0909341SAndroid Build Coastguard Worker    shr            seed, 16
2688*c0909341SAndroid Build Coastguard Worker    shl            tmpd, 16
2689*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
2690*c0909341SAndroid Build Coastguard Worker    setp           tmpb                     ; parity of cur_seed
2691*c0909341SAndroid Build Coastguard Worker    or              r6d, 0x00010001
2692*c0909341SAndroid Build Coastguard Worker    xor            tmpd, r6d
2693*c0909341SAndroid Build Coastguard Worker    mov            seed, tmpd
2694*c0909341SAndroid Build Coastguard Worker    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
2695*c0909341SAndroid Build Coastguard Worker
2696*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2697*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
2698*c0909341SAndroid Build Coastguard Worker
2699*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx
2700*c0909341SAndroid Build Coastguard Worker
2701*c0909341SAndroid Build Coastguard Worker    mov           offxd, offyd
2702*c0909341SAndroid Build Coastguard Worker%else
2703*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2704*c0909341SAndroid Build Coastguard Worker                offx, offy, see, overlap, top_offxy, unused, lstride
2705*c0909341SAndroid Build Coastguard Worker
2706*c0909341SAndroid Build Coastguard Worker    mov           offxd, seed
2707*c0909341SAndroid Build Coastguard Worker    mov           offyd, seed
2708*c0909341SAndroid Build Coastguard Worker%endif
2709*c0909341SAndroid Build Coastguard Worker    ror           offyd, 8
2710*c0909341SAndroid Build Coastguard Worker    ror           offxd, 12
2711*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf000f
2712*c0909341SAndroid Build Coastguard Worker    and           offxd, 0xf000f
2713*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164>>%3
2714*c0909341SAndroid Build Coastguard Worker    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
2715*c0909341SAndroid Build Coastguard Worker    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
2716*c0909341SAndroid Build Coastguard Worker
2717*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2718*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy
2719*c0909341SAndroid Build Coastguard Worker%else
2720*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2721*c0909341SAndroid Build Coastguard Worker                h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak
2722*c0909341SAndroid Build Coastguard Worker%endif
2723*c0909341SAndroid Build Coastguard Worker
2724*c0909341SAndroid Build Coastguard Worker    movzx    top_offxyd, offxyw
2725*c0909341SAndroid Build Coastguard Worker    shr          offxyd, 16
2726*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2727*c0909341SAndroid Build Coastguard Worker    mov [rsp+7*mmsize+1*gprsize], top_offxyd
2728*c0909341SAndroid Build Coastguard Worker
2729*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
2730*c0909341SAndroid Build Coastguard Worker%endif
2731*c0909341SAndroid Build Coastguard Worker
2732*c0909341SAndroid Build Coastguard Worker%%loop_x_odd_v_overlap:
2733*c0909341SAndroid Build Coastguard Worker    mov              hd, r7m
2734*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
2735*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2736*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
2737*c0909341SAndroid Build Coastguard Worker%endif
2738*c0909341SAndroid Build Coastguard Worker%if %3
2739*c0909341SAndroid Build Coastguard Worker    mova             m1, [PIC_ptr(pb_23_22)]
2740*c0909341SAndroid Build Coastguard Worker%else
2741*c0909341SAndroid Build Coastguard Worker    mova             m1, [PIC_ptr(pb_27_17)]
2742*c0909341SAndroid Build Coastguard Worker%endif
2743*c0909341SAndroid Build Coastguard Worker%%loop_y_v_overlap:
2744*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2745*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r9mp
2746*c0909341SAndroid Build Coastguard Worker%endif
2747*c0909341SAndroid Build Coastguard Worker%if %2
2748*c0909341SAndroid Build Coastguard Worker    mova             m4, [lumaq+ 0]
2749*c0909341SAndroid Build Coastguard Worker    mova             m6, [lumaq+16]
2750*c0909341SAndroid Build Coastguard Worker    mova             m0, [srcq]
2751*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2752*c0909341SAndroid Build Coastguard Worker    add           lumaq, r10mp
2753*c0909341SAndroid Build Coastguard Worker    mov            r9mp, lumaq
2754*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
2755*c0909341SAndroid Build Coastguard Worker    movd             m7, [base+pb_1]
2756*c0909341SAndroid Build Coastguard Worker%else
2757*c0909341SAndroid Build Coastguard Worker    movd             m7, [pb_1]
2758*c0909341SAndroid Build Coastguard Worker%endif
2759*c0909341SAndroid Build Coastguard Worker    pshufd           m7, m7, q0000
2760*c0909341SAndroid Build Coastguard Worker    pxor             m2, m2
2761*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m4, m7
2762*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m6, m7
2763*c0909341SAndroid Build Coastguard Worker    pavgw            m4, m2
2764*c0909341SAndroid Build Coastguard Worker    pavgw            m6, m2
2765*c0909341SAndroid Build Coastguard Worker%else
2766*c0909341SAndroid Build Coastguard Worker    mova             m4, [lumaq]
2767*c0909341SAndroid Build Coastguard Worker    mova             m0, [srcq]
2768*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2769*c0909341SAndroid Build Coastguard Worker    add           lumaq, r10mp
2770*c0909341SAndroid Build Coastguard Worker    mov            r9mp, lumaq
2771*c0909341SAndroid Build Coastguard Worker%endif
2772*c0909341SAndroid Build Coastguard Worker    pxor             m2, m2
2773*c0909341SAndroid Build Coastguard Worker%endif
2774*c0909341SAndroid Build Coastguard Worker
2775*c0909341SAndroid Build Coastguard Worker%if %1
2776*c0909341SAndroid Build Coastguard Worker%if %2
2777*c0909341SAndroid Build Coastguard Worker    packuswb         m4, m6                 ; luma
2778*c0909341SAndroid Build Coastguard Worker%endif
2779*c0909341SAndroid Build Coastguard Worker    punpckhbw        m6, m4, m0
2780*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m0                 ; { luma, chroma }
2781*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m6, m14
2782*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m4, m14
2783*c0909341SAndroid Build Coastguard Worker    psraw            m6, 6
2784*c0909341SAndroid Build Coastguard Worker    psraw            m4, 6
2785*c0909341SAndroid Build Coastguard Worker    paddw            m6, m15
2786*c0909341SAndroid Build Coastguard Worker    paddw            m4, m15
2787*c0909341SAndroid Build Coastguard Worker    packuswb         m4, m6                 ; pack+unpack = clip
2788*c0909341SAndroid Build Coastguard Worker    punpckhbw        m6, m4, m2
2789*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m2
2790*c0909341SAndroid Build Coastguard Worker%elif %2 == 0
2791*c0909341SAndroid Build Coastguard Worker    punpckhbw        m6, m4, m2
2792*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m2
2793*c0909341SAndroid Build Coastguard Worker%endif
2794*c0909341SAndroid Build Coastguard Worker
2795*c0909341SAndroid Build Coastguard Worker    ; scaling[luma_src]
2796*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2797*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m7, m4, scalingq-1, r0, r5
2798*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m6, scalingq-1, r0, r5
2799*c0909341SAndroid Build Coastguard Worker%else
2800*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m7, m4, scalingq-1, r12, r2
2801*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m6, scalingq-1, r12, r2
2802*c0909341SAndroid Build Coastguard Worker%endif
2803*c0909341SAndroid Build Coastguard Worker    REPX {psrlw x, 8}, m7, m5
2804*c0909341SAndroid Build Coastguard Worker
2805*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
2806*c0909341SAndroid Build Coastguard Worker    movu             m3, [grain_lutq+offxyq]
2807*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2808*c0909341SAndroid Build Coastguard Worker    mov              r0, [rsp+7*mmsize+1*gprsize]
2809*c0909341SAndroid Build Coastguard Worker    movu             m4, [grain_lutq+r0]
2810*c0909341SAndroid Build Coastguard Worker%else
2811*c0909341SAndroid Build Coastguard Worker    movu             m4, [grain_lutq+top_offxyq]
2812*c0909341SAndroid Build Coastguard Worker%endif
2813*c0909341SAndroid Build Coastguard Worker    punpckhbw        m6, m4, m3
2814*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m3
2815*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m2, m1, m6
2816*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m3, m1, m4
2817*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m2, m8
2818*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m3, m8
2819*c0909341SAndroid Build Coastguard Worker    packsswb         m3, m2
2820*c0909341SAndroid Build Coastguard Worker    pxor             m6, m6
2821*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m6, m3
2822*c0909341SAndroid Build Coastguard Worker    punpcklbw        m2, m3, m6
2823*c0909341SAndroid Build Coastguard Worker    punpckhbw        m3, m6
2824*c0909341SAndroid Build Coastguard Worker
2825*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2826*c0909341SAndroid Build Coastguard Worker    pmullw           m2, m7
2827*c0909341SAndroid Build Coastguard Worker    pmullw           m3, m5
2828*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m2, m11
2829*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m3, m11
2830*c0909341SAndroid Build Coastguard Worker
2831*c0909341SAndroid Build Coastguard Worker    ; unpack chroma_source
2832*c0909341SAndroid Build Coastguard Worker    pxor             m4, m4
2833*c0909341SAndroid Build Coastguard Worker    punpckhbw        m6, m0, m4
2834*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m4                 ; m0-1: src as word
2835*c0909341SAndroid Build Coastguard Worker
2836*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2837*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2838*c0909341SAndroid Build Coastguard Worker%endif
2839*c0909341SAndroid Build Coastguard Worker
2840*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
2841*c0909341SAndroid Build Coastguard Worker    paddw            m0, m2
2842*c0909341SAndroid Build Coastguard Worker    paddw            m6, m3
2843*c0909341SAndroid Build Coastguard Worker    pmaxsw           m0, m13
2844*c0909341SAndroid Build Coastguard Worker    pmaxsw           m6, m13
2845*c0909341SAndroid Build Coastguard Worker    pminsw           m0, m12
2846*c0909341SAndroid Build Coastguard Worker    pminsw           m6, m12
2847*c0909341SAndroid Build Coastguard Worker    packuswb         m0, m6
2848*c0909341SAndroid Build Coastguard Worker    movifnidn      dstq, dstmp
2849*c0909341SAndroid Build Coastguard Worker    mova    [dstq+srcq], m0
2850*c0909341SAndroid Build Coastguard Worker
2851*c0909341SAndroid Build Coastguard Worker    dec              hw
2852*c0909341SAndroid Build Coastguard Worker    je %%end_y_v_overlap
2853*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2854*c0909341SAndroid Build Coastguard Worker    add            srcq, r2mp
2855*c0909341SAndroid Build Coastguard Worker    ; lumaq has already been incremented above
2856*c0909341SAndroid Build Coastguard Worker%else
2857*c0909341SAndroid Build Coastguard Worker    add            srcq, r12mp
2858*c0909341SAndroid Build Coastguard Worker%if %3
2859*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [lumaq+lstrideq*2]
2860*c0909341SAndroid Build Coastguard Worker%else
2861*c0909341SAndroid Build Coastguard Worker    add           lumaq, lstrideq
2862*c0909341SAndroid Build Coastguard Worker%endif
2863*c0909341SAndroid Build Coastguard Worker%endif
2864*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82
2865*c0909341SAndroid Build Coastguard Worker%if %3 == 0
2866*c0909341SAndroid Build Coastguard Worker    btc              hd, 16
2867*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2868*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
2869*c0909341SAndroid Build Coastguard Worker%endif
2870*c0909341SAndroid Build Coastguard Worker    mova             m1, [PIC_ptr(pb_17_27)]
2871*c0909341SAndroid Build Coastguard Worker    jnc %%loop_y_v_overlap
2872*c0909341SAndroid Build Coastguard Worker%endif
2873*c0909341SAndroid Build Coastguard Worker    jmp %%loop_y
2874*c0909341SAndroid Build Coastguard Worker
2875*c0909341SAndroid Build Coastguard Worker%%end_y_v_overlap:
2876*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2877*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
2878*c0909341SAndroid Build Coastguard Worker
2879*c0909341SAndroid Build Coastguard Worker    mov              wq, r4m
2880*c0909341SAndroid Build Coastguard Worker%endif
2881*c0909341SAndroid Build Coastguard Worker    add              wq, 16
2882*c0909341SAndroid Build Coastguard Worker    jge %%end_hv
2883*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2884*c0909341SAndroid Build Coastguard Worker    mov            srcq, r1mp
2885*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r11mp
2886*c0909341SAndroid Build Coastguard Worker%else
2887*c0909341SAndroid Build Coastguard Worker    mov            srcq, r11mp
2888*c0909341SAndroid Build Coastguard Worker%endif
2889*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [luma_bakq+wq*(1+%2)]
2890*c0909341SAndroid Build Coastguard Worker    add            srcq, wq
2891*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2892*c0909341SAndroid Build Coastguard Worker    mov             r4m, wq
2893*c0909341SAndroid Build Coastguard Worker    mov             r9m, lumaq
2894*c0909341SAndroid Build Coastguard Worker%endif
2895*c0909341SAndroid Build Coastguard Worker
2896*c0909341SAndroid Build Coastguard Worker%if %2
2897*c0909341SAndroid Build Coastguard Worker    ; since fg_dataq.overlap is guaranteed to be set, we never jump
2898*c0909341SAndroid Build Coastguard Worker    ; back to .loop_x_v_overlap, and instead always fall-through to
2899*c0909341SAndroid Build Coastguard Worker    ; h+v overlap
2900*c0909341SAndroid Build Coastguard Worker%else
2901*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2902*c0909341SAndroid Build Coastguard Worker    add dword [rsp+7*mmsize+1*gprsize], 16
2903*c0909341SAndroid Build Coastguard Worker%else
2904*c0909341SAndroid Build Coastguard Worker    add      top_offxyd, 16
2905*c0909341SAndroid Build Coastguard Worker%endif
2906*c0909341SAndroid Build Coastguard Worker    add          offxyd, 16
2907*c0909341SAndroid Build Coastguard Worker    btc       dword r8m, 2
2908*c0909341SAndroid Build Coastguard Worker    jnc %%loop_x_odd_v_overlap
2909*c0909341SAndroid Build Coastguard Worker%endif
2910*c0909341SAndroid Build Coastguard Worker
2911*c0909341SAndroid Build Coastguard Worker%%loop_x_hv_overlap:
2912*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2913*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused
2914*c0909341SAndroid Build Coastguard Worker
2915*c0909341SAndroid Build Coastguard Worker    mov              r6, [rsp+7*mmsize+1*gprsize]
2916*c0909341SAndroid Build Coastguard Worker%if %2
2917*c0909341SAndroid Build Coastguard Worker    lea              r0, [r3d+16]
2918*c0909341SAndroid Build Coastguard Worker    add              r6, 16
2919*c0909341SAndroid Build Coastguard Worker    mov [rsp+7*mmsize+0*gprsize], r0        ; left_offxy
2920*c0909341SAndroid Build Coastguard Worker%else
2921*c0909341SAndroid Build Coastguard Worker    mov [rsp+7*mmsize+0*gprsize], r3        ; left_offxy
2922*c0909341SAndroid Build Coastguard Worker%endif
2923*c0909341SAndroid Build Coastguard Worker    mov [rsp+7*mmsize+2*gprsize], r6        ; topleft_offxy
2924*c0909341SAndroid Build Coastguard Worker
2925*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused
2926*c0909341SAndroid Build Coastguard Worker
2927*c0909341SAndroid Build Coastguard Worker    mov            seed, r3m
2928*c0909341SAndroid Build Coastguard Worker    xor            tmpd, tmpd
2929*c0909341SAndroid Build Coastguard Worker%else
2930*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2931*c0909341SAndroid Build Coastguard Worker                tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
2932*c0909341SAndroid Build Coastguard Worker
2933*c0909341SAndroid Build Coastguard Worker%if %2
2934*c0909341SAndroid Build Coastguard Worker    lea  topleft_offxyq, [top_offxyq+16]
2935*c0909341SAndroid Build Coastguard Worker    lea     left_offxyq, [offxyq+16]
2936*c0909341SAndroid Build Coastguard Worker%else
2937*c0909341SAndroid Build Coastguard Worker    mov  topleft_offxyq, top_offxyq
2938*c0909341SAndroid Build Coastguard Worker    mov     left_offxyq, offxyq
2939*c0909341SAndroid Build Coastguard Worker%endif
2940*c0909341SAndroid Build Coastguard Worker
2941*c0909341SAndroid Build Coastguard Worker    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
2942*c0909341SAndroid Build Coastguard Worker%endif
2943*c0909341SAndroid Build Coastguard Worker    mov             r6d, seed
2944*c0909341SAndroid Build Coastguard Worker    or             seed, 0xeff4eff4
2945*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
2946*c0909341SAndroid Build Coastguard Worker    setp           tmpb                     ; parity of top_seed
2947*c0909341SAndroid Build Coastguard Worker    shr            seed, 16
2948*c0909341SAndroid Build Coastguard Worker    shl            tmpd, 16
2949*c0909341SAndroid Build Coastguard Worker    test           seeb, seeh
2950*c0909341SAndroid Build Coastguard Worker    setp           tmpb                     ; parity of cur_seed
2951*c0909341SAndroid Build Coastguard Worker    or              r6d, 0x00010001
2952*c0909341SAndroid Build Coastguard Worker    xor            tmpd, r6d
2953*c0909341SAndroid Build Coastguard Worker    mov            seed, tmpd
2954*c0909341SAndroid Build Coastguard Worker    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
2955*c0909341SAndroid Build Coastguard Worker
2956*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2957*c0909341SAndroid Build Coastguard Worker    mov             r3m, seed
2958*c0909341SAndroid Build Coastguard Worker
2959*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx
2960*c0909341SAndroid Build Coastguard Worker
2961*c0909341SAndroid Build Coastguard Worker    mov           offxd, offyd
2962*c0909341SAndroid Build Coastguard Worker%else
2963*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2964*c0909341SAndroid Build Coastguard Worker                offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
2965*c0909341SAndroid Build Coastguard Worker
2966*c0909341SAndroid Build Coastguard Worker    mov           offxd, seed
2967*c0909341SAndroid Build Coastguard Worker    mov           offyd, seed
2968*c0909341SAndroid Build Coastguard Worker%endif
2969*c0909341SAndroid Build Coastguard Worker    ror           offyd, 8
2970*c0909341SAndroid Build Coastguard Worker    ror           offxd, 12
2971*c0909341SAndroid Build Coastguard Worker    and           offyd, 0xf000f
2972*c0909341SAndroid Build Coastguard Worker    and           offxd, 0xf000f
2973*c0909341SAndroid Build Coastguard Worker    imul          offyd, 164>>%3
2974*c0909341SAndroid Build Coastguard Worker    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
2975*c0909341SAndroid Build Coastguard Worker    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
2976*c0909341SAndroid Build Coastguard Worker
2977*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2978*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
2979*c0909341SAndroid Build Coastguard Worker%else
2980*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2981*c0909341SAndroid Build Coastguard Worker                h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak
2982*c0909341SAndroid Build Coastguard Worker%endif
2983*c0909341SAndroid Build Coastguard Worker
2984*c0909341SAndroid Build Coastguard Worker    movzx    top_offxyd, offxyw
2985*c0909341SAndroid Build Coastguard Worker    shr          offxyd, 16
2986*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2987*c0909341SAndroid Build Coastguard Worker    mov [rsp+7*mmsize+1*gprsize], top_offxyd
2988*c0909341SAndroid Build Coastguard Worker%endif
2989*c0909341SAndroid Build Coastguard Worker
2990*c0909341SAndroid Build Coastguard Worker    mov              hd, r7m
2991*c0909341SAndroid Build Coastguard Worker    mov      grain_lutq, grain_lutmp
2992*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2993*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
2994*c0909341SAndroid Build Coastguard Worker%endif
2995*c0909341SAndroid Build Coastguard Worker%if %3
2996*c0909341SAndroid Build Coastguard Worker    mova             m3, [PIC_ptr(pb_23_22)]
2997*c0909341SAndroid Build Coastguard Worker%else
2998*c0909341SAndroid Build Coastguard Worker    mova             m3, [PIC_ptr(pb_27_17)]
2999*c0909341SAndroid Build Coastguard Worker%endif
3000*c0909341SAndroid Build Coastguard Worker%%loop_y_hv_overlap:
3001*c0909341SAndroid Build Coastguard Worker    ; grain = grain_lut[offy+y][offx+x]
3002*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3003*c0909341SAndroid Build Coastguard Worker    mov              r0, [rsp+7*mmsize+2*gprsize]       ; topleft_offxy
3004*c0909341SAndroid Build Coastguard Worker    mov              r5, [rsp+7*mmsize+1*gprsize]       ; top_offxy
3005*c0909341SAndroid Build Coastguard Worker    movd             m1, [grain_lutq+r0]
3006*c0909341SAndroid Build Coastguard Worker    mov              r0, [rsp+7*mmsize+0*gprsize]       ; left_offxy
3007*c0909341SAndroid Build Coastguard Worker%else
3008*c0909341SAndroid Build Coastguard Worker    movd             m1, [grain_lutq+topleft_offxyq]
3009*c0909341SAndroid Build Coastguard Worker%endif
3010*c0909341SAndroid Build Coastguard Worker    movu             m2, [grain_lutq+offxyq]
3011*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3012*c0909341SAndroid Build Coastguard Worker    movu             m6, [grain_lutq+r5]
3013*c0909341SAndroid Build Coastguard Worker    movd             m4, [grain_lutq+r0]
3014*c0909341SAndroid Build Coastguard Worker%else
3015*c0909341SAndroid Build Coastguard Worker    movu             m6, [grain_lutq+top_offxyq]
3016*c0909341SAndroid Build Coastguard Worker    movd             m4, [grain_lutq+left_offxyq]
3017*c0909341SAndroid Build Coastguard Worker%endif
3018*c0909341SAndroid Build Coastguard Worker    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
3019*c0909341SAndroid Build Coastguard Worker    punpcklbw        m1, m6
3020*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m2
3021*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m0, m9, m1
3022*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m1, m9, m4
3023*c0909341SAndroid Build Coastguard Worker    REPX {pmulhrsw x, m8}, m0, m1
3024*c0909341SAndroid Build Coastguard Worker    packsswb         m0, m1
3025*c0909341SAndroid Build Coastguard Worker    shufps           m4, m0, m2, q3232
3026*c0909341SAndroid Build Coastguard Worker    shufps           m0, m6, q3210
3027*c0909341SAndroid Build Coastguard Worker    ; followed by v interpolation (top | cur -> cur)
3028*c0909341SAndroid Build Coastguard Worker    punpcklbw        m2, m0, m4
3029*c0909341SAndroid Build Coastguard Worker    punpckhbw        m0, m4
3030*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m4, m3, m0
3031*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m1, m3, m2
3032*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m4, m8
3033*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m1, m8
3034*c0909341SAndroid Build Coastguard Worker    packsswb         m1, m4
3035*c0909341SAndroid Build Coastguard Worker
3036*c0909341SAndroid Build Coastguard Worker    ; src
3037*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3038*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
3039*c0909341SAndroid Build Coastguard Worker
3040*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r9mp
3041*c0909341SAndroid Build Coastguard Worker%endif
3042*c0909341SAndroid Build Coastguard Worker%if %2
3043*c0909341SAndroid Build Coastguard Worker    mova             m4, [lumaq+ 0]
3044*c0909341SAndroid Build Coastguard Worker    mova             m6, [lumaq+16]
3045*c0909341SAndroid Build Coastguard Worker    mova             m0, [srcq]
3046*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3047*c0909341SAndroid Build Coastguard Worker    add           lumaq, r10mp
3048*c0909341SAndroid Build Coastguard Worker    mov            r9mp, lumaq
3049*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
3050*c0909341SAndroid Build Coastguard Worker    movd             m7, [base+pb_1]
3051*c0909341SAndroid Build Coastguard Worker%else
3052*c0909341SAndroid Build Coastguard Worker    movd             m7, [pb_1]
3053*c0909341SAndroid Build Coastguard Worker%endif
3054*c0909341SAndroid Build Coastguard Worker    pshufd           m7, m7, q0000
3055*c0909341SAndroid Build Coastguard Worker    pxor             m2, m2
3056*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m4, m7
3057*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m6, m7
3058*c0909341SAndroid Build Coastguard Worker    pavgw            m4, m2
3059*c0909341SAndroid Build Coastguard Worker    pavgw            m6, m2
3060*c0909341SAndroid Build Coastguard Worker%else
3061*c0909341SAndroid Build Coastguard Worker    mova             m4, [lumaq]
3062*c0909341SAndroid Build Coastguard Worker    mova             m0, [srcq]
3063*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3064*c0909341SAndroid Build Coastguard Worker    add           lumaq, r10mp
3065*c0909341SAndroid Build Coastguard Worker    mov            r9mp, lumaq
3066*c0909341SAndroid Build Coastguard Worker%endif
3067*c0909341SAndroid Build Coastguard Worker    pxor             m2, m2
3068*c0909341SAndroid Build Coastguard Worker%endif
3069*c0909341SAndroid Build Coastguard Worker
3070*c0909341SAndroid Build Coastguard Worker%if %1
3071*c0909341SAndroid Build Coastguard Worker%if %2
3072*c0909341SAndroid Build Coastguard Worker    packuswb         m4, m6                 ; luma
3073*c0909341SAndroid Build Coastguard Worker%endif
3074*c0909341SAndroid Build Coastguard Worker    punpckhbw        m6, m4, m0
3075*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m0                 ; { luma, chroma }
3076*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m6, m14
3077*c0909341SAndroid Build Coastguard Worker    pmaddubsw        m4, m14
3078*c0909341SAndroid Build Coastguard Worker    psraw            m6, 6
3079*c0909341SAndroid Build Coastguard Worker    psraw            m4, 6
3080*c0909341SAndroid Build Coastguard Worker    paddw            m6, m15
3081*c0909341SAndroid Build Coastguard Worker    paddw            m4, m15
3082*c0909341SAndroid Build Coastguard Worker    packuswb         m4, m6                 ; pack+unpack = clip
3083*c0909341SAndroid Build Coastguard Worker    punpckhbw        m6, m4, m2
3084*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m2
3085*c0909341SAndroid Build Coastguard Worker%elif %2 == 0
3086*c0909341SAndroid Build Coastguard Worker    punpckhbw        m6, m4, m2
3087*c0909341SAndroid Build Coastguard Worker    punpcklbw        m4, m2
3088*c0909341SAndroid Build Coastguard Worker%endif
3089*c0909341SAndroid Build Coastguard Worker
3090*c0909341SAndroid Build Coastguard Worker    ; scaling[src]
3091*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3092*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m7, m4, scalingq-1, r0, r5
3093*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m6, scalingq-1, r0, r5
3094*c0909341SAndroid Build Coastguard Worker%else
3095*c0909341SAndroid Build Coastguard Worker%if %3
3096*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m7, m4, scalingq-1, r2, r12
3097*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m6, scalingq-1, r2, r12
3098*c0909341SAndroid Build Coastguard Worker%else
3099*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m7, m4, scalingq-1, r2, r13
3100*c0909341SAndroid Build Coastguard Worker    vpgatherdw       m5, m6, scalingq-1, r2, r13
3101*c0909341SAndroid Build Coastguard Worker%endif
3102*c0909341SAndroid Build Coastguard Worker%endif
3103*c0909341SAndroid Build Coastguard Worker    REPX {psrlw x, 8}, m7, m5
3104*c0909341SAndroid Build Coastguard Worker
3105*c0909341SAndroid Build Coastguard Worker    ; unpack grain
3106*c0909341SAndroid Build Coastguard Worker    pxor             m4, m4
3107*c0909341SAndroid Build Coastguard Worker    pcmpgtb          m4, m1
3108*c0909341SAndroid Build Coastguard Worker    punpcklbw        m2, m1, m4
3109*c0909341SAndroid Build Coastguard Worker    punpckhbw        m1, m4
3110*c0909341SAndroid Build Coastguard Worker
3111*c0909341SAndroid Build Coastguard Worker    ; noise = round2(scaling[src] * grain, scaling_shift)
3112*c0909341SAndroid Build Coastguard Worker    pmullw           m2, m7
3113*c0909341SAndroid Build Coastguard Worker    pmullw           m1, m5
3114*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m2, m11
3115*c0909341SAndroid Build Coastguard Worker    pmulhrsw         m1, m11
3116*c0909341SAndroid Build Coastguard Worker
3117*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3118*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
3119*c0909341SAndroid Build Coastguard Worker%endif
3120*c0909341SAndroid Build Coastguard Worker
3121*c0909341SAndroid Build Coastguard Worker    ; unpack chroma source
3122*c0909341SAndroid Build Coastguard Worker    pxor             m4, m4
3123*c0909341SAndroid Build Coastguard Worker    punpckhbw        m5, m0, m4
3124*c0909341SAndroid Build Coastguard Worker    punpcklbw        m0, m4                 ; m0-1: src as word
3125*c0909341SAndroid Build Coastguard Worker
3126*c0909341SAndroid Build Coastguard Worker    ; dst = clip_pixel(src, noise)
3127*c0909341SAndroid Build Coastguard Worker    paddw            m0, m2
3128*c0909341SAndroid Build Coastguard Worker    paddw            m5, m1
3129*c0909341SAndroid Build Coastguard Worker    pmaxsw           m0, m13
3130*c0909341SAndroid Build Coastguard Worker    pmaxsw           m5, m13
3131*c0909341SAndroid Build Coastguard Worker    pminsw           m0, m12
3132*c0909341SAndroid Build Coastguard Worker    pminsw           m5, m12
3133*c0909341SAndroid Build Coastguard Worker    packuswb         m0, m5
3134*c0909341SAndroid Build Coastguard Worker    movifnidn      dstq, dstmp
3135*c0909341SAndroid Build Coastguard Worker    mova    [dstq+srcq], m0
3136*c0909341SAndroid Build Coastguard Worker
3137*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3138*c0909341SAndroid Build Coastguard Worker    add            srcq, r2mp
3139*c0909341SAndroid Build Coastguard Worker    ; lumaq has been adjusted above already
3140*c0909341SAndroid Build Coastguard Worker%else
3141*c0909341SAndroid Build Coastguard Worker    add            srcq, r12mp
3142*c0909341SAndroid Build Coastguard Worker%if %3
3143*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [lumaq+lstrideq*(1+%2)]
3144*c0909341SAndroid Build Coastguard Worker%else
3145*c0909341SAndroid Build Coastguard Worker    add           lumaq, r10mp
3146*c0909341SAndroid Build Coastguard Worker%endif
3147*c0909341SAndroid Build Coastguard Worker%endif
3148*c0909341SAndroid Build Coastguard Worker    add      grain_lutq, 82
3149*c0909341SAndroid Build Coastguard Worker    dec              hw
3150*c0909341SAndroid Build Coastguard Worker%if %3
3151*c0909341SAndroid Build Coastguard Worker    jg %%loop_y_h_overlap
3152*c0909341SAndroid Build Coastguard Worker%else
3153*c0909341SAndroid Build Coastguard Worker    jle %%end_y_hv_overlap
3154*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3155*c0909341SAndroid Build Coastguard Worker    mov              r5, r5m
3156*c0909341SAndroid Build Coastguard Worker%endif
3157*c0909341SAndroid Build Coastguard Worker    mova             m3, [PIC_ptr(pb_17_27)]
3158*c0909341SAndroid Build Coastguard Worker    btc              hd, 16
3159*c0909341SAndroid Build Coastguard Worker    jnc %%loop_y_hv_overlap
3160*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3161*c0909341SAndroid Build Coastguard Worker    mov        lstrideq, r10mp
3162*c0909341SAndroid Build Coastguard Worker%endif
3163*c0909341SAndroid Build Coastguard Worker    jmp %%loop_y_h_overlap
3164*c0909341SAndroid Build Coastguard Worker%%end_y_hv_overlap:
3165*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3166*c0909341SAndroid Build Coastguard Worker    mov        lstrideq, r10mp
3167*c0909341SAndroid Build Coastguard Worker%endif
3168*c0909341SAndroid Build Coastguard Worker%endif
3169*c0909341SAndroid Build Coastguard Worker
3170*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3171*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
3172*c0909341SAndroid Build Coastguard Worker
3173*c0909341SAndroid Build Coastguard Worker    mov              wq, r4m
3174*c0909341SAndroid Build Coastguard Worker%endif
3175*c0909341SAndroid Build Coastguard Worker    add              wq, 16
3176*c0909341SAndroid Build Coastguard Worker    jge %%end_hv
3177*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3178*c0909341SAndroid Build Coastguard Worker    mov            srcq, r1mp
3179*c0909341SAndroid Build Coastguard Worker    mov           lumaq, r11mp
3180*c0909341SAndroid Build Coastguard Worker%else
3181*c0909341SAndroid Build Coastguard Worker    mov            srcq, r11mp
3182*c0909341SAndroid Build Coastguard Worker%endif
3183*c0909341SAndroid Build Coastguard Worker    lea           lumaq, [luma_bakq+wq*(1+%2)]
3184*c0909341SAndroid Build Coastguard Worker    add            srcq, wq
3185*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3186*c0909341SAndroid Build Coastguard Worker    mov             r4m, wq
3187*c0909341SAndroid Build Coastguard Worker    mov             r9m, lumaq
3188*c0909341SAndroid Build Coastguard Worker%endif
3189*c0909341SAndroid Build Coastguard Worker%if %2
3190*c0909341SAndroid Build Coastguard Worker    jmp %%loop_x_hv_overlap
3191*c0909341SAndroid Build Coastguard Worker%else
3192*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3193*c0909341SAndroid Build Coastguard Worker    add dword [rsp+7*mmsize+1*gprsize], 16
3194*c0909341SAndroid Build Coastguard Worker%else
3195*c0909341SAndroid Build Coastguard Worker    add      top_offxyd, 16
3196*c0909341SAndroid Build Coastguard Worker%endif
3197*c0909341SAndroid Build Coastguard Worker    add          offxyd, 16
3198*c0909341SAndroid Build Coastguard Worker    xor       dword r8m, 4
3199*c0909341SAndroid Build Coastguard Worker    jmp %%loop_x_odd_v_overlap
3200*c0909341SAndroid Build Coastguard Worker%endif
3201*c0909341SAndroid Build Coastguard Worker
3202*c0909341SAndroid Build Coastguard Worker%%end_hv:
3203*c0909341SAndroid Build Coastguard Worker    RET
3204*c0909341SAndroid Build Coastguard Worker%endmacro
3205*c0909341SAndroid Build Coastguard Worker
3206*c0909341SAndroid Build Coastguard Worker    %%FGUV_32x32xN_LOOP 1, %2, %3
3207*c0909341SAndroid Build Coastguard Worker.csfl:
3208*c0909341SAndroid Build Coastguard Worker    %%FGUV_32x32xN_LOOP 0, %2, %3
3209*c0909341SAndroid Build Coastguard Worker%endmacro
3210*c0909341SAndroid Build Coastguard Worker
3211*c0909341SAndroid Build Coastguard WorkerFGUV_FN 420, 1, 1
3212*c0909341SAndroid Build Coastguard Worker
3213*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < mmsize
3214*c0909341SAndroid Build Coastguard WorkerDECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
3215*c0909341SAndroid Build Coastguard Worker%endif
3216*c0909341SAndroid Build Coastguard Worker
3217*c0909341SAndroid Build Coastguard WorkerFGUV_FN 422, 1, 0
3218*c0909341SAndroid Build Coastguard Worker
3219*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < mmsize
3220*c0909341SAndroid Build Coastguard WorkerDECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
3221*c0909341SAndroid Build Coastguard Worker%endif
3222*c0909341SAndroid Build Coastguard Worker
3223*c0909341SAndroid Build Coastguard WorkerFGUV_FN 444, 0, 0
3224