xref: /aosp_15_r20/external/libdav1d/src/x86/filmgrain_sse.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2019-2021, VideoLAN and dav1d authors
2; Copyright © 2019, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28%include "x86/filmgrain_common.asm"
29
30SECTION_RODATA
31
32pw_1024: times 8 dw 1024
33pb_27_17_17_27: db 27, 17, 17, 27
34                times 6 db 0, 32
35pb_23_22_h: db 23, 22
36            times 7 db 0, 32
37pb_27_17: times 8 db 27, 17
38pb_17_27: times 8 db 17, 27
39pb_23_22: times 8 db 23, 22
40pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
41rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
42byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
43pw_seed_xor: times 2 dw 0xb524
44             times 2 dw 0x49d8
45pb_1: times 4 db 1
46hmul_bits: dw 32768, 16384, 8192, 4096
47round: dw 2048, 1024, 512
48mul_bits: dw 256, 128, 64, 32, 16
49round_vals: dw 32, 64, 128, 256, 512
50max: dw 255, 240, 235
51min: dw 0, 16
52pw_1: dw 1
53
54%macro JMP_TABLE 2-*
55    %xdefine %1_8bpc_%2_table %%table
56    %xdefine %%base %1_8bpc_%2_table
57    %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
58    %%table:
59    %rep %0 - 2
60        dd %%prefix %+ .ar%3 - %%base
61        %rotate 1
62    %endrep
63%endmacro
64
65JMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3
66JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3
67JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3
68JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3
69
70SECTION .text
71
72%if ARCH_X86_32
73%define PIC_ptr(a) base+a
74%else
75%define PIC_ptr(a) a
76%endif
77
78%macro SCRATCH 3
79%if ARCH_X86_32
80    mova [rsp+%3*mmsize], m%1
81%define m%2 [rsp+%3*mmsize]
82%else
83    SWAP             %1, %2
84%endif
85%endmacro
86
87INIT_XMM ssse3
88cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
89    LEA              r4, $$
90%define base r4-$$
91    movq             m1, [base+rnd_next_upperbit_mask]
92    movq             m4, [base+mul_bits]
93    movq             m7, [base+hmul_bits]
94    mov             r2d, [fg_dataq+FGData.grain_scale_shift]
95    movd             m2, [base+round+r2*2]
96    movd             m0, [fg_dataq+FGData.seed]
97    mova             m5, [base+pb_mask]
98    pshuflw          m2, m2, q0000
99    pshuflw          m0, m0, q0000
100    mov              r2, -73*82
101    sub            bufq, r2
102    lea              r3, [base+gaussian_sequence]
103.loop:
104    pand             m6, m0, m1
105    psrlw            m3, m6, 10
106    por              m6, m3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
107    pmullw           m6, m4            ; bits 0x0f00 are set
108    pshufb           m3, m5, m6        ; set 15th bit for next 4 seeds
109    psllq            m6, m3, 30
110    por              m3, m6
111    psllq            m6, m3, 15
112    por              m3, m6            ; aggregate each bit into next seed's high bit
113    pmulhuw          m6, m0, m7
114    por              m3, m6            ; 4 next output seeds
115    pshuflw          m0, m3, q3333
116    psrlw            m3, 5
117%if ARCH_X86_64
118    movq             r6, m3
119    mov              r8, r6
120    movzx           r5d, r6w
121    shr             r6d, 16
122    shr              r8, 32
123    movzx            r7, r8w
124    shr              r8, 16
125
126    movd             m6, [r3+r5*2]
127    pinsrw           m6, [r3+r6*2], 1
128    pinsrw           m6, [r3+r7*2], 2
129    pinsrw           m6, [r3+r8*2], 3
130%else
131    movd             r6, m3
132    pshuflw          m3, m3, q3232
133    movzx            r5, r6w
134    shr              r6, 16
135
136    movd             m6, [r3+r5*2]
137    pinsrw           m6, [r3+r6*2], 1
138
139    movd             r6, m3
140    movzx            r5, r6w
141    shr              r6, 16
142
143    pinsrw           m6, [r3+r5*2], 2
144    pinsrw           m6, [r3+r6*2], 3
145%endif
146    pmulhrsw         m6, m2
147    packsswb         m6, m6
148    movd      [bufq+r2], m6
149    add              r2, 4
150    jl .loop
151
152    ; auto-regression code
153    movsxd           r2, [fg_dataq+FGData.ar_coeff_lag]
154    movsxd           r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4]
155    lea              r2, [r2+base+generate_grain_y_8bpc_ssse3_table]
156    jmp              r2
157
158.ar1:
159%if ARCH_X86_32
160    DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max
161%elif WIN64
162    DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0
163    mov            bufq, r0
164%else
165    DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0
166%endif
167    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
168    movd             m4, [fg_dataq+FGData.ar_coeffs_y]
169    mov             ecx, [fg_dataq+FGData.ar_coeff_shift]
170%if ARCH_X86_32
171    mov             r1m, cf3d
172    DEFINE_ARGS buf, shift, val3, min, max, x, val0
173%define hd r0mp
174%define cf3d r1mp
175%elif WIN64
176    DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0
177%else
178    DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0
179%endif
180    pxor             m6, m6
181    pcmpgtb          m7, m6, m4
182    punpcklbw        m4, m7
183    pinsrw           m4, [base+pw_1], 3
184    pshufd           m5, m4, q1111
185    pshufd           m4, m4, q0000
186    movd             m3, [base+round_vals+shiftq*2-12]    ; rnd
187    pshuflw          m3, m3, q0000
188    sub            bufq, 82*73-(82*3+79)
189    mov              hd, 70
190    mov            mind, -128
191    mov            maxd, 127
192.y_loop_ar1:
193    mov              xq, -76
194    movsx         val3d, byte [bufq+xq-1]
195.x_loop_ar1:
196    movq             m0, [bufq+xq-82-1]     ; top/left
197    pcmpgtb          m7, m6, m0
198    punpcklbw        m0, m7
199    psrldq           m2, m0, 2              ; top
200    psrldq           m1, m0, 4              ; top/right
201    punpcklwd        m0, m2
202    punpcklwd        m1, m3
203    pmaddwd          m0, m4
204    pmaddwd          m1, m5
205    paddd            m0, m1
206.x_loop_ar1_inner:
207    movd          val0d, m0
208    psrldq           m0, 4
209    imul          val3d, cf3d
210    add           val3d, val0d
211    sar           val3d, shiftb
212    movsx         val0d, byte [bufq+xq]
213    add           val3d, val0d
214    cmp           val3d, maxd
215    cmovns        val3d, maxd
216    cmp           val3d, mind
217    cmovs         val3d, mind
218    mov  byte [bufq+xq], val3b
219    ; keep val3d in-place as left for next x iteration
220    inc              xq
221    jz .x_loop_ar1_end
222    test             xq, 3
223    jnz .x_loop_ar1_inner
224    jmp .x_loop_ar1
225
226.x_loop_ar1_end:
227    add            bufq, 82
228    dec              hd
229    jg .y_loop_ar1
230.ar0:
231    RET
232
233.ar2:
234%if ARCH_X86_32
235    ALLOC_STACK -16*8
236%endif
237    DEFINE_ARGS buf, fg_data, shift
238    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
239    movd             m6, [base+round_vals-12+shiftq*2]
240    movd             m7, [base+byte_blend+1]
241    SCRATCH           7, 15, 7
242    movq             m0, [fg_dataq+FGData.ar_coeffs_y+0]    ; cf0-7
243    movd             m1, [fg_dataq+FGData.ar_coeffs_y+8]    ; cf8-11
244    pxor             m7, m7
245    pshuflw          m6, m6, q0000
246    punpcklwd        m6, m7
247    pcmpgtb          m4, m7, m0
248    pcmpgtb          m5, m7, m1
249    punpcklbw        m0, m4
250    punpcklbw        m1, m5
251    DEFINE_ARGS buf, fg_data, h, x
252    pshufd           m4, m1, q0000
253    pshufd           m5, m1, q1111
254    pshufd           m3, m0, q3333
255    pshufd           m2, m0, q2222
256    pshufd           m1, m0, q1111
257    pshufd           m0, m0, q0000
258    SCRATCH           0, 8,  0
259    SCRATCH           1, 9,  1
260    SCRATCH           2, 10, 2
261    SCRATCH           3, 11, 3
262    SCRATCH           4, 12, 4
263    SCRATCH           5, 13, 5
264    SCRATCH           6, 14, 6
265    sub            bufq, 82*73-(82*3+79)
266    mov              hd, 70
267.y_loop_ar2:
268    mov              xq, -76
269
270.x_loop_ar2:
271    movq             m0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
272    movhps           m0, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
273    pcmpgtb          m2, m7, m0
274    punpckhbw        m1, m0, m2
275    punpcklbw        m0, m2
276    psrldq           m5, m0, 2              ; y=-2,x=[-1,+5]
277    psrldq           m3, m1, 2              ; y=-1,x=[-1,+5]
278    psrldq           m4, m1, 4              ; y=-1,x=[+0,+5]
279    punpcklwd        m2, m0, m5
280    punpcklwd        m3, m4
281    pmaddwd          m2, m8
282    pmaddwd          m3, m11
283    paddd            m2, m3
284
285    psrldq           m4, m0, 4              ; y=-2,x=[+0,+5]
286    psrldq           m5, m0, 6              ; y=-2,x=[+1,+5]
287    psrldq           m6, m0, 8              ; y=-2,x=[+2,+5]
288    punpcklwd        m4, m5
289    punpcklwd        m6, m1
290    psrldq           m5, m1, 6              ; y=-1,x=[+1,+5]
291    psrldq           m1, m1, 8              ; y=-1,x=[+2,+5]
292    punpcklwd        m5, m1
293    pmaddwd          m4, m9
294    pmaddwd          m6, m10
295    pmaddwd          m5, m12
296    paddd            m4, m6
297    paddd            m2, m5
298    paddd            m2, m4
299    paddd            m2, m14
300
301    movq             m0, [bufq+xq-2]        ; y=0,x=[-2,+5]
302.x_loop_ar2_inner:
303    pcmpgtb          m4, m7, m0
304    punpcklbw        m1, m0, m4
305    pmaddwd          m3, m1, m13
306    paddd            m3, m2
307    psrldq           m1, 4                  ; y=0,x=0
308    psrldq           m2, 4                  ; shift top to next pixel
309    psrad            m3, [fg_dataq+FGData.ar_coeff_shift]
310    ; don't packssdw since we only care about one value
311    paddw            m3, m1
312    packsswb         m3, m3
313    pslldq           m3, 2
314    pand             m3, m15
315    pandn            m1, m15, m0
316    por              m0, m1, m3
317    psrldq           m0, 1
318    ; overwrite 2 pixels, but that's ok
319    movd      [bufq+xq-1], m0
320    inc              xq
321    jz .x_loop_ar2_end
322    test             xq, 3
323    jnz .x_loop_ar2_inner
324    jmp .x_loop_ar2
325
326.x_loop_ar2_end:
327    add            bufq, 82
328    dec              hd
329    jg .y_loop_ar2
330    RET
331
332.ar3:
333    DEFINE_ARGS buf, fg_data, shift
334%if ARCH_X86_32
335    ALLOC_STACK  -16*14
336%elif WIN64
337    SUB             rsp, 16*6
338%assign stack_size_padded (stack_size_padded+16*6)
339%assign stack_size (stack_size+16*6)
340%else
341    ALLOC_STACK  -16*6
342%endif
343    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
344    movd             m6, [base+round_vals-12+shiftq*2]
345    movd             m7, [base+byte_blend]
346    movu             m0, [fg_dataq+FGData.ar_coeffs_y+ 0]   ; cf0-15
347    movq             m2, [fg_dataq+FGData.ar_coeffs_y+16]   ; cf16-23
348    pxor             m3, m3
349    pcmpgtb          m4, m3, m0
350    pcmpgtb          m3, m2
351    pshuflw          m6, m6, q0000
352    SCRATCH           6, 14, 12
353    SCRATCH           7, 15, 13
354    punpckhbw        m1, m0, m4
355    punpcklbw        m0, m4
356    punpcklbw        m2, m3
357    pshufd           m3, m0, q1111
358    pshufd           m4, m0, q2222
359    pshufd           m5, m0, q3333
360    pshufd           m0, m0, q0000
361    mova    [rsp+ 0*16], m0
362    mova    [rsp+ 1*16], m3
363    mova    [rsp+ 2*16], m4
364    mova    [rsp+ 3*16], m5
365    pshufd           m6, m1, q1111
366    pshufd           m7, m1, q2222
367    pshufd           m5, m1, q3333
368    pshufd           m1, m1, q0000
369    pshufd           m3, m2, q1111
370    psrldq           m0, m2, 10
371    pinsrw           m2, [base+pw_1], 5
372    pshufd           m4, m2, q2222
373    pshufd           m2, m2, q0000
374    pinsrw           m0, [base+round_vals+shiftq*2-10], 3
375    mova    [rsp+ 4*16], m1
376    mova    [rsp+ 5*16], m6
377    SCRATCH           7, 8,  6
378    SCRATCH           5, 9,  7
379    SCRATCH           2, 10, 8
380    SCRATCH           3, 11, 9
381    SCRATCH           4, 12, 10
382    SCRATCH           0, 13, 11
383    DEFINE_ARGS buf, fg_data, h, x
384    sub            bufq, 82*73-(82*3+79)
385    mov              hd, 70
386.y_loop_ar3:
387    mov              xq, -76
388
389.x_loop_ar3:
390    movu             m0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
391    pxor             m3, m3
392    pcmpgtb          m3, m0
393    punpckhbw        m2, m0, m3
394    punpcklbw        m0, m3
395
396    psrldq           m5, m0, 2
397    psrldq           m6, m0, 4
398    psrldq           m7, m0, 6
399    punpcklwd        m4, m0, m5
400    punpcklwd        m6, m7
401    pmaddwd          m4, [rsp+ 0*16]
402    pmaddwd          m6, [rsp+ 1*16]
403    paddd            m4, m6
404
405    movu             m1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
406    pxor             m5, m5
407    pcmpgtb          m5, m1
408    punpckhbw        m3, m1, m5
409    punpcklbw        m1, m5
410    palignr          m6, m2, m0, 10
411    palignr          m7, m2, m0, 12
412    psrldq           m0, 8
413    punpcklwd        m0, m6
414    punpcklwd        m7, m1
415    pmaddwd          m0, [rsp+ 2*16]
416    pmaddwd          m7, [rsp+ 3*16]
417    paddd            m0, m7
418    paddd            m0, m4
419
420    psrldq           m4, m1, 2
421    psrldq           m5, m1, 4
422    psrldq           m6, m1, 6
423    psrldq           m7, m1, 8
424    punpcklwd        m4, m5
425    punpcklwd        m6, m7
426    pmaddwd          m4, [rsp+ 4*16]
427    pmaddwd          m6, [rsp+ 5*16]
428    paddd            m4, m6
429    paddd            m0, m4
430
431    movu             m2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
432    pxor             m7, m7
433    pcmpgtb          m7, m2
434    punpckhbw        m5, m2, m7
435    punpcklbw        m2, m7
436    palignr          m7, m3, m1, 10
437    palignr          m3, m1, 12
438    psrldq           m1, m2, 2
439    punpcklwd        m7, m3
440    punpcklwd        m3, m2, m1
441    pmaddwd          m7, m8
442    pmaddwd          m3, m9
443    paddd            m7, m3
444    paddd            m0, m7
445
446    psrldq           m6, m2, 4
447    psrldq           m1, m2, 6
448    psrldq           m3, m2, 8
449    palignr          m4, m5, m2, 10
450    palignr          m5, m5, m2, 12
451
452    punpcklwd        m6, m1
453    punpcklwd        m3, m4
454    punpcklwd        m5, m14
455    pmaddwd          m6, m10
456    pmaddwd          m3, m11
457    pmaddwd          m5, m12
458    paddd            m0, m6
459    paddd            m3, m5
460    paddd            m0, m3
461
462    movq             m1, [bufq+xq-3]        ; y=0,x=[-3,+4]
463.x_loop_ar3_inner:
464    pxor             m5, m5
465    pcmpgtb          m5, m1
466    punpcklbw        m2, m1, m5
467    pmaddwd          m2, m13
468    pshufd           m3, m2, q1111
469    paddd            m2, m3                 ; left+cur
470    paddd            m2, m0                 ; add top
471    psrldq           m0, 4
472    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
473    ; don't packssdw since we only care about one value
474    packsswb         m2, m2
475    pslldq           m2, 3
476    pand             m2, m15
477    pandn            m3, m15, m1
478    por              m1, m2, m3
479    movd    [bufq+xq-3], m1
480    psrldq           m1, 1
481    inc              xq
482    jz .x_loop_ar3_end
483    test             xq, 3
484    jnz .x_loop_ar3_inner
485    jmp .x_loop_ar3
486
487.x_loop_ar3_end:
488    add            bufq, 82
489    dec              hd
490    jg .y_loop_ar3
491    RET
492
493%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
494INIT_XMM ssse3
495cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv
496    movifnidn        r2, r2mp
497    movifnidn        r3, r3mp
498    LEA              r4, $$
499%define base r4-$$
500    movq             m1, [base+rnd_next_upperbit_mask]
501    movq             m4, [base+mul_bits]
502    movq             m7, [base+hmul_bits]
503    mov             r5d, [fg_dataq+FGData.grain_scale_shift]
504    movd             m6, [base+round+r5*2]
505    mova             m5, [base+pb_mask]
506    movd             m0, [fg_dataq+FGData.seed]
507    movd             m2, [base+pw_seed_xor+uvq*4]
508    pxor             m0, m2
509    pshuflw          m6, m6, q0000
510    pshuflw          m0, m0, q0000
511    lea              r6, [base+gaussian_sequence]
512%if %2
513%if ARCH_X86_64
514    mov             r7d, 73-35*%3
515%else
516    mov            r3mp, 73-35*%3
517%endif
518    add            bufq, 44
519.loop_y:
520    mov              r5, -44
521.loop_x:
522%else
523    mov              r5, -82*73
524    sub            bufq, r5
525.loop:
526%endif
527    pand             m2, m0, m1
528    psrlw            m3, m2, 10
529    por              m2, m3             ; bits 0xf, 0x1e, 0x3c and 0x78 are set
530    pmullw           m2, m4             ; bits 0x0f00 are set
531    pshufb           m3, m5, m2         ; set 15th bit for next 4 seeds
532    psllq            m2, m3, 30
533    por              m3, m2
534    psllq            m2, m3, 15
535    por              m3, m2             ; aggregate each bit into next seed's high bit
536    pmulhuw          m2, m0, m7
537    por              m2, m3             ; 4 next output seeds
538    pshuflw          m0, m2, q3333
539    psrlw            m2, 5
540%if ARCH_X86_64
541    movd            r9d, m2
542    pshuflw          m2, m2, q3232
543    movzx            r8, r9w
544    shr              r9, 16
545
546    movd             m3, [r6+r8*2]
547    pinsrw           m3, [r6+r9*2], 1
548
549    movd            r9d, m2
550    movzx            r8, r9w
551    shr              r9, 16
552
553    pinsrw           m3, [r6+r8*2], 2
554    pinsrw           m3, [r6+r9*2], 3
555%else
556    movd             r2, m2
557    pshuflw          m2, m2, q3232
558    movzx            r1, r2w
559    shr              r2, 16
560
561    movd             m3, [r6+r1*2]
562    pinsrw           m3, [r6+r2*2], 1
563
564    movd             r2, m2
565    movzx            r1, r2w
566    shr              r2, 16
567
568    pinsrw           m3, [r6+r1*2], 2
569    pinsrw           m3, [r6+r2*2], 3
570%endif
571    pmulhrsw         m3, m6
572    packsswb         m3, m3
573    movd      [bufq+r5], m3
574    add              r5, 4
575%if %2
576    jl .loop_x
577    add            bufq, 82
578%if ARCH_X86_64
579    dec             r7d
580%else
581    dec            r3mp
582%endif
583    jg .loop_y
584%else
585    jl .loop
586%endif
587
588%if ARCH_X86_32
589    mov              r2, r2mp
590%endif
591
592    ; auto-regression code
593    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
594    movsxd           r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4]
595    lea              r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table]
596    jmp              r5
597
598.ar0:
599    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
600    movifnidn     bufyq, bufymp
601%if ARCH_X86_32
602    ALLOC_STACK   -2*16
603%endif
604    imul            uvd, 28
605    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
606    movd             m5, [fg_dataq+FGData.ar_coeffs_uv+uvq]
607    movd             m4, [base+hmul_bits+shiftq*2]
608    DEFINE_ARGS buf, bufy, h, x
609    pxor             m0, m0
610    pcmpgtb          m0, m5
611    punpcklbw        m5, m0
612    movd             m7, [base+pb_1]
613%if %2
614    movd             m6, [base+hmul_bits+2+%3*2]
615%endif
616    pshuflw          m5, m5, q0000
617    pshuflw          m4, m4, q0000
618    pshufd           m7, m7, q0000
619%if %2
620    pshuflw          m6, m6, q0000
621%endif
622    punpcklqdq       m5, m5
623    punpcklqdq       m4, m4
624%if %2
625    punpcklqdq       m6, m6
626%endif
627    pcmpeqw          m1, m1
628    pslldq           m1, 12>>%2
629    SCRATCH           1, 8, 0
630    SCRATCH           4, 9, 1
631%if %2
632    sub            bufq, 82*(73-35*%3)+82-(82*3+41)
633%else
634    sub            bufq, 82*70-3
635%endif
636    add           bufyq, 3+82*3
637    mov              hd, 70-35*%3
638.y_loop_ar0:
639    xor              xd, xd
640.x_loop_ar0:
641    ; first 32 pixels
642%if %2
643    movu             m1, [bufyq+xq*2]
644%if %3
645    movu             m2, [bufyq+xq*2+82]
646%endif
647    movu             m3, [bufyq+xq*2+16]
648%if %3
649    movu             m4, [bufyq+xq*2+82+16]
650%endif
651    pmaddubsw        m0, m7, m1
652%if %3
653    pmaddubsw        m1, m7, m2
654%endif
655    pmaddubsw        m2, m7, m3
656%if %3
657    pmaddubsw        m3, m7, m4
658    paddw            m0, m1
659    paddw            m2, m3
660%endif
661    pmulhrsw         m0, m6
662    pmulhrsw         m2, m6
663%else
664    movu             m0, [bufyq+xq]
665    pxor             m6, m6
666    pcmpgtb          m6, m0
667    punpckhbw        m2, m0, m6
668    punpcklbw        m0, m6
669%endif
670    pmullw           m0, m5
671    pmullw           m2, m5
672    pmulhrsw         m0, m9
673    pmulhrsw         m2, m9
674    movu             m1, [bufq+xq]
675    pxor             m4, m4
676    pcmpgtb          m4, m1
677    punpckhbw        m3, m1, m4
678%if %2
679    punpcklbw        m1, m4
680    paddw            m2, m3
681    paddw            m0, m1
682%else
683    punpcklbw        m6, m1, m4
684    paddw            m2, m3
685    paddw            m0, m6
686%endif
687    packsswb         m0, m2
688%if %2
689    movu      [bufq+xq], m0
690    add              xd, 16
691    cmp              xd, 32
692    jl .x_loop_ar0
693
694    ; last 6/12 pixels
695    movu             m1, [bufyq+xq*(1+%2)]
696%if %3
697    movu             m2, [bufyq+xq*2+82]
698%endif
699    pmaddubsw        m0, m7, m1
700%if %3
701    pmaddubsw        m1, m7, m2
702    paddw            m0, m1
703%endif
704    pmulhrsw         m0, m6
705    pmullw           m0, m5
706    pmulhrsw         m0, m9
707    movq             m1, [bufq+xq]
708    pxor             m4, m4
709    pcmpgtb          m4, m1
710    punpcklbw        m2, m1, m4
711    paddw            m0, m2
712    packsswb         m0, m0
713    pandn            m2, m8, m0
714    pand             m1, m8
715    por              m2, m1
716    movq      [bufq+xq], m2
717%else
718    add              xd, 16
719    cmp              xd, 80
720    je .y_loop_final_ar0
721    movu   [bufq+xq-16], m0
722    jmp .x_loop_ar0
723.y_loop_final_ar0:
724    pandn            m2, m8, m0
725    pand             m1, m8
726    por              m2, m1
727    movu   [bufq+xq-16], m2
728%endif
729
730    add            bufq, 82
731    add           bufyq, 82<<%3
732    dec              hd
733    jg .y_loop_ar0
734    RET
735
736.ar1:
737%if ARCH_X86_32
738    RESET_STACK_STATE
739%endif
740    DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x
741    imul            uvd, 28
742    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
743    movd             m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1]
744    pinsrw           m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2
745%if ARCH_X86_32
746    mov            r3mp, cf3d
747    DEFINE_ARGS buf, shift, fg_data, val3, min, max, x
748%elif WIN64
749    DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x
750    mov            bufq, r0
751%else
752    DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x
753%endif
754    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
755    movd             m3, [base+round_vals+shiftq*2-12]    ; rnd
756%if %2
757    movd             m7, [base+pb_1]
758    movd             m6, [base+hmul_bits+2+%3*2]
759%endif
760    psrldq           m4, 1
761%if ARCH_X86_32
762    DEFINE_ARGS buf, shift, val0, val3, min, max, x
763%elif WIN64
764    DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0
765%else
766    DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0
767%endif
768    pxor             m5, m5
769    punpcklwd        m3, m5
770%if %2
771    punpcklwd        m6, m6
772%endif
773    pcmpgtb          m5, m4
774    punpcklbw        m4, m5
775    pshufd           m5, m4, q1111
776    pshufd           m4, m4, q0000
777    pshufd           m3, m3, q0000
778%if %2
779    pshufd           m7, m7, q0000
780    pshufd           m6, m6, q0000
781    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
782%else
783    sub            bufq, 82*69+3
784%endif
785%if ARCH_X86_32
786    add            r1mp, 79+82*3
787    mov            r0mp, 70-35*%3
788%else
789    add           bufyq, 79+82*3
790    mov              hd, 70-35*%3
791%endif
792    mov            mind, -128
793    mov            maxd, 127
794.y_loop_ar1:
795    mov              xq, -(76>>%2)
796    movsx         val3d, byte [bufq+xq-1]
797.x_loop_ar1:
798%if %2
799%if ARCH_X86_32
800    mov              r2, r1mp
801    movq             m0, [r2+xq*2]
802%if %3
803    movq             m1, [r2+xq*2+82]
804%endif
805%else
806    movq             m0, [bufyq+xq*2]
807%if %3
808    movq             m1, [bufyq+xq*2+82]
809%endif
810%endif
811    pmaddubsw        m2, m7, m0
812%if %3
813    pmaddubsw        m0, m7, m1
814    paddw            m2, m0
815%endif
816    pmulhrsw         m2, m6
817%else
818%if ARCH_X86_32
819    mov              r2, r1mp
820    movd             m2, [r2+xq]
821%else
822    movd             m2, [bufyq+xq]
823%endif
824    pxor             m0, m0
825    pcmpgtb          m0, m2
826    punpcklbw        m2, m0
827%endif
828
829    movq             m0, [bufq+xq-82-1]     ; top/left
830    pxor             m1, m1
831    pcmpgtb          m1, m0
832    punpcklbw        m0, m1
833    psrldq           m1, m0, 4              ; top/right
834    punpcklwd        m1, m2
835    psrldq           m2, m0, 2              ; top
836    punpcklwd        m0, m2
837    pmaddwd          m0, m4
838    pmaddwd          m1, m5
839    paddd            m0, m1
840    paddd            m0, m3
841.x_loop_ar1_inner:
842    movd          val0d, m0
843    psrldq           m0, 4
844%if ARCH_X86_32
845    imul          val3d, r3mp
846%else
847    imul          val3d, cf3d
848%endif
849    add           val3d, val0d
850    sar           val3d, shiftb
851    movsx         val0d, byte [bufq+xq]
852    add           val3d, val0d
853    cmp           val3d, maxd
854    cmovns        val3d, maxd
855    cmp           val3d, mind
856    cmovs         val3d, mind
857    mov  byte [bufq+xq], val3b
858    ; keep val3d in-place as left for next x iteration
859    inc              xq
860    jz .x_loop_ar1_end
861    test             xq, 3
862    jnz .x_loop_ar1_inner
863    jmp .x_loop_ar1
864
865.x_loop_ar1_end:
866    add            bufq, 82
867%if ARCH_X86_32
868    add            r1mp, 82<<%3
869    dec            r0mp
870%else
871    add           bufyq, 82<<%3
872    dec              hd
873%endif
874    jg .y_loop_ar1
875    RET
876
877.ar2:
878%if ARCH_X86_32
879    ALLOC_STACK   -8*16
880%endif
881    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
882    movifnidn     bufyq, bufymp
883    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
884    imul            uvd, 28
885    movd             m7, [base+round_vals-12+shiftq*2]
886    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]   ; cf0-12
887    pxor             m2, m2
888    pcmpgtb          m2, m0
889    punpckhbw        m1, m0, m2
890    punpcklbw        m0, m2
891    pinsrw           m1, [base+pw_1], 5
892    punpcklwd        m7, m7
893    pshufd           m7, m7, q0000
894    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
895    pshufd           m4, m1, q0000
896    pshufd           m5, m1, q1111
897    pshufd           m6, m1, q2222
898    pshufd           m3, m0, q3333
899    pshufd           m2, m0, q2222
900    pshufd           m1, m0, q1111
901    pshufd           m0, m0, q0000
902    SCRATCH           0, 8,  0
903    SCRATCH           1, 9,  1
904    SCRATCH           2, 10, 2
905    SCRATCH           3, 11, 3
906    SCRATCH           4, 12, 4
907    SCRATCH           5, 13, 5
908    SCRATCH           6, 14, 6
909    SCRATCH           7, 15, 7
910%if %2
911    movd             m7, [base+hmul_bits+2+%3*2]
912    movd             m6, [base+pb_1]
913    punpcklwd        m7, m7
914    pshufd           m6, m6, q0000
915    pshufd           m7, m7, q0000
916    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
917%else
918    sub            bufq, 82*69+3
919%endif
920    add           bufyq, 79+82*3
921    mov              hd, 70-35*%3
922.y_loop_ar2:
923    mov              xq, -(76>>%2)
924
925.x_loop_ar2:
926    pxor             m2, m2
927    movq             m0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
928    movhps           m0, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
929    pcmpgtb          m2, m0
930    punpckhbw        m1, m0, m2
931    punpcklbw        m0, m2
932    psrldq           m5, m0, 2              ; y=-2,x=[-1,+5]
933    psrldq           m3, m1, 2              ; y=-1,x=[-1,+5]
934    psrldq           m4, m1, 4              ; y=-1,x=[+0,+5]
935    punpcklwd        m2, m0, m5
936    punpcklwd        m3, m4
937    pmaddwd          m2, m8
938    pmaddwd          m3, m11
939    paddd            m2, m3
940
941    psrldq           m4, m0, 4              ; y=-2,x=[+0,+5]
942    psrldq           m5, m0, 6              ; y=-2,x=[+1,+5]
943    psrldq           m0, 8                  ; y=-2,x=[+2,+5]
944    punpcklwd        m4, m5
945    punpcklwd        m0, m1
946    psrldq           m3, m1, 6              ; y=-1,x=[+1,+5]
947    psrldq           m1, m1, 8              ; y=-1,x=[+2,+5]
948    punpcklwd        m3, m1
949    pmaddwd          m4, m9
950    pmaddwd          m0, m10
951    pmaddwd          m3, m12
952    paddd            m4, m0
953    paddd            m2, m3
954    paddd            m2, m4
955
956%if %2
957    movq             m1, [bufyq+xq*2]
958%if %3
959    movq             m3, [bufyq+xq*2+82]
960%endif
961    pmaddubsw        m0, m6, m1
962%if %3
963    pmaddubsw        m1, m6, m3
964    paddw            m0, m1
965%endif
966    pmulhrsw         m0, m7
967%else
968    movd             m0, [bufyq+xq]
969    pxor             m1, m1
970    pcmpgtb          m1, m0
971    punpcklbw        m0, m1
972%endif
973    punpcklwd        m0, m15
974    pmaddwd          m0, m14
975    paddd            m2, m0
976
977    movq             m0, [bufq+xq-2]        ; y=0,x=[-2,+5]
978    pxor             m4, m4
979    movd             m5, [base+byte_blend+1]
980    punpcklbw        m5, m5
981.x_loop_ar2_inner:
982    pcmpgtb          m1, m4, m0
983    punpcklbw        m0, m1
984    pmaddwd          m3, m0, m13
985    paddd            m3, m2
986    psrldq           m2, 4                  ; shift top to next pixel
987    psrad            m3, [fg_dataq+FGData.ar_coeff_shift]
988    pslldq           m3, 4
989    pand             m3, m5
990    paddw            m0, m3
991    packsswb         m0, m0
992    movd    [bufq+xq-2], m0
993    psrldq           m0, 1
994    inc              xq
995    jz .x_loop_ar2_end
996    test             xq, 3
997    jnz .x_loop_ar2_inner
998    jmp .x_loop_ar2
999
1000.x_loop_ar2_end:
1001    add            bufq, 82
1002    add           bufyq, 82<<%3
1003    dec              hd
1004    jg .y_loop_ar2
1005    RET
1006
1007.ar3:
1008%if ARCH_X86_32
1009    RESET_STACK_STATE
1010%endif
1011    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
1012    movifnidn     bufyq, bufymp
1013%if ARCH_X86_32
1014    ALLOC_STACK  -15*16
1015%else
1016    SUB             rsp, 16*7
1017%assign stack_size_padded (stack_size_padded+16*7)
1018%assign stack_size (stack_size+16*7)
1019%endif
1020    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
1021    imul            uvd, 28
1022
1023    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]   ; cf0-15
1024    pxor             m3, m3
1025    pcmpgtb          m3, m0
1026    punpckhbw        m1, m0, m3
1027    punpcklbw        m0, m3
1028    pshufd           m2, m0, q1111
1029    pshufd           m3, m0, q2222
1030    pshufd           m4, m0, q3333
1031    pshufd           m0, m0, q0000
1032    pshufd           m5, m1, q1111
1033    pshufd           m6, m1, q2222
1034    pshufd           m7, m1, q3333
1035    pshufd           m1, m1, q0000
1036    mova    [rsp+ 0*16], m0
1037    mova    [rsp+ 1*16], m2
1038    mova    [rsp+ 2*16], m3
1039    mova    [rsp+ 3*16], m4
1040    mova    [rsp+ 4*16], m1
1041    mova    [rsp+ 5*16], m5
1042    mova    [rsp+ 6*16], m6
1043    SCRATCH           7, 8, 7
1044
1045    movu             m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16]   ; cf16-24 [24=luma]
1046    pxor             m4, m4
1047    pcmpgtb          m4, m2
1048    punpckhbw        m5, m2, m4
1049    punpcklbw        m2, m4
1050    pshufd           m4, m2, q3232
1051    punpcklwd        m3, m4, m5
1052    pshuflw          m5, m4, q3321
1053    pshufd           m4, m3, q0000
1054    pshufd           m3, m2, q1111
1055    pshufd           m2, m2, q0000
1056    pinsrw           m5, [base+round_vals+shiftq*2-10], 3
1057    SCRATCH           2, 9,  8
1058    SCRATCH           3, 10, 9
1059    SCRATCH           4, 11, 10
1060    SCRATCH           5, 12, 11
1061
1062    movd             m2, [base+round_vals-12+shiftq*2]
1063%if %2
1064    movd             m1, [base+pb_1]
1065    movd             m3, [base+hmul_bits+2+%3*2]
1066%endif
1067    pxor             m0, m0
1068    punpcklwd        m2, m0
1069%if %2
1070    punpcklwd        m3, m3
1071%endif
1072    pshufd           m2, m2, q0000
1073%if %2
1074    pshufd           m1, m1, q0000
1075    pshufd           m3, m3, q0000
1076    SCRATCH           1, 13, 12
1077%endif
1078    SCRATCH           2, 14, 13
1079%if %2
1080    SCRATCH           3, 15, 14
1081%endif
1082
1083    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
1084%if %2
1085    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
1086%else
1087    sub            bufq, 82*69+3
1088%endif
1089    add           bufyq, 79+82*3
1090    mov              hd, 70-35*%3
1091.y_loop_ar3:
1092    mov              xq, -(76>>%2)
1093
1094.x_loop_ar3:
1095    movu             m0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
1096    pxor             m4, m4
1097    pcmpgtb          m4, m0
1098    punpckhbw        m3, m0, m4
1099    punpcklbw        m0, m4
1100
1101    psrldq           m5, m0, 2
1102    psrldq           m6, m0, 4
1103    psrldq           m7, m0, 6
1104    punpcklwd        m4, m0, m5
1105    punpcklwd        m6, m7
1106    pmaddwd          m4, [rsp+ 0*16]
1107    pmaddwd          m6, [rsp+ 1*16]
1108    paddd            m4, m6
1109
1110    palignr          m2, m3, m0, 10
1111    palignr          m3, m0, 12
1112    psrldq           m0, 8
1113
1114    movu             m1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
1115    pxor             m6, m6
1116    pcmpgtb          m6, m1
1117    punpckhbw        m5, m1, m6
1118    punpcklbw        m1, m6
1119
1120    punpcklwd        m0, m2
1121    punpcklwd        m3, m1
1122    pmaddwd          m0, [rsp+ 2*16]
1123    pmaddwd          m3, [rsp+ 3*16]
1124    paddd            m0, m3
1125    paddd            m0, m4
1126
1127    movu             m2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
1128    pxor             m7, m7
1129    pcmpgtb          m7, m2
1130    punpckhbw        m6, m2, m7
1131    punpcklbw        m2, m7
1132
1133    palignr          m3, m5, m1, 10
1134    palignr          m5, m1, 12
1135    psrldq           m4, m2, 2
1136
1137    punpcklwd        m3, m5
1138    punpcklwd        m5, m2, m4
1139    pmaddwd          m3, [rsp+ 6*16]
1140    pmaddwd          m5, m8
1141    paddd            m3, m5
1142    paddd            m0, m3
1143
1144    psrldq           m3, m1, 2
1145    psrldq           m4, m1, 4
1146    psrldq           m5, m1, 6
1147    psrldq           m1, 8
1148
1149    punpcklwd        m3, m4
1150    punpcklwd        m5, m1
1151    pmaddwd          m3, [rsp+ 4*16]
1152    pmaddwd          m5, [rsp+ 5*16]
1153    paddd            m3, m5
1154    paddd            m0, m3
1155
1156%if %2
1157    movq             m1, [bufyq+xq*2]
1158%if %3
1159    movq             m3, [bufyq+xq*2+82]
1160%endif
1161    pmaddubsw        m7, m13, m1
1162%if %3
1163    pmaddubsw        m5, m13, m3
1164    paddw            m7, m5
1165%endif
1166    pmulhrsw         m7, m15
1167%else
1168    movd             m7, [bufyq+xq]
1169    pxor             m1, m1
1170    pcmpgtb          m1, m7
1171    punpcklbw        m7, m1
1172%endif
1173
1174    psrldq           m1, m2, 4
1175    psrldq           m3, m2, 6
1176    palignr          m4, m6, m2, 10
1177    palignr          m6, m2, 12
1178    psrldq           m2, 8
1179
1180    punpcklwd        m1, m3
1181    punpcklwd        m2, m4
1182    punpcklwd        m6, m7
1183    pmaddwd          m1, m9
1184    pmaddwd          m2, m10
1185    pmaddwd          m6, m11
1186    paddd            m1, m2
1187    paddd            m0, m6
1188    paddd            m0, m1
1189    paddd            m0, m14
1190
1191    movq             m1, [bufq+xq-3]        ; y=0,x=[-3,+4]
1192    pxor             m4, m4
1193    movd             m5, [base+byte_blend]
1194.x_loop_ar3_inner:
1195    pcmpgtb          m2, m4, m1
1196    punpcklbw        m3, m1, m2
1197    pmaddwd          m2, m3, m12
1198    pshufd           m3, m2, q1111
1199    paddd            m2, m3                 ; left+cur
1200    paddd            m2, m0                 ; add top
1201    psrldq           m0, 4
1202    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
1203    ; don't packssdw, we only care about one value
1204    packsswb         m2, m2
1205    pandn            m3, m5, m1
1206    pslld            m2, 24
1207    pand             m2, m5
1208    por              m1, m2, m3
1209    movd    [bufq+xq-3], m1
1210    psrldq           m1, 1
1211    inc              xq
1212    jz .x_loop_ar3_end
1213    test             xq, 3
1214    jnz .x_loop_ar3_inner
1215    jmp .x_loop_ar3
1216
1217.x_loop_ar3_end:
1218    add            bufq, 82
1219    add           bufyq, 82<<%3
1220    dec              hd
1221    jg .y_loop_ar3
1222    RET
1223%endmacro
1224
1225generate_grain_uv_fn 420, 1, 1
1226generate_grain_uv_fn 422, 1, 0
1227generate_grain_uv_fn 444, 0, 0
1228
1229%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg
1230%assign %%idx 0
1231%define %%tmp %2
1232%if %0 == 6
1233%define %%tmp %6
1234%endif
1235%rep 4
1236%if %%idx == 0
1237    movd        %5 %+ d, %2
1238    pshuflw       %%tmp, %2, q3232
1239%else
1240    movd        %5 %+ d, %%tmp
1241%if %%idx == 2
1242    punpckhqdq    %%tmp, %%tmp
1243%elif %%idx == 4
1244    psrlq         %%tmp, 32
1245%endif
1246%endif
1247    movzx       %4 %+ d, %5 %+ w
1248    shr         %5 %+ d, 16
1249
1250%if %%idx == 0
1251    movd             %1, [%3+%4]
1252%else
1253    pinsrw           %1, [%3+%4], %%idx + 0
1254%endif
1255    pinsrw           %1, [%3+%5], %%idx + 1
1256%assign %%idx %%idx+2
1257%endrep
1258%endmacro
1259
1260INIT_XMM ssse3
1261; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby)
1262%if ARCH_X86_32
1263%if STACK_ALIGNMENT < mmsize
1264cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \
1265        dst, src, scaling, unused1, fg_data, picptr, unused2
1266    ; copy stack arguments to new position post-alignment, so that we
1267    ; don't have to keep the old stack location in a separate register
1268    mov              r0, r0m
1269    mov              r1, r2m
1270    mov              r2, r4m
1271    mov              r3, r6m
1272    mov              r4, r7m
1273    mov              r5, r8m
1274
1275    mov [rsp+5*mmsize+ 4*gprsize], r0
1276    mov [rsp+5*mmsize+ 6*gprsize], r1
1277    mov [rsp+5*mmsize+ 8*gprsize], r2
1278    mov [rsp+5*mmsize+10*gprsize], r3
1279    mov [rsp+5*mmsize+11*gprsize], r4
1280    mov [rsp+5*mmsize+12*gprsize], r5
1281%else
1282cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \
1283        dst, src, scaling, unused1, fg_data, picptr, unused2
1284%endif
1285    mov            srcq, srcm
1286    mov        fg_dataq, r3m
1287    mov        scalingq, r5m
1288%if STACK_ALIGNMENT < mmsize
1289%define r0m [rsp+5*mmsize+ 4*gprsize]
1290%define r1m [rsp+5*mmsize+ 5*gprsize]
1291%define r2m [rsp+5*mmsize+ 6*gprsize]
1292%define r3m [rsp+5*mmsize+ 7*gprsize]
1293%define r4m [rsp+5*mmsize+ 8*gprsize]
1294%define r5m [rsp+5*mmsize+ 9*gprsize]
1295%define r6m [rsp+5*mmsize+10*gprsize]
1296%define r7m [rsp+5*mmsize+11*gprsize]
1297%define r8m [rsp+5*mmsize+12*gprsize]
1298%endif
1299    LEA              r5, pb_mask
1300%define base r5-pb_mask
1301    mov             r5m, picptrq
1302%else
1303cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
1304    lea              r7, [pb_mask]
1305%define base r7-pb_mask
1306%endif
1307    mov             r6d, [fg_dataq+FGData.scaling_shift]
1308    movd             m3, [base+mul_bits+r6*2-14]
1309    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
1310    movd             m4, [base+max+r6*4]
1311    movd             m5, [base+min+r6*2]
1312    punpcklwd        m3, m3
1313    punpcklwd        m4, m4
1314    punpcklwd        m5, m5
1315    pshufd           m3, m3, q0000
1316    pshufd           m4, m4, q0000
1317    pshufd           m5, m5, q0000
1318    SCRATCH           3, 11, 0
1319    SCRATCH           4, 12, 1
1320    SCRATCH           5, 13, 2
1321
1322%if ARCH_X86_32
1323    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
1324%else
1325    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
1326%endif
1327
1328    mov            sbyd, r8m
1329    mov        overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
1330    test       overlapd, overlapd
1331    jz .no_vertical_overlap
1332    mova             m6, [base+pw_1024]
1333    mova             m7, [base+pb_27_17_17_27]
1334    SCRATCH           6, 14, 3
1335    SCRATCH           7, 15, 4
1336    test           sbyd, sbyd
1337    jnz .vertical_overlap
1338    ; fall-through
1339
1340.no_vertical_overlap:
1341    mov             r8m, overlapd
1342%if ARCH_X86_32
1343    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused
1344    imul           seed, (173 << 24) | 37
1345%else
1346    imul           seed, sbyd, (173 << 24) | 37
1347%endif
1348    add            seed, (105 << 24) | 178
1349    rol            seed, 8
1350    movzx          seed, seew
1351    xor            seed, [fg_dataq+FGData.seed]
1352
1353%if ARCH_X86_32
1354    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
1355
1356    mov             r3m, seed
1357    mov              wq, r4m
1358%else
1359    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1360                unused1, unused2, see, unused3
1361%endif
1362
1363    lea        src_bakq, [srcq+wq]
1364    neg              wq
1365    sub           dstmp, srcq
1366%if ARCH_X86_32
1367    mov             r1m, src_bakq
1368    mov             r4m, wq
1369    DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
1370%endif
1371
1372.loop_x:
1373%if ARCH_X86_32
1374    mov            seed, r3m
1375%endif
1376    mov             r6d, seed
1377    or             seed, 0xEFF4
1378    shr             r6d, 1
1379    test           seeb, seeh
1380    lea            seed, [r6+0x8000]
1381    cmovp          seed, r6d                ; updated seed
1382%if ARCH_X86_32
1383    mov             r3m, seed
1384
1385    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1386
1387    mov           offxd, offyd
1388%else
1389    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1390                offx, offy, see, unused
1391
1392    mov           offyd, seed
1393    mov           offxd, seed
1394%endif
1395    ror           offyd, 8
1396    shr           offxd, 12
1397    and           offyd, 0xf
1398    imul          offyd, 164
1399    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
1400
1401%if ARCH_X86_32
1402    ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
1403    ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
1404    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1405%else
1406    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1407                h, offxy, see, unused
1408%endif
1409
1410.loop_x_odd:
1411    mov              hd, r7m
1412    mov      grain_lutq, grain_lutmp
1413.loop_y:
1414    ; src
1415    mova             m0, [srcq]
1416    pxor             m2, m2
1417    punpckhbw        m1, m0, m2
1418    punpcklbw        m0, m2                 ; m0-1: src as word
1419
1420    ; scaling[src]
1421%if ARCH_X86_32
1422    vpgatherdw       m4, m0, scalingq-1, r0, r5, m3
1423    vpgatherdw       m5, m1, scalingq-1, r0, r5, m3
1424%else
1425    vpgatherdw       m4, m0, scalingq-1, r12, r13, m3
1426    vpgatherdw       m5, m1, scalingq-1, r12, r13, m3
1427%endif
1428    REPX {psrlw x, 8}, m4, m5
1429
1430    ; grain = grain_lut[offy+y][offx+x]
1431    movu             m3, [grain_lutq+offxyq]
1432    pcmpgtb          m7, m2, m3
1433    punpcklbw        m2, m3, m7
1434    punpckhbw        m3, m7
1435
1436    ; noise = round2(scaling[src] * grain, scaling_shift)
1437    pmullw           m2, m4
1438    pmullw           m3, m5
1439    pmulhrsw         m2, m11
1440    pmulhrsw         m3, m11
1441
1442    ; dst = clip_pixel(src, noise)
1443    paddw            m0, m2
1444    paddw            m1, m3
1445    pmaxsw           m0, m13
1446    pmaxsw           m1, m13
1447    pminsw           m0, m12
1448    pminsw           m1, m12
1449    packuswb         m0, m1
1450    movifnidn      dstq, dstmp
1451    mova    [dstq+srcq], m0
1452
1453    add            srcq, r2mp
1454    add      grain_lutq, 82
1455    dec              hd
1456    jg .loop_y
1457
1458%if ARCH_X86_32
1459    add            r4mp, 16
1460%else
1461    add              wq, 16
1462%endif
1463    jge .end
1464%if ARCH_X86_32
1465    mov            srcq, r1mp
1466    add            srcq, r4mp
1467%else
1468    lea            srcq, [src_bakq+wq]
1469%endif
1470    btc       dword r8m, 2
1471    jc .next_blk
1472
1473    add          offxyd, 16
1474    test      dword r8m, 2              ; r8m & 2 = have_top_overlap
1475    jz .loop_x_odd
1476
1477%if ARCH_X86_32
1478    add dword [rsp+5*mmsize+1*gprsize], 16
1479%else
1480    add            r11d, 16             ; top_offxyd
1481%endif
1482    jnz .loop_x_odd_v_overlap
1483
1484.next_blk:
1485    test      dword r8m, 1
1486    jz .loop_x
1487
1488    test      dword r8m, 2
1489    jnz .loop_x_hv_overlap
1490
1491    ; horizontal overlap (without vertical overlap)
1492.loop_x_h_overlap:
1493%if ARCH_X86_32
1494    ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
1495    ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
1496    DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3
1497
1498    add          offxyd, 16                 ; left_offxyd
1499    mov [rsp+5*mmsize+0*gprsize], offxyd
1500
1501    DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
1502
1503    mov            seed, r3m
1504%else
1505    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1506                offx, offy, see, left_offxy
1507
1508    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
1509%endif
1510
1511    mov             r6d, seed
1512    or             seed, 0xEFF4
1513    shr             r6d, 1
1514    test           seeb, seeh
1515    lea            seed, [r6+0x8000]
1516    cmovp          seed, r6d                ; updated seed
1517
1518%if ARCH_X86_32
1519    mov             r3m, seed
1520
1521    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1522
1523    mov           offxd, offyd
1524%else
1525    mov           offyd, seed
1526    mov           offxd, seed
1527%endif
1528    ror           offyd, 8
1529    shr           offxd, 12
1530    and           offyd, 0xf
1531    imul          offyd, 164
1532    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
1533
1534%if ARCH_X86_32
1535    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1536%else
1537    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1538                h, offxy, see, left_offxy
1539%endif
1540
1541    mov              hd, r7m
1542    mov      grain_lutq, grain_lutmp
1543.loop_y_h_overlap:
1544    ; src
1545    mova             m0, [srcq]
1546    pxor             m2, m2
1547    punpckhbw        m1, m0, m2
1548    punpcklbw        m0, m2                 ; m0-1: src as word
1549
1550    ; scaling[src]
1551%if ARCH_X86_32
1552    vpgatherdw       m4, m0, scalingq-1, r0, r5, m3
1553    vpgatherdw       m5, m1, scalingq-1, r0, r5, m3
1554%else
1555    vpgatherdw       m4, m0, scalingq-1, r12, r13, m3
1556    vpgatherdw       m5, m1, scalingq-1, r12, r13, m3
1557%endif
1558    REPX {psrlw x, 8}, m4, m5
1559
1560    ; grain = grain_lut[offy+y][offx+x]
1561    movu             m3, [grain_lutq+offxyq]
1562%if ARCH_X86_32
1563    mov              r5, [rsp+5*mmsize+0*gprsize]
1564    movd             m7, [grain_lutq+r5]
1565%else
1566    movd             m7, [grain_lutq+left_offxyq]
1567%endif
1568    punpcklbw        m7, m3
1569    pmaddubsw        m6, m15, m7
1570    pmulhrsw         m6, m14
1571    packsswb         m6, m6
1572    shufps           m6, m3, q3210
1573    pcmpgtb          m2, m6
1574    punpcklbw        m7, m6, m2
1575    punpckhbw        m6, m2
1576
1577    ; noise = round2(scaling[src] * grain, scaling_shift)
1578    pmullw           m7, m4
1579    pmullw           m6, m5
1580    pmulhrsw         m7, m11
1581    pmulhrsw         m6, m11
1582
1583    ; dst = clip_pixel(src, noise)
1584    paddw            m0, m7
1585    paddw            m1, m6
1586    pmaxsw           m0, m13
1587    pmaxsw           m1, m13
1588    pminsw           m0, m12
1589    pminsw           m1, m12
1590    packuswb         m0, m1
1591    movifnidn      dstq, dstmp
1592    mova    [dstq+srcq], m0
1593
1594    add            srcq, r2mp
1595    add      grain_lutq, 82
1596    dec              hd
1597    jg .loop_y_h_overlap
1598
1599%if ARCH_X86_32
1600    add            r4mp, 16
1601%else
1602    add              wq, 16
1603%endif
1604    jge .end
1605%if ARCH_X86_32
1606    mov            srcq, r1m
1607    add            srcq, r4m
1608%else
1609    lea            srcq, [src_bakq+wq]
1610%endif
1611    xor       dword r8m, 4
1612    add          offxyd, 16
1613
1614    ; since this half-block had left-overlap, the next does not
1615    test      dword r8m, 2              ; have_top_overlap
1616    jz .loop_x_odd
1617%if ARCH_X86_32
1618    add dword [rsp+5*mmsize+1*gprsize], 16
1619%else
1620    add            r11d, 16             ; top_offxyd
1621%endif
1622    jmp .loop_x_odd_v_overlap
1623
1624.end:
1625    RET
1626
1627.vertical_overlap:
1628%if ARCH_X86_32
1629    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
1630%else
1631    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
1632%endif
1633
1634    or         overlapd, 2                  ; top_overlap: overlap & 2
1635    mov             r8m, overlapd
1636    movzx          sbyd, sbyb
1637%if ARCH_X86_32
1638    imul             r4, [fg_dataq+FGData.seed], 0x00010001
1639    DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
1640%else
1641    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1642%endif
1643    imul           tmpd, sbyd, 173 * 0x00010001
1644    imul           sbyd, 37 * 0x01000100
1645    add            tmpd, (105 << 16) | 188
1646    add            sbyd, (178 << 24) | (141 << 8)
1647    and            tmpd, 0x00ff00ff
1648    and            sbyd, 0xff00ff00
1649    xor            seed, tmpd
1650%if ARCH_X86_32
1651    xor            sbyd, seed               ; (cur_seed << 16) | top_seed
1652
1653    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
1654
1655    mov             r3m, seed
1656    mov              wq, r4m
1657%else
1658    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1659
1660    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1661                tmp, unused2, see, unused3
1662%endif
1663
1664    lea        src_bakq, [srcq+wq]
1665    neg              wq
1666    sub           dstmp, srcq
1667%if ARCH_X86_32
1668    mov             r1m, src_bakq
1669    mov             r4m, wq
1670    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
1671%endif
1672
1673.loop_x_v_overlap:
1674%if ARCH_X86_32
1675    mov            seed, r3m
1676%endif
1677    ; we assume from the block above that bits 8-15 of tmpd are zero'ed,
1678    ; because of the 'and tmpd, 0x00ff00ff' above
1679    mov             r6d, seed
1680    or             seed, 0xeff4eff4
1681    test           seeb, seeh
1682    setp           tmpb                     ; parity of top_seed
1683    shr            seed, 16
1684    shl            tmpd, 16
1685    test           seeb, seeh
1686    setp           tmpb                     ; parity of cur_seed
1687    or              r6d, 0x00010001
1688    xor            tmpd, r6d
1689    mov            seed, tmpd
1690    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
1691
1692%if ARCH_X86_32
1693    mov             r3m, seed
1694
1695    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1696
1697    mov           offxd, offyd
1698%else
1699    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1700                offx, offy, see, unused, top_offxy
1701
1702    mov           offyd, seed
1703    mov           offxd, seed
1704%endif
1705
1706    ror           offyd, 8
1707    ror           offxd, 12
1708    and           offyd, 0xf000f
1709    and           offxd, 0xf000f
1710    imul          offyd, 164
1711    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1712    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
1713
1714%if ARCH_X86_32
1715    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
1716%else
1717    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1718                h, offxy, see, unused, top_offxy
1719%endif
1720
1721    movzx    top_offxyd, offxyw
1722%if ARCH_X86_32
1723    mov [rsp+5*mmsize+1*gprsize], top_offxyd
1724
1725    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1726%endif
1727    shr          offxyd, 16
1728
1729.loop_x_odd_v_overlap:
1730%if ARCH_X86_32
1731    mov              r5, r5m
1732    lea              r5, [base+pb_27_17]
1733    mov [rsp+5*mmsize+12], r5
1734%else
1735    mova             m8, [pb_27_17]
1736%endif
1737    mov              hd, r7m
1738    mov      grain_lutq, grain_lutmp
1739.loop_y_v_overlap:
1740    ; src
1741    mova             m0, [srcq]
1742    pxor             m2, m2
1743    punpckhbw        m1, m0, m2
1744    punpcklbw        m0, m2                 ; m0-1: src as word
1745
1746    ; scaling[src]
1747%if ARCH_X86_32
1748    vpgatherdw       m4, m0, scalingq-1, r0, r5, m3
1749    vpgatherdw       m5, m1, scalingq-1, r0, r5, m3
1750%else
1751    vpgatherdw       m4, m0, scalingq-1, r12, r13, m3
1752    vpgatherdw       m5, m1, scalingq-1, r12, r13, m3
1753%endif
1754    REPX {psrlw x, 8}, m4, m5
1755
1756    ; grain = grain_lut[offy+y][offx+x]
1757    movu             m3, [grain_lutq+offxyq]
1758%if ARCH_X86_32
1759    mov              r5, [rsp+5*mmsize+1*gprsize]
1760    movu             m7, [grain_lutq+r5]
1761%else
1762    movu             m7, [grain_lutq+top_offxyq]
1763%endif
1764    punpckhbw        m6, m7, m3
1765    punpcklbw        m7, m3
1766%if ARCH_X86_32
1767    mov              r5, [rsp+5*mmsize+12]
1768    pmaddubsw        m3, [r5], m6
1769    pmaddubsw        m6, [r5], m7
1770%else
1771    pmaddubsw        m3, m8, m6
1772    pmaddubsw        m6, m8, m7
1773%endif
1774    pmulhrsw         m3, m14
1775    pmulhrsw         m6, m14
1776    packsswb         m6, m3
1777    pcmpgtb          m7, m2, m6
1778    punpcklbw        m2, m6, m7
1779    punpckhbw        m6, m7
1780
1781    ; noise = round2(scaling[src] * grain, scaling_shift)
1782    pmullw           m2, m4
1783    pmullw           m6, m5
1784    pmulhrsw         m2, m11
1785    pmulhrsw         m6, m11
1786
1787    ; dst = clip_pixel(src, noise)
1788    paddw            m0, m2
1789    paddw            m1, m6
1790    pmaxsw           m0, m13
1791    pmaxsw           m1, m13
1792    pminsw           m0, m12
1793    pminsw           m1, m12
1794    packuswb         m0, m1
1795    movifnidn      dstq, dstmp
1796    mova    [dstq+srcq], m0
1797
1798%if ARCH_X86_32
1799    add dword [rsp+5*mmsize+12], mmsize
1800%else
1801    mova             m8, [pb_17_27]
1802%endif
1803    add            srcq, r2mp
1804    add      grain_lutq, 82
1805    dec              hw
1806    jz .end_y_v_overlap
1807    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1808    ; remaining (up to) 30 lines
1809    btc              hd, 16
1810    jnc .loop_y_v_overlap
1811    jmp .loop_y
1812
1813.end_y_v_overlap:
1814%if ARCH_X86_32
1815    add            r4mp, 16
1816%else
1817    add              wq, 16
1818%endif
1819    jge .end_hv
1820%if ARCH_X86_32
1821    mov            srcq, r1mp
1822    add            srcq, r4mp
1823%else
1824    lea            srcq, [src_bakq+wq]
1825%endif
1826    btc       dword r8m, 2
1827    jc .loop_x_hv_overlap
1828    add          offxyd, 16
1829%if ARCH_X86_32
1830    add dword [rsp+5*mmsize+1*gprsize], 16
1831%else
1832    add      top_offxyd, 16
1833%endif
1834    jmp .loop_x_odd_v_overlap
1835
1836.loop_x_hv_overlap:
1837%if ARCH_X86_32
1838    mov              r5, r5m
1839    lea              r5, [base+pb_27_17]
1840    mov [rsp+5*mmsize+12], r5
1841
1842    DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak
1843
1844    mov              r5, [rsp+5*mmsize+1*gprsize]
1845    mov              r4, offxyd
1846    add              r5, 16
1847    add              r4, 16
1848    mov [rsp+5*mmsize+2*gprsize], r5        ; topleft_offxy
1849    mov [rsp+5*mmsize+0*gprsize], r4        ; left_offxy
1850
1851    DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak
1852
1853    xor            tmpd, tmpd
1854    mov            seed, r3m
1855%else
1856    mova             m8, [pb_27_17]
1857
1858    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1859                tmp, unused2, see, unused3
1860
1861    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
1862%endif
1863    mov             r6d, seed
1864    or             seed, 0xeff4eff4
1865    test           seeb, seeh
1866    setp           tmpb                     ; parity of top_seed
1867    shr            seed, 16
1868    shl            tmpd, 16
1869    test           seeb, seeh
1870    setp           tmpb                     ; parity of cur_seed
1871    or              r6d, 0x00010001
1872    xor            tmpd, r6d
1873    mov            seed, tmpd
1874    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
1875
1876%if ARCH_X86_32
1877    mov             r3m, seed
1878
1879    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1880
1881    mov           offxd, offyd
1882%else
1883    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1884                offx, offy, see, left_offxy, top_offxy, topleft_offxy
1885
1886    lea  topleft_offxyq, [top_offxyq+16]
1887    lea     left_offxyq, [offyq+16]
1888    mov           offyd, seed
1889    mov           offxd, seed
1890%endif
1891    ror           offyd, 8
1892    ror           offxd, 12
1893    and           offyd, 0xf000f
1894    and           offxd, 0xf000f
1895    imul          offyd, 164
1896    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1897    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
1898
1899%if ARCH_X86_32
1900    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1901
1902    movzx            r5, offxyw             ; top_offxy
1903    mov [rsp+5*mmsize+1*gprsize], r5
1904%else
1905    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1906                h, offxy, see, left_offxy, top_offxy, topleft_offxy
1907
1908    movzx    top_offxyd, offxyw
1909%endif
1910    shr          offxyd, 16
1911
1912    mov              hd, r7m
1913    mov      grain_lutq, grain_lutmp
1914.loop_y_hv_overlap:
1915    ; grain = grain_lut[offy+y][offx+x]
1916    movu             m3, [grain_lutq+offxyq]
1917%if ARCH_X86_32
1918    mov              r5, [rsp+5*mmsize+1*gprsize]   ; top_offxy
1919    mov              r0, [rsp+5*mmsize+0*gprsize]   ; left_offxy
1920    movu             m6, [grain_lutq+r5]
1921    mov              r5, [rsp+5*mmsize+2*gprsize]   ; topleft_offxy
1922    movd             m4, [grain_lutq+r0]
1923    movd             m7, [grain_lutq+r5]
1924%else
1925    movu             m6, [grain_lutq+top_offxyq]
1926    movd             m4, [grain_lutq+left_offxyq]
1927    movd             m7, [grain_lutq+topleft_offxyq]
1928%endif
1929    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
1930    punpcklbw        m4, m3
1931    punpcklbw        m7, m6
1932    pmaddubsw        m2, m15, m4
1933    pmaddubsw        m4, m15, m7
1934    pmulhrsw         m2, m14
1935    pmulhrsw         m4, m14
1936    packsswb         m2, m2
1937    packsswb         m4, m4
1938    shufps           m2, m3, q3210
1939    shufps           m4, m6, q3210
1940    ; followed by v interpolation (top | cur -> cur)
1941    punpcklbw        m3, m4, m2
1942    punpckhbw        m4, m2
1943%if ARCH_X86_32
1944    mov              r5, [rsp+5*mmsize+12]
1945    pmaddubsw        m7, [r5], m4
1946    pmaddubsw        m4, [r5], m3
1947%else
1948    pmaddubsw        m7, m8, m4
1949    pmaddubsw        m4, m8, m3
1950%endif
1951    pmulhrsw         m7, m14
1952    pmulhrsw         m4, m14
1953    packsswb         m4, m7
1954    pxor             m2, m2
1955    pcmpgtb          m7, m2, m4
1956    punpcklbw        m3, m4, m7
1957    punpckhbw        m4, m7
1958
1959    ; src
1960    mova             m0, [srcq]
1961    punpckhbw        m1, m0, m2
1962    punpcklbw        m0, m2                 ; m0-1: src as word
1963
1964    ; scaling[src]
1965%if ARCH_X86_32
1966    vpgatherdw       m5, m0, scalingq-1, r0, r5, m7
1967    vpgatherdw       m6, m1, scalingq-1, r0, r5, m7
1968%else
1969    vpgatherdw       m5, m0, scalingq-1, r13, r14, m7
1970    vpgatherdw       m6, m1, scalingq-1, r13, r14, m7
1971%endif
1972    REPX {psrlw x, 8}, m5, m6
1973
1974    ; noise = round2(scaling[src] * grain, scaling_shift)
1975    pmullw           m3, m5
1976    pmullw           m4, m6
1977    pmulhrsw         m3, m11
1978    pmulhrsw         m4, m11
1979
1980    ; dst = clip_pixel(src, noise)
1981    paddw            m0, m3
1982    paddw            m1, m4
1983    pmaxsw           m0, m13
1984    pmaxsw           m1, m13
1985    pminsw           m0, m12
1986    pminsw           m1, m12
1987    packuswb         m0, m1
1988    movifnidn      dstq, dstmp
1989    mova    [dstq+srcq], m0
1990
1991%if ARCH_X86_32
1992    add dword [rsp+5*mmsize+12], mmsize
1993%else
1994    mova             m8, [pb_17_27]
1995%endif
1996    add            srcq, r2mp
1997    add      grain_lutq, 82
1998    dec              hw
1999    jz .end_y_hv_overlap
2000    ; 2 lines get vertical overlap, then fall back to non-overlap code for
2001    ; remaining (up to) 30 lines
2002    btc              hd, 16
2003    jnc .loop_y_hv_overlap
2004    jmp .loop_y_h_overlap
2005
2006.end_y_hv_overlap:
2007%if ARCH_X86_32
2008    add            r4mp, 16
2009%else
2010    add              wq, 16
2011%endif
2012    jge .end_hv
2013%if ARCH_X86_32
2014    mov            srcq, r1m
2015    add            srcq, r4m
2016%else
2017    lea            srcq, [src_bakq+wq]
2018%endif
2019    xor       dword r8m, 4
2020    add          offxyd, 16
2021%if ARCH_X86_32
2022    add dword [rsp+5*mmsize+1*gprsize], 16
2023%else
2024    add      top_offxyd, 16
2025%endif
2026    jmp .loop_x_odd_v_overlap
2027
2028.end_hv:
2029    RET
2030
2031%macro FGUV_FN 3 ; name, ss_hor, ss_ver
2032INIT_XMM ssse3
2033%if ARCH_X86_32
2034; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h,
2035;                         sby, luma, lstride, uv_pl, is_id)
2036%if STACK_ALIGNMENT < mmsize
2037DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8
2038cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \
2039        tmp, src, scaling, h, fg_data, picptr, unused
2040    mov              r0, r0m
2041    mov              r1, r2m
2042    mov              r2, r4m
2043    mov              r3, r6m
2044    mov              r4, r7m
2045    mov [rsp+7*mmsize+3*gprsize], r0
2046    mov [rsp+7*mmsize+5*gprsize], r1
2047    mov [rsp+7*mmsize+7*gprsize], r2
2048    mov [rsp+7*mmsize+9*gprsize], r3
2049    mov [rsp+7*mmsize+10*gprsize], r4
2050
2051    mov              r0, r8m
2052    mov              r1, r9m
2053    mov              r2, r10m
2054    mov              r4, r11m
2055    mov              r3, r12m
2056    mov [rsp+7*mmsize+11*gprsize], r0
2057    mov [rsp+7*mmsize+12*gprsize], r1
2058    mov [rsp+7*mmsize+13*gprsize], r2
2059    mov [rsp+7*mmsize+14*gprsize], r4
2060%else
2061cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \
2062        tmp, src, scaling, h, fg_data, picptr, unused
2063%endif
2064    mov            srcq, srcm
2065    mov        fg_dataq, r3m
2066    mov        scalingq, r5m
2067%if STACK_ALIGNMENT < mmsize
2068%define r0m [rsp+7*mmsize+ 3*gprsize]
2069%define r1m [rsp+7*mmsize+ 4*gprsize]
2070%define r2m [rsp+7*mmsize+ 5*gprsize]
2071%define r3m [rsp+7*mmsize+ 6*gprsize]
2072%define r4m [rsp+7*mmsize+ 7*gprsize]
2073%define r5m [rsp+7*mmsize+ 8*gprsize]
2074%define r6m [rsp+7*mmsize+ 9*gprsize]
2075%define r7m [rsp+7*mmsize+10*gprsize]
2076%define r8m [rsp+7*mmsize+11*gprsize]
2077%define r9m [rsp+7*mmsize+12*gprsize]
2078%define r10m [rsp+7*mmsize+13*gprsize]
2079%define r11m [rsp+7*mmsize+14*gprsize]
2080%define r12m [rsp+7*mmsize+15*gprsize]
2081%endif
2082    LEA              r5, pb_mask
2083%define base r5-pb_mask
2084    mov             r5m, r5
2085%else
2086cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
2087                                     grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
2088    lea              r8, [pb_mask]
2089%define base r8-pb_mask
2090%endif
2091    mov             r6d, [fg_dataq+FGData.scaling_shift]
2092    movd             m3, [base+mul_bits+r6*2-14]
2093    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
2094    lea            tmpd, [r6d*2]
2095%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize
2096    test             r3, r3
2097%else
2098    cmp      dword r12m, 0                      ; is_idm
2099%endif
2100    movd             m5, [base+min+r6*2]
2101    cmovne          r6d, tmpd
2102    movd             m4, [base+max+r6*2]
2103    punpcklwd        m3, m3
2104    punpcklwd        m5, m5
2105    punpcklwd        m4, m4
2106    pshufd           m3, m3, q0000
2107    pshufd           m5, m5, q0000
2108    pshufd           m4, m4, q0000
2109    SCRATCH           3, 11, 0
2110    SCRATCH           4, 12, 1
2111    SCRATCH           5, 13, 2
2112
2113    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
2114    jne .csfl
2115
2116%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
2117%if ARCH_X86_32
2118    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
2119%else
2120    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
2121%endif
2122
2123%if %1
2124    mov             r6d, dword r11m
2125    movd             m0, [fg_dataq+FGData.uv_mult+r6*4]
2126    movd             m1, [fg_dataq+FGData.uv_luma_mult+r6*4]
2127    punpcklbw        m6, m1, m0
2128    movd             m7, [fg_dataq+FGData.uv_offset+r6*4]
2129    punpcklwd        m6, m6
2130    punpcklwd        m7, m7
2131    pshufd           m6, m6, q0000
2132    pshufd           m7, m7, q0000
2133    SCRATCH           6, 14, 3
2134    SCRATCH           7, 15, 4
2135%endif
2136
2137    mov            sbyd, r8m
2138    mov        overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
2139    test       overlapd, overlapd
2140    jz %%no_vertical_overlap
2141%if ARCH_X86_32
2142%if %2
2143    mova             m1, [base+pb_23_22_h]
2144%else
2145    mova             m1, [base+pb_27_17_17_27]
2146%endif
2147    mova             m0, [base+pw_1024]
2148%else
2149%if %2
2150    mova             m1, [pb_23_22_h]
2151%else
2152    mova             m1, [pb_27_17_17_27]
2153%endif
2154    mova             m0, [pw_1024]
2155%endif
2156    SCRATCH           0, 8, 5
2157    SCRATCH           1, 9, 6
2158    test           sbyd, sbyd
2159    jnz %%vertical_overlap
2160    ; fall-through
2161
2162%%no_vertical_overlap:
2163    mov             r8m, overlapd
2164%if ARCH_X86_32
2165    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap
2166    imul           seed, (173 << 24) | 37
2167%else
2168    imul           seed, sbyd, (173 << 24) | 37
2169%endif
2170    add            seed, (105 << 24) | 178
2171    rol            seed, 8
2172    movzx          seed, seew
2173    xor            seed, [fg_dataq+FGData.seed]
2174
2175%if ARCH_X86_32
2176    mov             r3m, seed
2177
2178    DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
2179%define luma_bakq lumaq
2180
2181    mov              wq, r4m
2182%if %3
2183    shl           r10mp, 1
2184%endif
2185%else
2186    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2187                unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak
2188
2189    mov        lstrideq, r10mp
2190%endif
2191
2192    mov           lumaq, r9mp
2193    lea        src_bakq, [srcq+wq]
2194    lea       luma_bakq, [lumaq+wq*(1+%2)]
2195    neg              wq
2196    sub            r0mp, srcq
2197%if ARCH_X86_32
2198    mov             r1m, src_bakq
2199    mov            r11m, luma_bakq
2200    mov             r4m, wq
2201
2202    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
2203%else
2204    mov           r11mp, src_bakq
2205    mov           r12mp, strideq
2206%endif
2207
2208%%loop_x:
2209%if ARCH_X86_32
2210    mov            seed, r3m
2211%endif
2212    mov             r6d, seed
2213    or             seed, 0xEFF4
2214    shr             r6d, 1
2215    test           seeb, seeh
2216    lea            seed, [r6+0x8000]
2217    cmovp          seed, r6d               ; updated seed
2218%if ARCH_X86_32
2219    mov             r3m, seed
2220
2221    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
2222
2223    mov           offxd, offyd
2224%else
2225    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2226                offx, offy, see, overlap, unused1, unused2, lstride
2227
2228    mov           offyd, seed
2229    mov           offxd, seed
2230%endif
2231    ror           offyd, 8
2232    shr           offxd, 12
2233    and           offyd, 0xf
2234    imul          offyd, 164>>%3
2235    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))]  ; offy*stride+offx
2236
2237%if ARCH_X86_32
2238    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
2239%else
2240    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2241                h, offxy, see, overlap, unused1, unused2, lstride, luma_bak
2242%endif
2243
2244%%loop_x_odd:
2245    mov              hd, r7m
2246    mov      grain_lutq, grain_lutmp
2247%%loop_y:
2248    ; src
2249%if ARCH_X86_32
2250    mov           lumaq, r9mp
2251%endif
2252%if %2
2253    mova             m4, [lumaq+ 0]
2254    mova             m6, [lumaq+16]
2255    mova             m0, [srcq]
2256%if ARCH_X86_32
2257    add           lumaq, r10mp
2258    mov            r9mp, lumaq
2259    mov              r5, r5m
2260    movd             m7, [base+pb_1]
2261%else
2262    movd             m7, [pb_1]
2263%endif
2264    pshufd           m7, m7, q0000
2265    pxor             m2, m2
2266    pmaddubsw        m4, m7
2267    pmaddubsw        m6, m7
2268    pavgw            m4, m2
2269    pavgw            m6, m2
2270%else
2271    mova             m4, [lumaq]
2272    mova             m0, [srcq]
2273%if ARCH_X86_32
2274    add           lumaq, r10mp
2275    mov            r9mp, lumaq
2276%endif
2277    pxor             m2, m2
2278%endif
2279
2280%if %1
2281%if %2
2282    packuswb         m4, m6                 ; luma
2283%endif
2284    punpckhbw        m6, m4, m0
2285    punpcklbw        m4, m0                 ; { luma, chroma }
2286    pmaddubsw        m6, m14
2287    pmaddubsw        m4, m14
2288    psraw            m6, 6
2289    psraw            m4, 6
2290    paddw            m6, m15
2291    paddw            m4, m15
2292    packuswb         m4, m6                 ; pack+unpack = clip
2293    punpckhbw        m6, m4, m2
2294    punpcklbw        m4, m2
2295%elif %2 == 0
2296    punpckhbw        m6, m4, m2
2297    punpcklbw        m4, m2
2298%endif
2299
2300    ; scaling[luma_src]
2301%if ARCH_X86_32
2302    vpgatherdw       m7, m4, scalingq-1, r0, r5
2303    vpgatherdw       m5, m6, scalingq-1, r0, r5
2304%else
2305    vpgatherdw       m7, m4, scalingq-1, r12, r2
2306    vpgatherdw       m5, m6, scalingq-1, r12, r2
2307%endif
2308    REPX {psrlw x, 8}, m7, m5
2309
2310    ; unpack chroma_source
2311    punpckhbw        m1, m0, m2
2312    punpcklbw        m0, m2                 ; m0-1: src as word
2313
2314    ; grain = grain_lut[offy+y][offx+x]
2315    movu             m3, [grain_lutq+offxyq+ 0]
2316    pcmpgtb          m6, m2, m3
2317    punpcklbw        m2, m3, m6
2318    punpckhbw        m3, m6
2319
2320    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2321    pmullw           m2, m7
2322    pmullw           m3, m5
2323    pmulhrsw         m2, m11
2324    pmulhrsw         m3, m11
2325
2326%if ARCH_X86_32
2327    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2328%endif
2329
2330    ; dst = clip_pixel(src, noise)
2331    paddw            m0, m2
2332    paddw            m1, m3
2333    pmaxsw           m0, m13
2334    pmaxsw           m1, m13
2335    pminsw           m0, m12
2336    pminsw           m1, m12
2337    packuswb         m0, m1
2338    movifnidn      dstq, dstmp
2339    mova    [dstq+srcq], m0
2340
2341%if ARCH_X86_32
2342    add            srcq, r2mp
2343    ; we already incremented lumaq above
2344%else
2345    add            srcq, r12mp
2346%if %3
2347    lea           lumaq, [lumaq+lstrideq*2]
2348%else
2349    add           lumaq, lstrideq
2350%endif
2351%endif
2352    add      grain_lutq, 82
2353    dec              hw
2354    jg %%loop_y
2355
2356%if ARCH_X86_32
2357    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
2358
2359    mov              wq, r4m
2360%endif
2361    add              wq, 16
2362    jge %%end
2363%if ARCH_X86_32
2364    mov            srcq, r1mp
2365    mov           lumaq, r11mp
2366%else
2367    mov            srcq, r11mp
2368%endif
2369    lea           lumaq, [luma_bakq+wq*(1+%2)]
2370    add            srcq, wq
2371%if ARCH_X86_32
2372    mov             r4m, wq
2373    mov             r9m, lumaq
2374%endif
2375%if %2 == 0
2376    ; adjust top_offxy
2377%if ARCH_X86_32
2378    add dword [rsp+7*mmsize+1*gprsize], 16
2379%else
2380    add            r11d, 16
2381%endif
2382    add          offxyd, 16
2383    btc       dword r8m, 2
2384    jc %%loop_x_even
2385    test      dword r8m, 2
2386    jz %%loop_x_odd
2387    jmp %%loop_x_odd_v_overlap
2388%%loop_x_even:
2389%endif
2390    test      dword r8m, 1
2391    jz %%loop_x
2392
2393    ; r8m = sbym
2394    test      dword r8m, 2
2395    jne %%loop_x_hv_overlap
2396
2397    ; horizontal overlap (without vertical overlap)
2398%%loop_x_h_overlap:
2399%if ARCH_X86_32
2400%if %2
2401    lea              r6, [offxyd+16]
2402    mov [rsp+7*mmsize+0*gprsize], r6
2403%else
2404    mov [rsp+7*mmsize+0*gprsize], offxyd
2405%endif
2406
2407    DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut
2408
2409    mov            seed, r3m
2410%else
2411    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2412                offx, offy, see, left_offxy, unused1, unused2, lstride
2413
2414%if %2
2415    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
2416%else
2417    mov     left_offxyd, offyd
2418%endif
2419%endif
2420    mov             r6d, seed
2421    or             seed, 0xEFF4
2422    shr             r6d, 1
2423    test           seeb, seeh
2424    lea            seed, [r6+0x8000]
2425    cmovp          seed, r6d                ; updated seed
2426
2427%if ARCH_X86_32
2428    mov             r3m, seed
2429
2430    DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx
2431
2432    mov          offxd, offyd
2433%else
2434    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2435                offx, offy, see, left_offxy, unused1, unused2, lstride
2436
2437    mov           offyd, seed
2438    mov           offxd, seed
2439%endif
2440    ror           offyd, 8
2441    shr           offxd, 12
2442    and           offyd, 0xf
2443    imul          offyd, 164>>%3
2444    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
2445
2446%if ARCH_X86_32
2447    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
2448%else
2449    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2450                h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak
2451%endif
2452
2453    mov              hd, r7m
2454    mov      grain_lutq, grain_lutmp
2455%%loop_y_h_overlap:
2456    ; src
2457%if ARCH_X86_32
2458    mov           lumaq, r9mp
2459%endif
2460%if %2
2461    mova             m4, [lumaq+ 0]
2462    mova             m6, [lumaq+16]
2463    mova             m0, [srcq]
2464%if ARCH_X86_32
2465    add           lumaq, r10mp
2466    mov            r9mp, lumaq
2467    mov              r5, r5m
2468    movd             m7, [base+pb_1]
2469%else
2470    movd             m7, [pb_1]
2471%endif
2472    pshufd           m7, m7, q0000
2473    pxor             m2, m2
2474    pmaddubsw        m4, m7
2475    pmaddubsw        m6, m7
2476    pavgw            m4, m2
2477    pavgw            m6, m2
2478%else
2479    mova             m4, [lumaq]
2480    mova             m0, [srcq]
2481%if ARCH_X86_32
2482    add           lumaq, r10mp
2483    mov            r9mp, lumaq
2484%endif
2485    pxor             m2, m2
2486%endif
2487
2488%if %1
2489%if %2
2490    packuswb         m4, m6                 ; luma
2491%endif
2492    punpckhbw        m6, m4, m0
2493    punpcklbw        m4, m0                 ; { luma, chroma }
2494    pmaddubsw        m6, m14
2495    pmaddubsw        m4, m14
2496    psraw            m6, 6
2497    psraw            m4, 6
2498    paddw            m6, m15
2499    paddw            m4, m15
2500    packuswb         m4, m6                 ; pack+unpack = clip
2501    punpckhbw        m6, m4, m2
2502    punpcklbw        m4, m2
2503%elif %2 == 0
2504    punpckhbw        m6, m4, m2
2505    punpcklbw        m4, m2
2506%endif
2507
2508    ; scaling[luma_src]
2509%if ARCH_X86_32
2510    vpgatherdw       m7, m4, scalingq-1, r0, r5
2511    vpgatherdw       m5, m6, scalingq-1, r0, r5
2512%else
2513    vpgatherdw       m7, m4, scalingq-1, r12, r2
2514    vpgatherdw       m5, m6, scalingq-1, r12, r2
2515%endif
2516    REPX {psrlw x, 8}, m7, m5
2517
2518    ; unpack chroma_source
2519    punpckhbw        m1, m0, m2
2520    punpcklbw        m0, m2                 ; m0-1: src as word
2521
2522    ; grain = grain_lut[offy+y][offx+x]
2523    movu             m4, [grain_lutq+offxyq+ 0]
2524%if ARCH_X86_32
2525    mov              r0, [rsp+7*mmsize+0*gprsize]
2526    movd             m2, [grain_lutq+r0+ 0]
2527%else
2528    movd             m2, [grain_lutq+left_offxyq+ 0]
2529%endif
2530    punpcklbw        m2, m4
2531    pmaddubsw        m3, m9, m2
2532    pmulhrsw         m3, m8
2533    packsswb         m3, m3
2534    shufps           m3, m4, q3210
2535    pxor             m4, m4
2536    pcmpgtb          m4, m3
2537    punpcklbw        m2, m3, m4
2538    punpckhbw        m3, m4
2539
2540    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2541    pmullw           m2, m7
2542    pmullw           m3, m5
2543    pmulhrsw         m2, m11
2544    pmulhrsw         m3, m11
2545
2546%if ARCH_X86_32
2547    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2548%endif
2549
2550    ; dst = clip_pixel(src, noise)
2551    paddw            m0, m2
2552    paddw            m1, m3
2553    pmaxsw           m0, m13
2554    pmaxsw           m1, m13
2555    pminsw           m0, m12
2556    pminsw           m1, m12
2557    packuswb         m0, m1
2558    movifnidn      dstq, dstmp
2559    mova    [dstq+srcq], m0
2560
2561%if ARCH_X86_32
2562    add            srcq, r2mp
2563    ; lumaq has already been incremented above
2564%else
2565    add            srcq, r12mp
2566%if %3
2567    lea           lumaq, [lumaq+lstrideq*2]
2568%else
2569    add           lumaq, lstrideq
2570%endif
2571%endif
2572    add      grain_lutq, 82
2573    dec              hw
2574    jg %%loop_y_h_overlap
2575
2576%if ARCH_X86_32
2577    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
2578
2579    mov              wq, r4m
2580%endif
2581    add              wq, 16
2582    jge %%end
2583%if ARCH_X86_32
2584    mov            srcq, r1mp
2585    mov           lumaq, r11mp
2586%else
2587    mov            srcq, r11mp
2588%endif
2589    lea           lumaq, [luma_bakq+wq*(1+%2)]
2590    add            srcq, wq
2591%if ARCH_X86_32
2592    mov             r4m, wq
2593    mov             r9m, lumaq
2594%endif
2595%if %2 == 0
2596    xor       dword r8m, 4
2597    ; adjust top_offxyd
2598%if ARCH_X86_32
2599    add dword [rsp+7*mmsize+1*gprsize], 16
2600%else
2601    add            r11d, 16
2602%endif
2603    add          offxyd, 16
2604%endif
2605
2606    ; r8m = sbym
2607    test      dword r8m, 2
2608%if %2
2609    jne %%loop_x_hv_overlap
2610    jmp %%loop_x_h_overlap
2611%else
2612    jne %%loop_x_odd_v_overlap
2613    jmp %%loop_x_odd
2614%endif
2615
2616%%end:
2617    RET
2618
2619%%vertical_overlap:
2620%if ARCH_X86_32
2621    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
2622%else
2623    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
2624%endif
2625
2626    or         overlapd, 2                  ; top_overlap: overlap & 2
2627    mov             r8m, overlapd
2628    movzx          sbyd, sbyb
2629%if ARCH_X86_32
2630    imul             r4, [fg_dataq+FGData.seed], 0x00010001
2631    DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
2632%else
2633    imul           seed, [fg_dataq+FGData.seed], 0x00010001
2634%endif
2635    imul           tmpd, sbyd, 173 * 0x00010001
2636    imul           sbyd, 37 * 0x01000100
2637    add            tmpd, (105 << 16) | 188
2638    add            sbyd, (178 << 24) | (141 << 8)
2639    and            tmpd, 0x00ff00ff
2640    and            sbyd, 0xff00ff00
2641    xor            seed, tmpd
2642%if ARCH_X86_32
2643    xor            sbyd, seed               ; (cur_seed << 16) | top_seed
2644
2645    DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
2646
2647    mov             r3m, seed
2648    mov              wq, r4m
2649%if %3
2650    shl           r10mp, 1
2651%endif
2652%else
2653    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
2654
2655    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2656                tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak
2657
2658    mov        lstrideq, r10mp
2659%endif
2660
2661    mov           lumaq, r9mp
2662    lea        src_bakq, [srcq+wq]
2663    lea       luma_bakq, [lumaq+wq*(1+%2)]
2664    neg              wq
2665    sub            r0mp, srcq
2666%if ARCH_X86_32
2667    mov             r1m, src_bakq
2668    mov            r11m, luma_bakq
2669    mov             r4m, wq
2670
2671    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
2672%else
2673    mov           r11mp, src_bakq
2674    mov           r12mp, strideq
2675%endif
2676
2677%%loop_x_v_overlap:
2678%if ARCH_X86_32
2679    mov            seed, r3m
2680    xor            tmpd, tmpd
2681%endif
2682    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
2683    mov             r6d, seed
2684    or             seed, 0xeff4eff4
2685    test           seeb, seeh
2686    setp           tmpb                     ; parity of top_seed
2687    shr            seed, 16
2688    shl            tmpd, 16
2689    test           seeb, seeh
2690    setp           tmpb                     ; parity of cur_seed
2691    or              r6d, 0x00010001
2692    xor            tmpd, r6d
2693    mov            seed, tmpd
2694    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
2695
2696%if ARCH_X86_32
2697    mov             r3m, seed
2698
2699    DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx
2700
2701    mov           offxd, offyd
2702%else
2703    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2704                offx, offy, see, overlap, top_offxy, unused, lstride
2705
2706    mov           offxd, seed
2707    mov           offyd, seed
2708%endif
2709    ror           offyd, 8
2710    ror           offxd, 12
2711    and           offyd, 0xf000f
2712    and           offxd, 0xf000f
2713    imul          offyd, 164>>%3
2714    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
2715    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
2716
2717%if ARCH_X86_32
2718    DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy
2719%else
2720    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2721                h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak
2722%endif
2723
2724    movzx    top_offxyd, offxyw
2725    shr          offxyd, 16
2726%if ARCH_X86_32
2727    mov [rsp+7*mmsize+1*gprsize], top_offxyd
2728
2729    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
2730%endif
2731
2732%%loop_x_odd_v_overlap:
2733    mov              hd, r7m
2734    mov      grain_lutq, grain_lutmp
2735%if ARCH_X86_32
2736    mov              r5, r5m
2737%endif
2738%if %3
2739    mova             m1, [PIC_ptr(pb_23_22)]
2740%else
2741    mova             m1, [PIC_ptr(pb_27_17)]
2742%endif
2743%%loop_y_v_overlap:
2744%if ARCH_X86_32
2745    mov           lumaq, r9mp
2746%endif
2747%if %2
2748    mova             m4, [lumaq+ 0]
2749    mova             m6, [lumaq+16]
2750    mova             m0, [srcq]
2751%if ARCH_X86_32
2752    add           lumaq, r10mp
2753    mov            r9mp, lumaq
2754    mov              r5, r5m
2755    movd             m7, [base+pb_1]
2756%else
2757    movd             m7, [pb_1]
2758%endif
2759    pshufd           m7, m7, q0000
2760    pxor             m2, m2
2761    pmaddubsw        m4, m7
2762    pmaddubsw        m6, m7
2763    pavgw            m4, m2
2764    pavgw            m6, m2
2765%else
2766    mova             m4, [lumaq]
2767    mova             m0, [srcq]
2768%if ARCH_X86_32
2769    add           lumaq, r10mp
2770    mov            r9mp, lumaq
2771%endif
2772    pxor             m2, m2
2773%endif
2774
2775%if %1
2776%if %2
2777    packuswb         m4, m6                 ; luma
2778%endif
2779    punpckhbw        m6, m4, m0
2780    punpcklbw        m4, m0                 ; { luma, chroma }
2781    pmaddubsw        m6, m14
2782    pmaddubsw        m4, m14
2783    psraw            m6, 6
2784    psraw            m4, 6
2785    paddw            m6, m15
2786    paddw            m4, m15
2787    packuswb         m4, m6                 ; pack+unpack = clip
2788    punpckhbw        m6, m4, m2
2789    punpcklbw        m4, m2
2790%elif %2 == 0
2791    punpckhbw        m6, m4, m2
2792    punpcklbw        m4, m2
2793%endif
2794
2795    ; scaling[luma_src]
2796%if ARCH_X86_32
2797    vpgatherdw       m7, m4, scalingq-1, r0, r5
2798    vpgatherdw       m5, m6, scalingq-1, r0, r5
2799%else
2800    vpgatherdw       m7, m4, scalingq-1, r12, r2
2801    vpgatherdw       m5, m6, scalingq-1, r12, r2
2802%endif
2803    REPX {psrlw x, 8}, m7, m5
2804
2805    ; grain = grain_lut[offy+y][offx+x]
2806    movu             m3, [grain_lutq+offxyq]
2807%if ARCH_X86_32
2808    mov              r0, [rsp+7*mmsize+1*gprsize]
2809    movu             m4, [grain_lutq+r0]
2810%else
2811    movu             m4, [grain_lutq+top_offxyq]
2812%endif
2813    punpckhbw        m6, m4, m3
2814    punpcklbw        m4, m3
2815    pmaddubsw        m2, m1, m6
2816    pmaddubsw        m3, m1, m4
2817    pmulhrsw         m2, m8
2818    pmulhrsw         m3, m8
2819    packsswb         m3, m2
2820    pxor             m6, m6
2821    pcmpgtb          m6, m3
2822    punpcklbw        m2, m3, m6
2823    punpckhbw        m3, m6
2824
2825    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2826    pmullw           m2, m7
2827    pmullw           m3, m5
2828    pmulhrsw         m2, m11
2829    pmulhrsw         m3, m11
2830
2831    ; unpack chroma_source
2832    pxor             m4, m4
2833    punpckhbw        m6, m0, m4
2834    punpcklbw        m0, m4                 ; m0-1: src as word
2835
2836%if ARCH_X86_32
2837    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2838%endif
2839
2840    ; dst = clip_pixel(src, noise)
2841    paddw            m0, m2
2842    paddw            m6, m3
2843    pmaxsw           m0, m13
2844    pmaxsw           m6, m13
2845    pminsw           m0, m12
2846    pminsw           m6, m12
2847    packuswb         m0, m6
2848    movifnidn      dstq, dstmp
2849    mova    [dstq+srcq], m0
2850
2851    dec              hw
2852    je %%end_y_v_overlap
2853%if ARCH_X86_32
2854    add            srcq, r2mp
2855    ; lumaq has already been incremented above
2856%else
2857    add            srcq, r12mp
2858%if %3
2859    lea           lumaq, [lumaq+lstrideq*2]
2860%else
2861    add           lumaq, lstrideq
2862%endif
2863%endif
2864    add      grain_lutq, 82
2865%if %3 == 0
2866    btc              hd, 16
2867%if ARCH_X86_32
2868    mov              r5, r5m
2869%endif
2870    mova             m1, [PIC_ptr(pb_17_27)]
2871    jnc %%loop_y_v_overlap
2872%endif
2873    jmp %%loop_y
2874
2875%%end_y_v_overlap:
2876%if ARCH_X86_32
2877    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
2878
2879    mov              wq, r4m
2880%endif
2881    add              wq, 16
2882    jge %%end_hv
2883%if ARCH_X86_32
2884    mov            srcq, r1mp
2885    mov           lumaq, r11mp
2886%else
2887    mov            srcq, r11mp
2888%endif
2889    lea           lumaq, [luma_bakq+wq*(1+%2)]
2890    add            srcq, wq
2891%if ARCH_X86_32
2892    mov             r4m, wq
2893    mov             r9m, lumaq
2894%endif
2895
2896%if %2
2897    ; since fg_dataq.overlap is guaranteed to be set, we never jump
2898    ; back to .loop_x_v_overlap, and instead always fall-through to
2899    ; h+v overlap
2900%else
2901%if ARCH_X86_32
2902    add dword [rsp+7*mmsize+1*gprsize], 16
2903%else
2904    add      top_offxyd, 16
2905%endif
2906    add          offxyd, 16
2907    btc       dword r8m, 2
2908    jnc %%loop_x_odd_v_overlap
2909%endif
2910
2911%%loop_x_hv_overlap:
2912%if ARCH_X86_32
2913    DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused
2914
2915    mov              r6, [rsp+7*mmsize+1*gprsize]
2916%if %2
2917    lea              r0, [r3d+16]
2918    add              r6, 16
2919    mov [rsp+7*mmsize+0*gprsize], r0        ; left_offxy
2920%else
2921    mov [rsp+7*mmsize+0*gprsize], r3        ; left_offxy
2922%endif
2923    mov [rsp+7*mmsize+2*gprsize], r6        ; topleft_offxy
2924
2925    DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused
2926
2927    mov            seed, r3m
2928    xor            tmpd, tmpd
2929%else
2930    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2931                tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
2932
2933%if %2
2934    lea  topleft_offxyq, [top_offxyq+16]
2935    lea     left_offxyq, [offxyq+16]
2936%else
2937    mov  topleft_offxyq, top_offxyq
2938    mov     left_offxyq, offxyq
2939%endif
2940
2941    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
2942%endif
2943    mov             r6d, seed
2944    or             seed, 0xeff4eff4
2945    test           seeb, seeh
2946    setp           tmpb                     ; parity of top_seed
2947    shr            seed, 16
2948    shl            tmpd, 16
2949    test           seeb, seeh
2950    setp           tmpb                     ; parity of cur_seed
2951    or              r6d, 0x00010001
2952    xor            tmpd, r6d
2953    mov            seed, tmpd
2954    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
2955
2956%if ARCH_X86_32
2957    mov             r3m, seed
2958
2959    DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx
2960
2961    mov           offxd, offyd
2962%else
2963    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2964                offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
2965
2966    mov           offxd, seed
2967    mov           offyd, seed
2968%endif
2969    ror           offyd, 8
2970    ror           offxd, 12
2971    and           offyd, 0xf000f
2972    and           offxd, 0xf000f
2973    imul          offyd, 164>>%3
2974    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
2975    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
2976
2977%if ARCH_X86_32
2978    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
2979%else
2980    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2981                h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak
2982%endif
2983
2984    movzx    top_offxyd, offxyw
2985    shr          offxyd, 16
2986%if ARCH_X86_32
2987    mov [rsp+7*mmsize+1*gprsize], top_offxyd
2988%endif
2989
2990    mov              hd, r7m
2991    mov      grain_lutq, grain_lutmp
2992%if ARCH_X86_32
2993    mov              r5, r5m
2994%endif
2995%if %3
2996    mova             m3, [PIC_ptr(pb_23_22)]
2997%else
2998    mova             m3, [PIC_ptr(pb_27_17)]
2999%endif
3000%%loop_y_hv_overlap:
3001    ; grain = grain_lut[offy+y][offx+x]
3002%if ARCH_X86_32
3003    mov              r0, [rsp+7*mmsize+2*gprsize]       ; topleft_offxy
3004    mov              r5, [rsp+7*mmsize+1*gprsize]       ; top_offxy
3005    movd             m1, [grain_lutq+r0]
3006    mov              r0, [rsp+7*mmsize+0*gprsize]       ; left_offxy
3007%else
3008    movd             m1, [grain_lutq+topleft_offxyq]
3009%endif
3010    movu             m2, [grain_lutq+offxyq]
3011%if ARCH_X86_32
3012    movu             m6, [grain_lutq+r5]
3013    movd             m4, [grain_lutq+r0]
3014%else
3015    movu             m6, [grain_lutq+top_offxyq]
3016    movd             m4, [grain_lutq+left_offxyq]
3017%endif
3018    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
3019    punpcklbw        m1, m6
3020    punpcklbw        m4, m2
3021    pmaddubsw        m0, m9, m1
3022    pmaddubsw        m1, m9, m4
3023    REPX {pmulhrsw x, m8}, m0, m1
3024    packsswb         m0, m1
3025    shufps           m4, m0, m2, q3232
3026    shufps           m0, m6, q3210
3027    ; followed by v interpolation (top | cur -> cur)
3028    punpcklbw        m2, m0, m4
3029    punpckhbw        m0, m4
3030    pmaddubsw        m4, m3, m0
3031    pmaddubsw        m1, m3, m2
3032    pmulhrsw         m4, m8
3033    pmulhrsw         m1, m8
3034    packsswb         m1, m4
3035
3036    ; src
3037%if ARCH_X86_32
3038    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
3039
3040    mov           lumaq, r9mp
3041%endif
3042%if %2
3043    mova             m4, [lumaq+ 0]
3044    mova             m6, [lumaq+16]
3045    mova             m0, [srcq]
3046%if ARCH_X86_32
3047    add           lumaq, r10mp
3048    mov            r9mp, lumaq
3049    mov              r5, r5m
3050    movd             m7, [base+pb_1]
3051%else
3052    movd             m7, [pb_1]
3053%endif
3054    pshufd           m7, m7, q0000
3055    pxor             m2, m2
3056    pmaddubsw        m4, m7
3057    pmaddubsw        m6, m7
3058    pavgw            m4, m2
3059    pavgw            m6, m2
3060%else
3061    mova             m4, [lumaq]
3062    mova             m0, [srcq]
3063%if ARCH_X86_32
3064    add           lumaq, r10mp
3065    mov            r9mp, lumaq
3066%endif
3067    pxor             m2, m2
3068%endif
3069
3070%if %1
3071%if %2
3072    packuswb         m4, m6                 ; luma
3073%endif
3074    punpckhbw        m6, m4, m0
3075    punpcklbw        m4, m0                 ; { luma, chroma }
3076    pmaddubsw        m6, m14
3077    pmaddubsw        m4, m14
3078    psraw            m6, 6
3079    psraw            m4, 6
3080    paddw            m6, m15
3081    paddw            m4, m15
3082    packuswb         m4, m6                 ; pack+unpack = clip
3083    punpckhbw        m6, m4, m2
3084    punpcklbw        m4, m2
3085%elif %2 == 0
3086    punpckhbw        m6, m4, m2
3087    punpcklbw        m4, m2
3088%endif
3089
3090    ; scaling[src]
3091%if ARCH_X86_32
3092    vpgatherdw       m7, m4, scalingq-1, r0, r5
3093    vpgatherdw       m5, m6, scalingq-1, r0, r5
3094%else
3095%if %3
3096    vpgatherdw       m7, m4, scalingq-1, r2, r12
3097    vpgatherdw       m5, m6, scalingq-1, r2, r12
3098%else
3099    vpgatherdw       m7, m4, scalingq-1, r2, r13
3100    vpgatherdw       m5, m6, scalingq-1, r2, r13
3101%endif
3102%endif
3103    REPX {psrlw x, 8}, m7, m5
3104
3105    ; unpack grain
3106    pxor             m4, m4
3107    pcmpgtb          m4, m1
3108    punpcklbw        m2, m1, m4
3109    punpckhbw        m1, m4
3110
3111    ; noise = round2(scaling[src] * grain, scaling_shift)
3112    pmullw           m2, m7
3113    pmullw           m1, m5
3114    pmulhrsw         m2, m11
3115    pmulhrsw         m1, m11
3116
3117%if ARCH_X86_32
3118    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
3119%endif
3120
3121    ; unpack chroma source
3122    pxor             m4, m4
3123    punpckhbw        m5, m0, m4
3124    punpcklbw        m0, m4                 ; m0-1: src as word
3125
3126    ; dst = clip_pixel(src, noise)
3127    paddw            m0, m2
3128    paddw            m5, m1
3129    pmaxsw           m0, m13
3130    pmaxsw           m5, m13
3131    pminsw           m0, m12
3132    pminsw           m5, m12
3133    packuswb         m0, m5
3134    movifnidn      dstq, dstmp
3135    mova    [dstq+srcq], m0
3136
3137%if ARCH_X86_32
3138    add            srcq, r2mp
3139    ; lumaq has been adjusted above already
3140%else
3141    add            srcq, r12mp
3142%if %3
3143    lea           lumaq, [lumaq+lstrideq*(1+%2)]
3144%else
3145    add           lumaq, r10mp
3146%endif
3147%endif
3148    add      grain_lutq, 82
3149    dec              hw
3150%if %3
3151    jg %%loop_y_h_overlap
3152%else
3153    jle %%end_y_hv_overlap
3154%if ARCH_X86_32
3155    mov              r5, r5m
3156%endif
3157    mova             m3, [PIC_ptr(pb_17_27)]
3158    btc              hd, 16
3159    jnc %%loop_y_hv_overlap
3160%if ARCH_X86_64
3161    mov        lstrideq, r10mp
3162%endif
3163    jmp %%loop_y_h_overlap
3164%%end_y_hv_overlap:
3165%if ARCH_X86_64
3166    mov        lstrideq, r10mp
3167%endif
3168%endif
3169
3170%if ARCH_X86_32
3171    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
3172
3173    mov              wq, r4m
3174%endif
3175    add              wq, 16
3176    jge %%end_hv
3177%if ARCH_X86_32
3178    mov            srcq, r1mp
3179    mov           lumaq, r11mp
3180%else
3181    mov            srcq, r11mp
3182%endif
3183    lea           lumaq, [luma_bakq+wq*(1+%2)]
3184    add            srcq, wq
3185%if ARCH_X86_32
3186    mov             r4m, wq
3187    mov             r9m, lumaq
3188%endif
3189%if %2
3190    jmp %%loop_x_hv_overlap
3191%else
3192%if ARCH_X86_32
3193    add dword [rsp+7*mmsize+1*gprsize], 16
3194%else
3195    add      top_offxyd, 16
3196%endif
3197    add          offxyd, 16
3198    xor       dword r8m, 4
3199    jmp %%loop_x_odd_v_overlap
3200%endif
3201
3202%%end_hv:
3203    RET
3204%endmacro
3205
3206    %%FGUV_32x32xN_LOOP 1, %2, %3
3207.csfl:
3208    %%FGUV_32x32xN_LOOP 0, %2, %3
3209%endmacro
3210
3211FGUV_FN 420, 1, 1
3212
3213%if STACK_ALIGNMENT < mmsize
3214DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
3215%endif
3216
3217FGUV_FN 422, 1, 0
3218
3219%if STACK_ALIGNMENT < mmsize
3220DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
3221%endif
3222
3223FGUV_FN 444, 0, 0
3224